# Organize Images
Since we cant load all the images to memory for training. It is better to organize the image directory so that we can use the `image_generator.flow_from_directory` method.

This notebook will organize the images directory such that
- images:
    - training
        - train
            - mask
            - unmask
        - validation
            - mask
            - unmask
    - test
    
**This notebook is meant to be run once, otherwise you will encounter an error**

In [1]:
import pandas as pd
import shutil
import os

from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('train_labels.csv')

In [3]:
df.head()

Unnamed: 0,image,target
0,tfymlmdkpzkqdjhdxyhnoeuqszxphw.jpg,0
1,rxgismgsvmaayzjarbfjaljhqmpbrt.jpg,1
2,uuzshfrhkgrkolhwdvliqauzulurnz.jpg,0
3,mjspxsagzusaznvnyxgamtrlqkqklp.jpg,0
4,rlbmuajgezfiddjzlyeoupxpqubkpt.jpg,1


In [4]:
# splitting the data to have training data and validation data
train_df, validation_df = train_test_split(df, test_size=0.2)

In [5]:
# separate the dataframe to people who have a mask with people who dont have
train_unmask = list(train_df[train_df['target'] == 0]['image'])
train_mask = list(train_df[train_df['target']==1]['image'])

validation_unmask = list(validation_df[validation_df['target'] == 0]['image'])
validation_mask = list(validation_df[validation_df['target'] == 1]['image'])

In [6]:
# create a training directory
training_path = "images/training/"
os.mkdir(training_path)

# create a train and validation directory inside the training directory
train_path = "images/training/train"
validation_path = "images/training/validation"
os.mkdir(train_path)
os.mkdir(validation_path)

# create the mask and unmask directories inside the train and validation directory
train_mask_path = "images/training/train/mask/"
train_unmask_path = "images/training/train/unmask/"
os.mkdir(train_mask_path)
os.mkdir(train_unmask_path)

validation_mask_path = "images/training/validation/mask/"
validation_unmask_path = "images/training/validation/unmask/"
os.mkdir(validation_mask_path)
os.mkdir(validation_unmask_path)

# create the test directory
test_path = "images/test/"
os.mkdir(test_path)


In [7]:
def move_files_to_directory(files, directory_path):
    """
    Given a list of files, this function moves them to `directory_path`.
    
    """
    for file in files:
        previous = f"images/{file}"
        after = directory_path
        shutil.move(previous, after)
        

In [8]:
# moving the train and validation images to their respective directories 
move_files_to_directory(train_mask, train_mask_path)
move_files_to_directory(train_unmask, train_unmask_path)

move_files_to_directory(validation_mask, validation_mask_path)
move_files_to_directory(validation_unmask, validation_unmask_path)


In [10]:
ss = pd.read_csv('SampleSubmission.csv')

In [11]:
ss.head()

Unnamed: 0,image,target
0,aadawlxbmapqrblgxyzarhjasgiobu.png,0
1,abpxvdfyhaaohzcrngcucmhffwizxs.jpg,0
2,aclkjfvackiieiznzfcwienplielrj.jpg,0
3,aelkivmayxgsdjosiibfgmkvfrjvjk.jpg,0
4,aelzzshpfxerelefnkatpczktuxjln.jpg,0


In [12]:
test_files = list(ss['image'])

In [15]:
# images_files = os.listdir('images/')
# for file in test_files:
#     if file not in images_files:
#         print(f"{file} not in image files")

# all the test images are in place 

In [16]:
# move the test images to the test directory
move_files_to_directory(test_files, test_path)