# Data Extraction Notebook
This notebook is used to gather all the training and test images and convert them to numpy representations to be stored in an archive file for easier use while training the CNN. Images are resized to 32x32x3. 

In [None]:
import os
import piexif
import cv2
import numpy as np
from sklearn.model_selection import train_test_split
import sys

In [None]:
#Images are going to be square so set size to 32 to be used for both width and height.
img_size = 32
train_dir = "{Insert directory with training images here}"
test_dir = "{Insert directory with test images here}"

In [None]:
def PreProcess_Images(root, train=True):
    features = []
    labels = []
    
    for subdir, dirs, files in os.walk(root):
        count  = 0
        for file in files:
            tot_files = len(files)
            if not file == ".DS_Store":
                count += 1
                sys.stdout.write("\rFile = " + file + " ----- Progress: {:2.1f}%".format(100 * count/float(tot_files)))
                img = os.path.join(subdir, file)
                #Remove exif data - This is because there is a lot of corrupt exif data in this dataset.
                if os.stat(img).st_size > 0:
                    piexif.remove(img)
                else:
                    continue
                #Open image and resize to designated width and height. 
                im = cv2.imread(img)
                im = cv2.resize(im, (img_size, img_size))
                #Extract features into a numpy array.
                feature = np.array(im, dtype=np.float32)
                #append feature to features list.
                features.append(feature)
                
                #only apply labels for training data. 
                if train == True:
                    #Get label from directory name.
                    label = os.path.basename(subdir)
                    #One hot encoding of label names.
                    if label == "1":
                        label = [1,0,0]
                    elif label == "2":
                        label = [0,1,0]
                    else:
                        label = [0,0,1]
                    #append label to labels list.    
                    labels.append(label)
                #add image filename to labels to be used in formatting of submission data.
                else:
                    label = os.path.basename(img)
                    labels.append(label)
                sys.stdout.flush()
    
    if train == True:
        labels = np.array(labels, np.uint8)
        
    features = np.array(features, np.float32) / 255.
    
    return features, labels

In [None]:
#Get Training X and y lists.
X, y = PreProcess_Images(train_dir, train=True)

In [None]:
#Get test X and filenames of test images
X_test, flnm_test = PreProcess_Images(test_dir, train=False)

In [None]:
#Split training data into training and validation data.
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.20, random_state=42)

In [None]:
#Save all information to a numpy archive file to be used when training the model.
np.savez('data_arrays', X_train, y_train, X_valid, y_valid, X_test, flnm_test)