## Import libraries

In [141]:
# perform train test split & create new directories for train test split
import os
from shutil import copyfile
from random import seed
from random import random

# convert train images to array
import numpy as np
from numpy import asarray
from numpy import save
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array

# SMOTE
from imblearn.over_sampling import SMOTE

# train validation split
from sklearn.model_selection import train_test_split

1. To execute SMOTE
from imblearn.over_sampling import SMOTE
'sm = SMOTE(random_state=42)
'X_res, y_res = sm.fit_resample(X_train, y_train)

'model.fit(X_smote, y_smote)'
=> X_train and y_train need to be in array form, hence need convert image data into array data

Step 1: split into train and test dataset first
Step 2: convert train image data into array data
Step 3: execute SMOTE
Step 4: train model with SMOTEd inputs + imagedatagenerator augmentation

## Train - Test Split

In [113]:
# perform train test split & create new directories for train test split

seed(1) # seed random number generator
val_ratio = .20 # set ratio of pictures to use for validation

newdir = 'train_test_dataset/'
subdirs = ['train/','test/']
for sub in subdirs:
    labeldirs=['good/','bad/']
    for labeldir in labeldirs:
        os.makedirs(newdir + sub + labeldir, exist_ok=True)
copy_from_dir = 'good_bad_carton_images/'
copy_from_folders_list = os.listdir(copy_from_dir) #['internal_bad','internal_good','webscraped_bad','webscraped_good']

for folder in copy_from_folders_list:
    copy_from_folder = copy_from_dir + folder
    for file in os.listdir(copy_from_folder):
        src = copy_from_folder + '/' + file
        dst_dir = 'train/'
        if random() < val_ratio:
            dst_dir = 'test/'
        if folder in ['internal_bad','webscraped_bad']:
            dst = newdir + dst_dir + 'bad/' + file
        else:
            dst = newdir + dst_dir + 'good/' + file
        copyfile(src, dst)

In [114]:
# check train test split
num_test_img = len(os.listdir('train_test_dataset/test/good')) + len(os.listdir('train_test_dataset/test/bad'))
num_train_img = len(os.listdir('train_test_dataset/train/good')) + len(os.listdir('train_test_dataset/train/bad'))
print('# of test images:', num_test_img)
print('# of train images:', num_train_img)
print('actual test ratio: ', num_test_img/num_train_img)

# 24.2% of images (438) are put away for testing the model after training

# of test images: 438
# of train images: 1810
actual test ratio:  0.2419889502762431


## Resizing and converting images to numpy array

In [143]:
# resize train images to 224x224 (size required for VGG16) and convert train images to numpy array

images, labels = list(), list()
train_dir = 'train_test_dataset/train/'  
train_subdirs = os.listdir(train_dir) # ['bad', 'good']
for subdir in train_subdirs:
    # determine class, 0 is bad, 1 is good
    output=0
    if subdir == 'good':
        output=1
    # enumerate images in each subdir
    for file in os.listdir(train_dir + subdir):
        # load image
        image = load_img(train_dir + subdir + '/' + file, target_size=(224,224))
        # convert to numpy array
        image = img_to_array(image)
        # store
        images.append(image)
        labels.append(output)
X_train = np.array(images)
y_train = np.array(labels)

In [144]:
# check shape and class count before SMOTE

print(X_train.shape)
print(np.bincount(y_train))

(1810, 224, 224, 3)
[1095  715]


## SMOTE

In [145]:
#need to reshape X_train from shape (a,b,c,d) into (a,b) for SMOTE
X_train = X_train.reshape(1810, 224*224*3)

In [146]:
# executing SMOTE; res stands for resampled - to denote training dataset balanced by SMOTE
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_train, y_train)

In [147]:
# check shape and class count after SMOTE
# note that the class count is now balanced; this means SMOTE oversampling is successful
print(X_res.shape)
print(np.bincount(y_res))

(2190, 150528)
[1095 1095]


In [148]:
#reshape back from (a,b) to (a,b,c,d)
X_res = X_res.reshape(2190, 224, 224, 3)

## Further splitting train set into train-validation test set

In [149]:
#Train-validation split for train and validation sets
#15/85 split with random state of 32
X_train, X_val, y_train, y_val = train_test_split(X_res, y_res, random_state=32, test_size = 0.15, stratify=y_res)

In [151]:
print('X_train: ', X_train.shape)
print('y_train: ', y_train.shape)
print('X_val: ', X_val.shape)
print('y_val: ', y_val.shape)
print(np.bincount(y_train))
print(np.bincount(y_val))

X_train:  (1861, 224, 224, 3)
y_train:  (1861,)
X_val:  (329, 224, 224, 3)
y_val:  (329,)
[931 930]
[164 165]


In [152]:
# export train validation data
np.save('X_train.npy', X_train)
np.save('y_train.npy', y_train)
np.save('X_val.npy', X_val)
np.save('y_val.npy', y_val)

In [140]:
# export data
np.save('X_res.npy', X_res)
np.save('y_res.npy', y_res)