In [1]:
# some standard packages
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import os
from random import shuffle

# modelling packages
# import tensorflow as tf
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten, Conv2D, MaxPooling2D
# from tensorflow.keras.optimizers import RMSprop
# from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau 
# from tensorflow.keras.preprocessing.image import ImageDataGenerator
# from tensorflow.keras.models import model_from_json
# from tensorflow.keras.models import load_model

# Importing tf tools
from tensorflow.keras.layers import Dense,Dropout
from tensorflow.keras.applications import VGG16
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Model, Sequential

# Model Evaluation
from sklearn.metrics import classification_report, confusion_matrix


Utilize the cell below only if using all of the Data Loader file instead of using the Keras Image Generator.

In [2]:
# Open the data files
# X = pickle.load(open('X.pickle', 'rb'))
# y = pickle.load(open('y.pickle', 'rb'))

In [3]:
os.getcwd()

'/home/ec2-user/SageMaker/InvasiveID-Model'

In [13]:
categories = ['class_0', 'class_1', 'class_2', 'class_3']

In [15]:
# Creating a nested for-loop that will move 20% of the images for each class into a test set
for each_category in categories: 
    
    # Make the directory
    #os.mkdir(path + f'test/{each_category}')
    
    # Getting a list of the images
    list_images = [file for file in os.listdir(f'/home/ec2-user/Sagemaker/data/BC-images-clean/{each_category}') if file.endswith('.jpg')]
    
    # Randomly shuffling the order 
    shuffle(list_images)
    
    # Getting the names of the first 20% of images
    num_test_images = int(len(list_images)*0.2)
    for_testset = list_images[ : num_test_images]
    
    # Looping through each image in this list 'for_testset'
    for each_image in for_testset:
        
        # Renaming the file path to move those image to test set
        os.rename(f'/home/ec2-user/Sagemaker/data/BC-images-clean/{each_category}/{each_image}', 
                  f'/home/ec2-user/Sagemaker/data/BC-images-clean/test/{each_category}/{each_image}')
        
        # Sanity check
        print(f'{each_image} moved to "{each_category}" in test.')
    
    # Another check
    print(f'{each_category} completed.')

10087202275_c4f81c0677_q.jpg moved to "class_0" in test.
49712435158_399492654d_q.jpg moved to "class_0" in test.
28240249143_6a64e83012_q.jpg moved to "class_0" in test.
6335417120_cca0a227ce_q.jpg moved to "class_0" in test.
42270525730_5b8c52e89d_q.jpg moved to "class_0" in test.
19322657711_9cb82c1412_q.jpg moved to "class_0" in test.
14931400283_c59efb8b09_q.jpg moved to "class_0" in test.
7407987842_1f970588b2_q.jpg moved to "class_0" in test.
5600722309_819b5f524b_q.jpg moved to "class_0" in test.
29547980327_95d33e5321_q.jpg moved to "class_0" in test.
16470575575_685c47a41b_q.jpg moved to "class_0" in test.
14510174863_47a82e2418_q.jpg moved to "class_0" in test.
50046758966_304418869a_q.jpg moved to "class_0" in test.
2131971052_a4768cf518_q.jpg moved to "class_0" in test.
13454254984_aeab44985b_q.jpg moved to "class_0" in test.
2978177831_de9bacc44d_q.jpg moved to "class_0" in test.
2989142167_3b304632ac_q.jpg moved to "class_0" in test.
5453880238_f7038228a3_q.jpg moved to 

In [None]:
test


In [16]:
# Use the Keras ImageDataGenerator for memory efficiency and preprocessing ease
# This process replaces the method of obtaining our data via DataLoader.ipynb
train_datagen = ImageDataGenerator(
    rotation_range=40,
    width_shift_range=0.2,
    height_shift_range=0.2,
    rescale=1./255,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest',
    validation_split=0.2,
)

In [17]:
test_datagen = ImageDataGenerator(rescale = 1./255)

In [6]:
# train_img_dir = 'Data/train/'
# test_img_dir = 'Data/test/'

In [18]:
batch_size = 32

In [8]:
# # Open the data files
# train = pickle.load(open('train_labels.pickle', 'rb'))
# test = pickle.load(open('test_labels.pickle', 'rb'))

In [9]:
# train.head()

In [10]:
# train['Label'].value_counts()

In [11]:
# train.shape

In [20]:
train_generator = train_datagen.flow_from_directory(
                                                    '/home/ec2-user/Sagemaker/data/BC-images-clean',
                                                    target_size=(32, 32),
                                                    color_mode='rgb',
                                                    batch_size=batch_size,
                                                    class_mode='categorical',
                                                    shuffle=True,
                                                    subset='training')

Found 6729 images belonging to 2 classes.


In [21]:
validation_generator = train_datagen.flow_from_directory(
                                                        '/home/ec2-user/Sagemaker/data/BC-images-clean/train/',
                                                        target_size=(32, 32),
                                                        color_mode='rgb',
                                                        batch_size=batch_size,
                                                        class_mode='categorical',
                                                        shuffle=False,
                                                        subset='validation'
                                                        )

Found 1344 images belonging to 4 classes.


In [22]:
test_generator = test_datagen.flow_from_directory(
                                                  '/home/ec2-user/Sagemaker/data/BC-images-clean/test/',
                                                  target_size=(32, 32),
                                                  color_mode='rgb',
                                                  batch_size=batch_size,
                                                  class_mode='categorical',
                                                  shuffle=False)

Found 1680 images belonging to 4 classes.


In [23]:
# Saving the number of stepsizes for the training, validation and test sets 
train_stepsize = train_generator.samples//train_generator.batch_size 

valid_stepsize = validation_generator.samples//validation_generator.batch_size 

test_stepsize = test_generator.samples//test_generator.batch_size 

# Sanity check 
print(f'Training step size = {train_stepsize} \nValidation step size = {valid_stepsize} \nTest step size = {test_stepsize}')

Training step size = 210 
Validation step size = 42 
Test step size = 52


In [24]:
# 1.  Import the pretrained VGG16 network, do not include the top layers
pretrained = VGG16(weights='imagenet', include_top=False, pooling='max', input_shape=(32, 32, 3))

# 2.  Setting all layers to not trainable so weights wont be tweaked
for layer in pretrained.layers:
    layer.trainable=False
    
# Display VGG16 architecture
pretrained.summary()

Downloading data from https://github.com/fchollet/deep-learning-models/releases/download/v0.1/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5
Model: "vgg16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 32, 32, 3)]       0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 32, 32, 64)        1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 32, 32, 64)        36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 16, 16, 64)        0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 16, 16, 128)       73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 16, 16, 128) 

In [25]:

# Instantiate the NN model 
weeds_model = Sequential()

# Add the pretrained layers 
weeds_model.add(pretrained)

# Add fully-connected dense layers -- plus a dropout layer to help prevent overfitting
weeds_model.add(Dense(256, activation='relu'))
weeds_model.add(Dropout(0.5))
weeds_model.add(Dense(512, activation='relu'))

# Adding our activation 
weeds_model.add(Dense(4, activation='softmax'))

In [26]:
# Initiate early stop based on validation accuracy
ES = EarlyStopping(monitor='val_acc', patience=5, mode='auto', min_delta=0.0001, verbose=1)

In [27]:
# Istantiating Adam optimizer with a learning rate of 0.0001 and saving to variable 'optim'
optim = Adam(lr=0.0001)

# Compiling the CNN model 
weeds_model.compile(optimizer=optim, loss='categorical_crossentropy', metrics=['acc'])

# Summary 
weeds_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
vgg16 (Model)                (None, 512)               14714688  
_________________________________________________________________
dense (Dense)                (None, 256)               131328    
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 512)               131584    
_________________________________________________________________
dense_2 (Dense)              (None, 4)                 2052      
Total params: 14,979,652
Trainable params: 264,964
Non-trainable params: 14,714,688
_________________________________________________________________


In [28]:
# Fitting the model to the training data
history = weeds_model.fit_generator(generator=train_generator,
                                steps_per_epoch=train_stepsize,
                                epochs=30,
                                validation_data=validation_generator,
                                validation_steps=valid_stepsize,
                                callbacks=[ES])

Instructions for updating:
Please use Model.fit, which supports generators.
  ...
    to  
  ['...']
  ...
    to  
  ['...']
Train for 210 steps, validate for 42 steps
Epoch 1/30


InvalidArgumentError:  logits and labels must be broadcastable: logits_size=[32,4] labels_size=[32,2]
	 [[node loss/dense_2_loss/softmax_cross_entropy_with_logits (defined at <ipython-input-28-c027add54467>:7) ]] [Op:__inference_distributed_function_1705]

Function call stack:
distributed_function


In [None]:
# Getting bestmodel's predictions (as probabilities) on the test set 
test_probas = weeds_model.predict_generator(test_generator, steps=test_stepsize)

# Setting the model's class prediction as the class that received the highest probability for each image
test_predictions = test_probas.argmax(axis=1)

In [None]:
# Getting the true class labels for the test set
test_true = test_generator.classes

# Sanity check 
test_true

In [None]:
# Look at what our model predicted
test_predictions

In [None]:
# Displaying the classification report for the test set
print('Classification Report\n \n', classification_report(test_true, test_predictions, target_names=categories))

In [None]:
# Get a confusion matrix 
test_matrix = pd.DataFrame(confusion_matrix(test_true, test_predictions), 
                           columns=['Predicted ' + cat_name for cat_name in categories], 
                           index=['True ' + cat_name for cat_name in categories])

# Plotting as a heatmap 
plt.figure()
sns.heatmap(test_matrix, cmap='Blues', annot=True, fmt='g')
plt.title('Normalized Confusion Matrix: Test Data')
plt.show()

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y) 

In [None]:
# # Building the Model
# model = Sequential()

# # # 3 convolutional layers
# model.add(Conv2D(28, (3,3), input_shape = (28,28,3)))
# model.add(Activation('relu'))
# model.add(MaxPooling2D(pool_size=(2,2)))

# model.add(Conv2D(64, (3, 3)))
# model.add(Activation('relu'))
# model.add(MaxPooling2D(pool_size=(2, 2)))

# model.add(Conv2D(64, (3,3)))
# model.add(Activation('relu'))
# model.add(MaxPooling2D(pool_size=(2,2)))
# model.add(Dropout(0.25))

# # # 2 hidden layers
# model.add(Flatten())
# model.add(Dense(128))
# model.add(Activation('relu'))

# model.add(Dense(128))
# model.add(Activation('relu'))

# # # The output layer with 9 neurons for 9 classes
# model.add(Dense(9))
# model.add(Activation('softmax'))

# # # Compiling the model using some basic parameters
# model.compile(loss='sparse_categorical_crossentropy',
#               optimizer='adam',
#               metrics=['accuracy'])

In [None]:
train_generator.filenames

In [None]:
# # Train the model
# history = model.fit_generator(generator=train_generator,
#                     steps_per_epoch=(11209) // batch_size,
#                     epochs=50, 
#                     validation_data=validation_generator,
#                     validation_steps=(2798) // batch_size,
#                     callbacks=[
#                         EarlyStopping(patience=3, restore_best_weights=True),
#                         ReduceLROnPlateau(patience=2)],
#                     verbose=1)