In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from keras.models import Model
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import auc
from sklearn.metrics import average_precision_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_curve
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import models
from tensorflow.keras import optimizers
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# 1. Initialize functions and generator for training and evaluation

In [None]:
def myGenerator(gen1, gen2):
    """Combine generators of imbalanced datasets into one.
    
    Args:
        gen1 (generator): First generator of image data split
            by classes
        gen2 (generator): Second generator of image data split
            by classes
        
    Returns:
        gen3 (generator): Generator that combines 2 generators
    """
    while True:
        img_data1,label1 = next(gen1)
        img_data2,label2 = next(gen2)
        if ((label1.shape[0] + label2.shape[0]) % 32) != 0:
            img_data1, label1 = next(gen1)
            img_data2, label2 = next(gen2)
        img_data_c = np.concatenate((img_data1, img_data2))

        label_size = label1.shape[1] + label2.shape[1]
        new_labels1 = np.zeros((label1.shape[0], label_size))
        new_labels1[:, :-label2.shape[1]] = label1
        new_labels2 = np.zeros((label2.shape[0], label_size))
        new_labels2[:, label1.shape[1]:] = label2

        labels_c = np.concatenate((new_labels1, new_labels2), axis=0)

        shuffler = np.random.permutation(labels_c.shape[0])
        labels_c_shuffled = labels_c[shuffler]
        img_data_c_shuffled = img_data_c[shuffler]

        yield img_data_c_shuffled, labels_c_shuffled

def plot_accuracy(history_obj, title):
    """Plots trained ML model accuracy scores.
    
    Args:
        history_df (obj): Output object from trained ML model 
        title (str): The desired title of the plot
        
    Returns:
        None
    """
    data_acc_df = pd.DataFrame({
        'epoch':[*range(1, len(
            history_obj.history['categorical_accuracy']) + 1)],
        'accuracy':history_obj.history['categorical_accuracy']}) 
    data_acc_df['Data'] = 'training'
    data_vacc_df = pd.DataFrame({
        'epoch':[*range(1, len(
            history_obj.history['val_categorical_accuracy']) + 1)],
        'accuracy':history_obj.history['val_categorical_accuracy']})
    data_vacc_df['Data'] = 'validation'
    data_m_df = pd.concat((data_acc_df, data_vacc_df), axis=0)

    plt.figure(figsize=(12, 5))
    sns.lineplot(data=data_m_df, x='epoch', y='accuracy', hue='Data')
    plt.xlabel('Epoch', fontsize=16)
    plt.ylabel('Accuracy', fontsize=16)
    plt.title(title, fontsize=18)
    plt.xticks(fontsize=14)
    plt.yticks(fontsize=14)
    plt.ylim((0.1, 0.9))
    plt.show()
    
# Set up plot styling
plt.style.use('fivethirtyeight')
plt.style.use('seaborn-notebook')

In [None]:
# Initialize training generator
train_datagen= ImageDataGenerator(
        rotation_range=90,
        horizontal_flip=True,
        vertical_flip=True,
        rescale=1./255,
        fill_mode='nearest',
        width_shift_range=0.2,
        height_shift_range=0.2,
        shear_range=30,
        zoom_range=0.1,
        brightness_range=[0.7, 1.0])
train_generator_1 = train_datagen.flow_from_directory(
        'model_dataset/train/augment',
        target_size=(200, 200),
        batch_size=26,
        class_mode='categorical',
        shuffle=True,
        seed=15)
train_generator_2 = train_datagen.flow_from_directory(
        'model_dataset/train/no_augment',
        target_size=(200, 200),
        batch_size=6,
        class_mode='categorical',
        shuffle=True,
        seed=15)
train_generator = myGenerator(train_generator_1, train_generator_2)

# Initialize validation generator
valid_datagen = ImageDataGenerator(rescale=1./255)
valid_generator = valid_datagen.flow_from_directory(
        'model_dataset/validation',
        target_size=(200, 200),
        batch_size=32,
        shuffle = True,
        class_mode='categorical')

# Initialize test generator
test_datagen = ImageDataGenerator(rescale=1./255)
test_generator = test_datagen.flow_from_directory(
    directory='model_dataset/test',
    target_size=(200, 200),
    batch_size=1,
    class_mode='categorical',
    shuffle=False,
    seed=42)

Found 280 images belonging to 4 classes.
Found 440 images belonging to 1 classes.
Found 75 images belonging to 5 classes.
Found 75 images belonging to 5 classes.


In [None]:
steps_per_epoch = 10

# 2. Train CNN model for feature extraction

In [None]:
# Weight Layers: 7 | Activation: Relu | Max Filter: 256 | Max FCN: 128
feat_extract = models.Sequential()
feat_extract.add(layers.Conv2D(64, (3, 3), activation='relu', 
                        input_shape=(200, 200, 3)))
feat_extract.add(layers.Conv2D(64, (3, 3), activation='relu'))
feat_extract.add(layers.MaxPooling2D((2, 2)))
feat_extract.add(layers.Conv2D(128, (3, 3), activation='relu'))
feat_extract.add(layers.Conv2D(128, (3, 3), activation='relu'))
feat_extract.add(layers.MaxPooling2D((2, 2)))
feat_extract.add(layers.Conv2D(256, (3, 3), activation='relu'))
feat_extract.add(layers.Conv2D(256, (3, 3), activation='relu'))
feat_extract.add(layers.MaxPooling2D((2, 2)))
feat_extract.add(layers.Flatten())

x = feat_extract.output
x = layers.Dense(128, activation='relu')(x)
x =layers.Dropout(0.2)(x)
prediction_layer = layers.Dense(5, activation = 'softmax')(x)

ResourceExhaustedError: OOM when allocating tensor with shape[112896,128] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [Op:RandomUniform]

In [None]:
# Fit and train CNN model
vgg16 = Model(inputs=feat_extract.input, outputs=prediction_layer)
vgg16.compile(loss='categorical_crossentropy', optimizer='rmsprop',
              metrics=['CategoricalAccuracy'])

history_vgg16 = vgg16.fit(train_generator, steps_per_epoch=steps_per_epoch,
                          epochs=75, validation_data=valid_generator,
                          validation_steps=2, verbose=0)

In [None]:
# Extract train features for SVM model
for i in range(steps_per_epoch):
    (X_sub, y_sub) = next(train_generator)
    y_sub = np.argmax(y_sub, axis=1)
    if i == 0:
        X_train = X_sub.copy()
        y_train = y_sub.copy()
    else:
        X_train = np.append(X_train, X_sub, axis=0)
        y_train = np.append(y_train, y_sub, axis=0)
X_ext_train = feat_extract.predict(X_train)

# Extract validation features for SVM model
for i in range(3):
    (X_sub, y_sub) = next(valid_generator)
    y_sub = np.argmax(y_sub, axis=1)
    if i == 0:
        X_val = X_sub.copy()
        y_val = y_sub.copy()
    else:
        X_val = np.append(X_val, X_sub, axis=0)
        y_val = np.append(y_val, y_sub, axis=0)
X_ext_val = feat_extract.predict(X_val)

# Extract test features for SVM model
y_test = test_generator.classes
for i in range(test_generator.samples):
    (X_sub, _) = next(test_generator)
    if i == 0:
        X_test = X_sub.copy()
    else:
        X_test = np.append(X_test, X_sub, axis=0)
X_ext_test = feat_extract.predict(X_test)

# 3. Evaluate RF models

In [None]:
class_labels = list(test_generator.class_indices.keys())

## Without grid search

In [None]:
# Initialize random forest classifier
rfc = RandomForestClassifier(n_jobs=-1, 
                             class_weight='balanced', 
                             n_estimators=1000)
rfc.fit(X_ext_train, y_train)

# Check for overfitting
y_train_pred = rfc.predict(X_ext_train)
print(classification_report(y_train, y_train_pred,
                               target_names=class_labels))

In [None]:
# Evaluate model
y_val_pred = rfc.predict(X_ext_val)
print(classification_report(y_val, y_val_pred,
                               target_names=class_labels))

In [None]:
# Reduce overfitting
average_auc = []
enc = OneHotEncoder()
y_train_OHE = enc.fit_transform(y_train.reshape(-1, 1)).toarray()
y_val_OHE = enc.fit_transform(y_val.reshape(-1, 1)).toarray()
for max_depth in range(1, 32):
    rfc = RandomForestClassifier(n_jobs=-1, 
                                 class_weight='balanced', 
                                 n_estimators=1000,
                                 max_depth=max_depth)
    rfc.fit(X_ext_train, y_train)
    y_train_score = rfc.predict_proba(X_ext_train)
    y_val_score = rfc.predict_proba(X_ext_val)
    
    train_auc_list = []
    val_auc_list = []
    # precision recall curve for 
    for i in range(5):
        precision, recall, _ = precision_recall_curve(y_train_OHE[:, i],
                                                            y_train_score[:, i])
        train_auc = auc(recall, precision)
        train_auc_list.append(train_auc)
        
        precision, recall, _ = precision_recall_curve(y_val_OHE[:, i],
                                                            y_val_score[:, i])
        val_auc = auc(recall, precision)
        val_auc_list.append(val_auc)
    average_auc.append([max_depth, np.mean(train_auc_list), np.mean(val_auc_list)])
average_auc_df = pd.DataFrame(average_auc, 
                              columns=['max_depth', 'average train auc', 'average val auc'])
sns.lineplot(x='max_depth', y='average train auc', data=average_auc_df, label='training')
sns.lineplot(x='max_depth', y='average val auc', data=average_auc_df, label='validation')
plt.ylabel('auc score')
plt.xlabel('max_depth')
plt.show()

## With grid search

In [None]:
# Initialize random forest classifier
rfc = RandomForestClassifier(n_jobs=-1, 
                             class_weight='balanced', 
                             n_estimators=1000)
criterion = ['gini', 'entropy']
max_features = ['auto', 'sqrt', 'log2']
grid = {'max_features': max_features,
        'criterion': criterion}
grid_search = GridSearchCV(estimator=rfc, param_grid=grid,
                         cv=5, n_jobs=-1, verbose=2)

# Fit grid search
grid_search.fit(X_ext_train, y_train)
print('Best Parameters:')
print(grid_search.best_params_)

# Check for overfitting
y_train_pred = rfc.predict(X_ext_train)
print(classification_report(y_train, y_train_pred,
                               target_names=class_labels))

In [None]:
# Initialize random forest classifier with max_depth
rfc = RandomForestClassifier(n_jobs=-1, 
                             class_weight='balanced', 
                             n_estimators=1000,
                             max_depth=2)
criterion = ['gini', 'entropy']
max_features = ['auto', 'sqrt', 'log2']
grid = {'max_features': max_features,
        'criterion': criterion}
grid_search = GridSearchCV(estimator=rfc, param_grid=grid,
                         cv=5, n_jobs=-1, verbose=2)

# Fit grid search
grid_search.fit(X_ext_train, y_train)
print('Best Parameters:')
print(grid_search.best_params_)

# Evaluate model
y_val_pred = grid_search.predict(X_ext_val)
print(classification_report(y_val, y_val_pred,
                               target_names=class_labels)) 

# 4. Evaluate best RF model

In [None]:
# Fit optimal model
rfc = RandomForestClassifier(n_jobs=-1, 
                             class_weight='balanced',
                             max_features='log2',
                             n_estimators=1000,
                             max_depth=2)
rfc.fit(X_ext_train, y_train)

In [None]:
# Check for overfitting
y_train_pred = rfc.predict(X_ext_train)
print(classification_report(y_train, y_train_pred,
                               target_names=class_labels))

In [None]:
# Evaluate model
y_test_pred = rfc.predict(X_ext_test)
print(classification_report(y_test, y_test_pred,
                               target_names=class_labels))

cm = confusion_matrix(y_test, y_test_pred)
c_labels = ['basil', 'oregano', 'parsley', 'thyme', 'non-spice']
df_cm = pd.DataFrame(cm, index=c_labels, columns=c_labels)
plt.figure(figsize=(10, 7))
sns.heatmap(df_cm, annot=True, cmap=plt.cm.Blues)
plt.ylabel('True', fontsize=14)
plt.xlabel('Predicted', fontsize=14)
plt.show()