# Data Mining & Machine Learning - Final Project
## Rotten Fruit Classfication

##Imports & Data load

In [None]:
# Import the required Libraries
import numpy as np
import pandas as pd
import os
import cv2
from tqdm import tqdm
from random import shuffle
from sklearn.preprocessing import LabelEncoder
from numpy import expand_dims
from keras.preprocessing.image import load_img, img_to_array, array_to_img, ImageDataGenerator
import sys
import sklearn
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
import math
import re
from PIL import Image
import imagehash

In [None]:
# We will use google colab to run our code
# We use drive.mount to read and write from the local drive

from google.colab import drive
drive.mount('/content/drive')


In [None]:
# Data loader with list output
def load_labeled_fruits_data(path):
  '''
  This function takes a path of a dataset of images as input and returns three outputs:
  1) the images 2) the labels 3) the name of the image_file
  '''
  X,Y,img_names=[],[],[] 
  z=[]
  np.random.seed(42)
  for cata in tqdm(os.listdir(path)): # go through the folders within the file path 
      path_main=os.path.join(path,cata)
      for img_name in os.listdir(path_main): # go through the files within the folder
        img=cv2.imread(os.path.join(path_main,img_name))
        img=cv2.cvtColor(img,cv2.COLOR_BGR2RGB)
        z.append([img, cata, img_name]) # store image, folder_name and image_file_name
  print('Shuffling your data.....')
  shuffle(z)
  for images, labels, img_name in tqdm(z): # folder_name will be the label 
    X.append(images);Y.append(labels);img_names.append(img_name)
  return X,Y,img_names

In [None]:
# The DMML_Project Folder is available with the following link:
# https://drive.google.com/drive/folders/1iyMlsS_q1w8iVuEvoK77muTG9p1ihE9V?usp=sharing

# Load Kaggle data
X_k,Y_k,Z_k = load_labeled_fruits_data('/content/drive/My Drive/DMML_Project/Kaggle_Fruits_Data')
# The original data is available here: 
# https://www.kaggle.com/datasets/sriramr/fruits-fresh-and-rotten-for-classification

# Load Fruitnet data
X_f,Y_f,Z_f = load_labeled_fruits_data('/content/drive/My Drive/DMML_Project/FruitNet_Data')
# the DMML Project Folder contains most of the Fruitnet Data.
# The fruitnet data is available here: https://data.mendeley.com/datasets/b6fftwbr2v/2
# All images but the ones of pomegranates were used. We decided not to use pomegranates 
# as this data set was preaugmented in a visually different way than the other fruits.

## Exploratory Analysis

In [None]:
def check_amounts(input_list):
  '''
  This function takes a list of fruit_labels as input and will return their counts
  '''
  for i in range(len(np.unique(np.array(input_list)))):
    print("Amount of fruits in dataset classified as '" \
        + str(np.unique(np.array(input_list), return_counts=True)[0][i]) + "': " \
        + str(np.unique(np.array(input_list), return_counts=True)[1][i]) + " ("  \
        + str(round(np.unique(np.array(input_list), return_counts=True)[1][i]/len(input_list)*100,2)) + "%)")
  print("")
  print("Total amount of pictures from dataset: " + str(len(input_list)))

In [None]:
# Print amount of instances in subcategories
print("Kaggle dataset:")
check_amounts(Y_k)
print("")
print("")
print("Fruitnet dataset:")
check_amounts(Y_f)

In [None]:
def fruit_quality_count(fruits,quality):
  '''
  This function takes a fruit labels and correspondeing quality labels as input and returns the count of type of fruits and quality of fruits
  '''
  for i in range(len(np.unique(np.array(fruits)))):
    print("Total amount of " + str(np.unique(np.array(fruits), return_counts=True)[0][i]) + "s: " \
        + str(np.unique(np.array(fruits), return_counts=True)[1][i]) + " ("  \
        + str(round(np.unique(np.array(fruits), return_counts=True)[1][i]/len(fruits)*100,2)) + "%)")
  print("")
  for i in range(len(np.unique(np.array(quality)))):
    print("Total amount of fruits declared as '" + str(np.unique(np.array(quality), return_counts=True)[0][i]) + "': " \
        + str(np.unique(np.array(quality), return_counts=True)[1][i]) + " ("  \
        + str(round(np.unique(np.array(quality), return_counts=True)[1][i]/len(quality)*100,2)) + "%)")


In [None]:
# Print amount of images in high level categories
fruits = [label.split("_")[0] for label in Y_k] + [label.split("_")[0] for label in Y_f]
quality = [label.split("_")[1] for label in Y_k] + [label.split("_")[1] for label in Y_f]

fruit_quality_count(fruits,quality)

In [None]:
def fruitlist_generator(X,Y,catList,L):
  '''
  Creates a list with length L of fruit from the given categories
  '''
  images = []
  labels = []
  x = L//len(catList)
  temp = pd.DataFrame({"data":X,"label":Y})
  for cat in catList:
    filtered = temp[temp["label"].str.contains(cat)]
    filtered.sample(frac=1)
    images = images + filtered["data"].values.tolist()[:x]
    labels = labels + filtered["label"].values.tolist()[:x]
  return images, labels

In [None]:
def show_subplot(X,Y=None):
  '''
  creates a plot of 6x6 images
  '''
  f, ax= plt.subplots(6,6, figsize=(40,60))
  for i,img in enumerate(X[:36]):
    ax[i//6][i%6].imshow(img, aspect='auto')
    ax[i//6][i%6].set_axis_off()
    if isinstance(Y,np.ndarray) or isinstance(Y,list) and len(Y) == len(X):
      ax[i//6][i%6].set_title(Y[i], fontsize=28)
  plt.show()

In [None]:
# Generate Fruitlist ...

# ... for Kaggle dataset
plot_X_k,plot_Y_k  = fruitlist_generator(X_k,Y_k,["Apple_Fresh","Apple_Rotten","Banana_Fresh","Banana_Rotten","Orange_Fresh","Orange_Rotten"],36)

# ... for FruitNet dataset
plot_X_f,plot_Y_f  = fruitlist_generator(X_f,Y_f,["Apple_Fresh","Apple_Rotten","Banana_Fresh","Banana_Rotten","Orange_Fresh","Orange_Rotten","Guava_Fresh","Guava_Rotten","Lime_Fresh","Lime_Rotten"],60)

In [None]:
# Kaggle sample images plot by categories
show_subplot(plot_X_k,plot_Y_k)

In [None]:
#We can see that quite some of the images are already augmented.

In [None]:
# Look at unique file names in Kaggle dataset
beginning = np.array([])
for file_name in Z_k:
  beginning = np.append(beginning, re.search(r"[a-z]*", file_name, re.IGNORECASE).group())
print(np.unique(beginning))


#We see that the augmented pictures we identified in the subplot above have 'rotated', 'saltandpepper', 'translation' or 'vertical' as beginning of their name
#We will use this information later on to drop out the augmented Versions in order to augment the images ourselves
# Unaugmented images start with "Screen" ("Screen_shot" but only all characters of the beginning until the first character that is not a letter are printed)

In [None]:
def show_subplot_60(X,Y=None):
  '''
  creates a plot of 6x6 images
  '''
  f, ax= plt.subplots(10,6, figsize=(40,60))
  for i,img in enumerate(X[:60]):
    ax[i//6][i%6].imshow(img, aspect='auto')
    ax[i//6][i%6].set_axis_off()
    if isinstance(Y,np.ndarray) or isinstance(Y,list) and len(Y) == len(X):
      ax[i//6][i%6].set_title(Y[i], fontsize=28)
  plt.show()

In [None]:
# FruitNet image plot by categories
show_subplot_60(plot_X_f,plot_Y_f)

In [None]:
# Now we print one sample of a (supposed) augmented image

augmented = []
augmented_label = []
for i in range(len(Z_k)):
  if "Screen Shot 2018-06-08 at 5.00.50 PM" in Z_k[i]:
    augmented.append(X_k[i])
    augmented_label.append(Y_k[i])
def show_subplot(X,Y=None):
        f, ax= plt.subplots(2,4, figsize=(40,20))
        for i,img in enumerate(X[:8]):
            ax[i//4][i%4].imshow(img, aspect='auto')
            ax[i//4][i%4].set_axis_off()
            if isinstance(Y,np.ndarray) or isinstance(Y,list) and len(Y) == len(X):
                ax[i//4][i%4].set_title(Y[i], fontsize=28)
        plt.show()
show_subplot(augmented, augmented_label)

In [None]:
def size_aspect_ratio_scatter(X, Y):
  '''
  Creates a scatterplot that plots square root size vs aspect ratio
  values are colored by the given label
  '''
  size = [math.sqrt(len(img)*len(img[0])) for img in X]
  aspect_ratio = [len(img)/len(img[0]) for img in X]
  df = pd.DataFrame({'size': size, 'aspect ratio': aspect_ratio, 'label':Y})
  sns.scatterplot('size', 'aspect ratio', data=df, hue='label')
  plt.show()
  return

In [None]:
def darken_color(color, amount=0.5):
    import matplotlib.colors as mc
    import colorsys
    try:
        c = mc.cnames[color]
    except:
        c = color
    c = colorsys.rgb_to_hls(*mc.to_rgb(c))
    return colorsys.hls_to_rgb(c[0], 1 - amount * (1 - c[1]), c[2])

def average_RGB_boxplot(X, Y):
  avg_RGB = [np.mean(img) for img in tqdm(X)]
  print(len(avg_RGB))
  df = pd.DataFrame({'average RGB value':avg_RGB, 'label':Y})
  sns.boxplot(x='label', y='average RGB value', data=df, order=["Apple_Fresh","Apple_Rotten","Banana_Fresh","Banana_Rotten","Guava_Fresh","Guava_Rotten","Lime_Fresh","Lime_Rotten","Orange_Fresh","Orange_Rotten"], palette=['red', darken_color('red',1.2), 'yellow', darken_color('yellow',1.2), 'green', darken_color("green",1.1), 'lightgreen', darken_color("lightgreen",1.6), 'orange', darken_color("orange",1.2)])
  plt.xticks(rotation=340,ha="left")
  plt.savefig('/content/drive/My Drive/DMML_Project/foo.png', bbox_inches='tight')
  plt.show()

In [None]:
# Create joined dataset for general anaylsis purposes
X_joined = X_k + X_f
Y_joined = Y_k + Y_f
quality_labels = [label.split("_")[1] for label in Y_joined]

In [None]:
# Plot size against aspect ratio of all pictures
size_aspect_ratio_scatter(X_joined, quality_labels)

In [None]:
# Boxplot of average RGB values
average_RGB_boxplot(X_joined,Y_joined)

In [None]:
# Barplot of different fruits
counts = [7306,6950,2281,2187,6237]
labels = ["Apple","Banana","Guava","Lime","Orange"]
plt.figure()
plt.bar(range(len(counts)), counts, alpha = .7)
plt.xticks(range(len(counts)), labels)#, rotation = 30)
plt.xlabel("label")
plt.ylabel("amount")
plt.savefig('/content/drive/My Drive/DMML_Project/foo.png', bbox_inches='tight')
plt.show()

## Preprocessing

### Data Filtering

In [None]:
# Image filtering
def augmented_filter(X,Y,img_names): # only for Kaggle dataset
  """
  filters out pre-augmented images with help of the file name
  """
  temp = pd.DataFrame({"data":X,"label":Y,"img_name":img_names})
  filtered = temp[temp["img_name"].str.startswith("Screen Shot")]
  print("Number of filtered-out entries:",len(X)-len(filtered))
  return filtered["data"].values.tolist(),filtered["label"].values.tolist()

def duplicate_filter(X,Y):
  """
  filters out full duplicates with help of the image hash
  """
  pHash = [imagehash.phash(Image.fromarray(img)) for img in X]
  temp = pd.DataFrame({"data":X,"label":Y,"perceptual hash":pHash})
  filtered = temp.drop_duplicates(subset="perceptual hash")
  print("Number of filtered-out entries:",len(X)-len(filtered))
  return filtered["data"].values.tolist(), filtered["label"].values.tolist()

def aspect_ratio_filter(X,Y):
  """
  filters out aspect ratio anomalies
  """
  aspect_ratios = [img.shape[0]/img.shape[1] for img in X]
  sqrt_sizes = [math.sqrt(img.shape[0]*img.shape[1]) for img in X]
  temp = pd.DataFrame({"data":X,"label":Y,"aspect ratio":aspect_ratios,"size":sqrt_sizes})
  filtered = temp.loc[(temp["aspect ratio"]>=0.25) | (temp["size"] > 300)]
  filtered = filtered.loc[(filtered["aspect ratio"]<=4) | (filtered["size"] > 300)]
  print("Number of filtered-out entries:",len(X)-len(filtered))
  return filtered["data"].values.tolist(),filtered["label"].values.tolist()

def RGB_filter(X,Y):
  """
  filters out RGB anomalies
  """
  avg_RGB = [np.mean(img, axis=(0,1)) for img in X]
  avg_R = [RGB[0] for RGB in avg_RGB]
  avg_G = [RGB[1] for RGB in avg_RGB]
  avg_B = [RGB[2] for RGB in avg_RGB]
  temp = pd.DataFrame({"data":X,"label":Y,"average R":avg_R,"average G":avg_G,"average B":avg_B}) 
  filtered = temp.loc[(temp["average R"]>=5) & (temp["average R"]<=250)]
  filtered = filtered.loc[(filtered["average G"]>=5) & (filtered["average G"]<=250)]
  filtered = filtered.loc[(filtered["average B"]>=5) & (filtered["average B"]<=250)]
  print("Number of filtered-out entries:",len(X)-len(filtered))
  return filtered["data"].values.tolist(),filtered["label"].values.tolist()

In [None]:
# Filter out already augmented pictures from Kaggle dataset (there are no augmented pictures in the Fruitnet dataset)
X_k,Y_k = augmented_filter(X_k,Y_k,Z_k)

In [None]:
# Filter out duplicates... 

# ... from Kaggle dataset
X_k,Y_k = duplicate_filter(X_k,Y_k)

# ... from Fruitnet dataset
X_f,Y_f = duplicate_filter(X_f,Y_f)

In [None]:
# Filter out pictures which have a smaller (square root) size than 300 and an aspect ratio below 0.25 or above 4 ...

# ... from Kaggle dataset
X_k,Y_k = aspect_ratio_filter(X_k,Y_k)

# ... from Fruitnet dataset
X_f,Y_f = aspect_ratio_filter(X_f,Y_f)

In [None]:
# Filter out too bright or dark pictures (due to too high average RGB values)...

# ... from Kaggle dataset
X_k,Y_k = RGB_filter(X_k,Y_k)

# ... from Fruitnet dataset
X_f,Y_f = RGB_filter(X_f,Y_f)

### Data Normalization

In [None]:
# Resize images
def resize(X,V):
  """
  Downsizes images of a list X to a given width and height of V
  """
  resized = []
  for img in X:
    resized.append(cv2.resize(img, (V,V), interpolation=cv2.INTER_CUBIC))
  return resized

# Rescale images
def rescale(X):
  return [img/255 for img in X]

In [None]:
# Downsize images to 128x128...

# ... Kaggle dataset
X_k = resize(X_k, 128)

# ... Fruitnet dataset
X_f = resize(X_f, 128)

In [None]:
# Rescale pictures ... ACHTUNG, CRASH! <-- Money helped

# ... from Kaggle dataset
X_k= rescale(X_k)

# ... from Fruitnet dataset
X_f = rescale(X_f)


In [None]:
# Merge datasets 
X = X_k + X_f
Y = Y_k + Y_f

In [None]:
# Check amount of 
print("Amount of pictures left from Kaggle dataset: " + str(len(X_k)))
print("Amount of pictures left from Fruitnet dataset: " + str(len(X_f)))
print("Amount of pictures in resulting total dataset: " + str(len(X)))

In [None]:
# LabelEncoder - split labels
Y_fresh = [label.split("_")[1] for label in Y]
Y_fresh_encoded = LabelEncoder().fit_transform(Y_fresh)

print(Y_fruit_encoded)
print(Y_fresh_encoded)

In [None]:
# Train-Test-Split: Train = 80%, Test = 20%
X = np.array(X)
X_train, X_test, y_train, y_test = train_test_split(X, Y_fresh_encoded, test_size=0.2, random_state=1)



In [None]:
# Train-Validation-Split: Train = 70%, Validation = 10%
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.125, random_state=1) # 0.125 x 0.8 = 0.1

#### PCA

In [None]:
def Dim_Reduction(X):
  '''
  Function to apply PCA to the given RGB image data
  returns reconstructed images
  '''
  reconstructed_X = []
  for img in tqdm(X):
    B,G,R = cv2.split(img)
    pca_r = PCA(n_components=0.99, whiten=True, random_state=42)
    reduced_R = pca_r.fit_transform(np.array(R))
    #print("Red Dimension reduced from", img.shape,"to",reduced_R.shape)
    pca_g = PCA(n_components=0.99, whiten=True, random_state=42)
    reduced_G = pca_g.fit_transform(np.array(G))
    #print("Blue Dimension reduced from", img.shape,"to",reduced_G.shape)
    pca_b = PCA(n_components=0.99, whiten=True, random_state=42)
    reduced_B = pca_b.fit_transform(np.array(B))
    #print("Green Dimension reduced from", img.shape,"to",reduced_B.shape)
    reconstructed_r = pca_r.inverse_transform(reduced_R)
    reconstructed_g = pca_g.inverse_transform(reduced_G)
    reconstructed_b = pca_b.inverse_transform(reduced_B)
    reconstructed_X.append(cv2.merge((reconstructed_b, reconstructed_g, reconstructed_r)))
  return reconstructed_X

In [None]:
# Plot some reconstructed images
reconstructed = Dim_Reduction(X)
len(reconstructed)
show_subplot(reconstructed,Y)

### Augmentation

In [None]:
# Create new pictures from original data with ImageDataGenerator
datagen = ImageDataGenerator(
    rotation_range=180,         # randomly rotates pictures
    width_shift_range=10,       # randomly shifts pictures vertically
    height_shift_range=10,      # randomly shifts pictures horizontally
    shear_range=5,              # randomly shears pictures
    brightness_range=[0.8,1.2], # randomly changes brightness of pictures
    horizontal_flip=True,       # randomly flips pictures horizontally
    vertical_flip=True          # randomly flips pictures vertically
)

In [None]:
# Data Augmentation func
def augment(X, Y):
  aug_X = []                      # list of augmented pictures, each picture will be augmented two times (TBD!)
  aug_Y = []                      # list of respective labels for augmented pictures
  for i in range(len(X)): 
    pic_array = expand_dims(X[i],0)
    
    # Create an iterator
    it = datagen.flow(pic_array)

    # Generate batch of images
    for j in range(1):      # reduced to 1 to save RAM
    
        # convert to unsigned integers
        image = next(it)[0].astype('uint8')
        
        # Append image and respective label to lists
        aug_X.append(image)
        aug_Y.append(Y[i]) # 0 variabel 
  aug_X = rescale(aug_X)
  return aug_X, aug_Y

In [None]:
# Data Augmentation

# Train data
aug_X_train, aug_y_train = augment(X_train, y_train)

# Valdidation data
aug_X_valid, aug_y_valid = augment(X_valid, y_valid)

# Test data
aug_X_test, aug_y_test = augment(X_test, y_test)

In [None]:
# plot some augmented pictures
images = [X_train[0],aug_X_train[0],X_train[1],aug_X_train[1],X_train[2],aug_X_train[2],X_train[3],aug_X_train[3],X_train[4],aug_X_train[4],X_train[5],aug_X_train[5],X_train[6],aug_X_train[6],X_train[7],aug_X_train[7],X_train[8],aug_X_train[8],X_train[9],aug_X_train[9],X_train[10],aug_X_train[10],X_train[11],aug_X_train[11],X_train[12],aug_X_train[12],X_train[13],aug_X_train[13],X_train[14],aug_X_train[14],X_train[15],aug_X_train[15],X_train[16],aug_X_train[16],X_train[17],aug_X_train[17]]
labels = ["Before","After","Before","After","Before","After","Before","After","Before","After","Before","After","Before","After","Before","After","Before","After","Before","After","Before","After","Before","After","Before","After","Before","After","Before","After","Before","After","Before","After","Before","After"]
show_subplot(images,labels)

In [None]:
# add augmented data to original data
# Train data
X_train = np.concatenate((X_train, np.array(aug_X_train)), axis=0)
y_train = np.concatenate((y_train, np.array(aug_y_train)), axis=0)
## shuffle
np.random.seed(42)
np.random.shuffle(X_train)
np.random.seed(42)
np.random.shuffle(y_train)

# Validation data
X_valid = np.concatenate((X_valid, np.array(aug_X_valid)), axis=0)
y_valid = np.concatenate((y_valid, np.array(aug_y_valid)), axis=0)
## shuffle
np.random.seed(42)
np.random.shuffle(X_valid)
np.random.seed(42)
np.random.shuffle(y_valid)

# test data
X_test = np.concatenate((X_test, np.array(aug_X_test)), axis=0)
y_test = np.concatenate((y_test, np.array(aug_y_test)), axis=0)

## shuffle
np.random.seed(42)
np.random.shuffle(X_test)
np.random.seed(42)
np.random.shuffle(y_test)


In [None]:
print("Number of train instances after Augmentation:", len(X_train))
print("Number of validation instances after Augmentation:", len(X_valid))
print("Number of test instances after Augmentation:", len(X_test))

In [None]:
# Plot some examples from final train data
show_subplot(X_train,y_train)

## Modelling

### General Function

In [None]:
# import needed libraries
import keras 
from keras.layers import Dense,Dropout, Conv2D,MaxPooling2D , Activation, Flatten, BatchNormalization, SeparableConv2D
from keras.models import Sequential
import tensorflow as tf
from keras.models import Model, load_model
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [None]:
def plot_training_history(history):
  '''
  Function plots training history for a model
  '''
  plt.figure(1, figsize = (20, 12))
  plt.subplot(1,2,1)
  plt.xlabel("Epochs")
  plt.ylabel("Loss")
  plt.plot( history.history["loss"], label = "Training Loss")
  plt.plot( history.history["val_loss"], label = "Validation Loss")
  plt.grid(True)
  plt.legend()

  plt.subplot(1,2,2)
  plt.xlabel("Epochs")
  plt.ylabel("Accuracy")
  plt.plot( history.history["accuracy"], label = "Training Accuracy")
  plt.plot( history.history["val_accuracy"], label = "Validation Accuracy")
  plt.grid(True)
  plt.legend()

In [None]:
#Load the final data right without the need to rerun all previous steps
#data can be found here: https://drive.google.com/drive/folders/17RRL9rI-dLtBB4B0En5iDPF38WhuyLjs?usp=sharing
X_train = np.load('/content/drive/My Drive/DMML_Project/Final_Data/After_Augmentation/X_train_aug.npy')
X_test = np.load('/content/drive/My Drive/DMML_Project/Final_Data/After_Augmentation/X_test_aug.npy')
X_valid = np.load('/content/drive/My Drive/DMML_Project/Final_Data/After_Augmentation/X_valid_aug.npy')
y_train = np.load('/content/drive/My Drive/DMML_Project/Final_Data/After_Augmentation/y_train_aug.npy')
y_test = np.load('/content/drive/My Drive/DMML_Project/Final_Data/After_Augmentation/y_test_aug.npy')
y_valid = np.load('/content/drive/My Drive/DMML_Project/Final_Data/After_Augmentation/y_valid_aug.npy')

### CNN1

In [None]:
# Create CNN with 8 layers
model = keras.models.Sequential([
    keras.layers.Conv2D(32, kernel_size=3, padding="same", activation="relu"),
    keras.layers.Conv2D(64, kernel_size=3, padding="same", activation="relu"),
    keras.layers.MaxPool2D(),
    keras.layers.Flatten(),
    keras.layers.Dropout(0.25),
    keras.layers.Dense(128, activation="relu"),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(1, activation="sigmoid")
])

In [None]:
# Compile model
model.compile(loss="binary_crossentropy", optimizer="adam",
              metrics=["accuracy"])

In [None]:
# Fit model
%%time
tf.random.set_seed(42)
history = model.fit(X_train, y_train, epochs=10, validation_data=[X_valid, y_valid])


In [None]:
# print model summary
model.summary()

In [None]:
# plot training history
plot_training_history(history)

In [None]:
# model evaluation
model.evaluate(X_test, y_test)

In [None]:
# reload fitted model
# find folder to load this model under this link: https://drive.google.com/drive/folders/1GhNa-RbiSuJIRKCsu1OB2B1a1ngNMnlx?usp=sharing
CNN1 = load_model('/content/drive/My Drive/DMML_Project/Final_Models/CNN1.h5')

In [None]:
# Generate predictions for test data
y_pred_CNN1 = CNN1.predict(X_test)

In [None]:
# print classification report
print(classification_report(y_test, y_pred_CNN1.round()))

In [None]:
# plot the confusion matrix
mat = confusion_matrix(y_test, y_pred_CNN1.round())
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False)
plt.xlabel('true label')
plt.ylabel('predicted label');

### CNN2

In [None]:
# Create CNN with 19 layers
model = keras.models.Sequential()

model.add(Conv2D(64, (3, 3), kernel_initializer='he_uniform', padding='same', activation='relu', input_shape=(128,128,3)))
model.add(BatchNormalization()) # to avoid overfitting (after layer)
model.add(MaxPooling2D((2, 2)))
model.add(BatchNormalization())
model.add(Dropout(0.2)) #reduce overfitting and improve generalization #error 

model.add(SeparableConv2D(64, (3, 3), kernel_initializer='he_uniform', padding='same', activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling2D((3, 3)))
model.add(Dropout(0.3))

model.add(Conv2D(128, (3, 3), kernel_initializer='he_uniform', padding='same', activation='relu'))
model.add(BatchNormalization())
model.add(Conv2D(64, (3, 3), kernel_initializer='he_uniform', padding='same', activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling2D((2, 2)))
model.add(Dropout(0.5))

model.add(Flatten())

model.add(Dense(64, activation='relu', kernel_initializer='he_uniform'))
model.add(Dropout(0.2))

model.add(Dense(1, activation='sigmoid'))


In [None]:
# print model summary
model.summary()

In [None]:
# Compile model
model.compile(loss='binary_crossentropy', optimizer="adam",metrics=["accuracy"])

In [None]:
# Fit model
%%time
tf.random.set_seed(42)    # set seed to make model runs comparable
history = model.fit(X_train, y_train, epochs=10, validation_data=[X_valid, y_valid])

In [None]:
# Plot training history
plot_training_history(history)

In [None]:
# model evaluation
model.evaluate(X_test, y_test)

In [None]:
# reload fitted model
# find folder to load this model under this link: https://drive.google.com/drive/folders/1GhNa-RbiSuJIRKCsu1OB2B1a1ngNMnlx?usp=sharing
CNN2 = load_model('/content/drive/My Drive/DMML_Project/Final_Models/CNN2.h5')

In [None]:
# Generate predictions
y_pred_CNN2 = CNN2.predict(X_test)

In [None]:
# Print classification report
print(classification_report(y_test, y_pred_CNN2.round()))

In [None]:
# plot the confusion matrix:
mat = confusion_matrix(y_test, y_pred_CNN2.round())
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False)
plt.xlabel('true label')
plt.ylabel('predicted label');

### VGG16

In [None]:
# import required libraries
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras import layers 

In [None]:
# load base model with pre-trained weights
base_model = VGG16(input_shape = (128, 128, 3), # Shape of our images
include_top = False, # Leave out the last fully connected layer to be able to set the input_shape
weights = 'imagenet')

In [None]:
# set some layers non-trainable
for layer in base_model.layers:
    layer.trainable = False

In [None]:
# Flatten the output layer to 1 dimension
x = layers.Flatten()(base_model.output)

# Add a fully connected layer with 512 hidden units and ReLU activation
x = layers.Dense(512, activation='relu')(x)

# Add a dropout rate of 0.5
x = layers.Dropout(0.5)(x)

# Add a final sigmoid layer with 1 node for classification output
x = layers.Dense(1, activation='sigmoid')(x)

model = tf.keras.models.Model(base_model.input, x)

In [None]:
# Compile model
model.compile(optimizer = tf.keras.optimizers.RMSprop(lr=0.0001), loss = 'binary_crossentropy',metrics = ['acc'])

In [None]:
# print model summary
model.summary()

In [None]:
# Fit model
%%time
tf.random.set_seed(42)
history = model.fit(X_train, y_train, validation_data=[X_valid, y_valid], steps_per_epoch = 100, epochs = 10)

In [None]:
# plot training history 
plt.figure(1, figsize = (20, 12))
plt.subplot(1,2,1)
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.plot( history.history["loss"], label = "Training Loss")
plt.plot( history.history["val_loss"], label = "Validation Loss")
plt.grid(True)
plt.legend()

plt.subplot(1,2,2)
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.plot( history.history["acc"], label = "Training Accuracy")
plt.plot( history.history["val_acc"], label = "Validation Accuracy")
plt.grid(True)
plt.legend()

In [None]:
# model evaluation
model.evaluate(X_test, y_test)

In [None]:
# reload fitted model
# find folder to load this model under this link: https://drive.google.com/drive/folders/1GhNa-RbiSuJIRKCsu1OB2B1a1ngNMnlx?usp=sharing
VGG16 = load_model('/content/drive/My Drive/DMML_Project/Final_Models/VGG16.h5')

In [None]:
# generate predictions
y_pred_VGG16 = VGG16.predict(X_test)

In [None]:
# print classification report
print(classification_report(y_test, y_pred_VGG16.round()))


In [None]:
# plot the confusion matrix
mat = confusion_matrix(y_test, y_pred_VGG16.round())
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False)
plt.xlabel('true label')
plt.ylabel('predicted label');

### Performance Comparison

In [None]:
params = [33.6,56.5,42]
accuracy = [92,96,96]
precision = [90,97,98]
recall = [94,94,94]
f1 = [92,96,96]
runtime = [52,37,28]
modelnames = ["CNN1","CNN2","VGG16"]

In [None]:
# plot accuracy vs number of params
plt.scatter(params, accuracy, s=80)
plt.ylim(90, 100)
plt.xlim(30, 63)
plt.grid()
plt.xlabel("Number of Parameters in millions")
plt.ylabel("Accuracy(%)")
plt.annotate("CNN1", (33.6, 92), xytext=(34.5,92),fontsize=14)
plt.annotate("CNN2", (56.5, 96), xytext=(57.5,96),fontsize=14)
plt.annotate("VGG16", (42, 96), xytext=(43,96),fontsize=14)
#plt.savefig('/content/drive/My Drive/DMML_Project/complexity_plot.png', bbox_inches='tight')
plt.show()

In [None]:
def bar_plot(metric,labels):
  '''
  Plots a bar chart for a given metrics across the given models
  '''
  plt.figure()
  plt.bar(range(len(labels)), metric)
  plt.ylim(0, 60)
  plt.grid(axis='y')
  plt.xticks(range(len(labels)), labels)
  plt.xlabel("Model")
  plt.ylabel("Runtime(min)")
  #plt.savefig('/content/drive/My Drive/DMML_Project/Runtime_scaled.png', bbox_inches='tight')
  plt.show()

In [None]:
# Plot precision
bar_plot(precision,modelnames)

In [None]:
# plot recall
bar_plot(recall,modelnames)

In [None]:
# plot f1 score
bar_plot(f1,modelnames)

In [None]:
# plot accuracy
bar_plot(accuracy,modelnames)

In [None]:
# plot runtime
bar_plot(runtime,modelnames)

### Error Analysis for CNN2

In [None]:
X_train = np.load('/content/drive/My Drive/Final_Data/After_Augmentation/X_train_aug.npy')
X_test = np.load('/content/drive/My Drive/Final_Data/After_Augmentation/X_test_aug.npy')
X_valid = np.load('/content/drive/My Drive/Final_Data/After_Augmentation/X_valid_aug.npy')
y_train = np.load('/content/drive/My Drive/Final_Data/After_Augmentation/y_train_aug.npy')
y_test = np.load('/content/drive/My Drive/Final_Data/After_Augmentation/y_test_aug.npy')
y_valid = np.load('/content/drive/My Drive/Final_Data/After_Augmentation/y_valid_aug.npy')

In [None]:
# get mislabeled data and labels from CNN2
temp = pd.DataFrame({"data":X_test.tolist(),"real Y":y_test.tolist(),"Prediction":[int(y[0]) for y in y_pred_CNN2.round().tolist()]})

X_wrong = temp[temp["real Y"]!=temp["Prediction"]]["data"].values
Y_wrong = temp[temp["real Y"]!=temp["Prediction"]]["Prediction"].values.tolist()
Y_wrong = ["Prediction: Fresh" if y==0 else "Prediction: Rotten" for y in Y_wrong]


In [None]:
# plot mislabeled examples
show_subplot(X_wrong,Y_wrong)

In [None]:
# create selection of mislabeled images
selection = [X_wrong[1],X_wrong[3],X_wrong[5],X_wrong[9],X_wrong[34]]
selection_labels = [Y_wrong[1],Y_wrong[3],Y_wrong[5],Y_wrong[9],Y_wrong[34]]