In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import math
from imblearn.over_sampling import RandomOverSampler
from scipy.stats import ttest_ind
from sqlalchemy import create_engine
from scipy.stats.mstats import winsorize
from scipy.stats import boxcox
from scipy.stats import jarque_bera
from scipy.stats import normaltest
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import cross_val_score
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import make_scorer
from sklearn.datasets import load_files

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, ElasticNetCV
from statsmodels.tools.eval_measures import mse, rmse
from wordcloud import WordCloud
import statsmodels.api as sm
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.svm import SVC
from sklearn import tree
from sklearn import ensemble
from sklearn import datasets
from sklearn.utils import shuffle
from sklearn import neighbors
from IPython.display import Image


import pydotplus
from sklearn import ensemble

import warnings

%matplotlib inline
sns.set()

warnings.filterwarnings('ignore')
import time
import cv2
import glob

Using TensorFlow backend.


## Introduction

- This amazing dataset contains tens of thousands of carefully curated images of 120 different fruits.
- It can be found here: https://www.kaggle.com/moltean/fruits
- Let's build some neural networks

In [2]:
import tensorflow as tf
import keras

from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, Conv3D, MaxPooling2D
from keras.layers import LSTM, Input, TimeDistributed
from keras.models import Model
from keras.optimizers import RMSprop

from keras import backend as K

In [3]:
#import the data
start_time = time.time()

train_dir = 'Data/fruits-360_dataset/fruits-360/Training' 
test_dir = 'Data/fruits-360_dataset/fruits-360/Test'

def load_dataset(path): 
    data = load_files(path) 
    files = np.array(data['filenames']) 
    targets = np.array(data['target']) 
    target_labels = np.array(data['target_names']) 
    return files,targets,target_labels

x_train, y_train, target_labels = load_dataset(train_dir) 
x_test, y_test,_ = load_dataset(test_dir)

print("--- %s seconds ---" % (time.time() - start_time))

--- 25.87718415260315 seconds ---


In [4]:
#convert image files to matrices
start_time = time.time()

x_train_mat = []
x_test_mat = []

for img in x_train:
    n= cv2.imread(img)
    x_train_mat.append(n)

for img in x_test:
    n= cv2.imread(img)
    x_test_mat.append(n)
    
print("--- %s seconds ---" % (time.time() - start_time))

--- 28.49203395843506 seconds ---


In [5]:
del(x_train, x_test)

In [6]:
#convert data to numpy arrays
start_time = time.time()

x_train_matnp = np.array(x_train_mat)
x_test_matnp = np.array(x_test_mat)
print("--- %s seconds ---" % (time.time() - start_time))

--- 1.8697540760040283 seconds ---


In [7]:
start_time = time.time()
del(x_train_mat, x_test_mat)

print("--- %s seconds ---" % (time.time() - start_time))

--- 0.32676196098327637 seconds ---


In [8]:
#reshape and normalize data

start_time = time.time()

img_rows, img_cols = 100, 100
num_classes = 120

if K.image_data_format() == 'channels_first':
    x_train_matnp = x_train_matnp.reshape(x_train_matnp.shape[0], 3, img_rows, img_cols)
    x_test_matnp = x_test_matnp.reshape(x_test_matnp.shape[0], 3, img_rows, img_cols)
    input_shape = (3, img_rows, img_cols)
else:
    x_train_matnp = x_train_matnp.reshape(x_train_matnp.shape[0], img_rows, img_cols, 3)
    x_test_matnp = x_test_matnp.reshape(x_test_matnp.shape[0], img_rows, img_cols, 3)
    input_shape = (img_rows, img_cols, 3)

x_train_matnp = x_train_matnp.astype('float32')
x_test_matnp = x_test_matnp.astype('float32')
x_train_matnp /= 255
x_test_matnp /= 255
print('x_train_matnp shape:', x_train_matnp.shape)
print(x_train_matnp.shape[0], 'train samples')
print(x_test_matnp.shape[0], 'test samples')

print("--- %s seconds ---" % (time.time() - start_time))

x_train_matnp shape: (60498, 100, 100, 3)
60498 train samples
20622 test samples
--- 32.652241230010986 seconds ---


In [9]:
# convert class vectors to binary class matrices

y_train_cat = keras.utils.to_categorical(y_train)
y_test_cat = keras.utils.to_categorical(y_test)

## Network 1

In [10]:
model = Sequential()

model.add(Conv2D(16, kernel_size=(5, 5),
                 activation='relu',
                 input_shape=(100,100,3)))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))

model.summary()

model.compile(loss='categorical_crossentropy',
              optimizer=keras.optimizers.Adadelta(),
              metrics=['accuracy'])

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 96, 96, 16)        1216      
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 48, 48, 16)        0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 48, 48, 16)        0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 36864)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               4718720   
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 120)              

In [11]:
model.fit(x_train_matnp, y_train_cat,
          epochs=10,
          verbose=1,
          validation_data=(x_test_matnp, y_test_cat))
score = model.evaluate(x_test_matnp, y_test_cat, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Train on 60498 samples, validate on 20622 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test loss: 0.159725972155145
Test accuracy: 0.9594607949256897


## Network 2

In [29]:
model_2 = Sequential()

model_2.add(Conv2D(32, kernel_size=(5, 5),
                 activation='relu',
                 input_shape=(100,100,3)))
model_2.add(MaxPooling2D(pool_size=(2, 2)))
model_2.add(Dropout(0.25))
model_2.add(Flatten())
model_2.add(Dense(128, activation='relu'))
model_2.add(Dropout(0.5))
model_2.add(Dense(num_classes, activation='softmax'))

model_2.summary()

model_2.compile(loss='categorical_crossentropy',
              optimizer=keras.optimizers.Adadelta(),
              metrics=['accuracy'])

Model: "sequential_12"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_18 (Conv2D)           (None, 96, 96, 32)        2432      
_________________________________________________________________
max_pooling2d_12 (MaxPooling (None, 48, 48, 32)        0         
_________________________________________________________________
dropout_23 (Dropout)         (None, 48, 48, 32)        0         
_________________________________________________________________
flatten_12 (Flatten)         (None, 73728)             0         
_________________________________________________________________
dense_23 (Dense)             (None, 128)               9437312   
_________________________________________________________________
dropout_24 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_24 (Dense)             (None, 120)             

In [30]:
model_2.fit(x_train_matnp, y_train_cat,
          epochs=10,
          verbose=1,
          validation_data=(x_test_matnp, y_test_cat))
score = model_2.evaluate(x_test_matnp, y_test_cat, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Train on 60498 samples, validate on 20622 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test loss: 0.14352463840097535
Test accuracy: 0.9649403691291809


## Network 3

In [31]:
model_3 = Sequential()

model_3.add(Conv2D(16, kernel_size=(5, 5),
                 activation='relu',
                 input_shape=(100,100,3)))
model_3.add(MaxPooling2D(pool_size=(2, 2)))
model_3.add(Dropout(0.25))
model_3.add(Flatten())
model_3.add(Dense(64, activation='relu'))
model_3.add(Dropout(0.5))
model_3.add(Dense(num_classes, activation='softmax'))

model_3.summary()

model_3.compile(loss='categorical_crossentropy',
              optimizer=keras.optimizers.Adadelta(),
              metrics=['accuracy'])

Model: "sequential_13"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_19 (Conv2D)           (None, 96, 96, 16)        1216      
_________________________________________________________________
max_pooling2d_13 (MaxPooling (None, 48, 48, 16)        0         
_________________________________________________________________
dropout_25 (Dropout)         (None, 48, 48, 16)        0         
_________________________________________________________________
flatten_13 (Flatten)         (None, 36864)             0         
_________________________________________________________________
dense_25 (Dense)             (None, 64)                2359360   
_________________________________________________________________
dropout_26 (Dropout)         (None, 64)                0         
_________________________________________________________________
dense_26 (Dense)             (None, 120)             

In [32]:
model_3.fit(x_train_matnp, y_train_cat,
          epochs=10,
          verbose=1,
          validation_data=(x_test_matnp, y_test_cat))
score = model_3.evaluate(x_test_matnp, y_test_cat, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Train on 60498 samples, validate on 20622 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test loss: 0.27795652542980437
Test accuracy: 0.928086519241333


## Network 4

In [None]:
model_4 = Sequential()

model_4.add(Conv2D(32, kernel_size=(5, 5),
                 activation='relu',
                 input_shape=(100,100,3)))
model_4.add(MaxPooling2D(pool_size=(2, 2)))
model_4.add(Dropout(0.25))
model_4.add(Flatten())
model_4.add(Dense(64, activation='relu'))
model_4.add(Dropout(0.5))
model_4.add(Dense(num_classes, activation='softmax'))

model_4.summary()

model_4.compile(loss='categorical_crossentropy',
              optimizer=keras.optimizers.Adadelta(),
              metrics=['accuracy'])

Model: "sequential_14"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_20 (Conv2D)           (None, 96, 96, 32)        2432      
_________________________________________________________________
max_pooling2d_14 (MaxPooling (None, 48, 48, 32)        0         
_________________________________________________________________
dropout_27 (Dropout)         (None, 48, 48, 32)        0         
_________________________________________________________________
flatten_14 (Flatten)         (None, 73728)             0         
_________________________________________________________________
dense_27 (Dense)             (None, 64)                4718656   
_________________________________________________________________
dropout_28 (Dropout)         (None, 64)                0         
_________________________________________________________________
dense_28 (Dense)             (None, 120)             

In [None]:
model_4.fit(x_train_matnp, y_train_cat,
          epochs=10,
          verbose=1,
          validation_data=(x_test_matnp, y_test_cat))
score = model_4.evaluate(x_test_matnp, y_test_cat, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Train on 60498 samples, validate on 20622 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test loss: 0.1839948054649649
Test accuracy: 0.9572301506996155


## Network 5

In [10]:
model_5 = Sequential()

model_5.add(Conv2D(16, kernel_size=(5, 5),
                 activation='relu',
                 input_shape=(100,100,3)))
model_5.add(Conv2D(16, (5, 5), activation='relu'))
model_5.add(MaxPooling2D(pool_size=(2, 2)))
model_5.add(Dropout(0.25))
model_5.add(Flatten())
model_5.add(Dense(64, activation='relu'))
model_5.add(Dropout(0.5))
model_5.add(Dense(num_classes, activation='softmax'))

model_5.summary()

model_5.compile(loss='categorical_crossentropy',
              optimizer=keras.optimizers.Adadelta(),
              metrics=['accuracy'])

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 96, 96, 16)        1216      
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 92, 92, 16)        6416      
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 46, 46, 16)        0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 46, 46, 16)        0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 33856)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 64)                2166848   
_________________________________________________________________
dropout_2 (Dropout)          (None, 64)               

In [11]:
model_5.fit(x_train_matnp, y_train_cat,
          epochs=10,
          verbose=1,
          validation_data=(x_test_matnp, y_test_cat))
score = model_5.evaluate(x_test_matnp, y_test_cat, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Train on 60498 samples, validate on 20622 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test loss: 0.1678199889935895
Test accuracy: 0.9563087821006775


## Summary

Not surprisingly, the models with the best accuracy have more complexity and more trainable parameters, meaning that they are more resource intensive and take longer to learn.  There are so many trade-offs an options with these models, it seems as thouh one could easily spend an entire career or more optimizing them.