In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import math
from imblearn.over_sampling import RandomOverSampler
from scipy.stats import ttest_ind
from sqlalchemy import create_engine
from scipy.stats.mstats import winsorize
from scipy.stats import boxcox
from scipy.stats import jarque_bera
from scipy.stats import normaltest
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import cross_val_score
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import make_scorer
from sklearn.datasets import load_files

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, ElasticNetCV
from statsmodels.tools.eval_measures import mse, rmse
from wordcloud import WordCloud
import statsmodels.api as sm
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.svm import SVC
from sklearn import tree
from sklearn import ensemble
from sklearn import datasets
from sklearn.utils import shuffle
from sklearn import neighbors
from IPython.display import Image


import pydotplus
from sklearn import ensemble

import warnings

%matplotlib inline
sns.set()

warnings.filterwarnings('ignore')
import time
import cv2
import glob

Using TensorFlow backend.


In [2]:
import tensorflow as tf
import keras

from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, Conv3D, MaxPooling2D
from keras.layers import LSTM, Input, TimeDistributed
from keras.models import Model
from keras.optimizers import RMSprop

from keras import backend as K

In [3]:
#import the data
start_time = time.time()

train_dir = 'Data/fruits-360_dataset/fruits-360/Training' 
test_dir = 'Data/fruits-360_dataset/fruits-360/Test'

def load_dataset(path): 
    data = load_files(path) 
    files = np.array(data['filenames']) 
    targets = np.array(data['target']) 
    target_labels = np.array(data['target_names']) 
    return files,targets,target_labels

x_train, y_train, target_labels = load_dataset(train_dir) 
x_test, y_test,_ = load_dataset(test_dir)

print("--- %s seconds ---" % (time.time() - start_time))

--- 27.221718072891235 seconds ---


In [4]:
#convert image files to matrices
start_time = time.time()

x_train_mat = []
x_test_mat = []

for img in x_train:
    n= cv2.imread(img)
    x_train_mat.append(n)

for img in x_test:
    n= cv2.imread(img)
    x_test_mat.append(n)
    
print("--- %s seconds ---" % (time.time() - start_time))

--- 27.880234003067017 seconds ---


In [5]:
del(x_train, x_test)

In [6]:
#convert data to numpy arrays
start_time = time.time()

x_train_matnp = np.array(x_train_mat)
x_test_matnp = np.array(x_test_mat)
print("--- %s seconds ---" % (time.time() - start_time))

--- 1.6575748920440674 seconds ---


In [17]:
from sys import getsizeof

getsizeof(x_train_matnp)

7259760144

In [14]:
start_time = time.time()
del(x_train_mat, x_test_mat)

print("--- %s seconds ---" % (time.time() - start_time))

In [7]:
#reshape and normalize data

start_time = time.time()

img_rows, img_cols = 100, 100
num_classes = 120

if K.image_data_format() == 'channels_first':
    x_train_matnp = x_train_matnp.reshape(x_train_matnp.shape[0], 3, img_rows, img_cols)
    x_test_matnp = x_test_matnp.reshape(x_test_matnp.shape[0], 3, img_rows, img_cols)
    input_shape = (3, img_rows, img_cols)
else:
    x_train_matnp = x_train_matnp.reshape(x_train_matnp.shape[0], img_rows, img_cols, 3)
    x_test_matnp = x_test_matnp.reshape(x_test_matnp.shape[0], img_rows, img_cols, 3)
    input_shape = (img_rows, img_cols, 3)

x_train_matnp = x_train_matnp.astype('float32')
x_test_matnp = x_test_matnp.astype('float32')
x_train_matnp /= 255
x_test_matnp /= 255
print('x_train_matnp shape:', x_train_matnp.shape)
print(x_train_matnp.shape[0], 'train samples')
print(x_test_matnp.shape[0], 'test samples')

print("--- %s seconds ---" % (time.time() - start_time))

x_train_matnp shape: (60498, 100, 100, 3)
60498 train samples
20622 test samples
--- 36.766772985458374 seconds ---


In [8]:
print(y_test)
print(y_test.shape)

[  0 104  83 ...  58  64  16]
(20622,)


In [9]:
# convert class vectors to binary class matrices

y_train_cat = keras.utils.to_categorical(y_train)
y_test_cat = keras.utils.to_categorical(y_test)

In [10]:
print(y_test_cat)
print(y_test_cat.shape)

[[1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
(20622, 120)


In [11]:
start_time = time.time()

x_train_matnp_big, x_train_matnp_small, y_train_cat_big, y_train_cat_small = train_test_split(
    x_train_matnp,
    y_train_cat,
    test_size=0.01,
    random_state=42)

print("--- %s seconds ---" % (time.time() - start_time))

--- 42.670034885406494 seconds ---


In [12]:
start_time = time.time()

x_test_matnp_big, x_test_matnp_small, y_test_cat_big, y_test_cat_small = train_test_split(
    x_test_matnp,
    y_test_cat,
    test_size=0.01,
    random_state=42)

print("--- %s seconds ---" % (time.time() - start_time))

--- 38.2055459022522 seconds ---


In [18]:
start_time = time.time()
del(x_train_matnp, x_test_matnp)

print("--- %s seconds ---" % (time.time() - start_time))

--- 0.42603588104248047 seconds ---


In [13]:
y_test_cat_small.shape

(207, 120)

In [19]:
model = Sequential()

model.add(Conv2D(16, kernel_size=(5, 5),
                 activation='relu',
                 input_shape=(100,100,3)))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))

model.summary()

model.compile(loss='categorical_crossentropy',
              optimizer=keras.optimizers.Adadelta(),
              metrics=['accuracy'])

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 96, 96, 16)        1216      
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 48, 48, 16)        0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 48, 48, 16)        0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 36864)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               4718720   
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 120)              

In [20]:
y_train_cat_small.shape
x_train_matnp_small.shape

(605, 100, 100, 3)

In [21]:
model.fit(x_train_matnp_small, y_train_cat_small,
          epochs=10,
          verbose=1,
          validation_data=(x_test_matnp_small, y_test_cat_small))
score = model.evaluate(x_test_matnp_small, y_test_cat_small, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Train on 605 samples, validate on 207 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test loss: 4.302891929368466
Test accuracy: 0.10144927352666855


In [22]:
batch_size = 64
row_hidden, col_hidden = 100, 100
num_classes = 120
epochs = 3

x_train_matnp_small_rnn = x_train_matnp_small.reshape(x_train_matnp_small.shape[0], 100, 100, 3)
x_test_matnp_small_rnn = x_test_matnp_small.reshape(x_test_matnp_small.shape[0], 100, 100, 3)


In [24]:
row, col, pixel = x_train_matnp_small_rnn.shape[1:]

# 4D input.
x = Input(shape=(row, col, pixel))

# Encodes a row of pixels using TimeDistributed Wrapper.
encoded_rows = TimeDistributed(LSTM(row_hidden))(x)

# Encodes columns of encoded rows.
encoded_columns = LSTM(col_hidden)(encoded_rows)

In [25]:
start_time = time.time()

# Final predictions and model.
prediction = Dense(num_classes, activation='softmax')(encoded_columns)
model_rnn = Model(x, prediction)
model_rnn.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

print("--- %s seconds ---" % (time.time() - start_time))

--- 0.05601215362548828 seconds ---


In [27]:
start_time = time.time()

# Training.
model_rnn.fit(x_train_matnp_small_rnn, y_train_cat_small,
          epochs=epochs,
          verbose=1,
          validation_data=(x_test_matnp_small_rnn, y_test_cat_small))

# Evaluation.
scores = model.evaluate(x_test_matnp_small_rnn, y_test_cat_small, verbose=0)
print('Test loss:', scores[0])
print('Test accuracy:', scores[1])

print("--- %s seconds ---" % (time.time() - start_time))

Train on 605 samples, validate on 207 samples
Epoch 1/10

KeyboardInterrupt: 