In [3]:
import tensorflow as tf
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd 

from tensorflow.keras import datasets, layers, models, regularizers, initializers
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from skimage.io import imshow
from skimage.transform import rotate
from skimage.filters.edges import convolve

from skimage.io import imshow, imread
from skimage.transform import resize
from skimage.color import rgb2gray

import dill as pickle

np.random.seed(33)

In [4]:
df_test = pd.read_csv('data/test_labels.csv')
df_train = pd.read_csv('data/train_labels.csv')

bw_loaded = np.load('data/color_images.npz')
X_train= bw_loaded['a']
X_test = bw_loaded['b']

X_test = X_test.reshape(-1,80,60,3)
X_train = X_train.reshape(-1,80,60,3)

In [3]:
# from model_functions import pick_ylabels, multi_index_counts, test_counts_by_cat, train_counts_by_cat, category_codes, class_weights

In [4]:
def pick_ylabels(column):
    y_train = df_train[column].copy().astype('category').cat.codes
    y_test = df_test[column].copy().astype('category').cat.codes
    return (y_train.values, y_test.values)
def multi_index_counts(col, col2):
    counts = df_test.groupby([col, col2]).count().id
    return counts

def category_codes(column):
    _, ytest = pick_ylabels(column)
    cat_codes = {}
    cat_code_list = []
    for i in range(len(df_test[column].value_counts().index)):
        s = i
        t = df_test[column].value_counts().index[i]
        cat_codes[s] = t
#         cat_code_list.append([s, t])

#     for key in sorted(cat_codes):
#         print("%s: %s" % (key, cat_codes[key]))
    return cat_codes

def test_counts_by_cat(column):
    _, ytest = pick_ylabels(column)
    test_counts_dict = {}
    test_counts = []
    for i in range(len(df_test[column].value_counts().index)):
        s = df_test[column].value_counts().values[i]
        t = df_test[column].value_counts().index[i]
        test_counts_dict[t] = s
        test_counts.append([t, s])
#     for i in sorted(test_counts):
#         print("%s: %s" % (test_counts[0], test_counts[1]))
    return test_counts

def train_counts_by_cat(column):
    y_train = pick_ylabels(column)
    train_counts_dict = {}
    train_counts = []
    for i in range(len(df_test[column].value_counts().index)):
        s = df_train[column].value_counts().values[i]
        t = df_train[column].value_counts().index[i]
        train_counts_dict[t] = s
        train_counts.append([t, s])
#     for i in sorted(test_counts):
#         print("%s: %s" % (test_counts[0], test_counts[1]))
    return train_counts

def class_weights(column):
    train_counts = train_counts_by_cat(column)
    counts_list = []
    ratio_list = []
    ratio_dict = {}
    for x in test_counts:
        counts_list.append(x[1])
    #print(counts_list)
    z = sum(counts_list)
    for x in counts_list:
        if np.round(x/z, 3) ==0:
            ratio_list.append(.001)
        else: 
            ratio_list.append(np.round(x/z, 2))
    for k, v in enumerate(ratio_list):
        ratio_dict[k] = v
    return ratio_dict

def model_results(model, column):
    yhat = model.predict(X_test)
    _ , ytest = pick_ylabels(column)
    accuracy = accuracy_score(ytest, yhat)
    print( 'Accuracy Score: ', accuracy )
    recall = recall_score(ytest, yhat, average='weighted')
    print( 'Recall Score: ', recall)
    y_proba = model.predict_proba(X_test)
    wrong_id_list = []
    pred_cat_list = []
    real_cat_list = []
    for row_idx in range(len(ytest)):
        if ytest[row_idx]!=yhat[row_idx]:
            wrong_id_list.append(row_idx)
            pred_cat_list.append(yhat[row_idx])
            real_cat_list.append(ytest[row_idx])

    arr = np.array([ real_cat_list, pred_cat_list])
    arr = arr.transpose()
    wrong_df = pd.DataFrame( arr, index= wrong_id_list, columns = ['actual', 'predicted'] )
    return (accuracy, recall, wrong_df)

In [16]:
y_train0.value_counts()

 0     5745
 18    3127
 1     2801
 3     2752
 8     2563
 15    1754
 7     1323
 13    1073
 14    1031
 11     923
 5      784
 12     352
 17     317
 19     290
 10     251
 16     179
 2       65
 4       62
 9       31
-1       12
 6        4
dtype: int64

In [11]:
train = df_train[df_train.masterCategory=='Apparel']
train_idx = list(train.index)
X_train_app = X_train[train_idx]

test = df_test[df_test.masterCategory=='Apparel']
test_idx = list(test.index)
X_test_app = X_test[test_idx]

In [12]:
y_train0 = df_train.baseColour.copy().astype('category').cat.codes
y_test0 = df_test.baseColour.copy().astype('category').cat.codes

In [13]:
from tensorflow.keras.utils import to_categorical
y_train = to_categorical(y_train0)
y_test = to_categorical(y_test0)

In [14]:
print(y_train.shape)
print(y_test.shape)

(25439, 20)
(6440, 19)


In [60]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D, Flatten, Dropout, MaxPooling2D

In [68]:
cnn_sub = models.Sequential()

In [69]:
#MaxPool reduces dimensionality of each feature
#Dropout to reduce overfitting

cnn_sub.add(Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=(80,60,1)))
cnn_sub.add(layers.MaxPooling2D((2, 2)))
cnn_sub.add(Dropout(0.2))

cnn_sub.add(Conv2D(64, kernel_size=(3, 3), activation='relu'))
cnn_sub.add(MaxPooling2D(pool_size=(2, 2)))
cnn_sub.add(Dropout(0.25))

cnn_sub.add(Conv2D(128, kernel_size=(3, 3), activation='relu'))
cnn_sub.add(Dropout(0.4))

cnn_sub.add(Flatten())

cnn_sub.add(Dense(128, activation='relu'))
cnn_sub.add(Dropout(0.3))
cnn_sub.add(Dense(6, activation='softmax'))

In [70]:
cnn_sub.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_10 (Conv2D)           (None, 78, 58, 32)        320       
_________________________________________________________________
max_pooling2d_7 (MaxPooling2 (None, 39, 29, 32)        0         
_________________________________________________________________
dropout_8 (Dropout)          (None, 39, 29, 32)        0         
_________________________________________________________________
conv2d_11 (Conv2D)           (None, 37, 27, 64)        18496     
_________________________________________________________________
max_pooling2d_8 (MaxPooling2 (None, 18, 13, 64)        0         
_________________________________________________________________
dropout_9 (Dropout)          (None, 18, 13, 64)        0         
_________________________________________________________________
conv2d_12 (Conv2D)           (None, 16, 11, 128)      

In [71]:
cnn_sub.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
cnn_sub.fit(X_train_app, y_train, epochs=20, validation_data=(X_test_app, y_test), verbose=1)

Train on 9194 samples, validate on 2619 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20

In [56]:
# save the fitted model
model_json = cnn_sub.to_json()
with open("cnn_sub.json", "w") as json_file:
    json_file.write(model_json)

In [45]:
y_true = y_test0

In [43]:
# get the predictions for the test data
predicted_classes = model.predict_classes(X_test)

# get the indices to be plotted
correct = np.nonzero(predicted_classes==y_true)[0]
incorrect = np.nonzero(predicted_classes!=y_true)[0]

(8022,)

In [48]:
predicted_classes

array([1, 2, 1, ..., 0, 1, 1])

In [47]:
from sklearn.metrics import classification_report
target_names = ["Class {}".format(i) for i in range(4)]
print(classification_report(y_true, predicted_classes, target_names=target_names))

              precision    recall  f1-score   support

     Class 0       0.88      0.84      0.86      2035
     Class 1       0.93      0.95      0.94      4431
     Class 2       0.97      0.94      0.95      1827
     Class 3       0.72      0.75      0.74       478

    accuracy                           0.91      8771
   macro avg       0.87      0.87      0.87      8771
weighted avg       0.91      0.91      0.91      8771



In [17]:
from imblearn.over_sampling import SMOTE

In [45]:
X_train.shape, y_train.shape

((30791, 80, 60, 1), (30791, 7))

In [23]:
from sklearn.utils import class_weight

In [None]:
def model_results(model, column):
    yhat = model.predict(X_test)
    _ , ytest = pick_ylabels(column)
    accuracy = accuracy_score(ytest, yhat)
    print( 'Accuracy Score: ', accuracy )
    recall = recall_score(ytest, yhat, average='weighted')
    print( 'Recall Score: ', recall)
    y_proba = model.predict_proba(X_test)
    wrong_id_list = []
    pred_cat_list = []
    real_cat_list = []
    for row_idx in range(len(ytest)):
        if ytest[row_idx]!=yhat[row_idx]:
            wrong_id_list.append(row_idx)
            pred_cat_list.append(yhat[row_idx])
            real_cat_list.append(ytest[row_idx])

    arr = np.array([ real_cat_list, pred_cat_list])
    arr = arr.transpose()
    wrong_df = pd.DataFrame( arr, index= wrong_id_list, columns = ['actual', 'predicted'] )
    return (accuracy, recall, wrong_df)