In [1]:
import tensorflow as tf
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd 

from tensorflow.keras import datasets, layers, models, regularizers, initializers
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from skimage.io import imshow
from skimage.transform import rotate
from skimage.filters.edges import convolve

from skimage.io import imshow, imread
from skimage.transform import resize
from skimage.color import rgb2gray

import dill as pickle

np.random.seed(33)

In [2]:
df_test = pd.read_csv('data/test_labels.csv')
df_train = pd.read_csv('data/train_labels.csv')

bw_loaded = np.load('data/bw_images.npz')
X_train= bw_loaded['a']
X_test = bw_loaded['b']

X_test = X_test.reshape(-1,80,60,1)
X_train = X_train.reshape(-1,80,60,1)

In [3]:
# from model_functions import pick_ylabels, multi_index_counts, test_counts_by_cat, train_counts_by_cat, category_codes, class_weights

In [4]:
def pick_ylabels(column):
    y_train = df_train[column].copy().astype('category').cat.codes
    y_test = df_test[column].copy().astype('category').cat.codes
    return (y_train.values, y_test.values)
def multi_index_counts(col, col2):
    counts = df_test.groupby([col, col2]).count().id
    return counts

def category_codes(column):
    _, ytest = pick_ylabels(column)
    cat_codes = {}
    cat_code_list = []
    for i in range(len(df_test[column].value_counts().index)):
        s = i
        t = df_test[column].value_counts().index[i]
        cat_codes[s] = t
#         cat_code_list.append([s, t])

#     for key in sorted(cat_codes):
#         print("%s: %s" % (key, cat_codes[key]))
    return cat_codes

def test_counts_by_cat(column):
    _, ytest = pick_ylabels(column)
    test_counts_dict = {}
    test_counts = []
    for i in range(len(df_test[column].value_counts().index)):
        s = df_test[column].value_counts().values[i]
        t = df_test[column].value_counts().index[i]
        test_counts_dict[t] = s
        test_counts.append([t, s])
#     for i in sorted(test_counts):
#         print("%s: %s" % (test_counts[0], test_counts[1]))
    return test_counts

def train_counts_by_cat(column):
    y_train = pick_ylabels(column)
    train_counts_dict = {}
    train_counts = []
    for i in range(len(df_test[column].value_counts().index)):
        s = df_train[column].value_counts().values[i]
        t = df_train[column].value_counts().index[i]
        train_counts_dict[t] = s
        train_counts.append([t, s])
#     for i in sorted(test_counts):
#         print("%s: %s" % (test_counts[0], test_counts[1]))
    return train_counts

def class_weights(column):
    train_counts = train_counts_by_cat(column)
    counts_list = []
    ratio_list = []
    ratio_dict = {}
    for x in test_counts:
        counts_list.append(x[1])
    #print(counts_list)
    z = sum(counts_list)
    for x in counts_list:
        if np.round(x/z, 3) ==0:
            ratio_list.append(.001)
        else: 
            ratio_list.append(np.round(x/z, 2))
    for k, v in enumerate(ratio_list):
        ratio_dict[k] = v
    return ratio_dict

def model_results(model, column):
    yhat = model.predict(X_test)
    _ , ytest = pick_ylabels(column)
    accuracy = accuracy_score(ytest, yhat)
    print( 'Accuracy Score: ', accuracy )
    recall = recall_score(ytest, yhat, average='weighted')
    print( 'Recall Score: ', recall)
    y_proba = model.predict_proba(X_test)
    wrong_id_list = []
    pred_cat_list = []
    real_cat_list = []
    for row_idx in range(len(ytest)):
        if ytest[row_idx]!=yhat[row_idx]:
            wrong_id_list.append(row_idx)
            pred_cat_list.append(yhat[row_idx])
            real_cat_list.append(ytest[row_idx])

    arr = np.array([ real_cat_list, pred_cat_list])
    arr = arr.transpose()
    wrong_df = pd.DataFrame( arr, index= wrong_id_list, columns = ['actual', 'predicted'] )
    return (accuracy, recall, wrong_df)

### Predict Article Types for Apparel

In [5]:
app_train = df_train[df_train.masterCategory=='Apparel']
app_train_idx = list(app_train.index)
X_train_app = X_train[app_train_idx]

app_test = df_test[df_test.masterCategory=='Apparel']
app_test_idx = list(app_test.index)
X_test_app = X_test[app_test_idx]

In [6]:
y_train0 = app_train.articleType.copy().astype('category').cat.codes
y_test0 = app_test.articleType.copy().astype('category').cat.codes

In [7]:
from tensorflow.keras.utils import to_categorical
y_train = to_categorical(y_train0)
y_test = to_categorical(y_test0)

In [8]:
print(y_train.shape)
print(y_test.shape)

(10394, 33)
(2603, 33)


In [9]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D, Flatten, Dropout, MaxPooling2D

In [21]:
cnn_sub = models.Sequential()

In [22]:
#MaxPool reduces dimensionality of each feature
#Dropout to reduce overfitting

cnn_sub.add(Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=(80,60,1)))
cnn_sub.add(layers.MaxPooling2D((2, 2)))
cnn_sub.add(Dropout(0.25))

# cnn_sub.add(Conv2D(32, kernel_size=(3, 3), activation='relu'))
# cnn_sub.add(MaxPooling2D(pool_size=(2, 2)))
# cnn_sub.add(Dropout(0.25))

cnn_sub.add(Conv2D(64, kernel_size=(3, 3), activation='relu'))
cnn_sub.add(MaxPooling2D(pool_size=(2, 2)))
cnn_sub.add(Dropout(0.25))

cnn_sub.add(Conv2D(128, kernel_size=(3, 3), activation='relu'))
# cnn_sub.add(MaxPooling2D(pool_size=(2, 2)))
cnn_sub.add(Dropout(0.4))

cnn_sub.add(Flatten())

# cnn_sub.add(Dense(512, activation='relu'))
# cnn_sub.add(Dropout(0.5))
cnn_sub.add(Dense(128, activation='relu'))
cnn_sub.add(Dropout(0.5))
cnn_sub.add(Dense(33, activation='softmax'))

In [None]:
cnn_sub.summary()

In [19]:
cnn_sub.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [20]:
cnn_sub.fit(X_train_app, y_train, epochs=20, validation_data=(X_test_app, y_test), verbose=1)

Train on 10394 samples, validate on 2603 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7feb002fecf8>

In [91]:
# save the fitted model
model_json = cnn_sub.to_json()
with open("cnn_apparel.json", "w") as json_file:
    json_file.write(model_json)

In [74]:
y_true = y_test0

In [76]:
# get the predictions for the test data
predicted_classes = cnn_sub.predict_classes(X_test_app)

# get the indices to be plotted
correct = np.nonzero(predicted_classes==y_true)[0]
incorrect = np.nonzero(predicted_classes!=y_true)[0]

  return getattr(obj, method)(*args, **kwds)


In [77]:
predicted_classes

array([1, 0, 5, ..., 0, 5, 1])

In [79]:
from sklearn.metrics import classification_report
target_names = ["Class {}".format(i) for i in range(6)]
print(classification_report(y_true, predicted_classes, target_names=target_names))

              precision    recall  f1-score   support

     Class 0       0.95      0.98      0.96       368
     Class 1       0.96      0.91      0.93       347
     Class 2       0.90      0.55      0.68        96
     Class 3       0.95      0.98      0.96        82
     Class 4       0.98      0.93      0.95       135
     Class 5       0.96      0.99      0.98      1591

    accuracy                           0.96      2619
   macro avg       0.95      0.89      0.91      2619
weighted avg       0.96      0.96      0.96      2619



### Predict Sub-Categories for Accessories

In [89]:
acc_train = df_train[df_train.masterCategory=='Accessories']
acc_train_idx = list(acc_train.index)
X_train_acc = X_train[acc_train_idx]

acc_test = df_test[df_test.masterCategory=='Accessories']
acc_test_idx = list(acc_test.index)
X_test_acc = X_test[acc_test_idx]

y_train_acc0 = acc_train.subCategory.copy().astype('category').cat.codes
y_test_acc0 = acc_test.subCategory.copy().astype('category').cat.codes

from tensorflow.keras.utils import to_categorical
y_train_acc = to_categorical(y_train_acc0)
y_test_acc = to_categorical(y_test_acc0)

print(y_train_acc.shape)
print(y_test_acc.shape)

(5586, 6)
(1656, 6)


In [84]:
cnn_acc = models.Sequential()

In [85]:
#MaxPool reduces dimensionality of each feature
#Dropout to reduce overfitting

cnn_acc.add(Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=(80,60,1)))
cnn_acc.add(layers.MaxPooling2D((2, 2)))
cnn_acc.add(Dropout(0.2))

cnn_acc.add(Conv2D(64, kernel_size=(3, 3), activation='relu'))
cnn_acc.add(MaxPooling2D(pool_size=(2, 2)))
cnn_acc.add(Dropout(0.25))

cnn_acc.add(Conv2D(128, kernel_size=(3, 3), activation='relu'))
cnn_acc.add(Dropout(0.4))


cnn_acc.add(Flatten())

cnn_acc.add(Dense(128, activation='relu'))
cnn_acc.add(Dropout(0.3))
cnn_acc.add(Dense(6, activation='softmax'))

In [86]:
cnn_acc.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_13 (Conv2D)           (None, 78, 58, 32)        320       
_________________________________________________________________
max_pooling2d_9 (MaxPooling2 (None, 39, 29, 32)        0         
_________________________________________________________________
dropout_12 (Dropout)         (None, 39, 29, 32)        0         
_________________________________________________________________
conv2d_14 (Conv2D)           (None, 37, 27, 64)        18496     
_________________________________________________________________
max_pooling2d_10 (MaxPooling (None, 18, 13, 64)        0         
_________________________________________________________________
dropout_13 (Dropout)         (None, 18, 13, 64)        0         
_________________________________________________________________
conv2d_15 (Conv2D)           (None, 16, 11, 128)      

In [87]:
cnn_acc.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [90]:
cnn_acc.fit(X_train_acc, y_train_acc, epochs=10, validation_data=(X_test_acc, y_test_acc), verbose=1)

Train on 5586 samples, validate on 1656 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7efe905dee48>

In [92]:
y_true_acc = y_test_acc0

In [97]:
# get the predictions for the test data
predicted_classes_acc = cnn_acc.predict_classes(X_test_acc)

# get the indices to be plotted
correct = (predicted_classes_acc ==y_true_acc).to_numpy().nonzero()[0]
incorrect = (predicted_classes_acc !=y_true_acc).to_numpy().nonzero()[0]

In [98]:
from sklearn.metrics import classification_report
target_names_acc = ["Class {}".format(i) for i in range(6)]
print(classification_report(y_true_acc, predicted_classes_acc, target_names=target_names_acc))

              precision    recall  f1-score   support

     Class 0       0.99      0.97      0.98       597
     Class 1       1.00      1.00      1.00       213
     Class 2       1.00      0.79      0.88        56
     Class 3       0.93      0.98      0.96       230
     Class 4       1.00      0.98      0.99        50
     Class 5       0.97      0.99      0.98       510

    accuracy                           0.98      1656
   macro avg       0.98      0.95      0.96      1656
weighted avg       0.98      0.98      0.98      1656



In [99]:
# save the fitted model
model_json = cnn_acc.to_json()
with open("cnn_accessories.json", "w") as json_file:
    json_file.write(model_json)

### Predict Sub-Categories for Footwear

In [100]:
fw_train = df_train[df_train.masterCategory=='Footwear']
fw_train_idx = list(fw_train.index)
X_train_fw = X_train[fw_train_idx]

fw_test = df_test[df_test.masterCategory=='Footwear']
fw_test_idx = list(fw_test.index)
X_test_fw = X_test[fw_test_idx]

y_train_fw0 = fw_train.subCategory.copy().astype('category').cat.codes
y_test_fw0 = fw_test.subCategory.copy().astype('category').cat.codes

from tensorflow.keras.utils import to_categorical
y_train_fw = to_categorical(y_train_fw0)
y_test_fw = to_categorical(y_test_fw0)

print(y_train_fw.shape)
print(y_test_fw.shape)

(6296, 3)
(1796, 3)


In [106]:
cnn_fw = models.Sequential()

In [107]:
#MaxPool reduces dimensionality of each feature
#Dropout to reduce overfitting

#MaxPool reduces dimensionality of each feature
#Dropout to reduce overfitting

cnn_fw.add(Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=(80,60,1)))
cnn_fw.add(layers.MaxPooling2D((2, 2)))
cnn_fw.add(Dropout(0.25))

cnn_fw.add(Conv2D(32, kernel_size=(3, 3), activation='relu'))
cnn_fw.add(MaxPooling2D(pool_size=(2, 2)))
cnn_fw.add(Dropout(0.25))

cnn_fw.add(Conv2D(64, kernel_size=(3, 3), activation='relu'))
cnn_fw.add(MaxPooling2D(pool_size=(2, 2)))
cnn_fw.add(Dropout(0.25))

cnn_fw.add(Conv2D(128, kernel_size=(3, 3), activation='relu'))
cnn_fw.add(Dropout(0.25))


cnn_fw.add(Flatten())
cnn_fw.add(Dense(512, activation='relu'))
cnn_fw.add(Dropout(0.5))
cnn_fw.add(Dense(128, activation='relu'))
cnn_fw.add(Dropout(0.5))
cnn_fw.add(Dense(3, activation='softmax'))

In [108]:
cnn_fw.summary()

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_19 (Conv2D)           (None, 78, 58, 32)        320       
_________________________________________________________________
max_pooling2d_11 (MaxPooling (None, 39, 29, 32)        0         
_________________________________________________________________
dropout_16 (Dropout)         (None, 39, 29, 32)        0         
_________________________________________________________________
conv2d_20 (Conv2D)           (None, 37, 27, 64)        18496     
_________________________________________________________________
max_pooling2d_12 (MaxPooling (None, 18, 13, 64)        0         
_________________________________________________________________
dropout_17 (Dropout)         (None, 18, 13, 64)        0         
_________________________________________________________________
conv2d_21 (Conv2D)           (None, 16, 11, 128)      

In [109]:
cnn_fw.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
cnn_fw.fit(X_train_fw, y_train_fw, epochs=20, validation_data=(X_test_fw, y_test_fw), verbose=1)

Train on 6296 samples, validate on 1796 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
1152/6296 [====>.........................] - ETA: 6s - loss: 0.1787 - acc: 0.9297

In [92]:
y_true_fw = y_test_fw0

In [97]:
# get the predictions for the test data
predicted_classes_fw = cnn_fw.predict_classes(X_test_fw)

# get the indices to be plotted
correct = (predicted_classes_fw ==y_true_fw).to_numpy().nonzero()[0]
incorrect = (predicted_classes_fw !=y_true_fw).to_numpy().nonzero()[0]

In [98]:
from sklearn.metrics import classification_report
target_names_fw = ["Class {}".format(i) for i in range(3)]
print(classification_report(y_true_fw, predicted_classes_fw, target_names=target_names_fw))

              precision    recall  f1-score   support

     Class 0       0.99      0.97      0.98       597
     Class 1       1.00      1.00      1.00       213
     Class 2       1.00      0.79      0.88        56
     Class 3       0.93      0.98      0.96       230
     Class 4       1.00      0.98      0.99        50
     Class 5       0.97      0.99      0.98       510

    accuracy                           0.98      1656
   macro avg       0.98      0.95      0.96      1656
weighted avg       0.98      0.98      0.98      1656



In [99]:
# save the fitted model
model_json = cnn_fw.to_json()
with open("cnn_footwear.json", "w") as json_file:
    json_file.write(model_json)

### Predict Sub-Categories for Personal Care

In [100]:
pc_train = df_train[df_train.masterCategory=='Personal Care']
pc_train_idx = list(pc_train.index)
X_train_pc = X_train[pc_train_idx]

pc_test = df_test[df_test.masterCategory=='Personal Care']
pc_test_idx = list(pc_test.index)
X_test_pc = X_test[pc_test_idx]

y_train_pc0 = fw_train.subCategory.copy().astype('category').cat.codes
y_test_pc0 = fw_test.subCategory.copy().astype('category').cat.codes

from tensorflow.keras.utils import to_categorical
y_train_pc = to_categorical(y_train_pc0)
y_test_pc = to_categorical(y_test_pc0)

print(y_train_pc.shape)
print(y_test_pc.shape)

(6296, 3)
(1796, 3)


In [106]:
cnn_pc = models.Sequential()

In [107]:
#MaxPool reduces dimensionality of each feature
#Dropout to reduce overfitting

cnn_pc.add(Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=(80,60,1)))
cnn_pc.add(layers.MaxPooling2D((2, 2)))
cnn_pc.add(Dropout(0.2)

cnn_pc.add(Conv2D(64, kernel_size=(3, 3), activation='relu'))
cnn_pc.add(MaxPooling2D(pool_size=(2, 2)))
cnn_pc.add(Dropout(0.25)

cnn_pc.add(Conv2D(128, kernel_size=(3, 3), activation='relu'))
cnn_pc.add(Dropout(0.4)
           
cnn_pc.add(Flatten()
           
cnn_pc.add(Dense(128, activation='relu'))
cnn_pc.add(Dropout(0.3))
cnn_pc.add(Dense(3, activation='softmax'))

In [108]:
cnn_fw.summary()

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_19 (Conv2D)           (None, 78, 58, 32)        320       
_________________________________________________________________
max_pooling2d_11 (MaxPooling (None, 39, 29, 32)        0         
_________________________________________________________________
dropout_16 (Dropout)         (None, 39, 29, 32)        0         
_________________________________________________________________
conv2d_20 (Conv2D)           (None, 37, 27, 64)        18496     
_________________________________________________________________
max_pooling2d_12 (MaxPooling (None, 18, 13, 64)        0         
_________________________________________________________________
dropout_17 (Dropout)         (None, 18, 13, 64)        0         
_________________________________________________________________
conv2d_21 (Conv2D)           (None, 16, 11, 128)      

In [109]:
cnn_fw.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
cnn_fw.fit(X_train_fw, y_train_fw, epochs=10, validation_data=(X_test_fw, y_test_fw), verbose=1)

Train on 6296 samples, validate on 1796 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10

In [92]:
y_true_fw = y_test_fw0

In [97]:
# get the predictions for the test data
predicted_classes_fw = cnn_fw.predict_classes(X_test_fw)

# get the indices to be plotted
correct = (predicted_classes_fw ==y_true_fw).to_numpy().nonzero()[0]
incorrect = (predicted_classes_fw !=y_true_fw).to_numpy().nonzero()[0]

In [98]:
from sklearn.metrics import classification_report
target_names_fw = ["Class {}".format(i) for i in range(3)]
print(classification_report(y_true_fw, predicted_classes_fw, target_names=target_names_fw))

              precision    recall  f1-score   support

     Class 0       0.99      0.97      0.98       597
     Class 1       1.00      1.00      1.00       213
     Class 2       1.00      0.79      0.88        56
     Class 3       0.93      0.98      0.96       230
     Class 4       1.00      0.98      0.99        50
     Class 5       0.97      0.99      0.98       510

    accuracy                           0.98      1656
   macro avg       0.98      0.95      0.96      1656
weighted avg       0.98      0.98      0.98      1656



In [99]:
# save the fitted model
model_json = cnn_fw.to_json()
with open("cnn_footwear.json", "w") as json_file:
    json_file.write(model_json)