In [1]:
import tensorflow as tf
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd 

from tensorflow.keras import datasets, layers, models, regularizers, initializers
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from skimage.io import imshow
from skimage.transform import rotate
from skimage.filters.edges import convolve

from skimage.io import imshow, imread
from skimage.transform import resize
from skimage.color import rgb2gray

import dill as pickle

np.random.seed(42)

In [2]:
df_test = pd.read_csv('data/test_labels.csv')
df_train = pd.read_csv('data/train_labels.csv')

bw_loaded = np.load('data/bw_images.npz')
X_train= bw_loaded['a']
X_test = bw_loaded['b']

X_test = X_test.reshape(-1,80,60,1)
X_train = X_train.reshape(-1,80,60,1)

In [3]:
# from model_functions import pick_ylabels, multi_index_counts, test_counts_by_cat, train_counts_by_cat, category_codes, class_weights

In [41]:
def pick_ylabels(column):
    y_train = df_train[column].copy().astype('category').cat.codes
    y_test = df_test[column].copy().astype('category').cat.codes
    return (y_train.values, y_test.values)
def multi_index_counts(col, col2):
    counts = df_test.groupby([col, col2]).count().id
    return counts

def category_codes(column):
    _, ytest = pick_ylabels(column)
    cat_codes = {}
    cat_code_list = []
    for i in range(len(df_test[column].value_counts().index)):
        s = i
        t = df_test[column].value_counts().index[i]
        cat_codes[s] = t
#         cat_code_list.append([s, t])

#     for key in sorted(cat_codes):
#         print("%s: %s" % (key, cat_codes[key]))
    return cat_codes

def test_counts_by_cat(column):
    _, ytest = pick_ylabels(column)
    test_counts_dict = {}
    test_counts = []
    for i in range(len(df_test[column].value_counts().index)):
        s = df_test[column].value_counts().values[i]
        t = df_test[column].value_counts().index[i]
        test_counts_dict[t] = s
        test_counts.append([t, s])
#     for i in sorted(test_counts):
#         print("%s: %s" % (test_counts[0], test_counts[1]))
    return test_counts

def train_counts_by_cat(column):
    y_train = pick_ylabels(column)
    train_counts_dict = {}
    train_counts = []
    for i in range(len(df_test[column].value_counts().index)):
        s = df_train[column].value_counts().values[i]
        t = df_train[column].value_counts().index[i]
        train_counts_dict[t] = s
        train_counts.append([t, s])
#     for i in sorted(test_counts):
#         print("%s: %s" % (test_counts[0], test_counts[1]))
    return train_counts

def class_weights(column):
    train_counts = train_counts_by_cat(column)
    counts_list = []
    ratio_list = []
    ratio_dict = {}
    for x in test_counts:
        counts_list.append(x[1])
    #print(counts_list)
    z = sum(counts_list)
    for x in counts_list:
        if np.round(x/z, 3) ==0:
            ratio_list.append(.001)
        else: 
            ratio_list.append(np.round(x/z, 2))
    for k, v in enumerate(ratio_list):
        ratio_dict[k] = v
    return ratio_dict

def model_results(model, column):
    yhat = model.predict(X_test)
    _ , ytest = pick_ylabels(column)
    accuracy = accuracy_score(ytest, yhat)
    print( 'Accuracy Score: ', accuracy )
    recall = recall_score(ytest, yhat, average='weighted')
    print( 'Recall Score: ', recall)
    y_proba = model.predict_proba(X_test)
    wrong_id_list = []
    pred_cat_list = []
    real_cat_list = []
    for row_idx in range(len(ytest)):
        if ytest[row_idx]!=yhat[row_idx]:
            wrong_id_list.append(row_idx)
            pred_cat_list.append(yhat[row_idx])
            real_cat_list.append(ytest[row_idx])

    arr = np.array([ real_cat_list, pred_cat_list])
    arr = arr.transpose()
    wrong_df = pd.DataFrame( arr, index= wrong_id_list, columns = ['actual', 'predicted'] )
    return (accuracy, recall, wrong_df)

In [9]:
y_train0, y_test0 = pick_ylabels('subCategory')

In [51]:
train_cats_arr = np.array(train_counts_by_cat('subCategory'))
train_cats = set(x[:,0])
test_cats_arr = np.array(test_counts_by_cat('subCategory'))
test_cats = set(x[:,0])
l

{'Accessories',
 'Apparel Set',
 'Bags',
 'Bath and Body',
 'Belts',
 'Bottomwear',
 'Cufflinks',
 'Dress',
 'Eyes',
 'Eyewear',
 'Flip Flops',
 'Fragrance',
 'Gloves',
 'Hair',
 'Headwear',
 'Innerwear',
 'Jewellery',
 'Lips',
 'Loungewear and Nightwear',
 'Makeup',
 'Mufflers',
 'Nails',
 'Sandal',
 'Saree',
 'Scarves',
 'Shoe Accessories',
 'Shoes',
 'Skin',
 'Skin Care',
 'Socks',
 'Stoles',
 'Ties',
 'Topwear',
 'Wallets',
 'Watches',
 'Water Bottle'}

In [10]:
from tensorflow.keras.utils import to_categorical
y_train = to_categorical(y_train0)
y_test = to_categorical(y_test0)

In [11]:
y_train.shape

(30700, 39)

In [12]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D, Flatten

In [33]:
cnn_sub = models.Sequential()

In [34]:
cnn_sub.add(Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=(80,60,1)))
cnn_sub.add(layers.MaxPooling2D((2, 2)))

In [35]:
cnn_sub.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_3 (Conv2D)            (None, 78, 58, 32)        320       
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 39, 29, 32)        0         
Total params: 320
Trainable params: 320
Non-trainable params: 0
_________________________________________________________________


In [36]:
cnn_sub.add(Flatten())
cnn_sub.add(Dense(39, activation='softmax'))


In [37]:
cnn_sub.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [38]:
cnn_sub.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test), verbose=1)

ValueError: A target array with shape (8771, 36) was passed for an output of shape (None, 39) while using as loss `categorical_crossentropy`. This loss expects targets to have the same shape as the output.

In [56]:
# save the fitted model
model_json = cnn_sub.to_json()
with open("cnn_sub.json", "w") as json_file:
    json_file.write(model_json)

In [45]:
y_true = y_test0

In [43]:
# get the predictions for the test data
predicted_classes = model.predict_classes(X_test)

# get the indices to be plotted
correct = np.nonzero(predicted_classes==y_true)[0]
incorrect = np.nonzero(predicted_classes!=y_true)[0]

(8022,)

In [48]:
predicted_classes

array([1, 2, 1, ..., 0, 1, 1])

In [47]:
from sklearn.metrics import classification_report
target_names = ["Class {}".format(i) for i in range(4)]
print(classification_report(y_true, predicted_classes, target_names=target_names))

              precision    recall  f1-score   support

     Class 0       0.88      0.84      0.86      2035
     Class 1       0.93      0.95      0.94      4431
     Class 2       0.97      0.94      0.95      1827
     Class 3       0.72      0.75      0.74       478

    accuracy                           0.91      8771
   macro avg       0.87      0.87      0.87      8771
weighted avg       0.91      0.91      0.91      8771



In [17]:
from imblearn.over_sampling import SMOTE

In [45]:
X_train.shape, y_train.shape

((30791, 80, 60, 1), (30791, 7))

In [23]:
from sklearn.utils import class_weight

In [None]:
def model_results(model, column):
    yhat = model.predict(X_test)
    _ , ytest = pick_ylabels(column)
    accuracy = accuracy_score(ytest, yhat)
    print( 'Accuracy Score: ', accuracy )
    recall = recall_score(ytest, yhat, average='weighted')
    print( 'Recall Score: ', recall)
    y_proba = model.predict_proba(X_test)
    wrong_id_list = []
    pred_cat_list = []
    real_cat_list = []
    for row_idx in range(len(ytest)):
        if ytest[row_idx]!=yhat[row_idx]:
            wrong_id_list.append(row_idx)
            pred_cat_list.append(yhat[row_idx])
            real_cat_list.append(ytest[row_idx])

    arr = np.array([ real_cat_list, pred_cat_list])
    arr = arr.transpose()
    wrong_df = pd.DataFrame( arr, index= wrong_id_list, columns = ['actual', 'predicted'] )
    return (accuracy, recall, wrong_df)