# 1 Imports

In [266]:
import numpy as np 
import pandas as pd

import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.metrics import top_k_categorical_accuracy
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
from sklearn.model_selection import train_test_split

# 2 Fetching the data

In [267]:
#finding the amount of rows for each category
df = pd.read_csv('5_animals.csv')
df.tail()

Unnamed: 0,countrycode,drawing,key_id,timestamp,word
49995,TR,"[[[22, 0, 22], [65, 74, 80]], [[22, 25, 38, 57...",6082320509435904,2017-03-18 23:49:34.788410,bird
49996,US,"[[[177, 168, 222, 226, 236, 254, 255, 249, 233...",6349598891180032,2017-03-05 21:16:16.265550,dog
49997,RS,"[[[0, 2, 24, 41, 84, 106, 110, 118, 131, 183, ...",5359595658674176,2017-03-25 19:51:32.382570,bird
49998,NL,"[[[112, 90, 58, 28, 4, 0, 2, 59, 75, 80, 69, 5...",6052595476987904,2017-03-22 19:23:08.989450,butterfly
49999,US,"[[[42, 55, 64, 85, 139, 167, 179, 192, 218, 23...",6420716016631808,2017-03-15 14:15:01.841960,bird


In [268]:
df['word'].value_counts()

butterfly    10000
cat          10000
elephant     10000
bird         10000
dog          10000
Name: word, dtype: int64

There are 5 categories with 10k images of each available as you can see above.

Now i'm going to split the dataframe into a train/test/validation set like 70/15/15

# 3 Prepairing the data

In [269]:
X = df.drop(columns = ['word'])
y = df['word']
X

Unnamed: 0,countrycode,drawing,key_id,timestamp
0,US,"[[[61, 58, 59, 70, 87, 100, 109, 130, 136, 135...",6144641424424960,2017-03-15 00:32:07.462090
1,SE,"[[[66, 85, 111, 157, 183, 192, 250, 255, 255, ...",5425744563929088,2017-03-16 14:50:56.360420
2,US,"[[[66, 62, 56, 55, 69, 90], [87, 88, 104, 139,...",5828817815863296,2017-03-26 02:29:38.282890
3,IN,"[[[162, 132, 113, 94, 87, 81, 81, 87, 99, 136,...",6166647058464768,2017-03-27 12:05:10.695330
4,CZ,"[[[35, 35, 46, 56, 74, 103, 120, 192, 209, 227...",6400753277075456,2017-03-14 20:20:56.641680
...,...,...,...,...
49995,TR,"[[[22, 0, 22], [65, 74, 80]], [[22, 25, 38, 57...",6082320509435904,2017-03-18 23:49:34.788410
49996,US,"[[[177, 168, 222, 226, 236, 254, 255, 249, 233...",6349598891180032,2017-03-05 21:16:16.265550
49997,RS,"[[[0, 2, 24, 41, 84, 106, 110, 118, 131, 183, ...",5359595658674176,2017-03-25 19:51:32.382570
49998,NL,"[[[112, 90, 58, 28, 4, 0, 2, 59, 75, 80, 69, 5...",6052595476987904,2017-03-22 19:23:08.989450


In [270]:
X_train, X_rem, y_train, y_rem = train_test_split(X,y, train_size=0.7)
X_val, X_test, y_val, y_test = train_test_split(X_rem,y_rem, test_size=0.5)

In [271]:
%%capture
X_train.reset_index()
y_train.reset_index()
X_test.reset_index()
y_test.reset_index()
X_val.reset_index()
y_val.reset_index()

In [272]:
print(X_train.shape), print(y_train.shape)
print(X_val.shape), print(y_val.shape)
print(X_test.shape), print(y_test.shape)

(35000, 4)
(35000,)
(7500, 4)
(7500,)
(7500, 4)
(7500,)


(None, None)

# 4 Training the model

In [273]:
num_classes = 5

model = Sequential()
model.add(Conv2D(32, kernel_size=(3, 3), padding='same', activation='relu', input_shape=(32, 32, 1)))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(64, kernel_size=(3, 3), padding='same', activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.2))
model.add(Flatten())
model.add(Dense(680, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))
model.summary()

Model: "sequential_17"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_34 (Conv2D)          (None, 32, 32, 32)        320       
                                                                 
 max_pooling2d_34 (MaxPoolin  (None, 16, 16, 32)       0         
 g2D)                                                            
                                                                 
 conv2d_35 (Conv2D)          (None, 16, 16, 64)        18496     
                                                                 
 max_pooling2d_35 (MaxPoolin  (None, 8, 8, 64)         0         
 g2D)                                                            
                                                                 
 dropout_34 (Dropout)        (None, 8, 8, 64)          0         
                                                                 
 flatten_17 (Flatten)        (None, 4096)            

In [275]:
def top_3_accuracy(x,y): 
    t3 = top_k_categorical_accuracy(x,y, 3)
    return t3

reduceLROnPlat = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, 
                                   verbose=1, mode='auto', min_delta=0.005, cooldown=5, min_lr=0.0001)
earlystop = EarlyStopping(monitor='val_top_3_accuracy', mode='max', patience=5) 
callbacks = [reduceLROnPlat, earlystop]

model.compile(loss='categorical_crossentropy',
              optimizer='RMSprop',
              metrics=['accuracy', top_3_accuracy])

model.fit(x=X_train, y=y_train,
          batch_size = 32,
          epochs = 25,
          validation_data = (X_val, y_val),
          callbacks = callbacks,
          verbose = 1)

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type int).