# 1 Imports

In [100]:
import os
import glob
import numpy as np 
import pandas as pd

import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.metrics import top_k_categorical_accuracy
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
from sklearn.model_selection import train_test_split

In [101]:
#setting the directory of the dataset
os.chdir("C:/Users/Brune/Documents/Stage Arinti 2022/Dataset/Simplified")

# 2 Loading and preprocessing the data

In [102]:
#getting the data from 5 different csvs and combining them in to 1 pandas dataframe
df = pd.concat(map(pd.read_csv, glob.glob(os.path.join('five_animals/', "*.csv"))),ignore_index=True)

#We drop the Recognized=False rows as there is no point in having unrecognized drawings who will not contribute to the training phase.
df = df[df["recognized"]==True].reset_index(drop=True)

#the recognized column is now uneccesary and can be dropped
df = df.drop(columns=['recognized'])

#lets check out the dataframe in its current state
df.head()


Unnamed: 0,countrycode,drawing,key_id,timestamp,word
0,VE,"[[[130, 113, 99, 109, 76, 64, 55, 48, 48, 51, 59, 86, 133, 154, 170, 203, 214, 217, 215, 208, 186, 176, 162, 157, 132], [72, 40, 27, 79, 82, 88, 100, 120, 134, 152, 165, 184, 189, 186, 179, 152, 131, 114, 100, 89, 76, 0, 31, 65, 70]], [[76, 28, 7], [136, 128, 128]], [[76, 23, 0], [160, 164, 175]], [[87, 52, 37], [175, 191, 204]], [[174, 220, 246, 251], [134, 132, 136, 139]], [[175, 255], [147, 168]], [[171, 208, 215], [164, 198, 210]], [[130, 110, 108, 111, 130, 139, 139, 119], [129, 134, 137, 144, 148, 144, 136, 130]], [[107, 106], [96, 113]]]",5201136883597312,2017-03-02 23:25:10.074530,cat
1,BR,"[[[222, 196, 176, 173, 179, 191, 233, 251, 255, 250, 231, 219], [31, 32, 43, 60, 72, 79, 79, 69, 54, 35, 27, 27]], [[176, 198, 203], [42, 0, 26]], [[219, 238, 249, 252, 252], [29, 11, 5, 16, 43]], [[190, 188, 191, 200, 204, 200], [44, 45, 49, 51, 43, 41]], [[233, 228, 233, 241, 243, 231], [44, 48, 53, 53, 46, 43]], [[203, 208, 213, 218, 227, 234], [63, 69, 69, 62, 70, 62]], [[168, 101, 75, 54, 53, 59, 69, 132, 180, 195, 199, 197], [41, 50, 62, 81, 101, 108, 110, 108, 83, 67, 61, 60]], [[57, 16, 0, 3, 23, 28, 40, 56], [76, 101, 130, 138, 137, 100, 90, 87]], [[74, 75, 78, 85, 94, 97, 96], [106, 139, 143, 145, 145, 140, 106]], [[163, 170, 172, 177, 183, 183, 176, 175], [92, 132, 134, 132, 122, 104, 93, 82]]]",4647694628814848,2017-03-26 18:43:54.780830,cat
2,RO,"[[[65, 71, 74, 68, 70, 65, 65], [64, 67, 72, 72, 65, 69, 67]], [[107, 124, 127, 109, 104, 108, 116, 117, 117, 119], [67, 61, 63, 69, 64, 62, 64, 69, 63, 65]], [[78, 85, 97, 114, 115, 92, 78], [111, 124, 128, 121, 118, 111, 111]], [[87, 88], [115, 122]], [[100, 100], [116, 118]], [[68, 94, 110, 122, 138], [153, 168, 173, 171, 162]], [[140, 228, 255], [138, 102, 87]], [[161, 232], [148, 141]], [[144, 191, 204, 213], [120, 103, 95, 85]], [[50, 19], [122, 99]], [[67, 34], [121, 91]], [[66, 0], [127, 141]], [[85, 43, 29, 22, 20, 26, 50, 70, 98, 127, 153, 156, 150, 144, 93, 65], [36, 48, 66, 81, 122, 141, 177, 193, 200, 197, 162, 145, 105, 91, 42, 24]], [[22, 7, 4, 7, 19, 46, 61], [61, 32, 17, 2, 1, 14, 35]], [[133, 150, 231, 226], [51, 52, 43, 54]]]",6411036099870720,2017-03-18 09:36:08.290570,cat
3,US,"[[[148, 127, 119, 80, 60, 53, 48, 49, 56, 67, 80, 93, 127, 165, 183, 208, 214, 217, 215, 200, 183, 164, 146], [43, 41, 43, 68, 95, 113, 136, 182, 204, 226, 241, 251, 252, 237, 223, 190, 173, 157, 128, 88, 67, 55, 51]], [[71, 67, 62, 44, 49, 68, 92, 113, 125], [76, 75, 63, 1, 0, 9, 28, 50, 55]], [[174, 224, 235, 221, 219], [62, 18, 13, 95, 125]], [[110, 107], [87, 112]], [[167, 165], [82, 116]], [[72, 0], [165, 164]], [[70, 23, 13], [201, 202, 206]], [[70, 26], [212, 255]], [[185, 241], [177, 152]], [[192, 221, 232], [199, 190, 190]], [[198, 237], [208, 215]], [[111, 116, 131, 146, 157], [183, 191, 197, 197, 187]], [[139, 123, 130, 139], [139, 150, 149, 140]]]",6196755135725568,2017-01-25 16:05:51.963360,cat
4,ID,"[[[91, 77, 74, 75, 90, 116, 134, 148, 164, 165, 145], [47, 67, 82, 110, 132, 134, 127, 116, 90, 67, 40]], [[84, 97, 76, 90, 99, 104, 109, 126, 127, 135, 145, 148], [49, 49, 7, 10, 16, 32, 35, 32, 21, 0, 16, 35]], [[104, 109], [72, 72]], [[134, 132], [63, 68]], [[123, 228, 255, 233, 211], [89, 40, 45, 53, 57]], [[127, 171, 229, 238, 242], [100, 73, 72, 72, 77]], [[99, 53, 23, 5, 11], [96, 77, 76, 82, 84]], [[92, 55, 32, 13, 2], [102, 97, 97, 100, 107]], [[113, 104], [88, 108]]]",4597292839469056,2017-01-29 02:42:16.318460,cat


In [104]:
#finding the amount of rows for each category
df['word'].value_counts()

horse       156302
dog         143285
zebra       127159
elephant    104986
cat         103031
Name: word, dtype: int64

As you can see above, the dataset is not balanced yet. We will do this in the next step by keeping 10k of each category.

In [105]:
#balancing of the dataset: keep 10000 rows from each animal
df = df.groupby('word').sample(n=10000, random_state=1).reset_index(drop=True)

In [106]:
df['word'].value_counts()

cat         10000
dog         10000
elephant    10000
horse       10000
zebra       10000
Name: word, dtype: int64

Now we are going to split the dataframe into a train/test/validation set like 70/15/15

In [107]:
X = df.drop(columns = ['word']).copy()
y = df['word']

In [108]:
X_train, X_rem, y_train, y_rem = train_test_split(X,y, train_size=0.7)
X_val, X_test, y_val, y_test = train_test_split(X_rem,y_rem, test_size=0.5)

In [109]:
print(X_train.shape), print(y_train.shape)
print(X_val.shape), print(y_val.shape)
print(X_test.shape), print(y_test.shape)

(35000, 4)
(35000,)


(None, None)

(7500, 4)
(7500,)


(None, None)

(7500, 4)
(7500,)


(None, None)

In [111]:
num_classes = 5

model = Sequential()
model.add(Conv2D(32, kernel_size=(3, 3), padding='same', activation='relu', input_shape=(32, 32, 1)))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(64, kernel_size=(3, 3), padding='same', activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.2))
model.add(Flatten())
model.add(Dense(680, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))
model.summary()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_12 (Conv2D)          (None, 32, 32, 32)        320       
                                                                 
 max_pooling2d_12 (MaxPoolin  (None, 16, 16, 32)       0         
 g2D)                                                            
                                                                 
 conv2d_13 (Conv2D)          (None, 16, 16, 64)        18496     
                                                                 
 max_pooling2d_13 (MaxPoolin  (None, 8, 8, 64)         0         
 g2D)                                                            
                                                                 
 dropout_12 (Dropout)        (None, 8, 8, 64)          0         
                                                                 
 flatten_6 (Flatten)         (None, 4096)             

In [112]:
def top_3_accuracy(x,y): 
    t3 = top_k_categorical_accuracy(x,y, 3)
    return t3

reduceLROnPlat = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, 
                                   verbose=1, mode='auto', min_delta=0.005, cooldown=5, min_lr=0.0001)
earlystop = EarlyStopping(monitor='val_top_3_accuracy', mode='max', patience=5) 
callbacks = [reduceLROnPlat, earlystop]

model.compile(loss='categorical_crossentropy',
              optimizer='RMSprop',
              metrics=['accuracy', top_3_accuracy])

model.fit(x=X_train, y=y_train,
          batch_size = 32,
          epochs = 25,
          validation_data = (X_val, y_val),
          callbacks = callbacks,
          verbose = 1)

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type int).