In [1]:
import os
from glob import glob
import re
import ast
import numpy as np 
import pandas as pd
from PIL import Image, ImageDraw 
from tqdm import tqdm
from dask import bag
train_dir = '/Users/taoli/Documents/Columbia_University/W4995_Deep_Learning/Project/train_simplified'
#print(os.listdir(train_dir))
import warnings
warnings.filterwarnings('ignore') # to suppress some matplotlib deprecation warnings
import ast
import math
import matplotlib.pyplot as plt
import matplotlib.style as style
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten
from tensorflow.keras.layers import Conv2D, MaxPooling2D
from tensorflow.keras.metrics import top_k_categorical_accuracy
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping

In [2]:
data = pd.read_csv(train_dir + '/roller coaster.csv',
                   index_col='key_id',
                   nrows=100)
#data.head()
data['word'] = data['word'].replace(' ', '_', regex=True)
data.head()

Unnamed: 0_level_0,countrycode,drawing,recognized,timestamp,word
key_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
6538378709303296,US,"[[[0, 28, 50, 73, 88, 94, 101, 101, 98, 100, 1...",True,2017-03-01 03:23:58.188330,roller_coaster
4974848176553984,US,"[[[0, 50, 78, 126, 142, 152, 157, 157, 141, 12...",True,2017-03-11 04:04:09.020040,roller_coaster
5822246901776384,US,"[[[2, 0, 17, 42, 68, 97, 120, 140, 151, 151, 1...",True,2017-03-15 03:05:49.143180,roller_coaster
5572615668236288,GB,"[[[192, 130, 101, 72, 0], [4, 99, 132, 148, 17...",True,2017-03-18 21:36:45.223520,roller_coaster
6266343907131392,US,"[[[9, 11, 19, 41, 114, 136, 153, 167, 179, 230...",True,2017-01-27 18:41:09.065050,roller_coaster


In [None]:
data.values[0][1]

In [3]:
classfiles = os.listdir(train_dir)
numstonames = {i: v[:-4].replace(" ", "_") for i, v in enumerate(classfiles)} #adds underscores

num_classes = 340    #340 max 
imheight, imwidth = 32, 32  
ims_per_class = 2000  #max?

In [4]:
# faster conversion function
def draw_it(strokes):
    image = Image.new("P", (256,256), color=255)
    image_draw = ImageDraw.Draw(image)
    for stroke in ast.literal_eval(strokes):
        for i in range(len(stroke[0])-1):
            image_draw.line([stroke[0][i], 
                             stroke[1][i],
                             stroke[0][i+1], 
                             stroke[1][i+1]],
                            fill=0, width=5)
    image = image.resize((imheight, imwidth))
    return np.array(image)/255.

In [6]:
#%% get train arrays
train_grand = []
class_paths = glob(train_dir+'/*.csv')
for i,c in enumerate(tqdm(class_paths[0: num_classes])):
    train = pd.read_csv(c, usecols=['drawing', 'recognized'], nrows=ims_per_class*5//4)
    train = train[train.recognized == True].head(ims_per_class)
    imagebag = bag.from_sequence(train.drawing.values).map(draw_it) 
    trainarray = np.array(imagebag.compute())  # PARALLELIZE
    trainarray = np.reshape(trainarray, (ims_per_class, -1))    
    labelarray = np.full((train.shape[0], 1), i)
    trainarray = np.concatenate((labelarray, trainarray), axis=1)
    train_grand.append(trainarray)
    
train_grand = np.array([train_grand.pop() for i in np.arange(num_classes)]) #less memory than np.concatenate
train_grand = train_grand.reshape((-1, (imheight*imwidth+1)))

del trainarray
del train

100%|██████████| 340/340 [04:06<00:00,  1.44it/s]


In [7]:
# memory-friendly alternative to train_test_split?
valfrac = 0.1
cutpt = int(valfrac * train_grand.shape[0])

np.random.shuffle(train_grand)
y_train, X_train = train_grand[cutpt: , 0], train_grand[cutpt: , 1:]
y_val, X_val = train_grand[0:cutpt, 0], train_grand[0:cutpt, 1:] #validation set is recognized==True

del train_grand

y_train = keras.utils.to_categorical(y_train, num_classes)
X_train = X_train.reshape(X_train.shape[0], imheight, imwidth, 1)
y_val = keras.utils.to_categorical(y_val, num_classes)
X_val = X_val.reshape(X_val.shape[0], imheight, imwidth, 1)

print(y_train.shape, "\n",
      X_train.shape, "\n",
      y_val.shape, "\n",
      X_val.shape)

(612000, 340) 
 (612000, 32, 32, 1) 
 (68000, 340) 
 (68000, 32, 32, 1)


In [8]:
model = Sequential()
model.add(Conv2D(32, kernel_size=(3, 3), padding='same', activation='relu', input_shape=(imheight, imwidth, 1)))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(64, kernel_size=(3, 3), padding='same', activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.2))
model.add(Flatten())
model.add(Dense(680, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 32, 32, 32)        320       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 16, 16, 32)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 16, 16, 64)        18496     
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 8, 8, 64)          0         
_________________________________________________________________
dropout (Dropout)            (None, 8, 8, 64)          0         
_________________________________________________________________
flatten (Flatten)            (None, 4096)              0         
_________________________________________________________________
dense (Dense)                (None, 680)               2785960   
__________

In [9]:
def top_3_accuracy(x,y): 
    t3 = top_k_categorical_accuracy(x,y, 3)
    return t3

reduceLROnPlat = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, 
                                   verbose=1, mode='auto', min_delta=0.005, cooldown=5, min_lr=0.0001)
earlystop = EarlyStopping(monitor='val_top_3_accuracy', mode='max', patience=5) 
callbacks = [reduceLROnPlat, earlystop]

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy', top_3_accuracy])

model.fit(x=X_train, y=y_train,
          batch_size = 32,
          epochs = 22,
          validation_data = (X_val, y_val),
          callbacks = callbacks,
          verbose = 1)

Train on 612000 samples, validate on 68000 samples
Epoch 1/22
Epoch 2/22
Epoch 3/22
Epoch 4/22
Epoch 5/22
Epoch 6/22
Epoch 7/22
Epoch 8/22
Epoch 9/22
Epoch 10/22
Epoch 11/22
Epoch 12/22
Epoch 13/22
Epoch 14/22
Epoch 15/22
Epoch 00015: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 16/22
Epoch 17/22
Epoch 18/22
Epoch 19/22
Epoch 20/22
Epoch 21/22
Epoch 22/22


<tensorflow.python.keras.callbacks.History at 0xb3ae6ab70>

In [10]:
#%% get test set
ttvlist = []
reader = pd.read_csv('/Users/taoli/Documents/Columbia_University/W4995_Deep_Learning/Project/test_simplified.csv', index_col=['key_id'],
    chunksize=2048)
for chunk in tqdm(reader, total=55):
    imagebag = bag.from_sequence(chunk.drawing.values).map(draw_it)
    testarray = np.array(imagebag.compute())
    testarray = np.reshape(testarray, (testarray.shape[0], imheight, imwidth, 1))
    testpreds = model.predict(testarray, verbose=0)
    ttvs = np.argsort(-testpreds)[:, 0:3]  # top 3
    ttvlist.append(ttvs)
    
ttvarray = np.concatenate(ttvlist)

100%|██████████| 55/55 [01:39<00:00,  1.70s/it]


In [11]:
preds_df = pd.DataFrame({'first': ttvarray[:,0], 'second': ttvarray[:,1], 'third': ttvarray[:,2]})
preds_df = preds_df.replace(numstonames)
preds_df['words'] = preds_df['first'] + " " + preds_df['second'] + " " + preds_df['third']

sub = pd.read_csv('/Users/taoli/Documents/Columbia_University/W4995_Deep_Learning/Project/sample_submission.csv', index_col=['key_id'])
sub['word'] = preds_df.words.values
sub.to_csv('subcnn_small.csv')
sub.head()

Unnamed: 0_level_0,word
key_id,Unnamed: 1_level_1
9000003627287624,radio motorbike bicycle
9000010688666847,sandwich hockey_puck belt
9000023642890129,The_Great_Wall_of_China castle bridge
9000038588854897,mountain tent triangle
9000052667981386,campfire fireplace hedgehog


In [12]:
import sys

# These are the usual ipython objects, including this one you are creating
ipython_vars = ['In', 'Out', 'exit', 'quit', 'get_ipython', 'ipython_vars']

# Get a sorted list of the objects and their sizes
sorted([(x, sys.getsizeof(globals().get(x))) for x in dir() if not 
    x.startswith('_') and x not in sys.modules and x 
    not in ipython_vars], key=lambda x: x[1], reverse=True)

[('preds_df', 30590222),
 ('sub', 9909531),
 ('ttvarray', 2692888),
 ('testpreds', 2185632),
 ('chunk', 1053432),
 ('data', 70120),
 ('labelarray', 16112),
 ('numstonames', 9320),
 ('class_paths', 3104),
 ('Sequential', 3096),
 ('classfiles', 2896),
 ('Dense', 2000),
 ('Dropout', 2000),
 ('Flatten', 2000),
 ('MaxPooling2D', 2000),
 ('EarlyStopping', 1464),
 ('ReduceLROnPlateau', 1464),
 ('ModelCheckpoint', 1056),
 ('Conv2D', 888),
 ('ttvlist', 528),
 ('c', 148),
 ('X_train', 144),
 ('X_val', 144),
 ('testarray', 144),
 ('draw_it', 136),
 ('top_3_accuracy', 136),
 ('top_k_categorical_accuracy', 136),
 ('train_dir', 136),
 ('ttvs', 112),
 ('y_train', 112),
 ('y_val', 112),
 ('Image', 80),
 ('ImageDraw', 80),
 ('bag', 80),
 ('callbacks', 80),
 ('keras', 80),
 ('np', 80),
 ('pd', 80),
 ('plt', 80),
 ('style', 80),
 ('tf', 80),
 ('earlystop', 56),
 ('imagebag', 56),
 ('model', 56),
 ('reader', 56),
 ('reduceLROnPlat', 56),
 ('cutpt', 28),
 ('i', 28),
 ('imheight', 28),
 ('ims_per_class', 28