In [6]:
%matplotlib inline
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import os
import ast
import datetime as dt
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [16, 10]
plt.rcParams['font.size'] = 14
import seaborn as sns
import cv2
import pandas as pd
import numpy as np
import tensorflow as tf
import keras
from keras.layers import Conv2D, MaxPooling2D
from keras.layers import Dense, Dropout, Flatten, Activation, BatchNormalization
from keras.metrics import categorical_accuracy, top_k_categorical_accuracy, categorical_crossentropy
from keras.models import Sequential
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from keras.optimizers import Adam, SGD
from keras.applications import NASNetMobile, MobileNetV2, InceptionResNetV2, ResNet50, MobileNet, VGG19
from keras.applications.mobilenet import preprocess_input
from keras.models import load_model
from keras.models import Model
from functools import partial
from multiprocessing import Pool
from super_convergence.clr import OneCycleLR
from PIL import Image

import time
start = dt.datetime.now()

In [33]:
DP_DIR = ''
INPUT_DIR = ''

BASE_SIZE = 256
NCSVS = 100
NCATS = 340
np.random.seed(seed=1987)
tf.set_random_seed(seed=1987)

def f2cat(filename: str) -> str:
    return filename.split('.')[0]

def list_all_categories():
    files = os.listdir(os.path.join(INPUT_DIR, '../../data/train_simplified'))
    return sorted([f2cat(f) for f in files], key=str.lower)

In [8]:
def apk(actual, predicted, k=3):
    """
    Source: https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/average_precision.py
    """
    if len(predicted) > k:
        predicted = predicted[:k]
    score = 0.0
    num_hits = 0.0
    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)
    if not actual:
        return 0.0
    return score / min(len(actual), k)

def mapk(actual, predicted, k=3):
    """
    Source: https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/average_precision.py
    """
    return np.mean([apk(a, p, k) for a, p in zip(actual, predicted)])

def preds2catids(predictions):
    return pd.DataFrame(np.argsort(-predictions, axis=1)[:, :3], columns=['a', 'b', 'c'])

def top_3_accuracy(y_true, y_pred):
    return top_k_categorical_accuracy(y_true, y_pred, k=3)

In [48]:
batchsize = 64
NUM_SAMPLES = 500000
STEPS = NUM_SAMPLES//batchsize 
EPOCHS = 1000
size = 128



In [42]:
base_model = MobileNet(weights='imagenet',include_top=False, input_shape=(size, size, 3))

# Add a new top layer
x = base_model.output
x = Flatten()(x)
predictions = Dense(NCATS, activation='softmax')(x)

# This is the model we will train
model = Model(inputs=base_model.input, outputs=predictions)

model.compile(optimizer=Adam(lr=0.00085), loss='categorical_crossentropy',
              metrics=[categorical_crossentropy, categorical_accuracy, top_3_accuracy])

callbacks_list = [keras.callbacks.EarlyStopping(monitor='val_acc', patience=3, verbose=1)]
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_7 (InputLayer)         (None, 128, 128, 3)       0         
_________________________________________________________________
conv1_pad (ZeroPadding2D)    (None, 129, 129, 3)       0         
_________________________________________________________________
conv1 (Conv2D)               (None, 64, 64, 32)        864       
_________________________________________________________________
conv1_bn (BatchNormalization (None, 64, 64, 32)        128       
_________________________________________________________________
conv1_relu (ReLU)            (None, 64, 64, 32)        0         
_________________________________________________________________
conv_dw_1 (DepthwiseConv2D)  (None, 64, 64, 32)        288       
_________________________________________________________________
conv_dw_1_bn (BatchNormaliza (None, 64, 64, 32)        128       
__________

In [43]:
def draw_cv2(raw_strokes, size=256, lw=6):
    img = np.zeros((BASE_SIZE, BASE_SIZE, 3), np.uint8)
    for t, stroke in enumerate(raw_strokes):
        for i in range(len(stroke[0]) - 1):
            color = 255
            #color = 255 - min(t, 10) * 13 
            _ = cv2.line(img, (stroke[0][i], stroke[1][i]),
                         (stroke[0][i + 1], stroke[1][i + 1]), (color, color, color), lw)
    if size != BASE_SIZE:
        img = cv2.resize(img, (size, size))
        return img 
    else:
        return img

def get_input(df, size, lw):
    df['drawing'] = df['drawing'].apply(ast.literal_eval)
    x = np.zeros((len(df), size, size, 3))
    for i, raw_strokes in enumerate(df.drawing.values):
        x[i, :, :, :] = draw_cv2(raw_strokes, size=size, lw=lw)
    x = preprocess_input(x).astype(np.float32)
    y = keras.utils.to_categorical(df.y, num_classes=NCATS)
    return x, y


def image_generator_xd(size, batchsize, ks, lw=6):
    partial_get_input = partial(get_input, size = size, lw=lw)
    while True:
        for k in np.random.permutation(ks):
            filename = os.path.join(DP_DIR, 'full_train/train_k{}.csv.gz'.format(k))
            for chunk in pd.read_csv(filename, chunksize=batchsize):
                yield partial_get_input(chunk)
                
def df_to_image_array_xd(df, size, lw=6):
    df['drawing'] = df['drawing'].apply(ast.literal_eval)
    x = np.zeros((len(df), size, size, 3))
    for i, raw_strokes in enumerate(df.drawing.values):
        x[i, :, :, :] = draw_cv2(raw_strokes, size=size, lw=lw)
    x = preprocess_input(x).astype(np.float32)
    return x



In [44]:
valid_df = pd.read_csv(os.path.join(DP_DIR, 'full_train/train_k{}.csv.gz'.format(NCSVS - 1)), nrows=30000)
x_valid = df_to_image_array_xd(valid_df, size)
y_valid = keras.utils.to_categorical(valid_df.y, num_classes=NCATS)
print(x_valid.shape, y_valid.shape)
print('Validation array memory {:.2f} GB'.format(x_valid.nbytes / 1024.**3 ))

(30000, 128, 128, 3) (30000, 340)
Validation array memory 5.49 GB


In [None]:
#model.load_weights("model")

In [45]:
train_datagen = image_generator_xd(size=size, batchsize=batchsize, ks=range(NCSVS - 1))

In [None]:
x, y = next(train_datagen)
print(x.shape)
n = 2
fig, axs = plt.subplots(nrows=n, ncols=n, sharex=True, sharey=True, figsize=(6, 6))
for i in range(n**2):
    ax = axs[i // n, i % n]
    (-x[i]+1)/2
    ax.imshow((-x[i, :, :, 0] + 1)/2)
    ax.axis('off')
plt.tight_layout()
fig.savefig('gs.png', dpi=300)
plt.show();

In [19]:
callbacks = [
    ModelCheckpoint(filepath='model', verbose=1, save_best_only=True),
#     ReduceLROnPlateau(monitor='val_loss', factor=0.2,
#                               patience=5, min_lr=0.001)
]

In [41]:
hists = []
hist = model.fit_generator(
    train_datagen, steps_per_epoch=STEPS, epochs=EPOCHS, verbose=1,
    validation_data=(x_valid, y_valid),
    callbacks = callbacks,
    use_multiprocessing=True
)
hists.append(hist)

Epoch 1/100

Epoch 00001: val_loss did not improve from 1.37920
Epoch 2/100

Epoch 00002: val_loss did not improve from 1.37920
Epoch 3/100

Epoch 00003: val_loss did not improve from 1.37920
Epoch 4/100

Epoch 00004: val_loss did not improve from 1.37920
Epoch 5/100

Epoch 00005: val_loss improved from 1.37920 to 1.33106, saving model to model
Epoch 6/100

Epoch 00006: val_loss improved from 1.33106 to 1.30775, saving model to model
Epoch 7/100

Process ForkPoolWorker-20:
Traceback (most recent call last):
  File "/home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))
  File "/home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/keras/utils/data_utils.py", line 626, in next_sample
    return six.next(_SHARED_SEQUENCES[uid])
  File "<ipython-input-27-ad8ccf95b951>", line 30, in image_generator_xd
    yield partial_get_input(chunk)
  File "<ipython-input-37-ad8ccf95b951>", line 18, in get_input
    x[i, :, :, :] = draw_cv2(raw_strokes, size=size, lw=lw)
  File "<ipython-input-37-ad8ccf95b951>", line 7, in draw_cv2
    (stroke[0][i + 1], stro

KeyboardInterrupt: 

In [None]:
hist = model.fit_generator(
    train_datagen, steps_per_epoch=STEPS, epochs=EPOCHS, verbose=1,
    validation_data=(x_valid, y_valid),
    callbacks = callbacks
)
hists.append(hist)

Epoch 1/1000

Epoch 00001: val_loss did not improve from 0.90425
Epoch 2/1000

Epoch 00002: val_loss improved from 0.90425 to 0.89953, saving model to model
Epoch 3/1000

Epoch 00003: val_loss did not improve from 0.89953
Epoch 4/1000

Epoch 00004: val_loss improved from 0.89953 to 0.89449, saving model to model
Epoch 5/1000

Epoch 00005: val_loss did not improve from 0.89449
Epoch 6/1000

Epoch 00006: val_loss improved from 0.89449 to 0.89150, saving model to model
Epoch 7/1000

Epoch 00007: val_loss did not improve from 0.89150
Epoch 8/1000

Epoch 00008: val_loss improved from 0.89150 to 0.89103, saving model to model
Epoch 9/1000

Epoch 00009: val_loss improved from 0.89103 to 0.88895, saving model to model
Epoch 10/1000

Epoch 00010: val_loss improved from 0.88895 to 0.87750, saving model to model
Epoch 11/1000

Epoch 00011: val_loss did not improve from 0.87750
Epoch 12/1000

Epoch 00012: val_loss improved from 0.87750 to 0.87532, saving model to model
Epoch 13/1000

Epoch 00013: 


Epoch 00023: val_loss did not improve from 0.86408
Epoch 24/1000

Epoch 00024: val_loss improved from 0.86408 to 0.86184, saving model to model
Epoch 25/1000

Epoch 00025: val_loss did not improve from 0.86184
Epoch 26/1000

Epoch 00026: val_loss did not improve from 0.86184
Epoch 27/1000

Epoch 00027: val_loss improved from 0.86184 to 0.86049, saving model to model
Epoch 28/1000

Epoch 00028: val_loss did not improve from 0.86049
Epoch 29/1000

Epoch 00029: val_loss did not improve from 0.86049
Epoch 30/1000

Epoch 00030: val_loss did not improve from 0.86049
Epoch 31/1000

Epoch 00031: val_loss improved from 0.86049 to 0.85746, saving model to model
Epoch 32/1000

Epoch 00032: val_loss improved from 0.85746 to 0.85490, saving model to model
Epoch 33/1000

Epoch 00033: val_loss did not improve from 0.85490
Epoch 34/1000

Epoch 00034: val_loss did not improve from 0.85490
Epoch 35/1000

Epoch 00035: val_loss improved from 0.85490 to 0.85340, saving model to model
Epoch 36/1000

Epoch 


Epoch 00045: val_loss improved from 0.84665 to 0.84579, saving model to model
Epoch 46/1000

Epoch 00046: val_loss improved from 0.84579 to 0.84460, saving model to model
Epoch 47/1000

Epoch 00047: val_loss did not improve from 0.84460
Epoch 48/1000

Epoch 00048: val_loss improved from 0.84460 to 0.84122, saving model to model
Epoch 49/1000
 264/7812 [>.............................] - ETA: 10:26 - loss: 0.8835 - categorical_crossentropy: 0.8835 - categorical_accuracy: 0.7783 - top_3_accuracy: 0.9094

In [None]:
hist_df = pd.concat([pd.DataFrame(hist.history) for hist in hists], sort=True)
hist_df.index = np.arange(1, len(hist_df)+1)
fig, axs = plt.subplots(nrows=2, sharex=True, figsize=(16, 10))
axs[0].plot(hist_df.val_categorical_accuracy, lw=5, label='Validation Accuracy')
axs[0].plot(hist_df.categorical_accuracy, lw=5, label='Training Accuracy')
axs[0].set_ylabel('Accuracy')
axs[0].set_xlabel('Epoch')
axs[0].grid()
axs[0].legend(loc=0)
axs[1].plot(hist_df.val_categorical_crossentropy, lw=5, label='Validation MLogLoss')
axs[1].plot(hist_df.categorical_crossentropy, lw=5, label='Training MLogLoss')
axs[1].set_ylabel('MLogLoss')
axs[1].set_xlabel('Epoch')
axs[1].grid()
axs[1].legend(loc=0)
fig.savefig('hist.png', dpi=300)
plt.show();

In [None]:
valid_predictions = model.predict(x_valid, batch_size=128, verbose=1)
map3 = mapk(valid_df[['y']].values, preds2catids(valid_predictions).values)
print('Map3: {:.3f}'.format(map3))

In [None]:
scores = [apk(actual, predicted) for actual,predicted in zip(valid_df[['y']].values, preds2catids(valid_predictions).values)]

In [None]:
valid_df['scores'] = scores

In [None]:
valid_df[valid_df['scores']==1]['recognized'].value_counts()

In [None]:
valid_df['images'] = list(x_valid)

In [None]:
valid_df.head()



In [None]:
img = valid_df.iloc[2]['images']

In [None]:
img

In [None]:
plt.imshow(img)

In [None]:
test_predictions = [] 
for chunk in pd.read_csv(os.path.join(INPUT_DIR, 'test_simplified.csv'), chunksize=1024):
    x_test = df_to_image_array_xd(chunk, size)
    predictions = list(model.predict(x_test, batch_size=128, verbose=1))
    test_predictions.extend(predictions)


In [None]:

top3 = preds2catids(np.array(test_predictions))
top3.head()
top3.shape

cats = list_all_categories()
id2cat = {k: cat.replace(' ', '_') for k, cat in enumerate(cats)}
top3cats = top3.replace(id2cat)
top3cats.head()
top3cats.shape

In [None]:
test['word'] = top3cats['a'] + ' ' + top3cats['b'] + ' ' + top3cats['c']
submission = test[['key_id', 'word']]
submission.to_csv('gs_mn_submission_{2}.csv', index=False)
submission.head()
submission.shape

In [None]:
end = dt.datetime.now()
print('Latest run {}.\nTotal time {}s'.format(end, (end - start).seconds))