In [67]:
%matplotlib inline
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import os
import ast
import datetime as dt
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [16, 10]
plt.rcParams['font.size'] = 14
import seaborn as sns
import cv2
import pandas as pd
import numpy as np
import tensorflow as tf
import keras
from keras.layers import Conv2D, MaxPooling2D
from keras.layers import Dense, Dropout, Flatten, Activation, BatchNormalization
from keras.metrics import categorical_accuracy, top_k_categorical_accuracy, categorical_crossentropy
from keras.models import Sequential
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from keras.optimizers import Adam, SGD
from keras.applications import NASNetMobile, MobileNetV2, InceptionResNetV2, ResNet50, MobileNet, VGG19
from keras.applications.mobilenet import preprocess_input
from keras.models import load_model
from keras.models import Model
from functools import partial
from multiprocessing import Pool
from super_convergence.clr import OneCycleLR
from os import listdir
from keras.utils import to_categorical

from PIL import Image
import pickle
import time
start = dt.datetime.now()

In [5]:
DP_DIR = ''
INPUT_DIR = ''

BASE_SIZE = 256
NCSVS = 100
NCATS = 340
np.random.seed(seed=1987)
tf.set_random_seed(seed=1987)

def f2cat(filename: str) -> str:
    return filename.split('.')[0]

def list_all_categories():
    files = os.listdir(os.path.join(INPUT_DIR, '../../data/train_simplified'))
    return sorted([f2cat(f) for f in files], key=str.lower)

In [8]:
def apk(actual, predicted, k=3):
    """
    Source: https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/average_precision.py
    """
    if len(predicted) > k:
        predicted = predicted[:k]
    score = 0.0
    num_hits = 0.0
    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)
    if not actual:
        return 0.0
    return score / min(len(actual), k)

def mapk(actual, predicted, k=3):
    """
    Source: https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/average_precision.py
    """
    return np.mean([apk(a, p, k) for a, p in zip(actual, predicted)])

def preds2catids(predictions):
    return pd.DataFrame(np.argsort(-predictions, axis=1)[:, :3], columns=['a', 'b', 'c'])

def top_3_accuracy(y_true, y_pred):
    return top_k_categorical_accuracy(y_true, y_pred, k=3)

In [51]:
batchsize = 512
NUM_SAMPLES = 50000
STEPS = NUM_SAMPLES//batchsize 
EPOCHS = 1000
size = 128



In [9]:
base_model =MobileNet(input_shape=(size, size, 1), alpha=1., weights=None, classes=NCATS)

# This is the model we will train
model = base_model

model.compile(optimizer=Adam(lr=0.00085), loss='categorical_crossentropy',
              metrics=[categorical_crossentropy, categorical_accuracy, top_3_accuracy])

callbacks_list = [keras.callbacks.EarlyStopping(monitor='val_acc', patience=3, verbose=1)]
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 128, 128, 1)       0         
_________________________________________________________________
conv1_pad (ZeroPadding2D)    (None, 129, 129, 1)       0         
_________________________________________________________________
conv1 (Conv2D)               (None, 64, 64, 32)        288       
_________________________________________________________________
conv1_bn (BatchNormalization (None, 64, 64, 32)        128       
_________________________________________________________________
conv1_relu (ReLU)            (None, 64, 64, 32)        0         
_________________________________________________________________
conv_dw_1 (DepthwiseConv2D)  (None, 64, 64, 32)        288       
_________________________________________________________________
conv_dw_1_bn (BatchNormaliza (None, 64, 64, 32)        128       
__________

In [68]:
def draw_cv2(raw_strokes, size, lw):
    img = np.zeros((BASE_SIZE, BASE_SIZE), np.uint8)
    for t, stroke in enumerate(raw_strokes):
        for i in range(len(stroke[0]) - 1):
            # colors = (255, 255, 255)
            # #color = 255
            # color = 255 - min(t, 10) * 13
            # colors = [0]*3
            # colors[0] = 255
            # colors[min(i,2)] = color
            # colors =  (255, 255, 255)
            _ = cv2.line(img, (stroke[0][i], stroke[1][i]),
                         (stroke[0][i + 1], stroke[1][i + 1]), 255, lw)
    if size != BASE_SIZE:
        img = cv2.resize(img, (size, size))
        return img
    else:
        return img


def image_generator_xd(size, batchsize, ks, lw):
    while True:
        for k in np.random.permutation(ks):
            filename = "/home/ubuntu/data/" + 'train_k{}.csv.gz'.format(k)
            for chunk in pd.read_csv(filename, chunksize=batchsize):
                x = df_to_image_array_xd(chunk, size, lw)
                y = to_categorical(chunk.y, num_classes=NCATS)
                yield x, y


def df_to_image_array_xd(df, size, lw):
    df['drawing'] = df['drawing'].apply(ast.literal_eval)
    x = np.zeros((len(df), size, size, 1))
    for i, raw_strokes in enumerate(df.drawing.values):
        x[i, :, :, 0] = draw_cv2(raw_strokes, size=size, lw=lw)
    x = preprocess_input(x).astype(np.float32)
    return x



In [26]:
valid_df = pd.read_csv(os.path.join(DP_DIR, '../../data/train_k{}.csv.gz'.format(NCSVS - 1)), nrows=100)
x_valid = df_to_image_array_xd(valid_df, size)
y_valid = keras.utils.to_categorical(valid_df.y, num_classes=NCATS)
print(x_valid.shape, y_valid.shape)
print('Validation array memory {:.2f} GB'.format(x_valid.nbytes / 1024.**3 ))

ValueError: could not broadcast input array from shape (128,128) into shape (128,128,1)

In [79]:
model.load_weights("model")

In [17]:
def fast_image_generator(batchsize):
    while True:
        indir = "../../data/numpy_format/"
        files = listdir(indir)
        np.random.shuffle(files)
        for file in files:
            with open(indir + file, 'rb') as infile:
                x, y = pickle.load(infile)
            for i in range(len(x)//batchsize):
                yield x[i*batchsize:(i+1)*batchsize], y[i*batchsize:(i+1)*batchsize]

In [70]:
train_datagen = image_generator_xd(size=size, batchsize=batchsize, ks=range(NCSVS - 1), lw=6)

In [None]:
x, y = next(train_datagen)
print(x.shape)
n = 2
fig, axs = plt.subplots(nrows=n, ncols=n, sharex=True, sharey=True, figsize=(6, 6))
for i in range(n**2):
    ax = axs[i // n, i % n]
    (-x[i]+1)/2
    ax.imshow((-x[i, :, :, 0] + 1)/2)
    ax.axis('off')
plt.tight_layout()
fig.savefig('gs.png', dpi=300)
plt.show();

In [18]:
callbacks = [
#     _fModelCheckpoint(filepath='model', verbose=1, save_best_only=True),
#     ReduceLROnPlateau(monitor='val_loss', factor=0.2,
#                               patience=5, min_lr=0.001)
]

In [65]:
gen = fast_image_generator(batchsize=batchsize)

In [47]:
a=  next(gen)

In [None]:
hist = model.fit_generator(
    train_datagen, steps_per_epoch=STEPS, epochs=EPOCHS, verbose=1,
    callbacks = callbacks, use_multiprocessing=False
)
hists.append(hist)

Epoch 1/1000
Epoch 2/1000




Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000

In [None]:
hist_df = pd.concat([pd.DataFrame(hist.history) for hist in hists], sort=True)
hist_df.index = np.arange(1, len(hist_df)+1)
fig, axs = plt.subplots(nrows=2, sharex=True, figsize=(16, 10))
axs[0].plot(hist_df.val_categorical_accuracy, lw=5, label='Validation Accuracy')
axs[0].plot(hist_df.categorical_accuracy, lw=5, label='Training Accuracy')
axs[0].set_ylabel('Accuracy')
axs[0].set_xlabel('Epoch')
axs[0].grid()
axs[0].legend(loc=0)
axs[1].plot(hist_df.val_categorical_crossentropy, lw=5, label='Validation MLogLoss')
axs[1].plot(hist_df.categorical_crossentropy, lw=5, label='Training MLogLoss')
axs[1].set_ylabel('MLogLoss')
axs[1].set_xlabel('Epoch')
axs[1].grid()
axs[1].legend(loc=0)
fig.savefig('hist.png', dpi=300)
plt.show();

In [82]:
valid_predictions = model.predict(x_valid, batch_size=128, verbose=1)
map3 = mapk(valid_df[['y']].values, preds2catids(valid_predictions).values)
print('Map3: {:.3f}'.format(map3))

Map3: 0.780


In [None]:
scores = [apk(actual, predicted) for actual,predicted in zip(valid_df[['y']].values, preds2catids(valid_predictions).values)]

In [None]:
valid_df['scores'] = scores

In [None]:
valid_df[valid_df['scores']==1]['recognized'].value_counts()

In [None]:
valid_df['images'] = list(x_valid)

In [None]:
valid_df.head()



In [None]:
img = valid_df.iloc[2]['images']

In [None]:
img

In [None]:
plt.imshow(img)

In [None]:
test_predictions = [] 
for chunk in pd.read_csv(os.path.join(INPUT_DIR, 'test_simplified.csv'), chunksize=1024):
    x_test = df_to_image_array_xd(chunk, size)
    predictions = list(model.predict(x_test, batch_size=128, verbose=1))
    test_predictions.extend(predictions)



top3 = preds2catids(np.array(test_predictions))
top3.head()
top3.shape

cats = list_all_categories()
id2cat = {k: cat.replace(' ', '_') for k, cat in enumerate(cats)}
top3cats = top3.replace(id2cat)
top3cats.head()
top3cats.shape

test['word'] = top3cats['a'] + ' ' + top3cats['b'] + ' ' + top3cats['c']
submission = test[['key_id', 'word']]
submission.to_csv('gs_mn_submission_{2}.csv', index=False)
submission.head()
submission.shape

In [None]:
end = dt.datetime.now()
print('Latest run {}.\nTotal time {}s'.format(end, (end - start).seconds))