In [1]:
%matplotlib inline
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import os
import ast
import datetime as dt
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [16, 10]
plt.rcParams['font.size'] = 14
import seaborn as sns
import cv2
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Conv2D, MaxPooling2D
from tensorflow.keras.layers import Dense, Dropout, Flatten, Activation
from tensorflow.keras.metrics import categorical_accuracy, top_k_categorical_accuracy, categorical_crossentropy
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.applications import NASNetMobile, MobileNet, InceptionResNetV2, ResNet50
from tensorflow.keras.applications.mobilenet import preprocess_input
from tensorflow.keras.models import load_model
from joblib import Parallel, delayed
from functools import partial
from multiprocessing import Pool
from super_convergence.clr import OneCycleLR

import time
start = dt.datetime.now()

Using TensorFlow backend.


In [2]:
DP_DIR = ''
INPUT_DIR = ''

BASE_SIZE = 256
NCSVS = 100
NCATS = 340
np.random.seed(seed=1987)
tf.set_random_seed(seed=1987)

def f2cat(filename: str) -> str:
    return filename.split('.')[0]

def list_all_categories():
    files = os.listdir(os.path.join(INPUT_DIR, '../../data/train_simplified'))
    return sorted([f2cat(f) for f in files], key=str.lower)

In [3]:
def apk(actual, predicted, k=3):
    """
    Source: https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/average_precision.py
    """
    if len(predicted) > k:
        predicted = predicted[:k]
    score = 0.0
    num_hits = 0.0
    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)
    if not actual:
        return 0.0
    return score / min(len(actual), k)

def mapk(actual, predicted, k=3):
    """
    Source: https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/average_precision.py
    """
    return np.mean([apk(a, p, k) for a, p in zip(actual, predicted)])

def preds2catids(predictions):
    return pd.DataFrame(np.argsort(-predictions, axis=1)[:, :3], columns=['a', 'b', 'c'])

def top_3_accuracy(y_true, y_pred):
    return top_k_categorical_accuracy(y_true, y_pred, k=3)

In [4]:

batchsize = 64
NUM_SAMPLES = 50000 
STEPS = NUM_SAMPLES//batchsize 
EPOCHS = 20
size = 256



In [5]:
def draw_cv2(raw_strokes, size=256, lw=6, time_color=True):
    img = np.zeros((BASE_SIZE, BASE_SIZE), np.uint8)
    for t, stroke in enumerate(raw_strokes):
        for i in range(len(stroke[0]) - 1):
            color = 255 - min(t, 10) * 13 if time_color else 255
            _ = cv2.line(img, (stroke[0][i], stroke[1][i]),
                         (stroke[0][i + 1], stroke[1][i + 1]), color, lw)
    if size != BASE_SIZE:
        return cv2.resize(img, (size, size))
    else:
        return img

def get_input(df, size, lw, time_color):
    df['drawing'] = df['drawing'].apply(ast.literal_eval)
    x = np.zeros((len(df), size, size, 1))
    for i, raw_strokes in enumerate(df.drawing.values):
        x[i, :, :, 0] = draw_cv2(raw_strokes, size=size, lw=lw,
                                 time_color=time_color)
    x = preprocess_input(x).astype(np.float32)
    y = keras.utils.to_categorical(df.y, num_classes=NCATS)
    return x, y

    
def image_generator_xd(size, batchsize, ks, lw=6, time_color=True):
    partial_get_input = partial(get_input, size = size, lw=lw, time_color=time_color)
    while True:
        for k in np.random.permutation(ks):
            filename = os.path.join(DP_DIR, 'train_k{}.csv.gz'.format(k))
            for chunk in pd.read_csv(filename, chunksize=batchsize):
                yield partial_get_input(chunk)
# #             p = Pool(1)
#             batches = map(partial_get_input, chunks)
#             #batches = Parallel(n_jobs=-1)(delayed(partial_get_input)(chunk) for chunk in chunks)
#             for batch in batches:
#                 yield batch 
                
def df_to_image_array_xd(df, size, lw=6, time_color=True):
    df['drawing'] = df['drawing'].apply(ast.literal_eval)
    x = np.zeros((len(df), size, size, 1))
    for i, raw_strokes in enumerate(df.drawing.values):
        x[i, :, :, 0] = draw_cv2(raw_strokes, size=size, lw=lw, time_color=time_color)
    x = preprocess_input(x).astype(np.float32)
    return x



In [None]:
train_datagen = image_generator_xd(size=size, batchsize=batchsize, ks=range(NCSVS - 1))

In [18]:
all_predictions = []  
for size, model_file in [(64, "saved_models/model_v1_32459"), 
                         (128, "saved_models/model_v5_increased_image_size_to_128"), 
                         (256, "saved_models/model_v5_increased_image_size_1_percent_increase_val")]:
    model = MobileNet(input_shape=(size, size, 1), alpha=1., weights=None, classes=NCATS)
    model.load_weights(model_file)
    test_predictions = [] 
    for chunk in pd.read_csv(os.path.join(INPUT_DIR, 'test_simplified.csv'), chunksize=128):
        x_test = df_to_image_array_xd(chunk, size)
        predictions = list(model.predict(x_test, batch_size=128, verbose=1))
        test_predictions.extend(predictions)
    all_predictions.append(test_predictions)







































In [19]:
average_predictions = [np.mean([all_predictions[i][0], all_predictions[i][1], all_predictions[i][2]]) for i in range(len(all_predictions))]

In [20]:

top3 = preds2catids(np.array(test_predictions))
top3.head()
top3.shape

cats = list_all_categories()
id2cat = {k: cat.replace(' ', '_') for k, cat in enumerate(cats)}
top3cats = top3.replace(id2cat)
top3cats.head()
top3cats.shape

Unnamed: 0,a,b,c
0,234,281,266
1,144,36,125
2,305,62,309
3,187,304,303
4,113,56,165


(112199, 3)

Unnamed: 0,a,b,c
0,radio,stereo,snorkel
1,hockey_puck,bottlecap,frying_pan
2,The_Great_Wall_of_China,castle,toe
3,mountain,The_Eiffel_Tower,tent
4,fireplace,campfire,leaf


(112199, 3)

In [21]:
test = pd.read_csv("test_simplified.csv")

In [22]:
test['word'] = top3cats['a'] + ' ' + top3cats['b'] + ' ' + top3cats['c']
submission = test[['key_id', 'word']]
submission.to_csv('../../submission_4.csv', index=False)
submission.head()
submission.shape

Unnamed: 0,key_id,word
0,9000003627287624,radio stereo snorkel
1,9000010688666847,hockey_puck bottlecap frying_pan
2,9000023642890129,The_Great_Wall_of_China castle toe
3,9000038588854897,mountain The_Eiffel_Tower tent
4,9000052667981386,fireplace campfire leaf


(112199, 2)

In [None]:
end = dt.datetime.now()
print('Latest run {}.\nTotal time {}s'.format(end, (end - start).seconds))