In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))
print(os.listdir('./'))

# Any results you write to the current directory are saved as output.

In [None]:
import ast
import cv2
from keras.utils import to_categorical
from tqdm import tqdm
import gc

train_simplified = os.path.join('../input', 'train_simplified')
sample_list = os.listdir(train_simplified)

class load_img_data(object):
    
    def __init__(self):
        self.size = 64
        self.train_path = os.path.join('train_df.csv')
        self.test_path = os.path.join('test_df.csv')
        
    def draw_line(self, raw_stroke, lw=6, time_color=True):
        img = np.zeros((256, 256), np.uint8)
        for t, stroke in enumerate(raw_stroke):
            for i in range(len(stroke[0]) - 1):
                color = 255 - min(t, 10) * 13 if time_color else 255
                _ = cv2.line(img, (stroke[0][i], stroke[1][i]),
                             (stroke[0][i+1], stroke[1][i+1]), color, lw)
        img = cv2.resize(img, (self.size, self.size))
        return img
    
    def trainset(self):
        
        labels = []
        all_df = pd.DataFrame()
        
        for i, sample_name in enumerate(tqdm(sample_list)):
            path = os.path.join(train_simplified, sample_name)
            df = pd.read_csv(path, nrows=30000)
            all_df = all_df.append(df)
            labels = labels + len(df) * [i]
            del df
        
        all_df['y']= labels
        all_df = all_df.sample(frac=1, random_state=0)
        all_df.to_csv(self.train_path)
        
        del all_df
        del labels
        
    def testset(self):
        
        labels = []
        all_df = pd.DataFrame()
        
        for i, sample_name in enumerate(tqdm(sample_list)):
            path = os.path.join(train_simplified, sample_name)
            df = pd.read_csv(path, skiprows=range(1,30000), header=0, nrows=100)
            all_df = all_df.append(df)
            labels = labels + len(df) * [i]
            del df
            
        all_df['y']= labels
        all_df = all_df.sample(frac=1, random_state=0)
        all_df.to_csv(self.test_path)
        
        del all_df
        del labels

    def traingen(self):
        while True:
            if os.path.exists(self.train_path) == False:
                self.trainset()
            for df in pd.read_csv(self.train_path, chunksize=680):
                x = self.df_to_image_data(df, lw=6)
                y = to_categorical(df.y, num_classes=len(sample_list))
                yield x, y
            
    def df_to_image_data(self, df, lw=6):
        df.drawing = df.drawing.apply(ast.literal_eval)
        x = np.zeros((len(df), self.size, self.size, 1))
        for i, raw_stroke in enumerate(df.drawing):
            x[i,:, :, 0] = self.draw_line(raw_stroke, lw=lw)
        return x


In [None]:
load_img_data().testset()

In [None]:
testset = pd.read_csv('test_df.csv')

x_vaild = load_img_data().df_to_image_data(testset, lw=6)
y_vaild = to_categorical(testset.y, num_classes=len(sample_list))
print(x_vaild.shape)
print(y_vaild.shape)

In [None]:
import matplotlib.pyplot as plt

img = -(x_vaild[32, :, :, 0])/ 2
label = y_vaild[32, :]
print(np.argmax(label, 0))
plt.imshow(img, cmap=plt.cm.gray)
plt.axis('off')
plt.show()

In [None]:
from keras import metrics
from keras.applications.mobilenet import MobileNet
from keras.applications.mobilenet import preprocess_input
from sklearn.model_selection import train_test_split
from keras import optimizers

def top_3_accuracy(y_true, y_pred):
    return metrics.top_k_categorical_accuracy(y_true, y_pred, k=3)
    
model = MobileNet(input_shape=(64, 64, 1),  include_top=True, weights=None, classes=340)
model.compile(optimizer= optimizers.Adam(lr=0.002),loss='categorical_crossentropy', 
              metrics=[metrics.categorical_accuracy, top_3_accuracy])

traingen = load_img_data().traingen()
history = model.fit_generator(traingen, steps_per_epoch=800, epochs=70, verbose=1, 
                              validation_data=(x_vaild, y_vaild))

In [None]:
def preds2catids(predictions):
    return pd.DataFrame(np.argsort(-predictions, axis=1)[:, :3], columns=['a', 'b', 'c'])

In [None]:
test_file = os.path.join('../input', 'test_simplified.csv')
all_top3 = pd.DataFrame()

for df in pd.read_csv(test_file, chunksize=1000):
    test_imgs = load_img_data().df_to_image_data(df, lw=6)
    test_predictions = model.predict(test_imgs, batch_size=128, verbose=1)
    top3 = preds2catids(test_predictions)
    all_top3 = all_top3.append(top3, ignore_index=True)

In [None]:
len(all_top3)

In [None]:
to_class = {}
for i, sample_name in enumerate(sample_list):
    sample_name= sample_name.replace('.csv', ' ').strip()
    to_class[i] = sample_name.replace(' ','_')

print(to_class)

In [None]:
all_top3 = all_top3.replace(to_class)
all_top3.head()

In [None]:
word = all_top3['a'] + ' ' + all_top3['b'] + ' ' + all_top3['c']
key_id = pd.Series()
for df in pd.read_csv(test_file, chunksize=680):
    key_id = key_id.append(df.key_id, ignore_index=True)
submission = pd.DataFrame({'key_id':key_id,
                          'word':word})
os.remove('train_df.csv')
os.remove('test_df.csv')
print(submission.head())
submission.to_csv('submission.csv', index=False)
print(os.listdir('./'))