In [1]:
# environment: Paperspace Quadro P4000 GPU, as dataset size is small 
import numpy as np 
import pandas as pd 
import os 
import tensorflow as tf
import tensorflow.keras # run pip install keras==2.3 beforehand for compatability 
from tensorflow.keras import Input, Model 
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Conv2D, Dropout, AlphaDropout, MaxPooling2D, AveragePooling2D, BatchNormalization, Concatenate, Flatten, Reshape, Add, Activation
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, LearningRateScheduler
from keras.utils.np_utils import to_categorical
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import skimage
from skimage.transform import rotate
from skimage.util import random_noise
from skimage.transform import warp, AffineTransform
import cv2 # opencv, when error occurs due to libGL, you may refer to https://github.com/conda-forge/pygridgen-feedstock/issues/10
import random 
from sklearn.model_selection import KFold


Using TensorFlow backend.


# Read in file and preprocess data

In [2]:
train_path = './storage/modified_mnist_dataset/train.csv'  
test_path = './storage/modified_mnist_dataset/test.csv' 
submission_path = './storage/modified_mnist_dataset/submission.csv'

In [3]:
train = pd.read_csv(train_path)
test = pd.read_csv(test_path) 
submission = pd.read_csv(submission_path)

In [4]:
# convert types of digit and letter columns to categorical 
train.iloc[:,1] = pd.Categorical(train.iloc[:,1])
train.iloc[:,2] = pd.Categorical(train.iloc[:,2]) 
test.iloc[:,1] = pd.Categorical(test.iloc[:,1])

In [5]:
# define and re-format train and test data 
x_train = train.iloc[:,3:].values.reshape(-1,28,28,1).astype(np.float32) 
x_train /= 255.0 
y_train = train.iloc[:,1].values
y_train = np.asarray(y_train)
y_train = to_categorical(y_train, num_classes = 10)
train_letters = train.iloc[:,2].values

x_test = test.iloc[:,2:].values.reshape(-1,28,28,1).astype(np.float32) 
x_test /= 255.0
test_letters = test.iloc[:,1].values

In [6]:
train_letters_numeric = [] 
test_letters_numeric = [] 
for letter in train_letters: 
    train_letters_numeric.append(ord(letter) - ord("A"))
for letter in test_letters: 
    test_letters_numeric.append(ord(letter) - ord("A")) 
    
train_letters_numeric = np.asarray(train_letters_numeric) 
test_letters_numeric = np.asarray(test_letters_numeric) 

train_letters_numeric = to_categorical(train_letters_numeric, num_classes = 26) 
test_letters_numeric = to_categorical(test_letters_numeric, num_classes = 26)

x_train.shape, y_train.shape, x_test.shape, train_letters_numeric.shape, test_letters_numeric.shape

((2048, 28, 28, 1), (2048, 10), (20480, 28, 28, 1), (2048, 26), (20480, 26))

# Augment Data

For now, we will try augmenting the data using the following methods 
- rotation 
- adding noise  
- adding gaussian blur 
- shifting image 

Please refer to [this notebook](https://github.com/iljimae0418/overlapping-digit-and-letter-mnist/blob/master/Examples%20of%20data%20augmentations.ipynb) for examples.  

In [7]:
# apply rotations 
x_train_rotated = [] 
for x_data in x_train:
    rotated_img = rotate(x_data, angle = random.randint(10,40))
    x_train_rotated.append(rotated_img) 
x_train_rotated = np.asarray(x_train_rotated) 

In [8]:
# apply clockwise rotations 
x_train_rotated_2 = [] 
for x_data in x_train: 
    rotated_img = rotate(x_data, angle = -random.randint(10,40)) 
    x_train_rotated_2.append(rotated_img)
x_train_rotated_2 = np.asarray(x_train_rotated_2) 

In [9]:
# add noise 
x_noised = [] 
for x_data in x_train: 
    noised_img = random_noise(x_data) 
    x_noised.append(noised_img)
x_noised = np.asarray(x_noised) 

In [10]:
# add gaussian blur 
x_blurred = [] 
for x_data in x_train:
    kernel_size = random.choice([3,5,9]) 
    blurred = cv2.GaussianBlur(x_data, (kernel_size, kernel_size), 0) 
    x_blurred.append(blurred)
x_blurred = np.asarray(x_blurred)
x_blurred = x_blurred.reshape(-1,28,28,1) 

In [11]:
# shift image 
x_shifted = [] 
for x_data in x_train: 
    dx = random.choice([-2,-1,1,2])
    dy = random.choice([-2,-1,1,2])
    transform = AffineTransform(translation = (dx,dy))
    warp_img = warp(x_data, transform, mode = "wrap")
    x_shifted.append(warp_img) 
x_shifted = np.asarray(x_shifted)

In [12]:
# concatenating augmented data to the original 
x_train = np.concatenate((x_train, x_train_rotated, x_train_rotated_2, x_noised, x_blurred, x_shifted), axis = 0) 
y_train = np.concatenate((y_train, y_train, y_train, y_train, y_train, y_train), axis = 0) 
train_letters_numeric = np.concatenate((train_letters_numeric, train_letters_numeric, train_letters_numeric, train_letters_numeric, train_letters_numeric, train_letters_numeric), axis = 0)

x_train.shape, y_train.shape, train_letters_numeric.shape


((12288, 28, 28, 1), (12288, 10), (12288, 26))

# Conduct Training

In [13]:
# define train and test data generators 
# this "replaces" the train data. It does not add to it 
train_datagen = ImageDataGenerator(rotation_range = 40, 
                                  width_shift_range = 0.2, 
                                  height_shift_range = 0.2, 
                                  shear_range = 0.2,
                                  zoom_range = 0.2) 

In [13]:
# define models to be used for experimentation 
# a basic model that obtains around a 74% validation loss on 9:1 train/validation split. 
def base_cnn(): 
    inputs = Input((28,28,1))
    conv = Conv2D(64, 3, activation = 'relu')(inputs)
    conv = Conv2D(64, 3, activation = 'relu')(conv)
    conv = MaxPooling2D((2,2))(conv)
    bn = BatchNormalization()(conv) 
    conv = Conv2D(128, 3, activation = 'relu')(bn)
    conv = Conv2D(128, 3, activation = 'relu')(conv) 
    conv = MaxPooling2D((2,2))(conv) 
    bn = BatchNormalization()(conv)
    conv = Conv2D(256, 3, activation = 'relu')(bn)
    conv = MaxPooling2D((2,2))(conv)
    outputs = Flatten()(conv)
    outputs = BatchNormalization()(outputs)
    outputs = Dense(512, activation = 'relu')(outputs)
    outputs = Dense(10, activation = 'softmax')(outputs) 
    model = Model(inputs = inputs, outputs = outputs) 
    model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy']) 
    return model 

In [14]:
# uses skip connections and also adds information from both MaxPooling2D and AveragePooling2D 
def conv2d_block(input_layer, n_filters, kernel):
    conv1 = Conv2D(n_filters, kernel, activation = 'relu', padding = 'same')(input_layer)
    conv1 = BatchNormalization()(conv1)
    conv2 = Conv2D(n_filters, kernel, activation = 'relu', padding = 'same')(conv1)
    conv1 = Add()([conv1, conv2])   
    conv1 = BatchNormalization()(conv1)
    maxpool = MaxPooling2D((2,2))(conv1) 
    avgpool = AveragePooling2D((2,2))(conv1)
    ret = Add()([maxpool,avgpool])
    return ret 

def dense_block(input_layer, units): 
    dense1 = Dense(units, activation = 'relu')(input_layer)
    for i in range(3): 
        bn = BatchNormalization()(dense1)
        dense2 = Dense(units, activation = 'relu')(bn)
        dense1 = Add()([dense1,dense2]) 
    return dense1 
    
# obtains around 75% validation loss on a 9:1 train/validation split 
def base_cnn_grade_2(): 
    inputs = Input((28,28,1))
    conv0 = conv2d_block(inputs, 64, 7)    
    conv1 = conv2d_block(inputs, 64, 5) 
    conv2 = conv2d_block(inputs, 64, 4) 
    conv3 = conv2d_block(inputs, 64, 3)
    conv = Concatenate()([conv0,conv1,conv2,conv3])   
    conv0 = conv2d_block(conv, 32, 7)
    conv1 = conv2d_block(conv, 32, 5)
    conv2 = conv2d_block(conv, 32, 4)
    conv3 = conv2d_block(conv, 32, 3) 
    conv = Concatenate()([conv0,conv1,conv2,conv3]) 
    outputs = Flatten()(conv)
    for unit in [256, 128, 64]: 
        outputs = Dense(unit, activation = 'selu', kernel_initializer = 'lecun_normal')(outputs)  
    outputs = Dense(10, activation = 'softmax')(outputs)
    model = Model(inputs = inputs, outputs = outputs)
    model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])
    return model 

# obtains around 80% validation loss on a 9:1 train/validation split 
def base_cnn_grade_3(): 
    inputs = Input((28,28,1))
    letter_input = Input((26,))
    conv0 = conv2d_block(inputs, 64, 7)    
    conv1 = conv2d_block(inputs, 64, 5) 
    conv2 = conv2d_block(inputs, 64, 4) 
    conv3 = conv2d_block(inputs, 64, 3)
    conv = Concatenate()([conv0,conv1,conv2,conv3])   
    conv0 = conv2d_block(conv, 32, 7)
    conv1 = conv2d_block(conv, 32, 5)
    conv2 = conv2d_block(conv, 32, 4)
    conv3 = conv2d_block(conv, 32, 3) 
    conv = Concatenate()([conv0,conv1,conv2,conv3]) 
    outputs = Flatten()(conv) 
    outputs = Concatenate()([outputs,letter_input])
    for unit in [512, 256, 128]: 
        outputs = Dense(unit, activation = 'selu', kernel_initializer = 'lecun_normal')(outputs)  
    outputs = Dense(10, activation = 'softmax')(outputs)
    model = Model(inputs = [inputs, letter_input], outputs = outputs)
    model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])
    return model  

# obtains around 82% validation loss on a 9:1 train/validation split
# the most promising model so far, until we come up with a potentially more powerful grade 5 model 
def base_cnn_grade_4(): 
    inputs = Input((28,28,1))
    letter_input = Input((26,))   
    conv1 = conv2d_block(inputs, 64, 7) 
    conv2 = conv2d_block(inputs, 64, 5) 
    conv3 = conv2d_block(inputs, 64, 3) 
    conv = Concatenate()([conv1,conv2,conv3])   
    conv1 = conv2d_block(conv, 32, 7)
    conv2 = conv2d_block(conv, 32, 5)
    conv3 = conv2d_block(conv, 32, 3) 
    conv = Concatenate()([conv1,conv2,conv3]) 
    outputs = Flatten()(conv) 
    outputs = Concatenate()([outputs,letter_input])
    for unit in [512, 256, 128]: 
        outputs = Dense(unit, activation = 'relu')(outputs)  
        outputs = BatchNormalization()(outputs) 
    outputs = Dropout(0.4)(outputs) 
    outputs = Dense(10, activation = 'softmax')(outputs)
    model = Model(inputs = [inputs, letter_input], outputs = outputs)
    model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])
    return model  

# baseline model presented by Dacon 
# results in 74% accuracy for 9:1 train/validation split
def cnn_dacon(): 
    inputs = Input((28,28,1)) 
    letter_data = Input((26,)) 

    bn = BatchNormalization()(inputs)
    conv = Conv2D(128, kernel_size=5, strides=1, padding='same', activation='relu')(bn)
    bn = BatchNormalization()(conv)
    conv = Conv2D(128, kernel_size=2, strides=1, padding='same', activation='relu')(bn)
    pool = MaxPooling2D((2, 2))(conv)

    bn = BatchNormalization()(pool)
    conv = Conv2D(256, kernel_size=2, strides=1, padding='same', activation='relu')(bn)
    bn = BatchNormalization()(conv)
    conv = Conv2D(256, kernel_size=2, strides=1, padding='same', activation='relu')(bn)
    pool = MaxPooling2D((2, 2))(conv)

    flatten = Flatten()(pool)
    flatten = Concatenate()([flatten, letter_data])
    bn = BatchNormalization()(flatten)
    dense = Dense(1000, activation='relu')(bn)

    bn = BatchNormalization()(dense)
    outputs = Dense(10, activation='softmax')(bn)
    model = Model(inputs = [inputs,letter_data], outputs = outputs) 
    model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy']) 
    return model 
    


In [22]:
'''
# codeblock to delete files in model directory - for experimentation 
# use with caution! 
files = [os.path.join('./storage/mnist_test/kfold1_sklearn/',x) for x in os.listdir('./storage/mnist_test/kfold1_sklearn/')] 
for file in files:  
    if 'epoch' in file: 
        os.remove(file)
'''

In [16]:
# implement k-fold cv 
def k_fold(k,files):  
    folds = [] 
    fold_size = len(files) // k 
    for i in range(k): 
        if i == k-1:  
            l = files[i*fold_size:] 
        else: 
            l = files[i*fold_size:(i+1)*fold_size]  
        folds.append(l)   
    return folds  

x_train_folds = k_fold(5, x_train)
y_train_folds = k_fold(5, y_train) 
letter_train_folds = k_fold(5,train_letters_numeric)

for t in range(5):  
    print("************ Fold {} training ************".format(t+1)) 
    cur_val_x = x_train_folds[t] 
    cur_val_y = y_train_folds[t] 
    cur_val_letter = letter_train_folds[t]
    train_folds_x = x_train_folds[0:t] + x_train_folds[t+1:] 
    train_folds_y = y_train_folds[0:t] + y_train_folds[t+1:]
    train_fold_letter = letter_train_folds[0:t] + letter_train_folds[t+1:]
    cur_train_x = [] 
    cur_train_y = [] 
    cur_letter = [] 
    for j in train_folds_x:  
        for q in j:  
            cur_train_x.append(q) 
    for j in train_folds_y:  
        for q in j:  
            cur_train_y.append(q)  
    for j in train_fold_letter: 
        for q in j: 
            cur_letter.append(q) 
    cur_train_x = np.asarray(cur_train_x)
    cur_train_y = np.asarray(cur_train_y)
    cur_letter = np.asarray(cur_letter) 
    model_path = './storage/mnist_test/' + 'kfold' + str(t+1) + '/epoch_{epoch:03d}_val_{val_loss:.3f}_acc_{val_accuracy:.3f}.h5' 
    learning_rate_reduction = ReduceLROnPlateau(monitor='val_accuracy', patience=3, verbose=1, factor=0.8)
    checkpoint = ModelCheckpoint(filepath=model_path,monitor='val_accuracy',verbose=1,save_best_only=True)
    early_stopping = EarlyStopping(monitor='val_accuracy',patience=25)
    annealer = LearningRateScheduler(lambda x: 1e-3 * 0.95 ** x) # possible alternative to ReduceLROnPlateau
    model = base_cnn_grade_4() 
 
    '''
    history = model.fit_generator(
        train_datagen.flow(cur_train_x, cur_train_y, batch_size = 32, shuffle = True),
        epochs = 200, 
        steps_per_epoch = cur_train_x.shape[0]//32,
        verbose = 1, 
        validation_data = (cur_val_x, cur_val_y),
        callbacks=[learning_rate_reduction, checkpoint, early_stopping]
    )
    ''' 
    
    history = model.fit([cur_train_x,cur_letter],
                        cur_train_y,
                       batch_size = 32,
                       shuffle = True, 
                       validation_data = ([cur_val_x,cur_val_letter],cur_val_y),
                       verbose = 1, 
                       epochs = 300,
                       callbacks = [learning_rate_reduction, checkpoint, early_stopping])


************ Fold 1 training ************
Train on 9831 samples, validate on 2457 samples
Epoch 1/300
Epoch 00001: val_accuracy improved from -inf to 0.09768, saving model to ./storage/mnist_test/kfold1/epoch_001_val_3.610_acc_0.098.h5
Epoch 2/300
Epoch 00002: val_accuracy improved from 0.09768 to 0.70981, saving model to ./storage/mnist_test/kfold1/epoch_002_val_0.860_acc_0.710.h5
Epoch 3/300
Epoch 00003: val_accuracy improved from 0.70981 to 0.80667, saving model to ./storage/mnist_test/kfold1/epoch_003_val_0.564_acc_0.807.h5
Epoch 4/300
Epoch 00004: val_accuracy improved from 0.80667 to 0.85429, saving model to ./storage/mnist_test/kfold1/epoch_004_val_0.442_acc_0.854.h5
Epoch 5/300
Epoch 00005: val_accuracy improved from 0.85429 to 0.88523, saving model to ./storage/mnist_test/kfold1/epoch_005_val_0.372_acc_0.885.h5
Epoch 6/300
Epoch 00006: val_accuracy improved from 0.88523 to 0.89255, saving model to ./storage/mnist_test/kfold1/epoch_006_val_0.333_acc_0.893.h5
Epoch 7/300
Epoch 0

In [15]:
# k-fold version 2, using the sklearn utility. 
# this version may score better, because it shuffles the batches 
# we can try comparing these two methods 
k_fold = KFold(n_splits=5, shuffle=True, random_state=7777)
t = 0 # for debugging and saving purposes  
for train_idx, val_idx in k_fold.split(x_train): 
    cur_val_x = x_train[val_idx] 
    cur_val_y = y_train[val_idx] 
    cur_val_letter = train_letters_numeric[val_idx] 
    
    cur_train_x = x_train[train_idx] 
    cur_train_y = y_train[train_idx] 
    cur_letter = train_letters_numeric[train_idx]  
    
    model_path = './storage/mnist_test/' + 'kfold' + str(t+1) + '_sklearn' + '/epoch_{epoch:03d}_val_{val_loss:.3f}_acc_{val_accuracy:.3f}.h5' 
    learning_rate_reduction = ReduceLROnPlateau(monitor='val_accuracy', patience=3, verbose=1, factor=0.8)
    checkpoint = ModelCheckpoint(filepath=model_path,monitor='val_accuracy',verbose=1,save_best_only=True)
    early_stopping = EarlyStopping(monitor='val_accuracy',patience=25)
    annealer = LearningRateScheduler(lambda x: 1e-3 * 0.95 ** x)
    model = base_cnn_grade_4() 
    history = model.fit([cur_train_x,cur_letter],
                        cur_train_y,
                       batch_size = 32,
                       shuffle=True, 
                       validation_data = ([cur_val_x,cur_val_letter],cur_val_y),
                       verbose = 1, 
                       epochs = 300,
                       callbacks = [learning_rate_reduction, checkpoint, early_stopping])
    t += 1 

Train on 9830 samples, validate on 2458 samples
Epoch 1/300
Epoch 00001: val_accuracy improved from -inf to 0.12368, saving model to ./storage/mnist_test/kfold1_sklearn/epoch_001_val_3.423_acc_0.124.h5
Epoch 2/300
Epoch 00002: val_accuracy improved from 0.12368 to 0.60334, saving model to ./storage/mnist_test/kfold1_sklearn/epoch_002_val_1.183_acc_0.603.h5
Epoch 3/300
Epoch 00003: val_accuracy improved from 0.60334 to 0.69894, saving model to ./storage/mnist_test/kfold1_sklearn/epoch_003_val_0.859_acc_0.699.h5
Epoch 4/300
Epoch 00004: val_accuracy improved from 0.69894 to 0.75712, saving model to ./storage/mnist_test/kfold1_sklearn/epoch_004_val_0.721_acc_0.757.h5
Epoch 5/300
Epoch 00005: val_accuracy improved from 0.75712 to 0.77339, saving model to ./storage/mnist_test/kfold1_sklearn/epoch_005_val_0.644_acc_0.773.h5
Epoch 6/300
Epoch 00006: val_accuracy did not improve from 0.77339
Epoch 7/300
Epoch 00007: val_accuracy improved from 0.77339 to 0.78234, saving model to ./storage/mnist

### Make prediction for the non-shuffling k-fold method

In [17]:
model1 = load_model('./storage/mnist_test/kfold1/epoch_052_val_0.181_acc_0.971.h5')
model2 = load_model('./storage/mnist_test/kfold2/epoch_162_val_1.666_acc_0.732.h5')
model3 = load_model('./storage/mnist_test/kfold3/epoch_048_val_0.651_acc_0.882.h5')
model4 = load_model('./storage/mnist_test/kfold4/epoch_072_val_1.140_acc_0.793.h5')
model5 = load_model('./storage/mnist_test/kfold5/epoch_097_val_0.868_acc_0.833.h5')

pred1 = model1.predict([x_test, test_letters_numeric])
pred2 = model2.predict([x_test, test_letters_numeric])
pred3 = model3.predict([x_test, test_letters_numeric])
pred4 = model4.predict([x_test, test_letters_numeric])
pred5 = model5.predict([x_test, test_letters_numeric])

pred_avg = (pred1+pred2+pred3+pred4+pred5)/5.0 

In [19]:
result_arr = [] 
for pred in pred_avg: 
    result_arr.append(np.argmax(pred))
result_arr = np.asarray(result_arr)
result_arr

array([6, 9, 8, ..., 6, 8, 0])

In [20]:
submission['digit'] = result_arr
submission.head()

Unnamed: 0,id,digit
0,2049,6
1,2050,9
2,2051,8
3,2052,0
4,2053,3


In [21]:
submission.to_csv('./storage/augmented_custom_kfold.csv',index=False)

### Make prediction for the shuffling k-fold method

In [16]:
model1 = load_model('./storage/mnist_test/kfold1_sklearn/epoch_053_val_0.583_acc_0.884.h5')
model2 = load_model('./storage/mnist_test/kfold2_sklearn/epoch_071_val_0.684_acc_0.880.h5')
model3 = load_model('./storage/mnist_test/kfold3_sklearn/epoch_077_val_0.583_acc_0.893.h5')
model4 = load_model('./storage/mnist_test/kfold4_sklearn/epoch_065_val_0.603_acc_0.887.h5')
model5 = load_model('./storage/mnist_test/kfold5_sklearn/epoch_088_val_0.553_acc_0.897.h5')

pred1 = model1.predict([x_test, test_letters_numeric])
pred2 = model2.predict([x_test, test_letters_numeric])
pred3 = model3.predict([x_test, test_letters_numeric])
pred4 = model4.predict([x_test, test_letters_numeric])
pred5 = model5.predict([x_test, test_letters_numeric])

pred_avg = (pred1+pred2+pred3+pred4+pred5)/5.0 

In [17]:
result_arr = [] 
for pred in pred_avg: 
    result_arr.append(np.argmax(pred))
result_arr = np.asarray(result_arr)
result_arr

array([6, 3, 8, ..., 6, 1, 0])

In [18]:
submission['digit'] = result_arr
submission.head()

Unnamed: 0,id,digit
0,2049,6
1,2050,3
2,2051,8
3,2052,0
4,2053,3


In [19]:
submission.to_csv('./storage/augmented_sklearn_kfold.csv',index=False)

Interesting reads and references :
- https://www.kdnuggets.com/2018/09/dropout-convolutional-networks.html
- https://kangbk0120.github.io/articles/2018-01/inception-googlenet-review