In [1]:
# environment: Paperspace Quadro P6000 GPU  
import numpy as np 
import pandas as pd 
import os 
import tensorflow as tf
import tensorflow.keras # run pip install keras==2.3 beforehand for compatability 
from tensorflow.keras import Input, Model 
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Conv2D, Dropout, AlphaDropout, MaxPooling2D, AveragePooling2D, BatchNormalization, Concatenate, Flatten, Reshape, Add, Activation
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, LearningRateScheduler
from keras.utils.np_utils import to_categorical
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import skimage
from skimage.transform import rotate
from skimage.util import random_noise
from skimage.transform import warp, AffineTransform
from skimage.transform import resize
import cv2
import random 
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.utils import shuffle # shuffle dataset before splitting into folds 
from scipy.ndimage.filters import gaussian_filter # for elastic distortion 
from scipy.ndimage.interpolation import map_coordinates # for elastic distortion 

Using TensorFlow backend.


# Read in file and preprocess data

In [2]:
train_path = './storage/modified_mnist_dataset/train.csv'  
test_path = './storage/modified_mnist_dataset/test.csv' 
submission_path = './storage/modified_mnist_dataset/submission.csv'

In [3]:
train = pd.read_csv(train_path)
test = pd.read_csv(test_path) 
submission = pd.read_csv(submission_path)

In [4]:
# convert types of digit and letter columns to categorical 
train.iloc[:,1] = pd.Categorical(train.iloc[:,1])
train.iloc[:,2] = pd.Categorical(train.iloc[:,2]) 
test.iloc[:,1] = pd.Categorical(test.iloc[:,1])

In [5]:
# convert types of digit and letter columns to categorical 
train.iloc[:,1] = pd.Categorical(train.iloc[:,1])
train.iloc[:,2] = pd.Categorical(train.iloc[:,2]) 
test.iloc[:,1] = pd.Categorical(test.iloc[:,1])

# define and re-format train and test data 
x_train = train.iloc[:,3:].values.reshape(-1,28,28,1).astype(np.float32) 
y_train = train.iloc[:,1].values
y_train = np.asarray(y_train)
y_train = to_categorical(y_train, num_classes = 10)
train_letters = train.iloc[:,2].values

x_test = test.iloc[:,2:].values.reshape(-1,28,28,1).astype(np.float32)  
test_letters = test.iloc[:,1].values 

In [6]:
train_letters_numeric = [] 
test_letters_numeric = [] 
for letter in train_letters: 
    train_letters_numeric.append(ord(letter) - ord("A"))
for letter in test_letters: 
    test_letters_numeric.append(ord(letter) - ord("A")) 
    
train_letters_numeric = np.asarray(train_letters_numeric) 
test_letters_numeric = np.asarray(test_letters_numeric) 

train_letters_numeric = to_categorical(train_letters_numeric, num_classes = 26) 
test_letters_numeric = to_categorical(test_letters_numeric, num_classes = 26)

x_train.shape, y_train.shape, x_test.shape, train_letters_numeric.shape, test_letters_numeric.shape

((2048, 28, 28, 1), (2048, 10), (20480, 28, 28, 1), (2048, 26), (20480, 26))

In [7]:
# min-max scaling 
x_train /= 255.0 
x_test /= 255.0 

# Augment Data

For now, we will try augmenting the data using the following methods 
- rotation 
- adding noise  
- adding gaussian blur 
- shifting image 

Please refer to [this notebook](https://github.com/iljimae0418/overlapping-digit-and-letter-mnist/blob/master/Examples%20of%20data%20augmentations.ipynb) for examples.  

Some more augmentations were decided to be added. They are 
- modifying brightness 
- ZCA Whitening 
- random crops
- elastic distortions
- Autoencoder generated images 

some more augmentations that are being planned are 
- GAN generated images

Please refer to [this notebook](https://github.com/iljimae0418/overlapping-digit-and-letter-mnist/blob/master/Examples%20of%20augmentation%202%20(further%20augmentation).ipynb) for examples. 

In [8]:
# apply rotations 
x_train_rotated = [] 
for x_data in x_train:
    rotated_img = rotate(x_data, angle = random.randint(10,40))
    x_train_rotated.append(rotated_img) 
x_train_rotated = np.asarray(x_train_rotated) 

In [9]:
# apply clockwise rotations 
x_train_rotated_2 = [] 
for x_data in x_train: 
    rotated_img = rotate(x_data, angle = -random.randint(10,40)) 
    x_train_rotated_2.append(rotated_img)
x_train_rotated_2 = np.asarray(x_train_rotated_2) 

In [10]:
# add noise 
x_noised = [] 
for x_data in x_train: 
    noised_img = random_noise(x_data) 
    x_noised.append(noised_img)
x_noised = np.asarray(x_noised) 

In [11]:
# add gaussian blur 
x_blurred = [] 
for x_data in x_train:
    kernel_size = random.choice([3,5,9]) 
    blurred = cv2.GaussianBlur(x_data, (kernel_size, kernel_size), 0) 
    x_blurred.append(blurred)
x_blurred = np.asarray(x_blurred)
x_blurred = x_blurred.reshape(-1,28,28,1) 

In [12]:
# shift image 
x_shifted = [] 
for x_data in x_train: 
    dx = random.choice([-2,-1,1,2])
    dy = random.choice([-2,-1,1,2])
    transform = AffineTransform(translation = (dx,dy))
    warp_img = warp(x_data, transform, mode = "wrap")
    x_shifted.append(warp_img) 
x_shifted = np.asarray(x_shifted)

In [13]:
# apply brightness modifications 
x_brightness = [] 
for x_data in x_train: 
    brightness = 0.5
    alpha = 1.0 + random.uniform(-brightness, brightness) 
    brightness_modified = x_data * alpha 
    x_brightness.append(brightness_modified) 

x_brightness = np.asarray(x_brightness) 

In [14]:
# apply zca whitening 
def zca_whitening(sample): 
    sample = sample - sample.mean(axis=0)
    cov = np.cov(sample, rowvar = False)
    U,S,V = np.linalg.svd(cov) 
    epsilon = 0.1
    X_ZCA = U.dot(np.diag(1.0/np.sqrt(S + epsilon))).dot(U.T).dot(sample.T).T
    X_ZCA_rescaled = (X_ZCA - X_ZCA.min()) / (X_ZCA.max() - X_ZCA.min())
    X_ZCA_rescaled = X_ZCA_rescaled.reshape((28,28,1)) 
    return X_ZCA_rescaled 

x_zca_whitened = [] 
for x_data in x_train: 
    zca_whitened = zca_whitening(x_data.reshape((28,28))) 
    x_zca_whitened.append(zca_whitened) 
    
x_zca_whitened = np.asarray(x_zca_whitened) 


In [15]:
# add random cropping (zooming effect)
def random_crop(img): 
    img = img.copy() 
    size = random.randint(22,24) # this seems to be a good balance, since our image size is 28 by 28
    crop_size = (size,size)
    w,h = img.shape[:2]
    x,y = np.random.randint(h-crop_size[0]), np.random.randint(w-crop_size[1])
    img = img[y:y+crop_size[0], x:x+crop_size[1]] 
    return img 

x_random_crop = []
for x_data in x_train: 
    cropped = random_crop(x_data) 
    cropped = resize(cropped, (28,28,1))
    x_random_crop.append(cropped)

x_random_crop = np.asarray(x_random_crop) 

In [16]:
# add elastic distortions 
# from https://www.kaggle.com/babbler/mnist-data-augmentation-with-elastic-distortion
def elastic_transform(image, alpha_range, sigma, random_state=None):
    """Elastic deformation of images as described in [Simard2003]_.
    .. [Simard2003] Simard, Steinkraus and Platt, "Best Practices for
       Convolutional Neural Networks applied to Visual Document Analysis", in
       Proc. of the International Conference on Document Analysis and
       Recognition, 2003.
       
   # Arguments
       image: Numpy array with shape (height, width, channels). 
       alpha_range: Float for fixed value or [lower, upper] for random value from uniform distribution.
           Controls intensity of deformation.
       sigma: Float, sigma of gaussian filter that smooths the displacement fields.
       random_state: `numpy.random.RandomState` object for generating displacement fields.
    """
    
    if random_state is None:
        random_state = np.random.RandomState(None)
        
    if np.isscalar(alpha_range):
        alpha = alpha_range
    else:
        alpha = np.random.uniform(low=alpha_range[0], high=alpha_range[1])
    
    shape = image.shape
    dx = gaussian_filter((random_state.rand(*shape) * 2 - 1), sigma) * alpha
    dy = gaussian_filter((random_state.rand(*shape) * 2 - 1), sigma) * alpha

    x, y, z = np.meshgrid(np.arange(shape[0]), np.arange(shape[1]), np.arange(shape[2]), indexing='ij')
    indices = np.reshape(x+dx, (-1, 1)), np.reshape(y+dy, (-1, 1)), np.reshape(z, (-1, 1))

    return map_coordinates(image, indices, order=1, mode='reflect').reshape(shape)

x_elastic_distort = [] 
for x_data in x_train: 
    distorted = elastic_transform(x_data, [8,10], 3) 
    x_elastic_distort.append(distorted) 
    
x_elastic_distort = np.asarray(x_elastic_distort)

In [17]:
# load autoencoder generated images 
x_en = np.load('./storage/ae_gen_2.npy')

In [18]:
# concatenating augmented data to the original 
x_train = np.concatenate((x_train, x_train_rotated, x_train_rotated_2, x_noised, x_blurred, x_shifted, x_brightness, x_zca_whitened, x_random_crop, x_elastic_distort, x_en), axis = 0) 
y_train = np.concatenate((y_train, y_train, y_train, y_train, y_train, y_train, y_train, y_train, y_train, y_train, y_train), axis = 0) 
train_letters_numeric = np.concatenate((train_letters_numeric, train_letters_numeric, train_letters_numeric, train_letters_numeric, train_letters_numeric, train_letters_numeric, train_letters_numeric, train_letters_numeric, train_letters_numeric, train_letters_numeric, train_letters_numeric), axis = 0)

x_train.shape, y_train.shape, train_letters_numeric.shape


((22528, 28, 28, 1), (22528, 10), (22528, 26))

# Conduct Training

In [19]:
# define train and test data generators 
# this "replaces" the train data. It does not add to it 
# I am not using this for now. I am directly adding augmented data 
train_datagen = ImageDataGenerator(width_shift_range = 0.1, 
                                  height_shift_range = 0.1, 
                                  shear_range = 0.1,
                                  zoom_range = 0.1,
                                  ) 

In [20]:
# uses skip connections and also adds information from both MaxPooling2D and AveragePooling2D 
def conv2d_block(input_layer, n_filters, kernel):
    conv1 = Conv2D(n_filters, kernel, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(input_layer)
    conv1 = BatchNormalization()(conv1)
    conv2 = Conv2D(n_filters, kernel, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(conv1)
    conv1 = Add()([conv1, conv2])   
    conv1 = BatchNormalization()(conv1)
    maxpool = MaxPooling2D((2,2))(conv1) 
    avgpool = AveragePooling2D((2,2))(conv1)
    ret = Add()([maxpool,avgpool])
    return ret 

# obtains around 82% validation accuracy on a 9:1 train/validation split
# the most promising model so far, until we come up with a potentially more powerful grade 5 model 
def base_cnn_grade_4(): 
    inputs = Input((28,28,1))
    letter_input = Input((26,))    
    conv1 = conv2d_block(inputs, 64, 7) 
    conv2 = conv2d_block(inputs, 64, 5) 
    conv3 = conv2d_block(inputs, 64, 3) 
    conv = Concatenate()([conv1,conv2,conv3])   
    conv1 = conv2d_block(conv, 32, 7)
    conv2 = conv2d_block(conv, 32, 5)
    conv3 = conv2d_block(conv, 32, 3) 
    conv = Concatenate()([conv1,conv2,conv3]) 
    outputs = Flatten()(conv) 
    outputs = Concatenate()([outputs,letter_input])
    for unit in [512, 256, 128]: 
        outputs = Dense(unit, activation = 'relu')(outputs)  
        outputs = BatchNormalization()(outputs) 
    outputs = Dropout(0.4)(outputs) 
    outputs = Dense(10, activation = 'softmax')(outputs)
    model = Model(inputs = [inputs, letter_input], outputs = outputs)
    model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])
    return model  

# obtains 91.2% accuracy on public leaderboard. 
# increased to three convolutional blocks 
# batchnorm input layer to normalize the input layer  
def base_cnn_grade_5(): 
    inputs = Input((28,28,1))
    letter_input = Input((26,))  
    bn = BatchNormalization()(inputs)
    conv1 = conv2d_block(bn, 64, 7) 
    conv2 = conv2d_block(bn, 64, 5) 
    conv3 = conv2d_block(bn, 64, 3) 
    conv4 = conv2d_block(bn, 64, 1)
    conv = Concatenate()([conv1,conv2,conv3,conv4])   
    conv = BatchNormalization()(conv) 
    conv1 = conv2d_block(conv, 32, 7)
    conv2 = conv2d_block(conv, 32, 5)
    conv3 = conv2d_block(conv, 32, 3) 
    conv4 = conv2d_block(conv, 32, 1)
    conv = Concatenate()([conv1,conv2,conv3,conv4])   
    conv = BatchNormalization()(conv) 
    conv1 = conv2d_block(conv, 16, 7)
    conv2 = conv2d_block(conv, 16, 5)
    conv3 = conv2d_block(conv, 16, 3)   
    conv4 = conv2d_block(conv, 16, 1)       
    conv = Concatenate()([conv1,conv2,conv3,conv4]) 
    conv = BatchNormalization()(conv) 
    outputs = Flatten()(conv) 
    outputs = Concatenate()([outputs, letter_input]) 
    for units in [512, 256, 128]: 
        outputs = Dense(units, activation = 'relu', kernel_initializer = 'he_normal')(outputs) 
        outputs = BatchNormalization()(outputs)  
    outputs = Dropout(0.4)(outputs) 
    outputs = Dense(10, activation = 'softmax')(outputs)
    model = Model(inputs = [inputs, letter_input], outputs = outputs)
    model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])
    return model  


# reduced to two convolutional blocks, but used one more filter size. 
# more dense layers at the end 
# batchnorm input layer to normalize the input layer 
def base_cnn_grade_6(): 
    inputs = Input((28,28,1))
    letter_input = Input((26,))  
    bn = BatchNormalization()(inputs)
    conv1 = conv2d_block(bn, 64, 7) 
    conv2 = conv2d_block(bn, 64, 5) 
    conv3 = conv2d_block(bn, 64, 4)
    conv4 = conv2d_block(bn, 64, 3) 
    conv5 = conv2d_block(bn, 64, 1)
    conv = Concatenate()([conv1,conv2,conv3,conv4,conv5])   
    conv = BatchNormalization()(conv) 
    conv1 = conv2d_block(conv, 32, 7)
    conv2 = conv2d_block(conv, 32, 5)
    conv3 = conv2d_block(conv, 32, 4) 
    conv4 = conv2d_block(conv, 32, 3) 
    conv5 = conv2d_block(conv, 32, 1)
    conv = Concatenate()([conv1,conv2,conv3,conv4,conv5])   
    conv = BatchNormalization()(conv) 
    outputs = Flatten()(conv)  
    outputs = Concatenate()([outputs, letter_input])
    for units in [1024, 512, 256, 128]: 
        outputs = Dense(units, activation = 'relu', kernel_initializer = 'he_normal')(outputs) 
        outputs = BatchNormalization()(outputs)  
    outputs = Dropout(0.4)(outputs) 
    outputs = Dense(10, activation = 'softmax')(outputs)
    model = Model(inputs = [inputs, letter_input], outputs = outputs)
    model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])
    return model  





In [21]:
# implement k-fold cv 
def k_fold(k,files):  
    folds = [] 
    fold_size = len(files) // k 
    for i in range(k): 
        if i == k-1:  
            l = files[i*fold_size:] 
        else: 
            l = files[i*fold_size:(i+1)*fold_size]  
        folds.append(l)   
    return folds  

# uncomment below to shuffle before splitting data into folds 
x_train, y_train, train_letters_numeric = shuffle(x_train, y_train, train_letters_numeric)
# split data into 10 folds 
k = 10
x_train_folds = k_fold(k, x_train)
y_train_folds = k_fold(k, y_train) 
letter_train_folds = k_fold(k,train_letters_numeric)

for t in range(k):  
    print("************ Fold {} training ************".format(t+1)) 
    cur_val_x = x_train_folds[t] 
    cur_val_y = y_train_folds[t] 
    cur_val_letter = letter_train_folds[t]
    train_folds_x = x_train_folds[0:t] + x_train_folds[t+1:] 
    train_folds_y = y_train_folds[0:t] + y_train_folds[t+1:]
    train_fold_letter = letter_train_folds[0:t] + letter_train_folds[t+1:]
    cur_train_x = [] 
    cur_train_y = [] 
    cur_letter = [] 
    for j in train_folds_x:  
        for q in j:  
            cur_train_x.append(q) 
    for j in train_folds_y:  
        for q in j:  
            cur_train_y.append(q)  
    for j in train_fold_letter: 
        for q in j: 
            cur_letter.append(q) 
    cur_train_x = np.asarray(cur_train_x)
    cur_train_y = np.asarray(cur_train_y)
    cur_letter = np.asarray(cur_letter) 
    model_path = './storage/mnist_test_2/' + 'kfold' + str(t+1) + '/epoch_{epoch:03d}_val_{val_loss:.3f}_acc_{val_accuracy:.3f}.h5' 
    learning_rate_reduction = ReduceLROnPlateau(monitor='val_accuracy', patience=3, verbose=1, factor=0.8)
    checkpoint = ModelCheckpoint(filepath=model_path,monitor='val_accuracy',verbose=1,save_best_only=True)
    early_stopping = EarlyStopping(monitor='val_accuracy',patience=25)
    annealer = LearningRateScheduler(lambda x: 1e-3 * 0.95 ** x) # possible alternative to ReduceLROnPlateau
    model = base_cnn_grade_6() 
    
    history = model.fit([cur_train_x,cur_letter],
                        cur_train_y,
                       batch_size = 32,
                       shuffle = True, 
                       validation_data = ([cur_val_x,cur_val_letter],cur_val_y),
                       verbose = 1, 
                       epochs = 300,
                       callbacks = [learning_rate_reduction, checkpoint, early_stopping])


************ Fold 1 training ************
Train on 20276 samples, validate on 2252 samples
Epoch 1/300
Epoch 00001: val_accuracy improved from -inf to 0.59813, saving model to ./storage/mnist_test_2/kfold1/epoch_001_val_1.198_acc_0.598.h5
Epoch 2/300
Epoch 00002: val_accuracy improved from 0.59813 to 0.69805, saving model to ./storage/mnist_test_2/kfold1/epoch_002_val_0.894_acc_0.698.h5
Epoch 3/300
Epoch 00003: val_accuracy improved from 0.69805 to 0.80417, saving model to ./storage/mnist_test_2/kfold1/epoch_003_val_0.642_acc_0.804.h5
Epoch 4/300
Epoch 00004: val_accuracy improved from 0.80417 to 0.80639, saving model to ./storage/mnist_test_2/kfold1/epoch_004_val_0.633_acc_0.806.h5
Epoch 5/300
Epoch 00005: val_accuracy did not improve from 0.80639
Epoch 6/300
Epoch 00006: val_accuracy improved from 0.80639 to 0.83925, saving model to ./storage/mnist_test_2/kfold1/epoch_006_val_0.526_acc_0.839.h5
Epoch 7/300
Epoch 00007: val_accuracy did not improve from 0.83925
Epoch 8/300
Epoch 00008