In [None]:
import pandas as pd
from pandas import DataFrame
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import seaborn as sns
%matplotlib inline


from sklearn.model_selection import train_test_split
from sklearn.metrics import *
import itertools

from keras.utils.np_utils import to_categorical # convert to one-hot-encoding
from keras.models import *
from keras.layers import *
from keras.optimizers import *
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import *
import os

import xgboost as xgb 
from xgboost import plot_importance , XGBClassifier, DMatrix

from random import *

sns.set(style='white', context='notebook', palette='deep')

In [None]:
# define train set
from google.colab import drive
drive.mount('/content/drive')
train = pd.read_csv('./drive/My Drive/DACON/data_file/train.csv')
test = pd.read_csv('./drive/My Drive/DACON/data_file/test.csv')
train_copy = train.copy()
test_copy = test.copy()

In [None]:
rot_gen = ImageDataGenerator(
    featurewise_center=False,
    samplewise_center=False,
    featurewise_std_normalization=False,
    samplewise_std_normalization=False,
    zca_whitening=False,
    zca_epsilon=1e-06,
    rotation_range=45, 
    width_shift_range=0.0,
    height_shift_range=0.0,
    brightness_range=None,
    shear_range=0,     
    zoom_range=0,      
    channel_shift_range=0.0,
    fill_mode='constant', 
    cval=0.0,             
    horizontal_flip=False, 
    vertical_flip=False,   
    rescale=1./255, # Rescale
    preprocessing_function=None,
    data_format=None,
    validation_split=0, 
    dtype=None
)

trans_gen = ImageDataGenerator(
    featurewise_center=False,
    samplewise_center=False,
    featurewise_std_normalization=False,
    samplewise_std_normalization=False,
    zca_whitening=False,
    zca_epsilon=1e-06,
    rotation_range=0, 
    width_shift_range=0.2, 
    height_shift_range=0.2,
    brightness_range=None,
    shear_range=0,     
    zoom_range=0,      
    channel_shift_range=0.0,
    fill_mode='constant', 
    cval=0.0,             
    horizontal_flip=False, 
    vertical_flip=False,   
    rescale=1./255, # Rescale
    preprocessing_function=None,
    data_format=None,
    validation_split=0, 
    dtype=None
)

shear_zoom_gen = ImageDataGenerator(
    featurewise_center=False,
    samplewise_center=False,
    featurewise_std_normalization=False,
    samplewise_std_normalization=False,
    zca_whitening=False,
    zca_epsilon=1e-06,
    rotation_range=0, 
    width_shift_range=0.0,
    height_shift_range=0.0,
    brightness_range=None,
    shear_range=0.2,     
    zoom_range=0.2,      
    channel_shift_range=0.0,
    fill_mode='constant', 
    cval=0.0,             
    horizontal_flip=False, 
    vertical_flip=False,   
    rescale=1./255, # Rescale
    preprocessing_function=None,
    data_format=None,
    validation_split=0, 
    dtype=None
)

flip_gen = ImageDataGenerator(
    featurewise_center=False,
    samplewise_center=False,
    featurewise_std_normalization=False,
    samplewise_std_normalization=False,
    zca_whitening=False,
    zca_epsilon=1e-06,
    rotation_range=0, 
    width_shift_range=0.0,
    height_shift_range=0.0,
    brightness_range=None,
    shear_range=0,     
    zoom_range=0,      
    channel_shift_range=0.0,
    fill_mode='constant', 
    cval=0.0,             
    horizontal_flip=True, 
    vertical_flip=True,   
    rescale=1./255, # Rescale
    preprocessing_function=None,
    data_format=None,
    validation_split=0, 
    dtype=None
)

In [None]:
def augmentation( input_imgs ):
    df = input_imgs
    new_data_set = []
    num_of_training_set = df.shape[0]

    for i in range(num_of_training_set//2):
        rand_1 = np.random.randint(num_of_training_set)
        rand_2 = np.random.randint(num_of_training_set)
        rand_3 = np.random.randint(num_of_training_set)
        rand_4 = np.random.randint(num_of_training_set)
    
        for j in range(3):
            # rotation
            _rot = rot_gen.flow( np.array(df.iloc[rand_1,3:]).reshape(1,28,28,1) ).next().reshape(784,)
            new_data_set += [[
                df.iloc[rand_1,1],
                df.iloc[rand_1,2],
            ] + list(_rot)]
            # translation
            _trans = trans_gen.flow( np.array(df.iloc[rand_2,3:]).reshape(1,28,28,1) ).next().reshape(784,)
            new_data_set += [[
                df.iloc[rand_2,1],
                df.iloc[rand_2,2],
            ] + list(_trans)]
            # shear / zoom
            _shear = shear_zoom_gen.flow( np.array(df.iloc[rand_3,3:]).reshape(1,28,28,1) ).next().reshape(784,)
            new_data_set += [[
                df.iloc[rand_3,1],
                df.iloc[rand_3,2],
            ] + list(_shear)]
            # flip
            _flip = flip_gen.flow( np.array(df.iloc[rand_4,3:]).reshape(1,28,28,1) ).next().reshape(784,)
            new_data_set += [[
                df.iloc[rand_4,1],
                df.iloc[rand_4,2],
            ] + list(_flip)]

    columns = ['digit', 'letter'] + [str(x) for x in range(784)]
    aug = pd.DataFrame(new_data_set, columns=columns)

    train_norm = pd.concat([ input_imgs.iloc[:,1:3], np.divide(input_imgs.iloc[:,3:],255) ],axis=1)
    train_aug = pd.concat([train_norm,aug])

    return train_aug


In [None]:
def train_test_gen(input_imgs):
    train_aug = augmentation(input_imgs)

    x_train = train_aug.iloc[:,2:].values.copy()
    x_train = x_train.reshape(-1,28,28,1)

    y_train = train_aug['digit']
    y_train = to_categorical(y_train,num_classes = 10)

    return train_test_split(x_train,y_train,test_size=0.1,random_state=randint(1,100))

In [None]:
def load_best(file_name):
    filepath = './drive/My Drive/DACON//saved_model/' + file_name + '/'
    time_list = []
    for f_name in os.listdir(f"{filepath}"):
        written_time = os.path.getctime(f"{filepath}{f_name}")
        time_list.append((f_name, written_time))
    sorted_file_list = sorted(time_list, key=lambda x: x[1], reverse=True)
    best = sorted_file_list[0]
    best_name = best[0]
    model = load_model( filepath + best_name )
    print('\033[31m' + best_name + '\033[0m')
    print()
    return model

In [None]:
def cnn_model():
    N = 64
    model = Sequential()

    model.add(Conv2D(filters = N, kernel_size = (5,5),padding = 'Same', 
                    activation ='relu', input_shape = (28,28,1)))
    model.add(Conv2D(filters = N, kernel_size = (5,5),padding = 'Same', 
                    activation ='relu'))
                
    model.add(MaxPool2D(pool_size=(2,2)))
    model.add(Dropout(0.25))


    model.add(Conv2D(filters = 2*N, kernel_size = (3,3),padding = 'Same', 
                    activation ='relu'))
    model.add(Conv2D(filters = 2*N, kernel_size = (3,3),padding = 'Same', 
                    activation ='relu'))
    model.add(MaxPool2D(pool_size=(2,2), strides=(2,2)))
    model.add(Dropout(0.25))


    model.add(Flatten())
    model.add(Dense(4*N, activation = "relu", name = 'my_dense'))
    model.add(Dropout(0.5))
    model.add(Dense(10, activation = "softmax"))

    return model

In [None]:
def compare(file1,file2):
    filepath1 = './drive/My Drive/DACON/submission/' + file1 +'.csv'
    filepath2 = './drive/My Drive/DACON/submission/' + file2 +'.csv'
    f1 = pd.read_csv(filepath1)
    f2 = pd.read_csv(filepath2)
    match = np.array( [ f1['digit']==f2['digit'] ][0] )
    acc = len( np.where(match==True)[0] )/len(match)
    return acc

In [None]:
def pred_acc(file_name,file_list):
    score = []
    for i in range( len(file_list) ):
        acc = compare(file_name, file_list[i])
        score.append(acc)
        print( 'Compared with ' + file_list[i].replace('submision_','') + ' : {}'.format(acc) )
    #return score
    

In [None]:
epochs = 50 
batch_size = 50
num = 3

inter_model_list = []
layer_name='my_dense'

for i in range(num):

    model = cnn_model()

    MODEL_SAVE_FOLDER_PATH = './drive/My Drive/DACON/saved_model/model_cnn/'
    if not os.path.exists(MODEL_SAVE_FOLDER_PATH):
        os.mkdir(MODEL_SAVE_FOLDER_PATH)
    model_path = MODEL_SAVE_FOLDER_PATH + '{}'.format(i) + '_{val_accuracy:.4f}.hdf5'

    # callbacks
    early_stopping = EarlyStopping(monitor='val_accuracy', patience=10, verbose=1, mode='max')
    mcp_save = ModelCheckpoint(filepath = model_path, save_best_only=True, monitor='val_accuracy', mode='max', verbose=1)
    reduce_lr_loss = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=5, verbose=1, min_delta=1e-4, mode='min')

    optimizer = RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0)
    model.compile(optimizer = optimizer , loss = "categorical_crossentropy", metrics=["accuracy"])

    # fit model
    x_train, x_val, y_train, y_val = train_test_gen(train_copy)

    hist = model.fit(x_train, y_train, batch_size=batch_size, epochs = epochs, 
                validation_data = (x_val,y_val),
                steps_per_epoch=x_train.shape[0]// batch_size, 
                callbacks=[early_stopping,mcp_save,reduce_lr_loss])
    model = load_best('model_cnn')
    
    inter_model = Model(inputs=model.input, outputs=model.get_layer(layer_name).output)
    inter_model_list.append(inter_model)

In [None]:
train_aug = augmentation(train_copy)
inter_train = train_aug.iloc[:,2:].values.reshape(-1,28,28,1)
cnn_output_list = []
cnn_val_list = []
for i in range(num):
    cnn_output = inter_model_list[i].predict( inter_train ) 
    cnn_output = pd.DataFrame( data=cnn_output )
    cnn_output_list.append(cnn_output)
    cnn_val = train_aug['digit']
    cnn_val_list.append(cnn_val)

xgb_model_list = []
for i in range(num):
    x_train, x_val, y_train, y_val = train_test_split(cnn_output_list[i], cnn_val_list[i],test_size=0.1,random_state=randint(1,100))

    xgb_model = XGBClassifier(max_depth=10, num_class=10, objective='multi:softprob', booster='gbtree', n_estimators=300, learning_rate=0.1 )
    xgb_model.fit( x_train, y_train, eval_set=[(x_val, y_val)], eval_metric='mlogloss', early_stopping_rounds=10)

    xgb_model_list.append(xgb_model)

In [None]:
j = 0
xgb_model_list[j].score(cnn_output_list[j], cnn_val_list[j])

In [None]:
def xgb_ensemble(input_imgs):
    pred = []
    L = input_imgs.shape[0]
    label_list = np.zeros((L,10))
    
    for i in range(num):
        cnn_output = inter_model_list[i].predict_on_batch( np.array(input_imgs).reshape(-1,28,28,1).astype(np.float32) )
        cnn_output = DataFrame(cnn_output)
        label = xgb_model_list[i].predict_proba( cnn_output )
        label_list += label
        
    for j in range(len(label_list)):
        pred .append( np.argmax( label_list[j] ))

    return pred

In [None]:
x_test = np.divide(test_copy.iloc[:,2:].values,255)
x_test = x_test.reshape(-1,28,28,1)

pred = xgb_ensemble( x_test )

In [None]:
pred[:10]

In [None]:
data = {'id':test_copy['id'], 'digit':pred}
submission = DataFrame(data)
submission.to_csv('./drive/My Drive/DACON/submission/submission_xgb_deep.csv', index=False)

In [None]:
file_list = [ 'submission_82',
             'submission_84',
             'submission_85',
             'submission_86_xgb_ensemble',
             'submission_87_ensembles',
             'submission_88_ensemble_2_2_4_try3',
             'submission_89_ensemble_2_2',
             'submission_89_xgb_10',
             'submission_89']
             
pred_acc('submission_xgb_deep',file_list)