In [None]:
import pandas as pd
from pandas import DataFrame
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import seaborn as sns
sns.set(style='white', context='notebook', palette='deep')
%matplotlib inline


from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from sklearn.linear_model import *
import itertools

from keras.utils.np_utils import to_categorical 
from keras.models import *
from keras.layers import *
from keras.optimizers import *
from keras.callbacks import *
from keras.preprocessing.image import ImageDataGenerator
import os

import xgboost as xgb 
from xgboost import plot_importance , XGBClassifier, DMatrix
import joblib


In [None]:
# define train set
from google.colab import drive
drive.mount('/content/drive')
train = pd.read_csv('./drive/My Drive/DACON/data_file/train.csv')
test = pd.read_csv('./drive/My Drive/DACON/data_file/test.csv')
test_pred = pd.read_csv('./drive/My Drive/DACON//submission/submission_91_ensembles_6+2_bn_08.csv')
train_copy = train.copy()
test_copy = test.copy()
test_pred_copy = test_pred.copy()

In [None]:
rot_gen = ImageDataGenerator(
    featurewise_center=False,
    samplewise_center=False,
    featurewise_std_normalization=False,
    samplewise_std_normalization=False,
    zca_whitening=False,
    zca_epsilon=1e-06,
    rotation_range=45, 
    width_shift_range=0.0,
    height_shift_range=0.0,
    brightness_range=None,
    shear_range=0,     
    zoom_range=0,      
    channel_shift_range=0.0,
    fill_mode='constant', 
    cval=0.0,            
    horizontal_flip=False, 
    vertical_flip=False,   
    rescale=1./255, 
    preprocessing_function=None,
    data_format=None,
    validation_split=0, 
    dtype=None
)

trans_gen = ImageDataGenerator(
    featurewise_center=False,
    samplewise_center=False,
    featurewise_std_normalization=False,
    samplewise_std_normalization=False,
    zca_whitening=False,
    zca_epsilon=1e-06,
    rotation_range=0, 
    width_shift_range=0.2, 
    height_shift_range=0.2,
    brightness_range=None,
    shear_range=0,     
    zoom_range=0,      
    channel_shift_range=0.0,
    fill_mode='constant', 
    cval=0.0,             
    horizontal_flip=False, 
    vertical_flip=False,   
    rescale=1./255, 
    preprocessing_function=None,
    data_format=None,
    validation_split=0, 
    dtype=None
)

shear_zoom_gen = ImageDataGenerator(
    featurewise_center=False,
    samplewise_center=False,
    featurewise_std_normalization=False,
    samplewise_std_normalization=False,
    zca_whitening=False,
    zca_epsilon=1e-06,
    rotation_range=0, 
    width_shift_range=0.0,
    height_shift_range=0.0,
    brightness_range=None,
    shear_range=0.2,     
    zoom_range=0.2,      
    channel_shift_range=0.0,
    fill_mode='constant', 
    cval=0.0,             
    horizontal_flip=False,
    vertical_flip=False,   
    rescale=1./255, 
    preprocessing_function=None,
    data_format=None,
    validation_split=0, 
    dtype=None
)

flip_gen = ImageDataGenerator(
    featurewise_center=False,
    samplewise_center=False,
    featurewise_std_normalization=False,
    samplewise_std_normalization=False,
    zca_whitening=False,
    zca_epsilon=1e-06,
    rotation_range=0, 
    width_shift_range=0.0,
    height_shift_range=0.0,
    brightness_range=None,
    shear_range=0,     
    zoom_range=0,      
    channel_shift_range=0.0,
    fill_mode='constant', 
    cval=0.0,             
    horizontal_flip=True, 
    vertical_flip=True,   
    rescale=1./255, # Rescale
    preprocessing_function=None,
    data_format=None,
    validation_split=0, 
    dtype=None
)

In [None]:
def augmentation( input_imgs, aug_size ):
    df = input_imgs
    new_data_set = []
    num_of_training_set = df.shape[0]

    for i in range(num_of_training_set//2):
        rand_1 = np.random.randint(num_of_training_set)
        rand_2 = np.random.randint(num_of_training_set)
        rand_3 = np.random.randint(num_of_training_set)
        rand_4 = np.random.randint(num_of_training_set)
    
        for j in range( aug_size ):
            # rotation
            _rot = rot_gen.flow( np.array(df.iloc[rand_1,3:]).reshape(1,28,28,1) ).next().reshape(784,)
            new_data_set += [[
                df.iloc[rand_1,1],
                df.iloc[rand_1,2],
            ] + list(_rot)]
            # translation
            _trans = trans_gen.flow( np.array(df.iloc[rand_2,3:]).reshape(1,28,28,1) ).next().reshape(784,)
            new_data_set += [[
                df.iloc[rand_2,1],
                df.iloc[rand_2,2],
            ] + list(_trans)]
            # shear / zoom
            _shear = shear_zoom_gen.flow( np.array(df.iloc[rand_3,3:]).reshape(1,28,28,1) ).next().reshape(784,)
            new_data_set += [[
                df.iloc[rand_3,1],
                df.iloc[rand_3,2],
            ] + list(_shear)]
            # flip
            _flip = flip_gen.flow( np.array(df.iloc[rand_4,3:]).reshape(1,28,28,1) ).next().reshape(784,)
            new_data_set += [[
                df.iloc[rand_4,1],
                df.iloc[rand_4,2],
            ] + list(_flip)]

    columns = ['digit', 'letter'] + [str(x) for x in range(784)]
    aug = pd.DataFrame(new_data_set, columns=columns)

    train_norm = pd.concat([ input_imgs.iloc[:,1:3], np.divide(input_imgs.iloc[:,3:],255) ],axis=1)
    train_aug = pd.concat([train_norm,aug])

    return train_aug


In [None]:
def train_test_gen(input_imgs, aug_size):
    train_aug = augmentation(input_imgs, aug_size)

    x_train = train_aug.iloc[:,2:].values.copy()
    x_train = x_train.reshape(-1,28,28,1)

    y_train = train_aug['digit']
    y_train = to_categorical(y_train,num_classes = 10)

    return train_test_split(x_train,y_train,test_size=0.1,random_state=15)

In [None]:
def load_best(file_name):
    filepath = './drive/My Drive/DACON/saved_model/' + file_name + '/'
    time_list = []
    for f_name in os.listdir(f"{filepath}"):
        written_time = os.path.getctime(f"{filepath}{f_name}")
        time_list.append((f_name, written_time))
    sorted_file_list = sorted(time_list, key=lambda x: x[1], reverse=True)
    best = sorted_file_list[0]
    best_name = best[0]
    model = load_model( filepath + best_name )
    print('\033[31m' + best_name + '\033[0m')
    print()
    return model

In [None]:
def set_filepath(file_name):
    MODEL_SAVE_FOLDER_PATH = './drive/My Drive/DACON/saved_model/' + file_name + '/'
    if not os.path.exists(MODEL_SAVE_FOLDER_PATH):
        os.mkdir(MODEL_SAVE_FOLDER_PATH)
    
    return MODEL_SAVE_FOLDER_PATH

In [None]:
def compare(file1,file2):
    filepath1 = './drive/My Drive/DACON/submission/' + file1 +'.csv'
    filepath2 = './drive/My Drive/DACON/submission/' + file2 +'.csv'
    f1 = pd.read_csv(filepath1)
    f2 = pd.read_csv(filepath2)
    match = np.array( [ f1['digit']==f2['digit'] ][0] )
    acc = len( np.where(match==True)[0] )/len(match)
    return acc

In [None]:
def pred_acc(file_name,file_list):
    score = []
    for i in range( len(file_list) ):
        acc = compare(file_name, file_list[i])
        score.append(acc)
        print( 'Compared with ' + file_list[i].replace('submision_','') + ' : {}'.format(acc) )
    #return score
    

# Load more models

In [None]:
filepath = './drive/My Drive/DACON/saved_model/model_storage_final/'

cnn_0 = load_model( filepath + 'cnn_0_0.8652.hdf5' )
cnn_1 = load_model( filepath + 'cnn_1_0.8643.hdf5' )
cnn_2 = load_model( filepath + 'cnn_2_0.8633.hdf5' )
cnn_3 = load_model( filepath + 'cnn_3_0.8877.hdf5' )
cnn_4 = load_model( filepath + 'cnn_4_0.8447.hdf5' )

res_0 = load_model(filepath + 'ResNet_0_0.8613.hdf5' )
res_1 = load_model(filepath + 'ResNet_1_0.8945.hdf5' )
res_2 = load_model(filepath + 'ResNet_2_0.8809.hdf5' )
res_3 = load_model(filepath + 'ResNet_3_0.8711.hdf5' )
res_4 = load_model(filepath + 'ResNet_4_0.8994.hdf5' )

model_list = []
model_list.append(cnn_0)
model_list.append(cnn_1)
model_list.append(cnn_2)
model_list.append(cnn_3)
model_list.append(cnn_4)

model_list.append(res_0)
model_list.append(res_1)
model_list.append(res_2)
model_list.append(res_3)
model_list.append(res_4)

In [None]:
inter_model_0 = load_model( filepath + 'inter_model.hdf5' )
inter_model_1 = load_model( filepath + 'inter_model_1.hdf5' )
xgb_0 = joblib.load( filepath + 'xgb.dat' )
xgb_1 = joblib.load( filepath + 'xgb_1.dat' )

inter_model_list = []
inter_model_list.append( inter_model_0 )
inter_model_list.append( inter_model_1 )

xgb_list = []
xgb_list.append( xgb_0 )
xgb_list.append( xgb_1 )

In [None]:
num = 12

# Stacking using train data

In [None]:
def data_for_reg(input, model_num):
    pred_data = []
    for i in range(len(model_list)):
        pred_data.append( model_list[i].predict( input ) )
        print('{}'.format(i),end=' ')
    for j in range( len(inter_model_list) ):
        cnn_output = inter_model_list[j].predict( input )
        cnn_output = DataFrame(cnn_output)
        pred_data.append( xgb_list[j].predict_proba(cnn_output) )
        print('{}'.format(j))

    data = pred_data[0]
    for i in range(1,model_num):
        data = np.concatenate((data,pred_data[i]),axis=1)
    return data

In [None]:
train_aug = augmentation(train_copy, 3)

In [None]:
input_imgs =  train_aug.iloc[:,2:]
input_digit = train_aug['digit']

In [None]:
data = data_for_reg( input_imgs.values.reshape(-1,28,28,1), num )
data_val = input_digit
data_val = to_categorical(data_val, 10)
x_train, x_val, y_train, y_val = train_test_split(data,data_val,test_size=0.1,random_state=15)

In [None]:
N = 64
stack = Sequential()
stack.add(Input(shape=(10*num,)))
stack.add(Dense(N, activation = "relu"))
stack.add(BatchNormalization())
# stack.add(Dropout(0.5))
stack.add(Dense(2*N, activation = "relu"))
stack.add(BatchNormalization())
stack.add(Dense(4*N, activation = "relu"))
stack.add(BatchNormalization())
stack.add(Dense(10, activation = "softmax"))

# fit model
batch_size = 100
epochs = 100

optimizer = RMSprop(lr=0.01, rho=0.9, epsilon=1e-08, decay=0.0)
stack.compile(optimizer = optimizer , loss = "categorical_crossentropy", metrics=["accuracy"])

file_name = 'model_stack'
model_path = set_filepath(file_name) + 'stack_{val_loss:.4f}.hdf5'

# callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=10, verbose=1, mode='min')
mcp_save = ModelCheckpoint(filepath = model_path, save_best_only=True, monitor='val_loss', mode='min', verbose=1)

hist = stack.fit(x_train, y_train, batch_size=batch_size, epochs = epochs, 
            validation_data = (x_val,y_val),
            steps_per_epoch=x_train.shape[0]// batch_size
            , callbacks = [early_stopping, mcp_save]
            )


In [None]:
x_test = np.divide(test_copy.iloc[:,2:].values,255)
x_test = x_test.reshape(-1,28,28,1)
data_test = data_for_reg(x_test, num)

In [None]:
pred = stack.predict( data_test )
pred_test = []
for i in range(len(pred)):
    pred_test.append( np.argmax(pred[i]) )
pred_test[:10]

In [None]:
data = {'id':test_copy['id'], 'digit':pred_test}
submission = DataFrame(data)
file_name = 'submission_ensembles_final_64_+xgb'
submission.to_csv('./drive/My Drive/DACON/submission/'+file_name+'.csv', index=False)

file_list = [ 'submission_84',
             'submission_85',
             'submission_86_xgb_ensemble',
             'submission_87_ensembles',
             'submission_87_ensembles_10+5_bn_linearreg',
             'submission_87_ensembles_stack_more+res_13_256',
             'submission_88_ensemble_2_2_4_try3',
             'submission_88_ensembles_10+1_bn_linearreg',
             'submission_88_ensembles_6+2_bn_linearreg_2',
             'submission_89_ensemble_2_2',
             'submission_89_ensembles_stack_more',
             'submission_89_ensembles_stack_more_using_test',
             'submission_89_ensembles_stack_more++_using_test_overlap_wobn_512',
             'submission_90_ensembles_6+2_bn_08_retry',
             'submission_90_pretrain_using_test_layer_4_3ensemble',
             'submission_90_ensembles_6+2_bn_linearreg',
             'submission_90_ensembles_linear_using_test_1000',
             'submission_90_ensembles_stack_more+_using_test',
             'submission_91_ensembles_3+1_w1',
             'submission_91_ensembles_6+2_bn_08',
             'submission_91_ensembles_stack_more++_using_test_overlap_909191_2048_aug',
             'submission_91_ensembles_stack_more++_using_test_overlap_909191_64_aug'
             ]
pred_acc(file_name,file_list)

# Ensemble

In [None]:
def ensemble(input_imgs,model_list, inter_model_list, xgb_list, w_cnn, w_res, w_xgb):
    pred = []
    L = input_imgs.shape[0]
    label_list = np.zeros((L,10))
    for i in range(5):
        label = model_list[i].predict( np.array(input_imgs).reshape(-1,28,28,1).astype(np.float32) )
        label_list += label*w_cnn
        print('{}'.format(i),end=' ')
    for i in range(5):
        label = model_list[i+5].predict( np.array(input_imgs).reshape(-1,28,28,1).astype(np.float32) )
        label_list += label*w_res
        print('{}'.format(i+5),end=' ')

    for i in range(len(xgb_list)):
        cnn_output = inter_model_list[i].predict( np.array(input_imgs).reshape(-1,28,28,1).astype(np.float32) )
        cnn_output = DataFrame(cnn_output)
        label = xgb_list[i].predict_proba( cnn_output )
        label_list += label*w_xgb
        print('{}'.format(i), end= '')
        print()
        
    for j in range(len(label_list)):
        pred.append( np.argmax(label_list[j]) )

    return pred

In [None]:
x_test = np.divide(test_copy.iloc[:,2:].values,255)
x_test = x_test.reshape(-1,28,28,1)
pred = ensemble(x_test, model_list, inter_model_list, xgb_list, 1.0, 1.0, 5.0)
data = {'id':test_copy['id'], 'digit':pred}
submission = DataFrame(data)
file_name = 'submission_ensembles_115'
submission.to_csv('./drive/My Drive/DACON/submission/'+file_name+'.csv', index=False)
print(pred[:10])
print()
file_list = [ 'submission_84',
             'submission_85',
             'submission_86_xgb_ensemble',
             'submission_87_ensembles',
             'submission_87_ensembles_10+5_bn_linearreg',
             'submission_87_ensembles_stack_more+res_13_256',
             'submission_88_ensemble_2_2_4_try3',
             'submission_88_ensembles_10+1_bn_linearreg',
             'submission_88_ensembles_6+2_bn_linearreg_2',
             'submission_89_ensemble_2_2',
             'submission_89_ensembles_stack_more',
             'submission_89_ensembles_stack_more_using_test',
             'submission_89_ensembles_stack_more++_using_test_overlap_wobn_512',
             'submission_90_ensembles_6+2_bn_08_retry',
             'submission_90_pretrain_using_test_layer_4_3ensemble',
             'submission_90_ensembles_6+2_bn_linearreg',
             'submission_90_ensembles_linear_using_test_1000',
             'submission_90_ensembles_stack_more+_using_test',
             'submission_91_ensembles_3+1_w1',
             'submission_91_ensembles_6+2_bn_08',
             'submission_91_ensembles_stack_more++_using_test_overlap_909191_2048_aug',
             'submission_91_ensembles_stack_more++_using_test_overlap_909191_64_aug'
             ]
pred_acc(file_name,file_list)

# Using predicted data

In [None]:
pred1 = pd.read_csv('./drive/My Drive/DACON/submission/submission_90_ensembles_stack_more+_using_test.csv').copy()
pred2 = pd.read_csv('./drive/My Drive/DACON/submission/submission_91_ensembles_6+2_bn_08.csv').copy()
pred3 = pd.read_csv('./drive/My Drive/DACON/submission/submission_91_ensembles_3+1_w1.csv').copy()

overlap_id1 = np.where([pred1['digit']==pred2['digit']][0]==True)[0]
overlap_id2 = np.where([pred1['digit']==pred3['digit']][0]==True)[0]  
overlap_id = np.array([id for id in overlap_id1 if id in overlap_id2])
print(len(overlap_id))

aug_data = pd.concat([pred1.iloc[overlap_id,0:2], test_copy.iloc[overlap_id,1:]], axis=1)

aug = augmentation(aug_data, 2)
input_test = aug.iloc[:,2:]
input_digit = aug['digit']

In [None]:
data_reg = data_for_reg( input_test.values.reshape(-1,28,28,1) ,num)
data_val = input_digit
data_val = to_categorical(data_val, 10)
x_train, x_val, y_train, y_val = train_test_split(data_reg , data_val,test_size=0.1,random_state=24)

In [None]:
"""train_aug = augmentation(train_copy,2)
data_reg1 = data_for_reg_more( train_aug.iloc[:,2:].values.reshape(-1,28,28,1) )
data_val1 = train_aug['digit']
data_val1 = to_categorical(data_val1,10)
x = np.concatenate([data_reg, data_reg1], axis=0)
y = np.concatenate([data_val, data_val1], axis=0)
x_train, x_val, y_train, y_val = train_test_split(data_reg1, data_val1,test_size=0.1,random_state=24)"""

In [None]:
N = 512
stack = Sequential()
stack.add(Input(shape=(10*num,)))
stack.add(Dense(N, activation = "relu"))
# stack.add(BatchNormalization())
stack.add(Dense(2*N, activation = "relu"))
# stack.add(BatchNormalization())
stack.add(Dense(4*N, activation = "relu"))
# stack.add(BatchNormalization())
stack.add(Dense(8*N, activation = "relu"))
# stack.add(BatchNormalization())
# stack.add(Dense(16*N, activation = "relu"))
# stack.add(BatchNormalization())
stack.add(Dense(10, activation = "softmax"))

# fit model
batch_size = 200
epochs = 5
# optimizer = RMSprop(lr=0.01, rho=0.9, epsilon=1e-08, decay=0.0)
optimizer = Adam()
stack.compile(optimizer = optimizer , loss = "categorical_crossentropy", metrics=["accuracy"])

file_name = 'model_stack'
model_path = set_filepath(file_name) + 'stack_{val_accuracy:.4f}.hdf5'

hist = stack.fit(x_train, y_train, batch_size=batch_size, epochs = epochs, 
            validation_data = (x_val,y_val),
            steps_per_epoch=x_train.shape[0]// batch_size
            # , callbacks = [early_stopping, mcp_save]
            )
# stack = load_best('model_stack')

In [None]:
x_test = np.divide(test_copy.iloc[:,2:].values,255)
x_test = x_test.reshape(-1,28,28,1)
data_test = data_for_reg(x_test, num)

In [None]:
pred = stack.predict( data_test )
pred_test = []
for i in range(len(pred)):
    pred_test.append( np.argmax(pred[i]) )
pred_test[:10]

In [None]:
data = {'id':test_copy['id'], 'digit':pred_test}
submission = DataFrame(data)
file_name = 'submission_ensembles_final_512'
submission.to_csv('./drive/My Drive/DACON/submission/'+file_name+'.csv', index=False)

file_list = [ 'submission_84',
             'submission_85',
             'submission_86_xgb_ensemble',
             'submission_87_ensembles',
             'submission_87_ensembles_10+5_bn_linearreg',
             'submission_87_ensembles_stack_more+res_13_256',
             'submission_88_ensemble_2_2_4_try3',
             'submission_88_ensembles_10+1_bn_linearreg',
             'submission_88_ensembles_6+2_bn_linearreg_2',
             'submission_89_ensemble_2_2',
             'submission_89_ensembles_stack_more',
             'submission_89_ensembles_stack_more_using_test',
             'submission_89_ensembles_stack_more++_using_test_overlap_wobn_512',
             'submission_90_ensembles_6+2_bn_08_retry',
             'submission_90_pretrain_using_test_layer_4_3ensemble',
             'submission_90_ensembles_6+2_bn_linearreg',
             'submission_90_ensembles_linear_using_test_1000',
             'submission_90_ensembles_stack_more+_using_test',
             'submission_91_ensembles_3+1_w1',
             'submission_91_ensembles_6+2_bn_08',
             'submission_91_ensembles_stack_more++_using_test_overlap_909191_2048_aug',
             'submission_91_ensembles_stack_more++_using_test_overlap_909191_64_aug'
             ]
pred_acc(file_name,file_list)

In [None]:
pred_acc('submission_89_ensembles_stack_more++_using_test_overlap_wobn_512',file_list)

In [None]:
compare('submission_ensembles_stack_more+res_using_test_overlap_wobn_512', file_name)