<a href="https://colab.research.google.com/github/jw00oo1/Dacon/blob/main/Ensemble_CNN_with_Bagging.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import Libraries & Prepare Dataset

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import Model, Input, layers
from tensorflow.keras.layers import BatchNormalization, Conv2D, concatenate, Flatten, MaxPooling2D, Dropout
from pathlib import Path
from tensorflow.keras.callbacks import ModelCheckpoint
import os
from datetime import datetime
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        tf.config.experimental.set_memory_growth(gpus[0], True)
    except RuntimeError as e:
        print(e)

data_url = {'train': 'https://bit.ly/34zEKdv',
            'test': 'https://bit.ly/2YGZ0WJ',
            'submission': 'https://bit.ly/3hDT2xi'}

cwd = Path(os.getcwd())

now = datetime.now()

suffix = ''
def set_suffix():
  global suffix
  now = datetime.now()
  year, month, day, hour, minute, second = now.year, now.month, now.day, now.hour, now.minute, now.second
  suffix = f'{year}_{month}_{day}__{hour}_{minute}_{second}'
set_suffix()
suffix


'2020_9_17__13_22_25'

In [None]:
def check_res():
  global cwd
  if not (cwd / 'res').exists():
    os.mkdir(cwd / 'res')
    for key in data_url.keys():
      download_url = data_url[key]
      from urllib.request import urlopen 
      file = urlopen(download_url)
      file_data = file.read()
      with open(cwd / 'res' / f'{key}.csv', 'wb') as f:
        f.write(file_data)
        f.close()

In [None]:
check_res()

train_path, test_path = [cwd / 'res' / f'{file}.csv' for file in ['train', 'test']]
train_csv = pd.read_csv(train_path)
test_csv = pd.read_csv(test_path)

In [None]:
def get_dataset_from_csv(csv: pd.DataFrame, is_test=False, to_sparse=True, only_img=True, preprocessing=True):
    img_key = csv.keys()[(2 if is_test else 3):]
    letters = []
    letter_temp = csv['letter']
    for letter in letter_temp:
        letter = ord(letter) - ord('A')
        if to_sparse:
            temp = np.zeros(26)
            temp[letter] = 1
        letter = np.array(letter)
        letters.append(letter)
    letters = np.array(letters)
    if len(letters.shape) == 1:
        letters = np.expand_dims(letters, axis=-1)
    imgs = np.array(csv[img_key]).reshape(-1, 28, 28, 1)
    if preprocessing:
        imgs[imgs < 128] = 0
    imgs = imgs / 255.
    ret = imgs if only_img else np.append(letters, imgs, axis=1)
    if not is_test:
        ret = ret, np.array(csv['digit'])
    return ret

In [None]:
train_ds = get_dataset_from_csv(train_csv, to_sparse=False, preprocessing=True)
test_ds = get_dataset_from_csv(test_csv, True, to_sparse=False, preprocessing=False)

x_train, y_train = train_ds
x_test, y_test = test_ds, np.array([-1 for i in range(len(test_ds))])
x_all = np.append(x_train, x_test, axis=0)
y_all = np.append(y_train, y_test, axis=0)

def setup_data():
    global train_ds, test_ds, x_train, y_train, x_test, y_test, x_all, y_all
    train_ds = get_dataset_from_csv(train_csv, to_sparse=False, preprocessing=False)
    test_ds = get_dataset_from_csv(test_csv, True, to_sparse=False, preprocessing=False)

    x_train, y_train = train_ds
    x_test, y_test = test_ds, np.array([-1 for i in range(len(test_ds))])
    x_all = np.append(x_train, x_test, axis=0)
    y_all = np.append(y_train, y_test, axis=0)

In [None]:
from sklearn.model_selection import train_test_split
setup_data()

validation_split = 0.2

x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=validation_split, random_state=2020)

In [None]:
import tensorflow.keras.layers as layers

def CNN(layer=3):
    img_input = Input(shape=(28,28,1))

    c1_1 = Conv2D(64, kernel_size=3, strides=1, padding='same', activation='relu')(img_input)
    c1_2 = Conv2D(64, kernel_size=3, strides=1, padding='same')(c1_1)
    add1 = layers.add([c1_1, c1_2])
    c1_3 = layers.Activation('relu')(add1)
    output_1 = c1_3
    
    c2_1 = Conv2D(128, kernel_size=3, strides=1, padding='same', activation='relu')(output_1)
    c2_2 = Conv2D(128, kernel_size=3, strides=1, padding='same')(c2_1)
    add2 = layers.add([c2_1, c2_2])
    c2_3 = layers.Activation('relu')(add2)
    output_2 = layers.AvgPool2D(pool_size=2)(c2_3)
    
    c3_1 = Conv2D(256, kernel_size=3, strides=1, padding='same', activation='relu')(output_2)
    c3_2 = Conv2D(256, kernel_size=3, strides=1, padding='same')(c3_1)
    add3 = layers.add([c3_1, c3_2])
    c3_3 = layers.Activation('relu')(add3)
    output_3 = MaxPooling2D(pool_size=2)(c3_3)
    
    if layer == 4:
        c4_1 = Conv2D(256, kernel_size=3, strides=1, padding='same', activation='relu')(output_3)
        c4_2 = Conv2D(256, kernel_size=3, strides=1, padding='same')(c4_1)
        add4 = layers.add([c4_1, c4_2])
        c4_3 = layers.Activation('relu')(add4)
        output_4 = MaxPooling2D(pool_size=2)(c4_3)

    x = Flatten(name='feature_output')(output_3 if layer != 4 else output_4)
    x = Dropout(0.4)(x)
    x = tf.keras.layers.Dense(512, activation='relu')(x)
    x = Dropout(0.4)(x)
    x = tf.keras.layers.Dense(512, activation='relu')(x)
    x = tf.keras.layers.Dense(10, activation='softmax')(x)

    model = Model(inputs=[img_input], outputs=[x])
    model.compile(optimizer=tf.keras.optimizers.Adam(lr=0.0003),
                  loss = 'sparse_categorical_crossentropy',
                  metrics=['accuracy'])

    return model

In [None]:
n_ensemble = 10

model_list = []
for i in range(n_ensemble):
    model = CNN(4)
    model_list.append(model)
model.summary()

In [None]:
from tensorflow.keras.callbacks import ReduceLROnPlateau
reduceLR = ReduceLROnPlateau(
    monitor = 'val_loss',
    factor = 0.5,
    patience=10,
)

class StopTraining(tf.keras.callbacks.Callback):
    def __init__(self, monitor='accuracy', baseline=0.97):
        self.monitor = monitor
        self.baseline = baseline
    
    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        acc = float(logs.get(self.monitor))
        if acc is not None:
            if acc > self.baseline:
                print(f'Epoch {epoch}: {self.monitor} reached {self.baseline}, terminating training')
                self.model.stop_training = True

stop_training = StopTraining(baseline=.98)
setup_data()

epochs = 100
batch_size = 32
validation_split = .2
if not (cwd / 'h5_models').exists():
  os.mkdir(cwd / 'h5_models')
model_path = cwd / 'h5_models'
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=validation_split, random_state=2020)

for i in range(len(model_list)):
    print(f'**********model_{i}*********')
    filename = f'best_model_{i}_{suffix}.h5'
    path = model_path / filename
    mc = ModelCheckpoint(path, monitor='val_loss', mode='min', save_best_only=True, verbose=1)
    callback = tf.keras.callbacks.EarlyStopping(monitor = 'val_accuracy', patience = 10, restore_best_weights = True)

    setup_data()
    random_state = np.random.randint(0, 2020)
    print(f'random_state: {random_state}')
    x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=validation_split, random_state=random_state)
    history = model_list[i].fit(x = x_train, y = y_train, epochs = epochs, batch_size = batch_size,
                       shuffle = True, validation_data = (x_val, y_val), callbacks = [mc, stop_training, reduceLR])
    # model_list[i].load_weights(path)

**********model_0*********
random_state: 1948
Epoch 1/100
Epoch 00001: val_loss improved from inf to 2.29153, saving model to /content/h5_models/best_model_0_2020_9_17__13_22_25.h5
Epoch 2/100
Epoch 00002: val_loss improved from 2.29153 to 2.23648, saving model to /content/h5_models/best_model_0_2020_9_17__13_22_25.h5
Epoch 3/100
Epoch 00003: val_loss improved from 2.23648 to 1.68739, saving model to /content/h5_models/best_model_0_2020_9_17__13_22_25.h5
Epoch 4/100
Epoch 00004: val_loss improved from 1.68739 to 1.12623, saving model to /content/h5_models/best_model_0_2020_9_17__13_22_25.h5
Epoch 5/100
Epoch 00005: val_loss improved from 1.12623 to 0.78027, saving model to /content/h5_models/best_model_0_2020_9_17__13_22_25.h5
Epoch 6/100
Epoch 00006: val_loss improved from 0.78027 to 0.60373, saving model to /content/h5_models/best_model_0_2020_9_17__13_22_25.h5
Epoch 7/100
Epoch 00007: val_loss improved from 0.60373 to 0.50517, saving model to /content/h5_models/best_model_0_2020_9_1

In [None]:
for i, model in enumerate(model_list):
    filename = f'best_model_{i}_{suffix}.h5'
    path = model_path / filename
    model.load_weights(path)

In [None]:
intermediate_layer_models = []
layer_name = 'feature_output'
for cnn_model in model_list:
  intermediate_layer_models.append(tf.keras.Model(inputs=cnn_model.input,
                                            outputs=cnn_model.get_layer(layer_name).output))

input_layer = tf.keras.Input((28, 28, 1))

avg_layer = [intermediate_model(input_layer) for intermediate_model in intermediate_layer_models]
output_layer = tf.keras.layers.average(avg_layer)

ensembled_feature_model = tf.keras.Model(inputs=input_layer, outputs=output_layer)

In [None]:
inputs = tf.keras.Input((28, 28, 1))

avg_layer = [cnn_model(inputs) for cnn_model in model_list]
output = tf.keras.layers.average(avg_layer)

ensembled_model = tf.keras.Model(inputs=inputs, outputs=output)

In [None]:
from xgboost import XGBClassifier
setup_data()
validation_split=.1

x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=validation_split, random_state=random_state)

intermediate_output = ensembled_feature_model.predict(x_train)
train_labels = y_train

val_data = ensembled_feature_model.predict(x_val)
val_labels = y_val

xgbmodel = XGBClassifier(objective='multi:softprob', 
                      num_class= 10)
xgbmodel.fit(intermediate_output, train_labels)
xgbmodel.score(val_data, val_labels)

In [None]:
setup_data()

intermediate_test_output = ensembled_feature_model.predict(x_test)

submission_xgb = xgbmodel.predict(intermediate_test_output)
submission_xgb

In [None]:
from sklearn.naive_bayes import GaussianNB
setup_data()
validation_split=.1

x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=validation_split, random_state=random_state)

intermediate_output = ensembled_feature_model.predict(x_train)
train_labels = y_train

val_data = ensembled_feature_model.predict(x_val)
val_labels = y_val

gnbmodel = GaussianNB().fit(intermediate_output, train_labels) 
gnbmodel.score(val_data, val_labels)

0.8536585365853658

In [None]:
setup_data()

validation_split = 0.2

x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=validation_split, random_state=2020)

intermediate_output = ensembled_feature_model(x_val)

xgb_proba = xgbmodel.predict_proba(intermediate_output)
gnb_proba = gnbmodel.predict_proba(intermediate_output)
ens_proba = ensembled_model.predict(x_val)

y_pred = np.add(np.add(xgb_proba, gnb_proba), ens_proba).argmax(axis=-1)
y_true = y_val

mask = y_pred == y_true
f'val_acuracy={(len(y_true[mask]) / len(y_true)):.4f}'

'val_acuracy=0.9951'

In [None]:
from sklearn.metrics import classification_report

setup_data()
intermediate_output = ensembled_feature_model(x_train)

xgb_proba = xgbmodel.predict_proba(intermediate_output)
gnb_proba = gnbmodel.predict_proba(intermediate_output)
ens_proba = ensembled_model.predict(x_train)

y_pred = np.add(np.add(xgb_proba, gnb_proba), ens_proba).argmax(axis=-1)
y_true = y_train

cls_report = classification_report(
    y_true=y_true,
    y_pred=y_pred,
    output_dict=True,
    labels=np.arange(10)
)
cls_report

{'0': {'f1-score': 1.0, 'precision': 1.0, 'recall': 1.0, 'support': 191},
 '1': {'f1-score': 0.995049504950495,
  'precision': 0.995049504950495,
  'recall': 0.995049504950495,
  'support': 202},
 '2': {'f1-score': 0.9956896551724138,
  'precision': 1.0,
  'recall': 0.9914163090128756,
  'support': 233},
 '3': {'f1-score': 0.9927007299270073,
  'precision': 0.9902912621359223,
  'recall': 0.9951219512195122,
  'support': 205},
 '4': {'f1-score': 0.9783132530120482,
  'precision': 0.9759615384615384,
  'recall': 0.9806763285024155,
  'support': 207},
 '5': {'f1-score': 0.9866071428571429,
  'precision': 0.9910313901345291,
  'recall': 0.9822222222222222,
  'support': 225},
 '6': {'f1-score': 0.9905660377358491,
  'precision': 0.9905660377358491,
  'recall': 0.9905660377358491,
  'support': 212},
 '7': {'f1-score': 1.0, 'precision': 1.0, 'recall': 1.0, 'support': 194},
 '8': {'f1-score': 0.9889502762430938,
  'precision': 0.9944444444444445,
  'recall': 0.9835164835164835,
  'support': 1

In [None]:
setup_data()
intermediate_output = ensembled_feature_model.predict(x_test)

xgb_proba = xgbmodel.predict_proba(intermediate_output)
gnb_proba = gnbmodel.predict_proba(intermediate_output)
ens_proba = ensembled_model.predict(x_test)

total_prediction = np.add(np.add(xgb_proba, ens_proba), gnb_proba).argmax(axis=-1)
total_prediction

array([6, 9, 3, ..., 6, 1, 0])

In [None]:
total_prediction = np.append(np.append(xgb_proba, gnb_proba, axis=1), ens_proba, axis=1).argmax(axis=-1) % 10

## Evaluate with train.csv

In [None]:
setup_data()

def get_prediction(x):
  prediction = ensembled_model.predict(x)
  prediction = np.argmax(prediction, axis=-1)
  return prediction

y_pred = get_prediction(x_train)
y_true = y_train

In [None]:
from sklearn.metrics import classification_report

y_true = y_train
y_pred = total_prediction
cls_report = classification_report(
    y_true=y_true,
    y_pred=y_pred,
    output_dict=True,
    labels=np.arange(10)
)
cls_report

## Save Submission CSV

In [None]:
submission_path = cwd / 'res/submission.csv'
submission_csv = pd.read_csv(submission_path)
submission_csv

Unnamed: 0,id,digit
0,2049,0
1,2050,0
2,2051,0
3,2052,0
4,2053,0
...,...,...
20475,22524,0
20476,22525,0
20477,22526,0
20478,22527,0


In [None]:
setup_data()

submission_csv['digit'] = total_prediction

In [None]:
csv_path = cwd / 'csvs'
if not csv_path.exists():
  os.mkdir(csv_path)
filename = f'submission_{suffix}_2.csv'
path = csv_path / filename

f = open(cwd / 'log.txt', 'a')
f.write(f'{suffix} == {cls_report}\n')
f.close()

submission_csv.to_csv(path, index=False)
submission_csv = pd.read_csv(path)
submission_csv

Unnamed: 0,id,digit
0,2049,6
1,2050,9
2,2051,3
3,2052,0
4,2053,3
...,...,...
20475,22524,4
20476,22525,1
20477,22526,6
20478,22527,1


In [None]:
f'submission_{suffix}.csv'

'submission_2020_9_14__7_45_1.csv'

## Propagating labels of test.csv from CNN Model

In [None]:
train_xs, train_ys = [], []
x_vals, y_vals = [], []

for i, _ in enumerate(model_list):
  setup_data()
  batch_size = 32
  np.random.shuffle(x_test)

  n_pick = int(len(x_train) * 2)
  n_pick = np.random.choice(len(x_test), n_pick)
  x_test = x_test[n_pick]
  y_test = get_prediction(x_test)

  picks = np.random.choice(len(x_train), int(len(x_train) * .5), replace=False)

  x_val = x_train[picks]
  y_val = y_train[picks]
  x_train = np.delete(x_train, picks, axis=0)
  y_train = np.delete(y_train, picks, axis=0)

  train_x = np.append(x_train, x_test, axis=0)
  train_y = np.append(y_train, y_test, axis=0)

  train_xs.append(train_x)
  train_ys.append(train_y)

  x_vals.append(x_val)
  y_vals.append(y_val)

  print(train_x.shape, train_y.shape, x_train.shape, y_train.shape, x_test.shape, y_test.shape, x_val.shape, y_val.shape)

In [None]:
n_ensemble = 20

for i in range(len(model_list)):  
  model_list[i] = CNN(layer=3)
  print(f'**********model_{i}*********')
  filename = f'final_model_{i}_{suffix}.h5'
  path = model_path / filename

  print(train_x.shape, train_y.shape, x_train.shape, y_train.shape, x_test.shape, y_test.shape, x_val.shape, y_val.shape)

  mc = ModelCheckpoint(path, monitor='val_loss', mode='min', save_best_only=True, verbose=1)
  stop_training = StopTraining(baseline=0.99)
  history = model_list[i].fit(x = train_xs[i], y = train_ys[i], epochs = epochs, batch_size = batch_size,
                      shuffle = True, validation_data = (x_vals[i], y_vals[i]), callbacks = [mc, stop_training, reduceLR],
                      verbose=2)

In [None]:
setup_data()

def get_prediction(x):
    global model_list    
    predictions = []
    for i, model in enumerate(model_list):
        model_path = cwd
        filename = f'final_model_{i}_{suffix}.h5'
        path = model_path / filename
        model.load_weights(path)
        result = model.predict(x)
        predictions.append(result)
    predictions = np.array(predictions)
    y_pred = predictions.sum(axis=0).argmax(axis=1)
    return y_pred

y_pred = get_prediction(x_train)
y_true = y_train

In [None]:
from sklearn.metrics import classification_report

cls_report = classification_report(
    y_true=y_true,
    y_pred=y_pred,
    output_dict=True,
    labels=np.arange(10)
)
cls_report

In [None]:
submission_path = cwd / 'res/submission.csv'
submission_csv = pd.read_csv(submission_path)
submission_csv

In [None]:
setup_data()

digits = get_prediction(x_test)
submission_csv['digit'] = digits

In [None]:
csv_path = cwd / 'csvs'
if not csv_path.exists():
  os.mkdir(csv_path)
filename = f'submission_{suffix}_final.csv'
path = csv_path / filename

f = open(cwd / 'log.txt', 'a')
f.write(f'{year}-{month}-{day} {hour}:{minute}:{second} == {cls_report}\n')
f.close()

submission_csv.to_csv(path, index=False)
submission_csv = pd.read_csv(path)
submission_csv

In [None]:
filename