## 1. 라이브러리 및 데이터

In [None]:
import pandas as pd # load and analysis
import numpy as np # load and analysis
from sklearn.model_selection import KFold, StratifiedKFold
import gc
from tqdm import tqdm_notebook as tqdm
import os

# model library
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, BatchNormalization
from keras.layers import Bidirectional, GlobalMaxPool1D, MaxPooling1D, Add, Flatten
from keras.models import Model, load_model
from keras import optimizers, callbacks
from keras import backend as K
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint,Callback, EarlyStopping
from swa.keras import SWA # swa optimizer - https://pypi.org/project/keras-swa/


In [None]:
# Data Load
TRAIN_PATH = './data/train.csv'
TEST_PATH = './data/test.csv'

df_train = pd.read_csv(TRAIN_PATH)
df_test = pd.read_csv(TEST_PATH)
gc.collect()

In [None]:
# learning rate scheduler 구현
# lr을 cyclic하게 변환해줌
import math

from keras.callbacks import Callback
class CosineAnnealingScheduler(Callback):
    """Cosine annealing scheduler.
    """

    def __init__(self, T_max, eta_max, eta_min=0, verbose=0):
        super(CosineAnnealingScheduler, self).__init__()
        self.T_max = T_max
        self.eta_max = eta_max
        self.eta_min = eta_min
        self.verbose = verbose

    def on_epoch_begin(self, epoch, logs=None):
        if not hasattr(self.model.optimizer, 'lr'):
            raise ValueError('Optimizer must have a "lr" attribute.')
        lr = self.eta_min + (self.eta_max - self.eta_min) * (1 + math.cos(math.pi * epoch / self.T_max)) / 2
        K.set_value(self.model.optimizer.lr, lr)
        if self.verbose &gt; 0:
            print('\nEpoch %05d: CosineAnnealingScheduler setting learning '
                  'rate to %s.' % (epoch + 1, lr))

    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        logs['lr'] = K.get_value(self.model.optimizer.lr)

In [None]:
# gelu activation function -  Gaussian Error Linear Units (GELUs)
# https://arxiv.org/abs/1606.08415
from keras.layers import Activation
from keras.utils.generic_utils import get_custom_objects
import tensorflow as tf
class Gelu(Activation):
    def __init__(self, activation, **kwargs):
        super(Gelu, self).__init__(activation, **kwargs)
        self.__name__='gelu'
        
def gelu(x):
    return 0.5 * x * (1 + tf.tanh(tf.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3))))

get_custom_objects().update({'gelu': Gelu(gelu)})

## 2. 데이터 전처리(Data Cleansing & Pre-Processing)

In [None]:
# 별도의 전처리 없음
y_columns = ['layer_1', 'layer_2', 'layer_3', 'layer_4']
X_train = df_train.drop(columns=y_columns)
y_train = df_train[y_columns]


## 3. 탐색적 자료분석(Exploratory Data Analysis)

여러가지 모델을 설계해 보면서 탐구 해보았지만 최종 결과에는 큰지장이 없어 제거함

NeuralNet 모델을 사용하는 것이 맞다는 결론에 도달해 모든 피쳐를 사용하는 모델을 사용하기로함

## 4. 변수 선택 및 모델 구축(Feature Engineering & Initial Modeling)

최종모델 선택에 도달하기까지의 모델 구현 과정의 아이디어를 기입하였습니다.

- 1 Feature 추가

모든 layer의 두께의 합을 예측 피쳐로 추가함 -> 결과에 큰 차이가 없어 최종 모델에서 제거

- 2 Dropout 조절

Underfit 되는 현상을 보고 0.5 에서 0.01까지 계속 낮추며 실험을함, 낮출수록 성능 증가
실험을 하다보니 training set에 overfit되지 않는 모습을 보게되어 매우 작은 값으로 선택 (validset과 leaderboard상의 점수가 훨씬 높게 나옴)

- 3 Underfit  

Layer를 깊고 크게 쌓다보니 학습이 잘 되지않아 SWA, CosineAnnealing Scheduler, Gelu 등을 사용하여 최대한 학습하려고 함
특히 Batchnorm 추가 뒤에 성능이 크게 증가

- 4 K-Fold Ensemble

구조가 다른 여러 모델을 사용해보았지만 크게 성능향상이 되지 않아 단일 모델의 K-Fold ensemble을 하였음

- 5 기타 실험 모델들

Batchnorm 추가 이후 0.8 -> 0.4 아래로 큰 향상
CNN base - 0.44
Layer-sum feature 추가 모델 - 0.38
기타 Boosting Tree model - 1점대

## 5. 모델 학습 및 검증(Model Tuning & Evaluation)

In [None]:
lr = 3e-4
lr_d = 0.0
patience = 200
dr_rate = 0.01
kfold = 13
kf = KFold(n_splits=kfold, shuffle=True, random_state=7777)
# train model
for enum, (train_index,valid_index) in enumerate(kf.split(X_train,y_train)):
    file_path = f"bn_swa_nn_best_model_fold_{enum}.hdf5"
    check_point = ModelCheckpoint(file_path, monitor="val_loss", verbose=1,save_best_only=True, mode="min")
    early_stop = EarlyStopping(monitor="val_loss", mode="min", patience=patience)
    cosine_scheduler = CosineAnnealingScheduler(T_max=100, eta_max=6e-4, eta_min=3e-5)
    swa = SWA(start_epoch=20, lr_schedule='manual', swa_lr=3e-4, swa_freq=5, verbose=1,batch_size=4096)

    kf_x_train = X_train.iloc[train_index]
    xf_y_train = y_train.iloc[train_index]
    
    kf_x_val = X_train.iloc[valid_index]
    kf_y_val = y_train.iloc[valid_index]

    inp = Input(shape = (226,))
    x = Dense(2017, activation=None)(inp)
    x = BatchNormalization()(x)
    x = Activation(gelu)(x)
    x = Dropout(dr_rate)(x)
    x = Dense(2013, activation=None)(x)
    x = BatchNormalization()(x)
    x = Activation(gelu)(x)
    x = Dropout(dr_rate)(x)
    x = Dense(1027, activation=None)(x)
    x = BatchNormalization()(x)
    x = Activation(gelu)(x)
    x = Dropout(dr_rate)(x)
    x = Dense(1023, activation=None)(x)
    x = BatchNormalization()(x)
    x = Activation(gelu)(x)
    x = Dropout(dr_rate)(x)
    x = Dense(517, activation=None)(x)
    x = BatchNormalization()(x)
    x = Activation(gelu)(x)
    x = Dropout(dr_rate)(x)
    x = Dense(509, activation=None)(x)
    x = BatchNormalization()(x)
    x = Activation(gelu)(x)
    x = Dropout(dr_rate)(x)
    x = Dense(503, activation=None)(x)
    x = BatchNormalization()(x)
    x = Activation(gelu)(x)
    x = Dropout(dr_rate)(x)
    x = Dense(307, activation='gelu')(x)
    x = Dropout(dr_rate)(x)
    x_1 = Dense(1, activation="relu", name='layer_1')(x)
    x_2 = Dense(1, activation="relu", name='layer_2')(x)
    x_3 = Dense(1, activation="relu", name='layer_3')(x)
    x_4 = Dense(1, activation="relu", name='layer_4')(x)


    model = Model(inputs=inp, outputs=[x_1,x_2,x_3,x_4])
    model.summary()
    model.compile(loss='mae', optimizer=Adam(lr=lr, decay=lr_d))
    model.fit(kf_x_train, 
             [np.array(xf_y_train)[:,0],np.array(xf_y_train)[:,1],np.array(xf_y_train)[:,2],np.array(xf_y_train)[:,3]],
              batch_size=4096, epochs=10000,
              validation_data = [kf_x_val, [np.array(kf_y_val)[:,0],np.array(kf_y_val)[:,1],np.array(kf_y_val)[:,2],
                                            np.array(kf_y_val)[:,3]]],
              verbose=2, callbacks=[early_stop,check_point,cosine_scheduler,swa])

In [None]:
X_test = df_test.iloc[:,1:]

pred_test = np.zeros((len(X_test),4))

for i in range(kfold):
    print(i)
    model = load_model("./bn_swa_nn_best_model_fold_{}.hdf5".format(i))
    pred = np.array(model.predict(X_test))
    pred_test += np.transpose(pred)[0]

In [None]:
pred_test = pred_test /kfold
sample_sub = pd.read_csv('./data/sample_submission.csv', index_col=0)

submission = sample_sub+pred_test
submission.to_csv('submission.csv')