In [None]:
pip install catboost

In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import pickle
import os
import random
from sklearn.preprocessing import LabelEncoder

# ML
from sklearn.ensemble import RandomForestClassifier  # Bagging
from xgboost.sklearn import XGBClassifier            # GBM
from sklearn.linear_model import LogisticRegression  # LogisticRegression
from sklearn.svm import SVC                          # SVM
from catboost import CatBoostClassifier as cat       # Catboost
from lightgbm import LGBMClassifier as lgb           # LGBM


# DL
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Input, Dense, ReLU, Softmax, Dropout
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping

# for checking multi-collinearity
from statsmodels.stats.outliers_influence import variance_inflation_factor

# KFold(CV), partial : for optuna
from sklearn.model_selection import KFold, StratifiedKFold
from functools import partial

# AutoML framework
import optuna

In [2]:
# set configs
is_tuning = True
is_scaling = True
is_pca = False
if is_tuning:
    n_trials=50
    
# Keras model compile
learning_rate = 1e-3
batch_size = 64
epochs = 10

In [3]:
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    tf.random.set_seed(seed)
    
seed_everything()

In [4]:
def balance_logloss(y_true, y_pred):
    y_pred = np.clip(y_pred, 1e-15, 1-1e-15)
    y_pred = y_pred.reshape(-1, 1)
    y_pred /= np.sum(y_pred, axis=1)[:, None]
    nc = np.bincount(y_true)
    
    if len(nc) == 1:
        return np.nan
    elif len(nc) == 2:
        logloss = (-1 / nc[0] * (np.sum(np.where(y_true == 0, 1, 0) * np.log(y_pred[:, 0])))
                   - 1 / nc[1] * (np.sum(np.where(y_true != 0, 1, 0) * np.log(y_pred[:, 1])))) / 2
        return logloss
    else:
        raise ValueError("Expected two classes in y_true.")


# def balance_loglossv2(y_true, y_pred):
#     from sklearn.metrics import log_loss
    
#     target_mean = y_true.mean()
#     w0 = 1/(1-target_mean)
#     w1 = 1/target_mean
#     sample_weight = [w0 if y == 0 else w1 for y in y_true]
#     loss = log_loss(y_true, y_pred, sample_weight=sample_weight)
    
#     return loss

def b_logloss_keras(y_true, y_pred):
    y_true = y_true[:, 1] * (1 - y_true[:, 0])
    y_true = tf.cast(y_true, tf.int64)
    score = tf.py_function(func=balance_logloss, inp=[y_true, y_pred], Tout=tf.float32)
    return score


In [5]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')
greeks = pd.read_csv('./greeks.csv')
submission = pd.read_csv('./sample_submission.csv')

In [6]:
train_no_class = train.drop(columns='Class')
train_no_class

Unnamed: 0,Id,AB,AF,AH,AM,AR,AX,AY,AZ,BC,...,FI,FL,FR,FS,GB,GE,GF,GH,GI,GL
0,000ff2bfdfe9,0.209377,3109.03329,85.200147,22.394407,8.138688,0.699861,0.025578,9.812214,5.555634,...,3.583450,7.298162,1.73855,0.094822,11.339138,72.611063,2003.810319,22.136229,69.834944,0.120343
1,007255e47698,0.145282,978.76416,85.200147,36.968889,8.138688,3.632190,0.025578,13.517790,1.229900,...,10.358927,0.173229,0.49706,0.568932,9.292698,72.611063,27981.562750,29.135430,32.131996,21.978000
2,013f2bd269f5,0.470030,2635.10654,85.200147,32.360553,8.138688,6.732840,0.025578,12.824570,1.229900,...,11.626917,7.709560,0.97556,1.198821,37.077772,88.609437,13676.957810,28.022851,35.192676,0.196941
3,043ac50845d5,0.252107,3819.65177,120.201618,77.112203,8.138688,3.685344,0.025578,11.053708,1.229900,...,14.852022,6.122162,0.49706,0.284466,18.529584,82.416803,2094.262452,39.948656,90.493248,0.155829
4,044fb8a146ec,0.380297,3733.04844,85.200147,14.103738,8.138688,3.942255,0.054810,3.396778,102.151980,...,13.666727,8.153058,48.50134,0.121914,16.408728,146.109943,8524.370502,45.381316,36.262628,0.096614
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
612,fd3dafe738fd,0.149555,3130.05946,123.763599,9.513984,13.020852,3.499305,0.077343,8.545512,2.804172,...,9.879296,0.173229,1.26092,0.067730,8.967128,217.148554,8095.932828,24.640462,69.191944,21.978000
613,fd895603f071,0.435846,5462.03438,85.200147,46.551007,15.973224,5.979825,0.025882,12.622906,3.777550,...,10.910227,10.223150,1.24236,0.426699,35.896418,496.994214,3085.308063,29.648928,124.808872,0.145340
614,fd8ef6377f76,0.427300,2459.10720,130.138587,55.355778,10.005552,8.070549,0.025578,15.408390,1.229900,...,12.029366,0.173229,0.49706,0.067730,19.962092,128.896894,6474.652866,26.166072,119.559420,21.978000
615,fe1942975e40,0.363205,1263.53524,85.200147,23.685856,8.138688,7.981959,0.025578,7.524588,1.229900,...,8.026928,9.256996,0.78764,0.670527,24.594488,72.611063,1965.343176,25.116750,37.155112,0.184622


In [8]:
greeks_cleanse = greeks.drop(columns='Epsilon').T
greeks_cleanse

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,607,608,609,610,611,612,613,614,615,616
Id,000ff2bfdfe9,007255e47698,013f2bd269f5,043ac50845d5,044fb8a146ec,04517a3c90bd,049232ca8356,057287f2da6d,0594b00fb30a,05f2bc0155cd,...,fb786fb02a65,fbb79ba9d642,fbc241daef00,fbd12c4ae88b,fd1dd68d51b4,fd3dafe738fd,fd895603f071,fd8ef6377f76,fe1942975e40,ffcca4ded3bb
Alpha,B,A,A,A,D,A,A,A,A,A,...,A,B,A,A,A,A,A,A,A,A
Beta,C,C,C,C,B,C,C,C,C,B,...,C,C,B,C,B,B,B,C,C,C
Gamma,G,M,M,M,F,M,M,M,M,M,...,M,H,M,M,M,M,M,M,M,M
Delta,D,B,B,B,B,B,B,B,B,B,...,B,B,B,C,B,B,B,B,B,B


In [9]:
TEMP_FINAL_LIST = []
for i in range(len(greeks_cleanse.T)):
    temp = []

    for value in greeks_cleanse[i]:
        if len(value) > 1:
            continue
        else:
            temp.append(value)
    temp = "".join(temp)
    TEMP_FINAL_LIST.append(temp)
    
temp_df = pd.DataFrame(TEMP_FINAL_LIST)

temp_df.columns = ['Merged']

In [12]:
label_encoder = LabelEncoder()
temp_df = label_encoder.fit_transform(temp_df['Merged'])

In [13]:
meta_encoded = pd.DataFrame(temp_df)
meta_encoded.columns = ['greeks_labeled']

In [14]:
meta_encoded = pd.concat([greeks['Id'], meta_encoded], axis=1)
meta_encoded

Unnamed: 0,Id,greeks_labeled
0,000ff2bfdfe9,11
1,007255e47698,2
2,013f2bd269f5,2
3,043ac50845d5,2
4,044fb8a146ec,19
...,...,...
612,fd3dafe738fd,1
613,fd895603f071,1
614,fd8ef6377f76,2
615,fe1942975e40,2


In [18]:
train_test_1 = pd.merge(train_no_class, meta_encoded, how='right')
train_test_1 = pd.concat([train_test_1, train['Class']], axis=1)
train_test_1

Unnamed: 0,Id,AB,AF,AH,AM,AR,AX,AY,AZ,BC,...,FR,FS,GB,GE,GF,GH,GI,GL,greeks_labeled,Class
0,000ff2bfdfe9,0.209377,3109.03329,85.200147,22.394407,8.138688,0.699861,0.025578,9.812214,5.555634,...,1.73855,0.094822,11.339138,72.611063,2003.810319,22.136229,69.834944,0.120343,11,1
1,007255e47698,0.145282,978.76416,85.200147,36.968889,8.138688,3.632190,0.025578,13.517790,1.229900,...,0.49706,0.568932,9.292698,72.611063,27981.562750,29.135430,32.131996,21.978000,2,0
2,013f2bd269f5,0.470030,2635.10654,85.200147,32.360553,8.138688,6.732840,0.025578,12.824570,1.229900,...,0.97556,1.198821,37.077772,88.609437,13676.957810,28.022851,35.192676,0.196941,2,0
3,043ac50845d5,0.252107,3819.65177,120.201618,77.112203,8.138688,3.685344,0.025578,11.053708,1.229900,...,0.49706,0.284466,18.529584,82.416803,2094.262452,39.948656,90.493248,0.155829,2,0
4,044fb8a146ec,0.380297,3733.04844,85.200147,14.103738,8.138688,3.942255,0.054810,3.396778,102.151980,...,48.50134,0.121914,16.408728,146.109943,8524.370502,45.381316,36.262628,0.096614,19,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
612,fd3dafe738fd,0.149555,3130.05946,123.763599,9.513984,13.020852,3.499305,0.077343,8.545512,2.804172,...,1.26092,0.067730,8.967128,217.148554,8095.932828,24.640462,69.191944,21.978000,1,0
613,fd895603f071,0.435846,5462.03438,85.200147,46.551007,15.973224,5.979825,0.025882,12.622906,3.777550,...,1.24236,0.426699,35.896418,496.994214,3085.308063,29.648928,124.808872,0.145340,1,0
614,fd8ef6377f76,0.427300,2459.10720,130.138587,55.355778,10.005552,8.070549,0.025578,15.408390,1.229900,...,0.49706,0.067730,19.962092,128.896894,6474.652866,26.166072,119.559420,21.978000,2,0
615,fe1942975e40,0.363205,1263.53524,85.200147,23.685856,8.138688,7.981959,0.025578,7.524588,1.229900,...,0.78764,0.670527,24.594488,72.611063,1965.343176,25.116750,37.155112,0.184622,2,0


In [17]:
greeks_for_label_encoding = greeks.drop(columns=['Id', 'Epsilon'])
for idx in greeks_for_label_encoding:
    greeks_for_label_encoding[idx] = label_encoder.fit_transform(greeks_for_label_encoding[idx])

train_test_2 = pd.concat([train_no_class, greeks_for_label_encoding], axis=1)
train_test_2 = pd.concat([train_test_2, train['Class']], axis=1)
train_test_2

Unnamed: 0,Id,AB,AF,AH,AM,AR,AX,AY,AZ,BC,...,GE,GF,GH,GI,GL,Alpha,Beta,Gamma,Delta,Class
0,000ff2bfdfe9,0.209377,3109.03329,85.200147,22.394407,8.138688,0.699861,0.025578,9.812214,5.555634,...,72.611063,2003.810319,22.136229,69.834944,0.120343,1,2,4,3,1
1,007255e47698,0.145282,978.76416,85.200147,36.968889,8.138688,3.632190,0.025578,13.517790,1.229900,...,72.611063,27981.562750,29.135430,32.131996,21.978000,0,2,6,1,0
2,013f2bd269f5,0.470030,2635.10654,85.200147,32.360553,8.138688,6.732840,0.025578,12.824570,1.229900,...,88.609437,13676.957810,28.022851,35.192676,0.196941,0,2,6,1,0
3,043ac50845d5,0.252107,3819.65177,120.201618,77.112203,8.138688,3.685344,0.025578,11.053708,1.229900,...,82.416803,2094.262452,39.948656,90.493248,0.155829,0,2,6,1,0
4,044fb8a146ec,0.380297,3733.04844,85.200147,14.103738,8.138688,3.942255,0.054810,3.396778,102.151980,...,146.109943,8524.370502,45.381316,36.262628,0.096614,2,1,3,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
612,fd3dafe738fd,0.149555,3130.05946,123.763599,9.513984,13.020852,3.499305,0.077343,8.545512,2.804172,...,217.148554,8095.932828,24.640462,69.191944,21.978000,0,1,6,1,0
613,fd895603f071,0.435846,5462.03438,85.200147,46.551007,15.973224,5.979825,0.025882,12.622906,3.777550,...,496.994214,3085.308063,29.648928,124.808872,0.145340,0,1,6,1,0
614,fd8ef6377f76,0.427300,2459.10720,130.138587,55.355778,10.005552,8.070549,0.025578,15.408390,1.229900,...,128.896894,6474.652866,26.166072,119.559420,21.978000,0,2,6,1,0
615,fe1942975e40,0.363205,1263.53524,85.200147,23.685856,8.138688,7.981959,0.025578,7.524588,1.229900,...,72.611063,1965.343176,25.116750,37.155112,0.184622,0,2,6,1,0


In [None]:
# # 60개(class : 0, class : 1) 테스트 셋 만들기
# ## 30개 class : 0
# mask = train.Class == 0
# test =  train[mask][:30]
# dropIndex1 = train[mask][:30].index

In [None]:
# ## 30개 class : 1
# mask = train.Class == 1
# test = test.append(train[mask][:30]) 
# dropIndex2 = train[mask][:30].index
# test.Class.value_counts() # 0 : 30, 1 : 30
# print(test.shape)
# test = test.drop(columns = ['Class'], axis = 1)
# print(test.shape)

In [None]:
# # drop
# print(train.shape)
# train = train.drop(index=dropIndex1)
# train = train.drop(index=dropIndex2)
# print(train.shape)

In [None]:
# test.to_csv('./test(make).csv',index = False)
# train.to_csv('./train(make).csv',index = False)

In [None]:
# train_make = pd.read_csv('./train(make).csv')
# test_make = pd.read_csv('./test(make).csv')

In [None]:
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()
train.EJ = lb.fit_transform(train.EJ)  # A->0, B->1

train_make = train.drop(columns=["Id"])

In [None]:
from sklearn.impute import KNNImputer

imp = KNNImputer(n_neighbors=5)
data = imp.fit_transform(train_make)
train = pd.DataFrame(columns=train_make.columns,
                    data=data)
train

In [None]:
[variance_inflation_factor(train, i) for i in range (train.shape[1])]

In [None]:
def check_vif(df):
    vifs = [variance_inflation_factor(df, i) for i in range(df.shape[1])]
    vif_df = pd.DataFrame({"features":df.columns, "VIF" : vifs})
    vif_df = vif_df.sort_values(by="VIF", ascending=False)
    remove_col = vif_df.iloc[0, 0]
    top_vif = vif_df.iloc[0, 1]
    return vif_df, remove_col, top_vif

In [None]:
# remove all features when VIF is over 10.
top_vif = 100

while(top_vif > 5):
    vif_df, remove_col, top_vif = check_vif(train)
    print(remove_col, top_vif)
    if top_vif < 5:
        break
    train = train.drop(columns=remove_col)

train

In [None]:
# feature selection via Feature Importance
X = train.drop(columns=['Class'])
y = train['Class']

rf = RandomForestClassifier()
rf.fit(X, y)
# [(col, fi) for col, fi in zip(X.columns, rf.feature_importances_)]
fi_df = pd.DataFrame({'feature': X.columns, 'importance' : rf.feature_importances_})

fi_df.sort_values(by='importance', ascending=False).plot(kind='bar')

In [None]:
# featrue 고름

selected_cols = fi_df.sort_values(by='importance', ascending=False)[:10]['feature'].values
selected_cols

In [None]:
# class imbalance handling

## 1. undersampling
c1 = train[train.Class==1]
c0 = train[train.Class==0]

print(c1.shape, c0.shape)
c0 = c0.sample(n=c1.shape[0])
train = pd.concat([c0, c1])
train.shape

In [None]:
X = train.drop(columns = ['Class'])
y = train['Class']

##### Oversampling


df = train[selected_cols]
df['Class'] = train['Class']
pd.pivot_table(index='Class', data=df)

## 2. oversampling = SMOTE
from imblearn.over_sampling import SMOTE

X = train[selected_cols]
y = train['Class']

smote = SMOTE(k_neighbors=5)
X_resampled, y_resampled = smote.fit_resample(X, y)
print(X_resampled.shape, y_resampled.shape)

## 3. hybrid approach

## class0 : 509 -> 300
## class1 : 108 -> 300

# class imbalance handling
## 1. undersampling

if sampling_method == 'hybrid':
    N = 300
    c1 = train[train.Class == 1]
    c0 = train[train.Class == 0]
    print(c1.shape, c0.shape)
    c0 = c0.sample(n=N)
    


df = X_resampled.copy()
df['Class'] = y_resampled
pd.pivot_table(index='Class', data=df)

X = X_resampled
y = y_resampled

In [None]:
# to make OOF prediction
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, stratify=y, random_state=42)
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

##### Feature Scailing

In [None]:
from sklearn.preprocessing import StandardScaler

if is_scaling:
    scaler = StandardScaler()
    data_ = scaler.fit_transform(X_train)
    X_train = pd.DataFrame(data=data_, columns=X_train.columns)
    data_ = scaler.transform(X_val)
    X_val = pd.DataFrame(data=data_, columns=X_val.columns)
    display(X_train)

In [None]:
if is_pca:
    from sklearn.decomposition import PCA
    
    pca = PCA(n_components=0.80, random_state=42)
    data_ = pca.fit_transform(X_train)
    X_train = pd.DataFrame(data=data_, columns=[f"PC{i}" for i in range(1, data_.shape[1]+1)])
    data_ = pca.transform(X_val)
    X_val = pd.DataFrame(data=data_, columns=[f"PC{i}" for i in range(1, data_.shape[1]+1)])

    display(X_train)

In [None]:
# Model ensemble of SVM, Logistic Regression, XGBoost, RandomForest, Simple NN.

svm = SVC(random_state=42)
lr = LogisticRegression(random_state=42, max_iter=300)
xgb = XGBClassifier(max_depth=3, colsample_bytree=0.8, reg_lambda=1, objective='binary:logistic', random_state=42)
rf = RandomForestClassifier(max_depth=3, max_features=0.8, criterion='log_loss', random_state=42)
catb = cat(iterations=300, depth=3, od_type='Iter', od_wait=15, bootstrap_type='Bayesian', random_state=42)
lgbm = lgb(boosting_type='gbdt', min_child_samples=20, min_child_weight=0.001, n_estimators=20, random_state=42)

nn = Sequential([
    Input(shape=(X_train.shape[1],)),
    Dense(30), ReLU(), Dropout(0.2),
    Dense(20), ReLU(), Dropout(0.2),
    Dense(10), ReLU(), Dropout(0.1),
    Dense(5), ReLU(), Dropout(0.2),
    Dense(5), ReLU(), Dropout(0.2),
    Dense(2), Softmax()
])
nn.summary()


In [None]:
optimizer = Adam(learning_rate=learning_rate)
loss_fn = tf.keras.losses.CategoricalCrossentropy(label_smoothing=0.8)   # [0.8, 0.2] <--> [0.9, 0] // [0, 0.9]
scheduler = ReduceLROnPlateau(monitor='val_loss',
                              factor=0.5,
                              patience=10,
                              min_lr=1e-6)
earlystopper = EarlyStopping(monitor='val_loss',
                             patience=20,
                             min_delta=1e-2)


nn.compile(optimizer=optimizer, loss=loss_fn, metrics=[b_logloss_keras])

nn_y_train = tf.keras.utils.to_categorical(y_train, num_classes=2)
nn_y_val = tf.keras.utils.to_categorical(y_val, num_classes=2)

In [None]:
print("\nFitting LogisticRegression...")
lr.fit(X_train, y_train)
print("\nFitting SVM...")
svm.fit(X_train, y_train)
print("\nFitting RandomForest...")
rf.fit(X_train, y_train)
print("\nFitting XGBoost...")
xgb.fit(X_train, y_train)
print("\nFitting CatBoost...")
catb.fit(X_train, y_train)
print("\nFitting Light GBM...")
lgbm.fit(X_train, y_train)
print("\nFitting MLP...")
history = nn.fit(X_train, nn_y_train,
                batch_size=batch_size,
                epochs=epochs,
                validation_data=[X_val, nn_y_val],
                callbacks=[scheduler, earlystopper])

In [None]:
## loss visualize
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))
plt.subplot(1, 2,1)
plt.plot(history.history['loss'],'b-', label = "training")
plt.plot(history.history['val_loss'], 'r:', label = "validation")
plt.title("model - loss")
plt.legend()

plt.subplot(1, 2, 2)
plt.title("model - val_logloss")

plt.plot(history.history['b_logloss_keras'], 'b-', label = "training")
plt.plot(history.history['val_b_logloss_keras'], 'r:', label = "validation")

plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# set metric
evaluation_metric = balance_logloss
evaluation_metric_keras = b_logloss_keras

In [None]:
print("--- Prediction with LR ---")
lr_pred_train = lr.predict_proba(X_train)
lr_pred_val = lr.predict_proba(X_val)

lr_train_score = evaluation_metric(y_train, lr_pred_train)
lr_val_score = evaluation_metric(y_val, lr_pred_val)

print("Train Score : %.4f" % lr_train_score)
print("Test Score : %.4f" % lr_val_score)

print("--- Prediction with SVM ---")
svm_pred_train = svm.predict(X_train)
svm_pred_val = svm.predict(X_val)

svm_train_score = evaluation_metric(y_train, svm_pred_train)
svm_val_score = evaluation_metric(y_val, svm_pred_val)

print("Train Score : %.4f" % svm_train_score)
print("Test Score : %.4f" % svm_val_score)

print("--- Prediction with RF ---")
rf_pred_train = rf.predict_proba(X_train)
rf_pred_val = rf.predict_proba(X_val)

rf_train_score = evaluation_metric(y_train, rf_pred_train)
rf_val_score = evaluation_metric(y_val, rf_pred_val)

print("Train Score : %.4f" % rf_train_score)
print("Test Score : %.4f" % rf_val_score)

print("--- Prediction with CAT ---")
cat_pred_train = catb.predict_proba(X_train)
cat_pred_val = catb.predict_proba(X_val)

cat_train_score = evaluation_metric(y_train, cat_pred_train)
cat_val_score = evaluation_metric(y_val, cat_pred_val)

print("Train Score : %.4f" % cat_train_score)
print("Test Score : %.4f" % cat_val_score)

print("--- Prediction with LGBM ---")
lgb_pred_train = lgbm.predict_proba(X_train)
lgb_pred_val = lgbm.predict_proba(X_val)

lgb_train_score = evaluation_metric(y_train, lgb_pred_train)
lgb_val_score = evaluation_metric(y_val, lgb_pred_val)

print("Train Score : %.4f" % lgb_train_score)
print("Test Score : %.4f" % lgb_val_score)

print("--- Prediction with MLP ---")
pred_train = nn.predict(X_train)
pred_val = nn.predict(X_val)

train_score = evaluation_metric_keras(nn_y_train, pred_train)
val_score = evaluation_metric_keras(nn_y_val, pred_val)

print("Train Score : %.4f" % train_score)
print("Validation Score : %.4f" % val_score)

In [None]:
def rf_optimizer(trial, X, y, K):
    # define parameter to tune
    n_estimators = trial.suggest_int('n_estimators', 50, 500)
    max_depth = trial.suggest_int('max_depth', 4, 20)
    max_features = trial.suggest_float('max_features', 0.6, 0.8, log=True)
    
    
    # set model
    model = RandomForestClassifier(n_estimators=n_estimators,
                                   max_depth=max_depth,
                                   max_features=max_features,
                                   criterion='log_loss',
                                   class_weight='balanced'
                                  )
    
    # K-Fold Cross validation
    folds = StratifiedKFold(n_splits=K)
    losses = []
    
    for train_idx, val_idx in folds.split(X, y):
        X_train = X.iloc[train_idx, :]
        y_train = y.iloc[train_idx]
        
        X_val = X.iloc[val_idx, :]
        y_val = y.iloc[val_idx]
        
        model.fit(X_train, y_train)
        preds = model.predict_proba(X_val)
        loss = evaluation_metric(y_val, preds)
        losses.append(loss)
    
    
    # return mean score of CV
    return np.mean(losses)

In [None]:
def svm_optimizer(trial, X, y, K):
    C = trial.suggest_int('C', 1, 100)
    kernel = trial.suggest_categorical('kernel', ['rbf'])
    gamma = trial.suggest_categorical('gamma', ['scale', 'auto'])
    

    model = SVC(C=C,
                kernel=kernel,
                class_weight='balanced', # if class imbalanced
                gamma=gamma,
                probability=True,
                cache_size=1000,
                random_state=42
               )
    
    folds = StratifiedKFold(n_splits=K)
    losses = []
    
    for train_idx, val_idx in folds.split(X, y):
        X_train = X.iloc[train_idx, :]
        y_train = y.iloc[train_idx]
        
        X_val = X.iloc[val_idx, :]
        y_val = y.iloc[val_idx]
        
        model.fit(X_train, y_train)
        preds = model.predict_proba(X_val)
        loss = evaluation_metric(y_val, preds)
        losses.append(loss)
    
    
    return np.mean(losses)

In [None]:
def lr_optimizer(trial, X, y, K):

    C = trial.suggest_int('C', 5, 100)
    solver = trial.suggest_categorical('solver', ['liblinear', 'newton-cg', 'newton-cholesky', 'saga'])    
    

    model = LogisticRegression(C=C,
                               solver=solver,
                               max_iter=500,
                               class_weight='balanced',
                               random_state=42,
                               n_jobs=-1)
    
    
    folds = StratifiedKFold(n_splits=K)
    losses = []
    
    for train_idx, val_idx in folds.split(X, y):
        X_train = X.iloc[train_idx, :]
        y_train = y.iloc[train_idx]
        
        X_val = X.iloc[val_idx, :]
        y_val = y.iloc[val_idx]
        
        model.fit(X_train, y_train)
        preds = model.predict_proba(X_val)
        loss = evaluation_metric(y_val, preds)
        losses.append(loss)
    
    
    return np.mean(losses)

In [None]:
def xgb_optimizer(trial, X, y, K):
    n_estimators = trial.suggest_int('n_estimators', 50, 2000)
    max_depth = trial.suggest_int('max_depth', 4, 20)
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.5, 0.8, log=True)
    learning_rate = trial.suggest_float('learning_rate', 1e-3, 1e-2, log=True)
    reg_lambda = trial.suggest_float('reg_lambda', 0.1, 2, log=True)
    booster = trial.suggest_categorical('booster', ['gbtree', 'dart'])
    
    
    model = XGBClassifier(n_estimators=n_estimators,
                          max_depth=max_depth,
                          booster=booster,
                          colsample_bytree=colsample_bytree,
                          learning_rate=learning_rate,
                          reg_lambda=reg_lambda,
                          scale_pos_weight=4.71)
    
    
    folds = StratifiedKFold(n_splits=K)
    losses = []
    
    for train_idx, val_idx in folds.split(X, y):
        X_train = X.iloc[train_idx, :]
        y_train = y.iloc[train_idx]
        
        X_val = X.iloc[val_idx, :]
        y_val = y.iloc[val_idx]
        
        model.fit(X_train, y_train)
        preds = model.predict_proba(X_val)
        loss = evaluation_metric(y_val, preds)
        losses.append(loss)
    
    
    return np.mean(losses)

In [None]:
def cat_optimizer(trial, X, y, K):
    iterations = trial.suggest_int('iterations', 50, 200)
    depth = trial.suggest_int('depth', 4, 16)
    bootstrap_type = trial.suggest_categorical('bootstrap_type', ['Bayesian'])
    learning_rate = trial.suggest_float('learning_rate', 1e-3, 1e-2, log=True)
    l2_leaf_reg = trial.suggest_float('l2_leaf_reg', 1e-8, 100.0, log=True)
#     random_strength = trial.suggest_float('random_strength', 1e-8, 10.0, log=True)
    bagging_temperature = trial.suggest_float('bagging_temperature', 0, 10, log=False)
#     od_type = trial.suggest_categorical('od_type', ['IncToDec', 'Iter'])
#     od_wait = trial.suggest_int('od_wait', 10, 50)
    
    
    model = cat(iterations=iterations,
                depth=depth,
                bootstrap_type=bootstrap_type,
                l2_leaf_reg=l2_leaf_reg,
#                 random_strength=random_strength,
                learning_rate=learning_rate,
                bagging_temperature=bagging_temperature,
#                 od_type=od_type,
#                 od_wait=od_wait,
#                 verbose=False,
                random_state=42
               )
    
    
    folds = StratifiedKFold(n_splits=K)
    losses = []
    
    for train_idx, val_idx in folds.split(X, y):
        X_train = X.iloc[train_idx, :]
        y_train = y.iloc[train_idx]
        
        X_val = X.iloc[val_idx, :]
        y_val = y.iloc[val_idx]
        
        model.fit(X_train, y_train)
        preds = model.predict_proba(X_val)
        loss = evaluation_metric(y_val, preds)
        losses.append(loss)
    
    
    return np.mean(losses)

In [None]:
def lgbm_optimizer(trial, X, y, K):
    num_leaves = trial.suggest_int('num_leaves', 100, 500)
    max_depth = trial.suggest_int('max_depth', 4, 20)
    boosting_type = trial.suggest_categorical('boosting_type', ['gbdt', 'dart'])
    learning_rate = trial.suggest_float('learning_rate', 1e-3, 1e-2, log=True)
    
    
    model = lgb(max_depth=max_depth,
                application='binary',
                metric='binary_logloss',
                num_leaves=num_leaves,
                boosting_type=boosting_type,
                learning_rate=learning_rate,
                random_state=42
               )
    
    
    folds = StratifiedKFold(n_splits=K)
    losses = []
    
    for train_idx, val_idx in folds.split(X, y):
        X_train = X.iloc[train_idx, :]
        y_train = y.iloc[train_idx]
        
        X_val = X.iloc[val_idx, :]
        y_val = y.iloc[val_idx]
        
        model.fit(X_train, y_train)
        preds = model.predict_proba(X_val)
        loss = evaluation_metric(y_val, preds)
        losses.append(loss)
    
    
    return np.mean(losses)

In [None]:
K = 6 # set K of K-Fold
opt_func = partial(rf_optimizer, X=X_train, y=y_train, K=K)

if is_tuning:
    rf_study = optuna.create_study(direction="minimize") # determine minimize or maximize sth
    rf_study.optimize(opt_func, n_trials=n_trials)

In [None]:
K = 6
opt_func = partial(lr_optimizer, X=X_train, y=y_train, K=K)

if is_tuning:
    lr_study = optuna.create_study(direction="minimize") 
    lr_study.optimize(opt_func, n_trials=n_trials)

In [None]:
K = 6 
opt_func = partial(svm_optimizer, X=X_train, y=y_train, K=K)

if is_tuning:
    svm_study = optuna.create_study(direction="minimize") 
    svm_study.optimize(opt_func, n_trials=n_trials)

In [None]:
K = 6
opt_func = partial(xgb_optimizer, X=X_train, y=y_train, K=K)

if is_tuning:
    xgb_study = optuna.create_study(direction="minimize")
    xgb_study.optimize(opt_func, n_trials=n_trials)

In [None]:
K = 6
opt_func = partial(cat_optimizer, X=X_train, y=y_train, K=K)

if is_tuning:
    cat_study = optuna.create_study(direction="minimize")
    cat_study.optimize(opt_func, n_trials=n_trials)

In [None]:
K = 6
opt_func = partial(lgbm_optimizer, X=X_train, y=y_train, K=K)

if is_tuning:
    lgbm_study = optuna.create_study(direction="minimize")
    lgbm_study.optimize(opt_func, n_trials=n_trials)


In [None]:
# save all studies
if is_tuning:
    with open("rm_study.pk", 'wb') as f:
        pickle.dump(rf_study, f)
    with open("lr_study.pk", 'wb') as f:
        pickle.dump(lr_study, f)
    with open("svm_study.pk", 'wb') as f:
        pickle.dump(svm_study, f)
    with open("xgb_study.pk", 'wb') as f:
        pickle.dump(xgb_study, f)
    with open("cat_study.pk", 'wb') as f:
        pickle.dump(cat_study, f)
    with open("lgbm_study.pk", 'wb') as f:
        pickle.dump(lgbm_study, f)
        
    nn.save("./simple_nn_model.keras")


In [None]:
# visualize experiment logs
def display_experiment_log(study):
    display(study.trials_dataframe())
    print("Best Score: %.4f" % study.best_value)
    print("Best params: ", study.best_trial.params)
    history = study.trials_dataframe()
    display(history[history.value == study.best_value])
    optuna.visualization.plot_optimization_history(study).show()
    optuna.visualization.plot_param_importances(study).show()

In [None]:
if is_tuning:
    display_experiment_log(rf_study)

In [None]:
if is_tuning:
    display_experiment_log(lr_study)

In [None]:
if is_tuning:
    display_experiment_log(svm_study)

In [None]:
if is_tuning:
    display_experiment_log(xgb_study)

In [None]:
if is_tuning:
    display_experiment_log(cat_study)

In [None]:
if is_tuning:
    display_experiment_log(lgbm_study)

In [None]:
## preprocessing in same way
X_test = test[train.columns.drop("Class")].fillna(test.mean())
if is_scaling:
    X_test = scaler.transform(X_test)

if is_pca:
    data_ = pca.transform(X_test)
    X_test = pd.DataFrame(data=data_, columns=[f"PC{i}" for i in range(1, data_.shape[1]+1)])

X_test

In [None]:
# Finalize Models
if is_tuning:
    rf_best_params = rf_study.best_params
    lr_best_params = lr_study.best_params
    xgb_best_params = xgb_study.best_params
    svm_best_params = svm_study.best_params
    lgbm_best_params = lgbm_study.best_params
    cat_best_params = cat_study.best_params    
    
    best_rf = RandomForestClassifier(**rf_best_params)
    best_lr = LogisticRegression(**lr_best_params)
    best_xgb = XGBClassifier(**xgb_best_params)
    best_svm = SVC(**svm_best_params, probability=True)
    best_lgbm = lgb(**lgbm_best_params)
    best_cat = cat(**cat_best_params)

In [None]:
print('\n --------- RF ---------')
print(rf_best_params)
print(rf_study.best_value)

print('\n --------- LR ---------')
print(lr_best_params)
print(lr_study.best_value)

print('\n --------- XGB ---------')

print(xgb_best_params)
print(xgb_study.best_value)

print('\n --------- SVM ---------')
print(svm_best_params)
print(svm_study.best_value)

print('\n --------- LGB ---------')

print(lgbm_best_params)
print(lgbm_study.best_value)

print('\n --------- CAT ---------')

print(cat_best_params)
print(cat_study.best_value)

In [None]:
# first ensebmle model, then check it.
best_rf.fit(X_train, y_train)
best_lr.fit(X_train, y_train)
best_xgb.fit(X_train, y_train)
best_svm.fit(X_train, y_train)
best_lgbm.fit(X_train, y_train)
best_cat.fit(X_train, y_train)

# OOF-prediction
v_rf = best_rf.predict_proba(X_val)
v_lr = best_lr.predict_proba(X_val)
v_xgb = best_xgb.predict_proba(X_val)
v_svm = best_svm.predict_proba(X_val)
v_lgbm = best_lgbm.predict_proba(X_val)
v_cat = best_cat.predict_proba(X_val)

print(v_rf.shape, v_lr.shape, v_xgb.shape, v_svm.shape, v_lgbm.shape, v_cat.shape)

preds_rf = best_rf.predict_proba(X_test)
preds_lr = best_lr.predict_proba(X_test)
preds_xgb = best_xgb.predict_proba(X_test)
preds_svm = best_svm.predict_proba(X_test)
preds_lgbm = best_lgbm.predict_proba(X_test)
preds_cat = best_cat.predict_proba(X_test)
print(preds_rf.shape, preds_lr.shape, preds_xgb.shape, preds_svm.shape, preds_lgbm.shape, preds_cat.shape)

In [None]:
# MLP predictions
v_nn = nn.predict(X_val)
preds_nn = nn.predict(X_test)
print(v_nn.shape, preds_nn.shape)

In [None]:
# OOF prediction
ensembles = np.mean([v_xgb, v_svm, v_rf, v_lr, v_nn], axis=0)
print("OOF prediction logloss : %.4f" % evaluation_metric(y_val, ensembles))

In [None]:
submission = pd.read_csv('../input/icr-identify-age-related-conditions/sample_submission.csv')
submission

In [None]:
# voting_weights = [0.1, 0.1, 0.25, 0.25, 0.3]
# voting_weights = [0.2, 0.2, 0.2, 0.2, 0.2]
voting_weights = [0.10, 0.10, 0.10, 0.20, 0.20, 0.30]
submission['class_0'] = voting_weights[0]*preds_nn[:, 0] + voting_weights[1]*preds_nn[:, 0] + voting_weights[2]*preds_nn[:, 0] + voting_weights[3]*preds_nn[:, 0] + voting_weights[4]*preds_nn[:, 0] + voting_weights[5]*preds_rf[:, 0] # + voting_weights[6]*preds_rf[:, 0]
submission['class_1'] = voting_weights[0]*preds_nn[:, 1] + voting_weights[1]*preds_nn[:, 1] + voting_weights[2]*preds_nn[:, 1] + voting_weights[3]*preds_nn[:, 1] + voting_weights[4]*preds_nn[:, 1] + voting_weights[5]*preds_rf[:, 1] # + voting_weights[6]*preds_rf[:, 1]
submission

In [None]:
submission.to_csv("submission.csv", index=False)