This code will give you a general idea of how to do a machine learning project using scikit-learn and optuna.

It performs the necessary preprocessing, tunes the models with optuna, combines the tuned models, and performs ensemble (voting).

You can submit the finished result without any problem.

Below is the version history, so please refer to it when using it.

## 1. Data and Library Load

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import pickle
import os
import random
from tqdm.auto import tqdm

# ignore warning
import warnings
warnings.filterwarnings('ignore')

# ML
from sklearn.ensemble import RandomForestClassifier  # Bagging
from xgboost.sklearn import XGBClassifier            # GBM
from sklearn.linear_model import LogisticRegression  # LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC # SVM

# DL
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Input, Dense, ReLU, Softmax, Dropout
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping

# for checking multi-collinearity
from statsmodels.stats.outliers_influence import variance_inflation_factor

# KFold(CV), partial : for optuna
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer
from functools import partial
from imblearn.over_sampling import SMOTE

# AutoML framework
import optuna
from optuna.samplers import TPESampler

In [2]:
# set configs
is_tuning = True

if is_tuning:
    sampler = TPESampler(seed=42) # 샘플러가 직접 결정하도록. 신상. 
    
is_scaling = True

is_pca = False

apply_vif = False

feature_selection = True # importance 가 없어짐.
if feature_selection: 
    m = 20
    
is_cuml = True
is_debug = True

sampling_method = 'hybrid' # 'under' or 'over'

if is_tuning:
    n_trials=30 # 50 -> 30으로 바꿈

# # import SVC
# if is_cuml:
#     from cuml.svm import SVC, LinearSVC
# else:
#     from sklearn.svm import SVC
    
# Keras model compile
learning_rate = 1e-2
batch_size = 32
epochs = 10

In [3]:
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    tf.random.set_seed(seed)
    
seed_everything()

In [4]:
def balance_logloss(y_true, y_pred):
    y_pred = np.clip(y_pred, 1e-15, 1-1e-15)
    y_pred /= np.sum(y_pred, axis=1)[:, None]
    nc = np.bincount(y_true)
    
    logloss = (-1/nc[0]*(np.sum(np.where(y_true==0,1,0) * np.log(y_pred[:,0]))) - 1/nc[1]*(np.sum(np.where(y_true!=0,1,0) * np.log(y_pred[:,1])))) / 2
    
    return logloss

def b_logloss_keras(y_true, y_pred):
    #(2, ) -> (1, ) inverse_one_hot encoding
    #y_true = y_true[:, 0] * (1 - y_true[:, 1]) + y_true[:, 1] * (1 - y_true[:, 0])
    y_true = y_true[:, 1] * (1 - y_true[:, 0])
    y_true = tf.cast(y_true, tf.int64)
    score = tf.py_function(func=balance_logloss, inp=[y_true, y_pred], Tout=tf.float32)
    return score

In [5]:
train = pd.read_csv('../input/icr-identify-age-related-conditions/train.csv')
test = pd.read_csv('../input/icr-identify-age-related-conditions/test.csv')
metadata = pd.read_csv('../input/icr-identify-age-related-conditions/greeks.csv')
print(train.shape, test.shape)

(617, 58) (5, 57)


## 2. Data Preprocessing

LabelEncoding -> KNN Imputation -> (optional)calculate VIF -> (optional)apply PCA -> feature Scaling


                           -> (optional)feature selection  ------->

In [6]:
lb = LabelEncoder()
train.EJ = lb.fit_transform(train.EJ)  # A->0, B->1
train = train.drop(columns=["Id"])

In [7]:
imp = KNNImputer()
labels = train["Class"]
train = train.drop(columns="Class")
data = imp.fit_transform(train)
tmp = pd.DataFrame(columns=train.columns, data=data)
train = pd.concat([tmp, labels], axis=1)
train

Unnamed: 0,AB,AF,AH,AM,AR,AX,AY,AZ,BC,BD,...,FL,FR,FS,GB,GE,GF,GH,GI,GL,Class
0,0.209377,3109.03329,85.200147,22.394407,8.138688,0.699861,0.025578,9.812214,5.555634,4126.58731,...,7.298162,1.73855,0.094822,11.339138,72.611063,2003.810319,22.136229,69.834944,0.120343,1
1,0.145282,978.76416,85.200147,36.968889,8.138688,3.632190,0.025578,13.517790,1.229900,5496.92824,...,0.173229,0.49706,0.568932,9.292698,72.611063,27981.562750,29.135430,32.131996,21.978000,0
2,0.470030,2635.10654,85.200147,32.360553,8.138688,6.732840,0.025578,12.824570,1.229900,5135.78024,...,7.709560,0.97556,1.198821,37.077772,88.609437,13676.957810,28.022851,35.192676,0.196941,0
3,0.252107,3819.65177,120.201618,77.112203,8.138688,3.685344,0.025578,11.053708,1.229900,4169.67738,...,6.122162,0.49706,0.284466,18.529584,82.416803,2094.262452,39.948656,90.493248,0.155829,0
4,0.380297,3733.04844,85.200147,14.103738,8.138688,3.942255,0.054810,3.396778,102.151980,5728.73412,...,8.153058,48.50134,0.121914,16.408728,146.109943,8524.370502,45.381316,36.262628,0.096614,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
612,0.149555,3130.05946,123.763599,9.513984,13.020852,3.499305,0.077343,8.545512,2.804172,4157.68439,...,0.173229,1.26092,0.067730,8.967128,217.148554,8095.932828,24.640462,69.191944,21.978000,0
613,0.435846,5462.03438,85.200147,46.551007,15.973224,5.979825,0.025882,12.622906,3.777550,5654.07556,...,10.223150,1.24236,0.426699,35.896418,496.994214,3085.308063,29.648928,124.808872,0.145340,0
614,0.427300,2459.10720,130.138587,55.355778,10.005552,8.070549,0.025578,15.408390,1.229900,5888.87769,...,0.173229,0.49706,0.067730,19.962092,128.896894,6474.652866,26.166072,119.559420,21.978000,0
615,0.363205,1263.53524,85.200147,23.685856,8.138688,7.981959,0.025578,7.524588,1.229900,4517.86560,...,9.256996,0.78764,0.670527,24.594488,72.611063,1965.343176,25.116750,37.155112,0.184622,0


In [8]:
###############
### z-score ###
###############
is_z = True
if is_z:
    # seperate class 0,1
    train_0 = train[train.Class == 0.0]
    train_1 = train[train.Class == 1.0]

    # calculate z-score
    train_0_mean = np.mean(train_0)
    train_0_std = np.std(train_0)

    z = (train_0 - train_0_mean) / train_0_std

    # find outliars base on z-score
    drop_index_list = list(set(sum([list(z[(z[i] > 3) | (z[i] < -3)].index) for i in train.columns[:-3]], [])))

    # assign을 이용해 z-score column을 새로 만드는 법.
    # train = train.assign(z_score = lambda x : x.AB.sub(x.AB.mean()).div(x.AB.std()))
    # train[['AB', 'z_score']]

    print(f'train_0에서 z-score로 걸러내는 column의 수 : {len(drop_index_list)}\n')

    print('--------------------------------------------------')
    print('Class_0에 있는 outliars만 제거한 후 Class 0, 1의 수 :')
    print(train.drop(index=drop_index_list).Class.value_counts())
    
    
    train.drop(index=drop_index_list, inplace=True)
    print(train.info())

train_0에서 z-score로 걸러내는 column의 수 : 187

--------------------------------------------------
Class_0에 있는 outliars만 제거한 후 Class 0, 1의 수 :
0    322
1    108
Name: Class, dtype: int64
<class 'pandas.core.frame.DataFrame'>
Int64Index: 430 entries, 0 to 615
Data columns (total 57 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   AB      430 non-null    float64
 1   AF      430 non-null    float64
 2   AH      430 non-null    float64
 3   AM      430 non-null    float64
 4   AR      430 non-null    float64
 5   AX      430 non-null    float64
 6   AY      430 non-null    float64
 7   AZ      430 non-null    float64
 8   BC      430 non-null    float64
 9   BD      430 non-null    float64
 10  BN      430 non-null    float64
 11  BP      430 non-null    float64
 12  BQ      430 non-null    float64
 13  BR      430 non-null    float64
 14  BZ      430 non-null    float64
 15  CB      430 non-null    float64
 16  CC      430 non-null    float64
 17  CD     

In [9]:
# check missing data
train[train.isnull().any(axis=1)]

Unnamed: 0,AB,AF,AH,AM,AR,AX,AY,AZ,BC,BD,...,FL,FR,FS,GB,GE,GF,GH,GI,GL,Class


In [10]:
def check_vif(df):
    vifs = [variance_inflation_factor(df, i) for i in range(df.shape[1])]
    vif_df = pd.DataFrame({"features":df.columns, "VIF" : vifs})
    vif_df = vif_df.sort_values(by="VIF", ascending=False)
    remove_col = vif_df.iloc[0, 0]
    top_vif = vif_df.iloc[0, 1]
    return vif_df, remove_col, top_vif

In [11]:
# remove all features when VIF is over 10.
if apply_vif:
    top_vif = 100

    while(top_vif > 5):
        vif_df, remove_col, top_vif = check_vif(train)
        print(remove_col, top_vif)
        if top_vif < 5:
            break
        train = train.drop(columns=remove_col)

    display(train)

In [12]:
# feature selection via Feature Importance
X = train.drop(columns=["Class"])
y = train['Class']

if feature_selection:
    rf = RandomForestClassifier()
    rf.fit(X, y)
    print("Train ACC : %.4f" % accuracy_score(y, rf.predict(X)))
    fi_df = pd.DataFrame({'feature':X.columns, 'importance':rf.feature_importances_})
    selected_cols = fi_df.sort_values(by="importance", ascending=False)[:m]["feature"].values
    
    display(selected_cols)
    
    X = train[selected_cols]
    display(X)

Train ACC : 1.0000


array(['DU', 'FL', 'DI', 'AB', 'FD ', 'GL', 'DA', 'EH', 'BQ', 'FR', 'EE',
       'DE', 'CR', 'AF', 'BC', 'DF', 'FE', 'CD ', 'AR', 'DH'],
      dtype=object)

Unnamed: 0,DU,FL,DI,AB,FD,GL,DA,EH,BQ,FR,EE,DE,CR,AF,BC,DF,FE,CD,AR,DH
0,5.310690,7.298162,89.245560,0.209377,10.265073,0.120343,69.08340,0.949104,152.707705,1.73855,1.987283,295.570575,0.069225,3109.03329,5.555634,0.238680,9028.291921,23.387600,8.138688,0.284232
1,0.005518,0.173229,110.581815,0.145282,0.296850,21.978000,70.79836,0.003042,14.754720,0.49706,0.858603,178.553100,1.117800,978.76416,1.229900,0.238680,6785.003474,50.628208,8.138688,0.363489
2,1.289739,7.709560,120.056438,0.470030,8.745201,0.196941,70.81970,0.377208,219.320160,0.97556,8.146651,321.426625,0.700350,2635.10654,1.229900,0.238680,8338.906181,85.955376,8.138688,0.210441
4,1.144902,8.153058,97.920120,0.380297,4.274640,0.096614,74.06532,0.164268,149.717165,48.50134,3.490846,200.178160,0.693150,3733.04844,102.151980,0.238680,16198.049590,72.644264,8.138688,0.207708
7,2.117379,6.591896,83.769368,0.269199,4.518057,0.092873,63.21684,0.292032,6.199900,0.49706,3.277203,326.225295,1.109625,966.45483,1.229900,0.238680,18090.349450,71.542272,8.138688,0.325227
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
608,8.876439,36.144542,523.664437,0.636677,14.872185,0.027231,16.20676,0.358956,49.857095,0.49706,4.994409,321.506455,0.554700,2996.11246,1.229900,1.123200,16739.773980,58.841616,25.533036,0.358023
609,0.531069,4.234008,99.539070,0.367478,3.259413,0.208286,70.40648,0.164268,102.040455,0.49706,2.978909,515.435700,0.584175,4461.60154,5.882436,0.238680,4889.825256,139.279104,8.138688,0.322494
611,0.005518,0.173229,130.468545,0.175193,0.296850,21.978000,48.24780,0.003042,33.122575,1.89486,6.953475,581.175705,0.755850,2607.26686,1.229900,0.238680,3324.847012,71.803896,8.138688,0.334792
612,0.005518,0.173229,176.977590,0.149555,0.296850,21.978000,21.75904,0.003042,27.287375,1.26092,1.354416,355.930925,0.698250,3130.05946,2.804172,0.238680,17167.209610,55.163024,13.020852,0.445479


## 3. Data preprocessing

In [13]:
# class imbalance handling
## 1. undersampling
if sampling_method == 'under':
    c1 = train[train.Class == 1]
    c0 = train[train.Class == 0]
    print(c1.shape, c0.shape) # 108, 509 -> 108, 108
    c0 = c0.sample(n=c1.shape[0], random_state=42) # 509 -> 108
    train = pd.concat([c0, c1])
    print(train.shape)

In [14]:
## before oversampling
# df = train[selected_cols]
# df["Class"] = train["Class"]
# pd.pivot_table(index="Class", data=df)

In [15]:
## 2. oversampling -> SMOTE
if sampling_method == 'over':
    X = train[selected_cols]
    y = train['Class']

    smote = SMOTE(k_neighbors=5, random_state=42)
    # fit_resample 함수가 자동으로 minority class를 y 기준으로 찾아서, 1:1로 맞춰줌.
    X_resampled, y_resampled = smote.fit_resample(X, y)
    print(X_resampled.shape, y_resampled.shape)
    X_resampled["Class"] = y_resampled
    train = X_resampled

In [16]:
# After SMOTE
# df = X_resampled.copy()
# df["Class"] = y_resampled
# pd.pivot_table(index="Class", data=df)

In [17]:
# 3. hybrid approach
## class0 : 509 -> 300
## class1 : 108 -> 300
if sampling_method == 'hybrid':
    N = 300
    c1 = train[train.Class == 1]
    c0 = train[train.Class == 0]
    print(c1.shape, c0.shape)
    c0 = c0.sample(n=N) # 509 -> 300
    train = pd.concat([c0, c1])
    print(train.shape)
    
    if feature_selection:
        X = train[selected_cols]
    else:
        X = train.drop("Class", axis=1)
    y = train.Class

    smote = SMOTE(k_neighbors=5, random_state=42)
    # The fit_resample function automatically finds the minority class by y and fits it 1:1.
    X, y = smote.fit_resample(X, y) # 300, 108 --> 300, 300
    print(X.shape, y.shape)
    display(X), display(y)

(108, 57) (322, 57)
(408, 57)
(600, 20) (600,)


Unnamed: 0,DU,FL,DI,AB,FD,GL,DA,EH,BQ,FR,EE,DE,CR,AF,BC,DF,FE,CD,AR,DH
0,0.931095,4.764967,127.985528,0.350386,4.001538,0.171600,48.870540,0.237276,109.582972,0.497060,0.955347,150.224537,0.593100,5431.637970,2.393034,0.238680,6730.643885,90.347488,8.138688,0.418149
1,0.005518,0.173229,202.511820,0.230742,0.296850,21.978000,67.612880,0.003042,44.839865,1.016740,2.450848,706.677335,0.344775,2678.049440,1.229900,0.238680,7112.171618,106.932864,8.138688,0.371688
2,0.662112,2.811204,93.140452,0.452938,0.789621,0.068062,25.150160,0.066924,30.269839,0.497060,4.172085,696.312740,0.867975,2379.684010,1.229900,0.238680,3353.604481,96.237992,8.138688,0.295164
3,1.027653,4.065990,191.828632,0.752048,2.250123,0.107638,61.503820,0.164268,14.431700,2.058130,1.483408,135.870660,0.818775,2939.932160,1.229900,1.224639,8176.856739,111.213984,8.138688,0.267834
4,0.005518,0.173229,201.002055,0.401662,0.296850,21.978000,30.291160,0.003042,33.156440,1.337480,4.909758,253.704175,0.714600,3513.917440,1.229900,0.238680,5356.554898,124.778792,8.138688,0.423615
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,49.931099,34.723752,163.739847,0.371324,67.536431,0.094162,46.586181,3.109511,40.688720,2.852441,7.764921,508.820515,0.842498,3971.168281,3.487820,0.238680,6359.188369,137.766909,12.353703,0.334118
596,6.125799,18.651302,285.106099,0.886584,1.287985,0.017754,21.694662,0.154914,164.428878,1.220655,1.314278,187.611813,0.436506,6711.939402,9.961980,1.603025,22351.093502,119.226884,22.482625,0.314401
597,1.278366,3.624042,197.898560,1.303413,5.045195,10.988373,51.882744,0.326516,79.679468,2.538719,3.570547,348.179788,0.727440,6708.886092,6.922661,1.618252,6874.930495,162.578817,13.084650,0.262909
598,5.912827,11.764975,85.284200,0.268591,18.833889,0.127450,59.557003,1.121941,98.462660,1.805864,2.420204,379.511789,0.499198,3056.740771,2.558375,0.238680,9500.854075,59.615483,8.138688,0.342936


0      0
1      0
2      0
3      0
4      0
      ..
595    1
596    1
597    1
598    1
599    1
Name: Class, Length: 600, dtype: int64

In [18]:
## we don't need anymore because we dropped, valid set.

# # to make OOF prediction
# from sklearn.model_selection import train_test_split

# #X = train.drop(columns=["Class"])
# X = train[selected_cols]
# y = train['Class']

# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, stratify=y, random_state=42)
# print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

### feature scaling

- Use StandardScaler

In [19]:
from sklearn.preprocessing import StandardScaler

if is_scaling:
    scaler = StandardScaler()
    data_ = scaler.fit_transform(X)
    X = pd.DataFrame(data=data_, columns=X.columns)
    display(X)
    
#     data_ = scaler.fit_transform(X_train)
#     X_train = pd.DataFrame(data=data_, columns=X_train.columns)
#     data_ = scaler.transform(X_val)
#     X_val = pd.DataFrame(data=data_, columns=X_val.columns)
#     display(X_train)

Unnamed: 0,DU,FL,DI,AB,FD,GL,DA,EH,BQ,FR,EE,DE,CR,AF,BC,DF,FE,CD,AR,DH
0,-0.201360,-0.204398,-0.345144,-0.371202,-0.085338,-0.763705,0.100603,-0.108619,-0.047019,-0.101862,-0.928050,-0.760046,-0.343974,0.506945,-0.142271,-0.365488,-0.407922,-0.076344,-0.392236,0.715252
1,-0.274696,-0.632788,0.505191,-0.592147,-0.139492,1.509849,1.090823,-0.224957,-0.738691,-0.094458,-0.169169,1.357174,-1.452710,-0.506982,-0.158249,-0.365488,-0.378053,0.252599,-0.392236,0.244875
2,-0.222673,-0.386676,-0.742721,-0.181821,-0.132289,-0.774500,-1.152624,-0.193229,-0.894348,-0.101862,0.704259,1.317738,0.883303,-0.616847,-0.158249,-0.365488,-0.672302,0.040484,-0.392236,-0.529864
3,-0.193710,-0.269610,0.383297,0.370541,-0.110940,-0.770374,0.768061,-0.144881,-1.063552,-0.079623,-0.660089,-0.814661,0.663632,-0.410552,-0.158249,0.679851,-0.294702,0.337508,-0.392236,-0.806557
4,-0.274696,-0.632788,0.487965,-0.276511,-0.139492,1.509849,-0.881007,-0.224957,-0.863509,-0.089889,1.078586,-0.366322,0.198506,-0.199199,-0.158249,-0.365488,-0.515496,0.606544,-0.392236,0.770591
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,3.681046,2.590629,0.062808,-0.332537,0.843402,-0.771779,-0.020087,1.317941,-0.783039,-0.068307,2.527416,0.604358,0.769554,-0.030830,-0.127232,-0.365488,-0.437002,0.864142,0.415666,-0.135491
596,0.210231,1.091138,1.447580,0.618986,-0.125004,-0.779745,-1.335189,-0.149526,0.538918,-0.091553,-0.745913,-0.617793,-1.043142,0.978378,-0.038297,1.081025,0.814963,0.496431,2.357099,-0.335107
597,-0.173845,-0.310842,0.452554,1.388737,-0.070082,0.364061,0.259748,-0.064296,-0.366488,-0.072776,0.399013,-0.006856,0.255833,0.977254,-0.080048,1.097169,-0.396626,1.356246,0.555768,-0.856414
598,0.193356,0.448673,-0.832360,-0.522251,0.131478,-0.768308,0.665204,0.330770,-0.165821,-0.083216,-0.184719,0.112358,-0.763233,-0.367541,-0.139999,-0.365488,-0.191049,-0.685863,-0.392236,-0.046217


In [20]:
if is_pca:
    from sklearn.decomposition import PCA
    
    pca = PCA(n_components=0.90, random_state=42)
    data_ = pca.fit_transform(X)
    X = pd.DataFrame(data=data_, columns=[f"PC{i}" for i in range(1, data_.shape[1]+1)])
    display(X)

#     data_ = pca.fit_transform(X_train)
#     X_train = pd.DataFrame(data=data_, columns=[f"PC{i}" for i in range(1, data_.shape[1]+1)])
#     data_ = pca.transform(X_val)
#     X_val = pd.DataFrame(data=data_, columns=[f"PC{i}" for i in range(1, data_.shape[1]+1)])

#     display(X_train)

## 4. Fitting and Evaluation


- does not need anymore.

In [21]:
# set metric
evaluation_metric = balance_logloss
#evaluation_metric_keras = b_logloss_keras

## 5. (Super)Hyper-parameter Tuning and OOF prediction

Let's try hyper-parameter tuning using optuna, an AutoML framework.

Optuna defines a target function to optimize and then optimizes that function.

For each model, we define an optimizer function separately and then burn optuna.

In [22]:
# make oof predictions
submission = pd.read_csv('../input/icr-identify-age-related-conditions/sample_submission.csv')
final_preds = []
# rf_preds = pd.DataFrame(columns=submission.columns, data=np.zeros((train.shape[0], submission.shape[1])))
# svm_preds = pd.DataFrame(columns=submission.columns, data=np.zeros((train.shape[0], submission.shape[1])))
# xgb_preds = pd.DataFrame(columns=submission.columns, data=np.zeros((train.shape[0], submission.shape[1])))

In [23]:
# apply same preprocessing in Testset
test.EJ = lb.transform(test.EJ)  # A->0, B->1
test = test.drop(columns=["Id"])

if apply_vif or feature_selection:
    X_test = pd.DataFrame(columns=test.columns, data=imp.transform(test))
    if apply_vif:
        X_test = X_test[train.columns.drop("Class")]
    else:
        X_test = X_test[selected_cols]
    X_test = pd.DataFrame(columns=X_test.columns, data=scaler.transform(X_test))
elif is_pca:
    X_test = pd.DataFrame(columns=test.columns, data=scaler.transform(imp.transform(test)))
    data_ = pca.fit_transform(X_test)
    X_test = pd.DataFrame(data=data_, columns=[f"PC{i}" for i in range(1, data_.shape[1]+1)])
else:
    X_test = pd.DataFrame(columns=test.columns, data=scaler.transform(imp.transform(test)))

X_test

Unnamed: 0,DU,FL,DI,AB,FD,GL,DA,EH,BQ,FR,EE,DE,CR,AF,BC,DF,FE,CD,AR,DH
0,-0.275134,-0.64895,-1.805441,-1.018254,-0.143831,-0.781596,-2.48139,-0.226468,-1.217731,-0.108943,-1.412833,-1.331628,-2.992081,-1.493095,-0.175144,-0.618542,-0.934847,-1.868239,-1.952196,-3.518145
1,-0.275134,-0.64895,-1.805441,-1.018254,-0.143831,-0.781596,-2.48139,-0.226468,-1.217731,-0.108943,-1.412833,-1.331628,-2.992081,-1.493095,-0.175144,-0.618542,-0.934847,-1.868239,-1.952196,-3.518145
2,-0.275134,-0.64895,-1.805441,-1.018254,-0.143831,-0.781596,-2.48139,-0.226468,-1.217731,-0.108943,-1.412833,-1.331628,-2.992081,-1.493095,-0.175144,-0.618542,-0.934847,-1.868239,-1.952196,-3.518145
3,-0.275134,-0.64895,-1.805441,-1.018254,-0.143831,-0.781596,-2.48139,-0.226468,-1.217731,-0.108943,-1.412833,-1.331628,-2.992081,-1.493095,-0.175144,-0.618542,-0.934847,-1.868239,-1.952196,-3.518145
4,-0.275134,-0.64895,-1.805441,-1.018254,-0.143831,-0.781596,-2.48139,-0.226468,-1.217731,-0.108943,-1.412833,-1.331628,-2.992081,-1.493095,-0.175144,-0.618542,-0.934847,-1.868239,-1.952196,-3.518145


In [24]:
def rf_optimizer(trial, X, y, K):
    # define parameter to tune    
    n_estimators = trial.suggest_categorical('n_estimators', [50, 100, 200])
    max_depth = trial.suggest_int('max_depth', 4, 10)
    max_features = trial.suggest_categorical('max_features', [0.6, 0.7, 0.8])
    
    
    # set model
    model = RandomForestClassifier(n_estimators=n_estimators,
                                   max_depth=max_depth,
                                   max_features=max_features,
                                   criterion='log_loss',
                                   class_weight='balanced'
                                  )
    
    # K-Fold Cross validation
    folds = StratifiedKFold(n_splits=K, shuffle=True, random_state=42)
    losses = []
    
    for train_idx, val_idx in folds.split(X, y):
        X_train = X.iloc[train_idx, :]
        y_train = y.iloc[train_idx]
        
        X_val = X.iloc[val_idx, :]
        y_val = y.iloc[val_idx]
        
        model.fit(X_train, y_train)
        preds = model.predict_proba(X_val)
        loss = evaluation_metric(y_val, preds)
        losses.append(loss)
    
    
    # return mean score of CV
    return np.mean(losses)

In [25]:
def xgb_optimizer(trial, X, y, K):
    
    n_estimators = trial.suggest_categorical('n_estimators', [500, 1000, 2000])
    max_depth = trial.suggest_int('max_depth', 4, 10)
    colsample_bytree = trial.suggest_categorical('colsample_bytree', [0.5, 0.6, 0.7, 0.8])
    learning_rate = trial.suggest_float('learning_rate', 1e-3, 1e-2)
    reg_lambda = trial.suggest_categorical('reg_lambda', [0.1, 0.5, 1, 2])
    
    
    model = XGBClassifier(n_estimators=n_estimators,
                          max_depth=max_depth,
                          colsample_bytree=colsample_bytree,
                          learning_rate=learning_rate,
                          reg_lambda=reg_lambda,
                          random_state=42)
#                          scale_pos_weight=4.71)  ## we set class imbalance by using sampling method.
    
    
    folds = StratifiedKFold(n_splits=K, shuffle=True, random_state=42)
    losses = []
    
    for train_idx, val_idx in folds.split(X, y):
        X_train = X.iloc[train_idx, :]
        y_train = y.iloc[train_idx]
        
        X_val = X.iloc[val_idx, :]
        y_val = y.iloc[val_idx]
        
        model.fit(X_train, y_train)
        preds = model.predict_proba(X_val)
        loss = evaluation_metric(y_val, preds)
        losses.append(loss)
    
    
    return np.mean(losses)

In [26]:
K = 4 # set K of K-Fold
opt_func = partial(rf_optimizer, X=X, y=y, K=K)

if is_tuning:
    rf_study = optuna.create_study(direction="minimize", sampler=sampler) # determine minimize or maximize sth
    rf_study.optimize(opt_func, n_trials=n_trials)

[I 2023-06-23 13:33:30,100] A new study created in memory with name: no-name-feb99f62-fa3f-4024-be5e-62183fe70d7c
[I 2023-06-23 13:33:31,666] Trial 0 finished with value: 0.17021103038673863 and parameters: {'n_estimators': 100, 'max_depth': 8, 'max_features': 0.6}. Best is trial 0 with value: 0.17021103038673863.
[I 2023-06-23 13:33:32,380] Trial 1 finished with value: 0.21477104861341967 and parameters: {'n_estimators': 50, 'max_depth': 4, 'max_features': 0.6}. Best is trial 0 with value: 0.17021103038673863.
[I 2023-06-23 13:33:35,858] Trial 2 finished with value: 0.16745397228109066 and parameters: {'n_estimators': 200, 'max_depth': 7, 'max_features': 0.8}. Best is trial 2 with value: 0.16745397228109066.
[I 2023-06-23 13:33:38,880] Trial 3 finished with value: 0.1690878606330276 and parameters: {'n_estimators': 200, 'max_depth': 7, 'max_features': 0.6}. Best is trial 2 with value: 0.16745397228109066.
[I 2023-06-23 13:33:42,222] Trial 4 finished with value: 0.1867139026372588 and 

In [27]:
K = 4 

if is_tuning:
    best_loss = 9999.0
    best_C = 0
    kernel = 'linear'
    folds = StratifiedKFold(n_splits=K, random_state=42, shuffle=True)

#     # for Linear SVM
#     for C in tqdm([1, 2, 5, 10, 100]):
#         losses = []
#         l_svm = LinearSVC(C=C, probability=True) ## cuml version. (faster model)

#         for train_idx, val_idx in folds.split(X, y):
#             X_train = X.iloc[train_idx, :]
#             y_train = y.iloc[train_idx]
#             X_val = X.iloc[val_idx, :]
#             y_val = y.iloc[val_idx]

#             l_svm.fit(X_train, y_train)
#             preds = l_svm.predict_proba(X_val).values
#             loss = evaluation_metric(y_val, preds)
#             losses.append(loss)

#         avg_loss = np.mean(losses)
#         if avg_loss < best_loss:
#             best_loss = avg_loss
#             best_C = C

    # for SVM with RBF kernel.
    for C in tqdm([1, 2, 5, 10, 100]):
        losses = []
        r_svm = SVC(C=C, probability=True) ## cuml version. (with rbf kernel)

        for train_idx, val_idx in folds.split(X, y):
            X_train = X.iloc[train_idx, :]
            y_train = y.iloc[train_idx]
            X_val = X.iloc[val_idx, :]
            y_val = y.iloc[val_idx]

            r_svm.fit(X_train, y_train)
            preds = r_svm.predict_proba(X_val)
            loss = evaluation_metric(y_val, preds)
            losses.append(loss)

        avg_loss = np.mean(losses)
        if avg_loss < best_loss:
            best_loss = avg_loss
            best_C = C
            kernel = 'rbf'

    print("SVM(%s) log loss : %.4f" % (kernel, best_loss))

  0%|          | 0/5 [00:00<?, ?it/s]

SVM(rbf) log loss : 0.1668


In [28]:
def svm_optimizer(trial, X, y, K):
    C = trial.suggest_float('C', 0.45, 0.55)
    kernel = trial.suggest_categorical('kernel', ['rbf'])


    model = SVC(C=C,
                kernel=kernel,
                # class_weight='balanced', # if class imbalanced
                probability=True,
                # cache_size=1000,
                random_state=42
               )

    folds = StratifiedKFold(n_splits=K, random_state=42, shuffle=True)
    losses = []

    for train_idx, val_idx in folds.split(X, y):
        X_train = X.iloc[train_idx, :]
        y_train = y.iloc[train_idx]

        X_val = X.iloc[val_idx, :]
        y_val = y.iloc[val_idx]

        model.fit(X_train, y_train)
        preds = model.predict_proba(X_val)
        loss = evaluation_metric(y_val, preds)
        losses.append(loss)

    return np.mean(losses)

In [29]:
K = 4
opt_func = partial(svm_optimizer, X=X, y=y, K=K)

if is_tuning:
    svm_study = optuna.create_study(direction="minimize", sampler=sampler)
    svm_study.optimize(opt_func, n_trials=n_trials)

[I 2023-06-23 13:34:21,069] A new study created in memory with name: no-name-888e86d4-fb54-4341-b5c4-36b48840f031
[I 2023-06-23 13:34:21,222] Trial 0 finished with value: 0.2529875457873408 and parameters: {'C': 0.5272244769296658, 'kernel': 'rbf'}. Best is trial 0 with value: 0.2529875457873408.
[I 2023-06-23 13:34:21,375] Trial 1 finished with value: 0.25669313519389125 and parameters: {'C': 0.46987156815341724, 'kernel': 'rbf'}. Best is trial 0 with value: 0.2529875457873408.
[I 2023-06-23 13:34:21,533] Trial 2 finished with value: 0.2578670003682941 and parameters: {'C': 0.45055221171236026, 'kernel': 'rbf'}. Best is trial 0 with value: 0.2529875457873408.
[I 2023-06-23 13:34:21,688] Trial 3 finished with value: 0.2527042837900088 and parameters: {'C': 0.5315461428454835, 'kernel': 'rbf'}. Best is trial 3 with value: 0.2527042837900088.
[I 2023-06-23 13:34:21,842] Trial 4 finished with value: 0.25339370785563986 and parameters: {'C': 0.5206857343847617, 'kernel': 'rbf'}. Best is tr

In [30]:
K = 4
opt_func = partial(xgb_optimizer, X=X, y=y, K=K)

if is_tuning:
    xgb_study = optuna.create_study(direction="minimize", sampler=sampler)
    xgb_study.optimize(opt_func, n_trials=n_trials)

[I 2023-06-23 13:34:25,754] A new study created in memory with name: no-name-0cbd8753-1d90-441b-806d-b80bbd5c717f
[I 2023-06-23 13:34:27,519] Trial 0 finished with value: 0.16166613379338135 and parameters: {'n_estimators': 500, 'max_depth': 4, 'colsample_bytree': 0.7, 'learning_rate': 0.008984914683186941, 'reg_lambda': 2}. Best is trial 0 with value: 0.16166613379338135.
[I 2023-06-23 13:34:31,431] Trial 1 finished with value: 0.12502117797499523 and parameters: {'n_estimators': 1000, 'max_depth': 7, 'colsample_bytree': 0.5, 'learning_rate': 0.006727693701374024, 'reg_lambda': 1}. Best is trial 1 with value: 0.12502117797499523.
[I 2023-06-23 13:34:34,777] Trial 2 finished with value: 0.13536138895937866 and parameters: {'n_estimators': 1000, 'max_depth': 4, 'colsample_bytree': 0.7, 'learning_rate': 0.006700633808593812, 'reg_lambda': 2}. Best is trial 1 with value: 0.12502117797499523.
[I 2023-06-23 13:34:40,527] Trial 3 finished with value: 0.12262967917580683 and parameters: {'n_e

In [31]:
# visualize experiment logs
def display_experiment_log(study):
    display(study.trials_dataframe())
    print("Best Score: %.4f" % study.best_value)
    print("Best params: ", study.best_trial.params)
    history = study.trials_dataframe()
    display(history[history.value == study.best_value])
    optuna.visualization.plot_optimization_history(study).show()
    optuna.visualization.plot_param_importances(study).show()

In [32]:
if is_tuning:
    display_experiment_log(rf_study)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_max_depth,params_max_features,params_n_estimators,state
0,0,0.170211,2023-06-23 13:33:30.102309,2023-06-23 13:33:31.666189,0 days 00:00:01.563880,8,0.6,100,COMPLETE
1,1,0.214771,2023-06-23 13:33:31.667717,2023-06-23 13:33:32.380352,0 days 00:00:00.712635,4,0.6,50,COMPLETE
2,2,0.167454,2023-06-23 13:33:32.381765,2023-06-23 13:33:35.857759,0 days 00:00:03.475994,7,0.8,200,COMPLETE
3,3,0.169088,2023-06-23 13:33:35.859276,2023-06-23 13:33:38.879780,0 days 00:00:03.020504,7,0.6,200,COMPLETE
4,4,0.186714,2023-06-23 13:33:38.881492,2023-06-23 13:33:42.221781,0 days 00:00:03.340289,5,0.8,200,COMPLETE
5,5,0.220221,2023-06-23 13:33:42.223741,2023-06-23 13:33:43.130526,0 days 00:00:00.906785,8,0.8,50,COMPLETE
6,6,0.166603,2023-06-23 13:33:43.132071,2023-06-23 13:33:44.900184,0 days 00:00:01.768113,8,0.8,100,COMPLETE
7,7,0.163204,2023-06-23 13:33:44.901612,2023-06-23 13:33:46.675836,0 days 00:00:01.774224,10,0.8,100,COMPLETE
8,8,0.172128,2023-06-23 13:33:46.677457,2023-06-23 13:33:48.448068,0 days 00:00:01.770611,6,0.8,100,COMPLETE
9,9,0.208571,2023-06-23 13:33:48.449454,2023-06-23 13:33:51.739642,0 days 00:00:03.290188,4,0.8,200,COMPLETE


Best Score: 0.1611
Best params:  {'n_estimators': 100, 'max_depth': 10, 'max_features': 0.7}


Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_max_depth,params_max_features,params_n_estimators,state
26,26,0.161088,2023-06-23 13:34:12.797217,2023-06-23 13:34:14.478738,0 days 00:00:01.681521,10,0.7,100,COMPLETE


In [33]:
if is_tuning:
    display_experiment_log(xgb_study)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_colsample_bytree,params_learning_rate,params_max_depth,params_n_estimators,params_reg_lambda,state
0,0,0.161666,2023-06-23 13:34:25.755633,2023-06-23 13:34:27.518891,0 days 00:00:01.763258,0.7,0.008985,4,500,2.0,COMPLETE
1,1,0.125021,2023-06-23 13:34:27.521009,2023-06-23 13:34:31.431281,0 days 00:00:03.910272,0.5,0.006728,7,1000,1.0,COMPLETE
2,2,0.135361,2023-06-23 13:34:31.435211,2023-06-23 13:34:34.776637,0 days 00:00:03.341426,0.7,0.006701,4,1000,2.0,COMPLETE
3,3,0.12263,2023-06-23 13:34:34.778504,2023-06-23 13:34:40.527347,0 days 00:00:05.748843,0.8,0.008747,6,2000,0.5,COMPLETE
4,4,0.122559,2023-06-23 13:34:40.529318,2023-06-23 13:34:46.019316,0 days 00:00:05.489998,0.8,0.009662,6,2000,0.5,COMPLETE
5,5,0.130814,2023-06-23 13:34:46.021324,2023-06-23 13:34:49.203300,0 days 00:00:03.181976,0.6,0.005405,4,1000,0.1,COMPLETE
6,6,0.166605,2023-06-23 13:34:49.205415,2023-06-23 13:34:55.278427,0 days 00:00:06.073012,0.8,0.003887,8,1000,2.0,COMPLETE
7,7,0.211641,2023-06-23 13:34:55.281011,2023-06-23 13:35:00.683767,0 days 00:00:05.402756,0.8,0.002238,8,1000,1.0,COMPLETE
8,8,0.118221,2023-06-23 13:35:00.687080,2023-06-23 13:35:07.167161,0 days 00:00:06.480081,0.8,0.009104,7,2000,2.0,COMPLETE
9,9,0.444676,2023-06-23 13:35:07.169127,2023-06-23 13:35:10.103186,0 days 00:00:02.934059,0.7,0.001083,8,500,0.5,COMPLETE


Best Score: 0.1120
Best params:  {'n_estimators': 2000, 'max_depth': 9, 'colsample_bytree': 0.5, 'learning_rate': 0.009673528321748368, 'reg_lambda': 0.1}


Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_colsample_bytree,params_learning_rate,params_max_depth,params_n_estimators,params_reg_lambda,state
24,24,0.111999,2023-06-23 13:36:24.479162,2023-06-23 13:36:30.584009,0 days 00:00:06.104847,0.5,0.009674,9,2000,0.1,COMPLETE


## 6. Test Prediction and Make Submission

In [34]:
# Finalize Models
if is_tuning:
    rf_best_params = rf_study.best_params
    xgb_best_params = xgb_study.best_params

    best_rf = RandomForestClassifier(**rf_best_params)
    best_xgb = XGBClassifier(**xgb_best_params)
    best_svm = r_svm
#     if kernel == 'linear':
#         best_svm = l_svm
#     else:
#         best_svm = r_svm

### Need to validate OOF prediction score.

- OOF score is not correlated with LB score.

In [35]:
# Make KFold OOF prediction
def oof_preds(best_model, svm=False):
    # call global variable
    global final_preds
    
    # make KFold
    folds = StratifiedKFold(n_splits=K, random_state=42, shuffle=True)
    losses = []
            
    if svm: # cuml SVC
        if kernel == 'linear':
            svm = LinearSVC(C=best_C, probability=True)
        else:
            svm = SVC(C=best_C, probability=True)
            

        for i, (train_idx, val_idx) in enumerate(folds.split(X, y)):
            X_train = X.iloc[train_idx, :]
            y_train = y.iloc[train_idx]
            X_val = X.iloc[val_idx, :]
            y_val = y.iloc[val_idx]

            print(f"========== Fold {i+1} ==========")
            svm.fit(X_train, y_train)
            preds = svm.predict_proba(X_val)
            test_preds = svm.predict_proba(X_test)
            final_preds.append(test_preds)
            loss = evaluation_metric(y_val, preds)
            print(f"Loss : {loss:.4f}")
            losses.append(loss)
    
    else:
        # fitting with best_model
        for i, (train_idx, val_idx) in enumerate(folds.split(X, y)):
            X_train = X.iloc[train_idx, :]
            y_train = y.iloc[train_idx]
            X_val = X.iloc[val_idx, :]
            y_val = y.iloc[val_idx]

            print(f"========== Fold {i+1} ==========")
            best_model.fit(X_train, y_train)
            preds = best_model.predict_proba(X_val)
            test_preds = best_model.predict_proba(X_test)
            final_preds.append(test_preds)
            loss = evaluation_metric(y_val, preds)
            print(f"Loss : {loss:.4f}")
            losses.append(loss)

    avg_loss = np.mean(losses)
    return avg_loss

In [36]:
print("Random Forest")
rf_loss = oof_preds(best_model=best_rf, svm=False)
print("Avg Loss : %.4f" % rf_loss)
print("\nXGBoost")
xgb_loss = oof_preds(best_model=best_xgb, svm=False)
print("Avg Loss : %.4f" % xgb_loss)
print("\nSupport Vector Machine")
svm_loss = oof_preds(best_model=best_svm, svm=True)
print("Avg Loss : %.4f" % svm_loss)
print("\nRF + SVM")
print("Avg Loss : %.4f" % np.mean([rf_loss, svm_loss]))
print("\nRF + XGB")
print("Avg Loss : %.4f" % np.mean([rf_loss, xgb_loss]))
print("\nSVM + XGB")
print("Avg Loss : %.4f" % np.mean([xgb_loss, svm_loss]))
print("\nTotal logloss : %.4f" % np.mean([rf_loss, xgb_loss, svm_loss]))

Random Forest
Loss : 0.2033
Loss : 0.1699
Loss : 0.1488
Loss : 0.1763
Avg Loss : 0.1746

XGBoost
Loss : 0.1898
Loss : 0.1096
Loss : 0.0531
Loss : 0.0993
Avg Loss : 0.1129

Support Vector Machine
Loss : 0.2005
Loss : 0.1770
Loss : 0.0999
Loss : 0.1846
Avg Loss : 0.1655

RF + SVM
Avg Loss : 0.1700

RF + XGB
Avg Loss : 0.1438

SVM + XGB
Avg Loss : 0.1392

Total logloss : 0.1510


In [37]:
pd.DataFrame(columns=[0, 1], data=best_rf.predict_proba(X_val)).head()

Unnamed: 0,0,1
0,0.97,0.03
1,0.93,0.07
2,0.81,0.19
3,1.0,0.0
4,0.99,0.01


In [38]:
best_svm.predict_proba(X_val)

array([[9.93283130e-01, 6.71686967e-03],
       [9.74733833e-01, 2.52661667e-02],
       [6.83164765e-01, 3.16835235e-01],
       [8.82795775e-01, 1.17204225e-01],
       [9.97575266e-01, 2.42473442e-03],
       [9.96509565e-01, 3.49043538e-03],
       [8.58342596e-01, 1.41657404e-01],
       [9.31122119e-01, 6.88778814e-02],
       [9.84352664e-01, 1.56473362e-02],
       [8.92444260e-01, 1.07555740e-01],
       [4.90113701e-01, 5.09886299e-01],
       [9.43132007e-01, 5.68679932e-02],
       [9.86785722e-01, 1.32142784e-02],
       [9.72749524e-01, 2.72504764e-02],
       [9.66768072e-01, 3.32319285e-02],
       [7.29337964e-01, 2.70662036e-01],
       [3.03774945e-01, 6.96225055e-01],
       [9.78734598e-01, 2.12654022e-02],
       [6.32610831e-01, 3.67389169e-01],
       [8.50433060e-01, 1.49566940e-01],
       [9.88522983e-01, 1.14770171e-02],
       [9.93024988e-01, 6.97501241e-03],
       [7.80156402e-01, 2.19843598e-01],
       [9.76750883e-01, 2.32491174e-02],
       [9.616716

In [39]:
final_preds = final_preds[:-4] # remove SVM KFold predictions

In [40]:
submission[['class_0', 'class_1']] = np.mean(final_preds, axis=0)
submission

Unnamed: 0,Id,class_0,class_1
0,00eed32682bb,0.84864,0.15136
1,010ebe33f668,0.84864,0.15136
2,02fa521e1838,0.84864,0.15136
3,040e15f562a2,0.84864,0.15136
4,046e85c7cc7f,0.84864,0.15136


In [42]:
# #voting_weights = [0.1, 0.1, 0.25, 0.25, 0.3]
# #voting_weights = [0.2, 0.2, 0.2, 0.2, 0.2]
# #voting_weights = [0.25, 0.25, 0.25, 0.25]
# #voting_weights = [0.5, 0.5]
# voting_weights = [0.35, 0.35, 0.3]
# submission['class_0'] = voting_weights[0]*preds_rf[:, 0] + voting_weights[1]*preds_lr[:, 0] + voting_weights[2]*preds_xgb[:, 0] + voting_weights[3]*preds_nn[:, 0] + voting_weights[4]*preds_svm[:, 0]
# submission['class_1'] = voting_weights[0]*preds_rf[:, 1] + voting_weights[1]*preds_lr[:, 1] + voting_weights[2]*preds_xgb[:, 1] + voting_weights[3]*preds_nn[:, 1] + voting_weights[4]*preds_svm[:, 1]
# submission['class_0'] = voting_weights[0]*preds_rf[:, 0] + voting_weights[1]*preds_lr[:, 0] + voting_weights[2]*preds_xgb[:, 0] + voting_weights[3]*preds_svm[:, 0]
# submission['class_1'] = voting_weights[0]*preds_rf[:, 1] + voting_weights[1]*preds_lr[:, 1] + voting_weights[2]*preds_xgb[:, 1] + voting_weights[3]*preds_svm[:, 1]
# submission['class_0'] = voting_weights[0]*preds_rf[:, 0] + voting_weights[1]*preds_xgb[:, 0] + voting_weights[2]*preds_svm[:, 0]
# submission['class_1'] = voting_weights[0]*preds_rf[:, 1] + voting_weights[1]*preds_xgb[:, 1] + voting_weights[2]*preds_svm[:, 1]
# # submission

In [None]:
submission.to_csv("submission.csv", index=False)