### **[Version 29, 30] Refactor all codes.**

**[Version 28] Not using finalized model, it causes overfitting.**

**[Version 27] Make KFold ensemble.**

**[Version 26] Drop LR, NN model.**

**[Version 25] TPESampler is added for using optuna as reproducible way.**

**[Version 24] cuml LinearSVC is added.**

**[Version 23] cuml SVC is added.**

**[Version 22] Make config variable and refactor code.**

**[Version 21] Sampling method is applied to relieve class imbalance.**

**[~Version 20] Use KNNImputer, Standard Scaling, model ensemble(Random Forest & Support Vector Machine & XGBoost & Shallow Neural Network & Logistic Regression)**

This code will give you a general idea of how to do a machine learning project using scikit-learn and optuna.

It performs the necessary preprocessing, tunes the models with optuna, combines the tuned models, and performs ensemble (voting).

You can submit the finished result without any problem.

Below is the version history, so please refer to it when using it.

## 1. Data and Library Load

In [75]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import pickle
import os
import random
from tqdm.auto import tqdm

# ignore warning
import warnings
warnings.filterwarnings('ignore')

# ML
from sklearn.ensemble import RandomForestClassifier  # Bagging
from sklearn.linear_model import LogisticRegression  # LogisticRegression
# from sklearn.svm import SVC                          # SVM

from xgboost.sklearn import XGBClassifier            # GBM
from lightgbm.sklearn import LGBMClassifier          # LGBM
from catboost import CatBoostClassifier              # CatBoost

# train_test_split
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score

# DL
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Input, Dense, ReLU, Softmax, Dropout
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping

# for checking multi-collinearity
from statsmodels.stats.outliers_influence import variance_inflation_factor

# KFold(CV), partial : for optuna
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer
from functools import partial
from imblearn.over_sampling import SMOTE

# AutoML framework
import optuna
from optuna.samplers import TPESampler

In [76]:
###################
### set configs ###
###################
is_debug = True # ???
# ======= set visualizing =======
is_visualizing = False
# ======= set optuna tuning =======
is_tuning = True
if is_tuning:
    sampler = TPESampler(seed=42)  # Make the sampler behave in a deterministic way.
    n_trials=50 # set n_trials to optuna
# ======= select scaler =======
is_scaling = 'standard' # ['standard', 'minmax']
# ======= remove outliars methods =======
is_IQR = False
is_z = True
is_iForest = False
is_mahala = False
# ======= reduce columns =======
# !!!Do Not Use Both 'VIF'/ 'feature-selection' And 'PCA'!!!
apply_vif = False
feature_selection = True
if feature_selection:
    m = 20 # set number of columns to use

is_pca = False
if is_pca:
    n_components = 0.90
    
if (apply_vif or feature_selection) and is_pca:
    raise Exception('Select one method only. (vif,feature-selection / pca)')
# ======= select sampling method =======
sampling_method = 'under' # ['hybrid', 'under', 'over']
# ======= select OOF / K-fold =======
is_oof = False
# ======= Set K of K-fold =======
K = 4
# ======= import SVC =======
is_cuml = True

if is_cuml:
    from cuml.svm import SVC, LinearSVC
else:
    from sklearn.svm import SVC
# ======= Keras model compile =======
learning_rate = 1e-2
batch_size = 32
epochs = 10

In [77]:
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    tf.random.set_seed(seed)
    
seed_everything()

In [78]:
def balance_logloss(y_true, y_pred):
    y_pred = np.clip(y_pred, 1e-15, 1-1e-15)
    y_pred /= np.sum(y_pred, axis=1)[:, None]
    nc = np.bincount(y_true)
    
    logloss = (-1/nc[0]*(np.sum(np.where(y_true==0,1,0) * np.log(y_pred[:,0]))) - 1/nc[1]*(np.sum(np.where(y_true!=0,1,0) * np.log(y_pred[:,1])))) / 2
    
    return logloss

def b_logloss_keras(y_true, y_pred):
    #(2, ) -> (1, ) inverse_one_hot encoding
    #y_true = y_true[:, 0] * (1 - y_true[:, 1]) + y_true[:, 1] * (1 - y_true[:, 0])
    y_true = y_true[:, 1] * (1 - y_true[:, 0])
    y_true = tf.cast(y_true, tf.int64)
    score = tf.py_function(func=balance_logloss, inp=[y_true, y_pred], Tout=tf.float32)
    return score

In [79]:
train = pd.read_csv('../input/icr-identify-age-related-conditions/train.csv')
test = pd.read_csv('../input/icr-identify-age-related-conditions/test.csv')
greeks = pd.read_csv('../input/icr-identify-age-related-conditions/greeks.csv')
print(train.shape, test.shape)

(617, 58) (5, 57)


In [80]:
if is_visualizing:
    cylinder_size = train.AB / train.AB.max() * 300
    train.plot(kind='scatter', x='Class', y='AB', s=cylinder_size, color='coral', alpha=0.3, figsize=(1, 5), xticks=[1, 0])
else:
    print('Use Visualization: False')

Use Visualization: False


## *Simple Greeks EDA

In [81]:
greeks

Unnamed: 0,Id,Alpha,Beta,Gamma,Delta,Epsilon
0,000ff2bfdfe9,B,C,G,D,3/19/2019
1,007255e47698,A,C,M,B,Unknown
2,013f2bd269f5,A,C,M,B,Unknown
3,043ac50845d5,A,C,M,B,Unknown
4,044fb8a146ec,D,B,F,B,3/25/2020
...,...,...,...,...,...,...
612,fd3dafe738fd,A,B,M,B,9/13/2020
613,fd895603f071,A,B,M,B,9/8/2020
614,fd8ef6377f76,A,C,M,B,7/24/2019
615,fe1942975e40,A,C,M,B,1/31/2019


In [82]:
greeks['test'] = greeks.Alpha + greeks.Beta + greeks.Gamma + greeks.Delta
greeks.test.nunique()

29

In [83]:
display(greeks.groupby(['Alpha']).nunique())
display(greeks.groupby(['Beta']).nunique())
display(greeks.groupby(['Gamma']).nunique())
display(greeks.groupby(['Delta']).nunique())

Unnamed: 0_level_0,Id,Beta,Gamma,Delta,Epsilon,test
Alpha,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A,509,2,2,4,130,7
B,61,3,2,4,57,8
D,18,3,2,3,17,6
G,29,3,2,3,25,8


Unnamed: 0_level_0,Id,Alpha,Gamma,Delta,Epsilon,test
Beta,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A,8,3,4,1,8,4
B,202,4,6,2,82,10
C,407,4,7,4,126,15


Unnamed: 0_level_0,Id,Alpha,Beta,Delta,Epsilon,test
Gamma,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A,11,1,3,3,10,4
B,18,1,3,3,15,4
E,8,1,2,3,7,3
F,10,1,2,2,10,3
G,8,1,1,2,8,2
H,53,1,3,4,50,6
M,445,1,2,4,90,5
N,64,1,1,2,44,2


Unnamed: 0_level_0,Id,Alpha,Beta,Gamma,Epsilon,test
Delta,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A,75,4,3,7,43,9
B,456,4,2,7,115,11
C,64,4,1,6,39,6
D,22,2,1,3,20,3


In [84]:
# remove outliars => scaling => KMM-imputer

## 2. Data Preprocessing

LabelEncoding -> KNN Imputation -> (optional)calculate VIF -> (optional)apply PCA -> feature Scaling


                           -> (optional)feature selection  ------->

In [85]:
lb = LabelEncoder()
train.EJ = lb.fit_transform(train.EJ)  # A->0, B->1
train = train.drop(columns=["Id"])

In [86]:
imp = KNNImputer()
labels = train["Class"]
train = train.drop(columns="Class")
data = imp.fit_transform(train)
tmp = pd.DataFrame(columns=train.columns, data=data)
train = pd.concat([tmp, labels], axis=1)
train

Unnamed: 0,AB,AF,AH,AM,AR,AX,AY,AZ,BC,BD,...,FL,FR,FS,GB,GE,GF,GH,GI,GL,Class
0,0.209377,3109.03329,85.200147,22.394407,8.138688,0.699861,0.025578,9.812214,5.555634,4126.58731,...,7.298162,1.73855,0.094822,11.339138,72.611063,2003.810319,22.136229,69.834944,0.120343,1
1,0.145282,978.76416,85.200147,36.968889,8.138688,3.632190,0.025578,13.517790,1.229900,5496.92824,...,0.173229,0.49706,0.568932,9.292698,72.611063,27981.562750,29.135430,32.131996,21.978000,0
2,0.470030,2635.10654,85.200147,32.360553,8.138688,6.732840,0.025578,12.824570,1.229900,5135.78024,...,7.709560,0.97556,1.198821,37.077772,88.609437,13676.957810,28.022851,35.192676,0.196941,0
3,0.252107,3819.65177,120.201618,77.112203,8.138688,3.685344,0.025578,11.053708,1.229900,4169.67738,...,6.122162,0.49706,0.284466,18.529584,82.416803,2094.262452,39.948656,90.493248,0.155829,0
4,0.380297,3733.04844,85.200147,14.103738,8.138688,3.942255,0.054810,3.396778,102.151980,5728.73412,...,8.153058,48.50134,0.121914,16.408728,146.109943,8524.370502,45.381316,36.262628,0.096614,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
612,0.149555,3130.05946,123.763599,9.513984,13.020852,3.499305,0.077343,8.545512,2.804172,4157.68439,...,0.173229,1.26092,0.067730,8.967128,217.148554,8095.932828,24.640462,69.191944,21.978000,0
613,0.435846,5462.03438,85.200147,46.551007,15.973224,5.979825,0.025882,12.622906,3.777550,5654.07556,...,10.223150,1.24236,0.426699,35.896418,496.994214,3085.308063,29.648928,124.808872,0.145340,0
614,0.427300,2459.10720,130.138587,55.355778,10.005552,8.070549,0.025578,15.408390,1.229900,5888.87769,...,0.173229,0.49706,0.067730,19.962092,128.896894,6474.652866,26.166072,119.559420,21.978000,0
615,0.363205,1263.53524,85.200147,23.685856,8.138688,7.981959,0.025578,7.524588,1.229900,4517.86560,...,9.256996,0.78764,0.670527,24.594488,72.611063,1965.343176,25.116750,37.155112,0.184622,0


In [87]:
# check missing data
train[train.isnull().any(axis=1)]

Unnamed: 0,AB,AF,AH,AM,AR,AX,AY,AZ,BC,BD,...,FL,FR,FS,GB,GE,GF,GH,GI,GL,Class


In [88]:
# 상관 관계 히트맵 그리기
if is_visualizing:
    # 상관 관계 행렬 생성
    correlation_matrix = train.corr()

    plt.figure(figsize=(30, 30))

    # 위쪽 영역 마스킹
    mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))

    # 상관 관계 히트맵 그리기
    sns.heatmap(correlation_matrix, 
                annot=True, 
                fmt='.2f', 
                vmin=-1, 
                annot_kws={'color': 'black', 'size': 5, 'weight': 'bold'}, 
                cmap='coolwarm', 
                mask=mask
               )

    # 그래프 출력
    plt.title('Correlation Heatmap')
    plt.show()
else:
    print('Use Visualization: False')

Use Visualization: False


In [89]:
# 조건에 맞는 컬럼 조합 찾기
correlation_matrix = train.corr()
columns = correlation_matrix.columns
combinations = []

for i in range(len(columns)):
    for j in range(i + 1, len(columns)):
        if abs(correlation_matrix.iloc[i, j]) > 0.7:
            combinations.append([columns[i], columns[j]])

# 조건에 맞는 컬럼 조합 출력
for combination in combinations:
    print(combination)

['AH', 'AR']
['AH', 'DV']
['AH', 'EB']
['AR', 'CL']
['AR', 'CS']
['AR', 'DV']
['AR', 'EB']
['AR', 'EP']
['BC', 'BD ']
['BC', 'BZ']
['CL', 'DV']
['CS', 'EP']
['DU', 'EH']
['DU', 'FD ']
['DV', 'EP']
['EB', 'EP']
['EH', 'FD ']
['EJ', 'GL']


## Anomaly detection - 이상치 탐지

- 간단한 방법들
    - IQR
    - z-score

    
- 변수가 많은 다차원의 경우
    - IsolationForest
    - Mahalanobis distance

In [90]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 617 entries, 0 to 616
Data columns (total 57 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   AB      617 non-null    float64
 1   AF      617 non-null    float64
 2   AH      617 non-null    float64
 3   AM      617 non-null    float64
 4   AR      617 non-null    float64
 5   AX      617 non-null    float64
 6   AY      617 non-null    float64
 7   AZ      617 non-null    float64
 8   BC      617 non-null    float64
 9   BD      617 non-null    float64
 10  BN      617 non-null    float64
 11  BP      617 non-null    float64
 12  BQ      617 non-null    float64
 13  BR      617 non-null    float64
 14  BZ      617 non-null    float64
 15  CB      617 non-null    float64
 16  CC      617 non-null    float64
 17  CD      617 non-null    float64
 18  CF      617 non-null    float64
 19  CH      617 non-null    float64
 20  CL      617 non-null    float64
 21  CR      617 non-null    float64
 22  CS

In [91]:
###################################
### check outliars with boxplot ###
###################################
# Original train data

if is_visualizing:
    fig, axes = plt.subplots(7, 8, figsize=(24, 32))

    col_list = list(train.columns[:-1])
    col_index = 0

    for i in range(7):
        for j in range(8):
            axes[i][j].boxplot(train[col_list[col_index]])
            axes[i][j].set_title(col_list[col_index])
            col_index += 1

    plt.show()
else:
    print('Use Visualization: False')

Use Visualization: False


## Remove Outliars Methods

In [92]:
df = train
df.columns[:-1]

Index(['AB', 'AF', 'AH', 'AM', 'AR', 'AX', 'AY', 'AZ', 'BC', 'BD ', 'BN', 'BP',
       'BQ', 'BR', 'BZ', 'CB', 'CC', 'CD ', 'CF', 'CH', 'CL', 'CR', 'CS', 'CU',
       'CW ', 'DA', 'DE', 'DF', 'DH', 'DI', 'DL', 'DN', 'DU', 'DV', 'DY', 'EB',
       'EE', 'EG', 'EH', 'EJ', 'EL', 'EP', 'EU', 'FC', 'FD ', 'FE', 'FI', 'FL',
       'FR', 'FS', 'GB', 'GE', 'GF', 'GH', 'GI', 'GL'],
      dtype='object')

In [93]:
###########
### IQR ###
###########

if is_IQR:
    def get_outlier(df=None, column=None, weight=1.5):
        # target 값과 상관관계가 높은 열을 우선적으로 진행
        quantile_25 = np.percentile(df[column].values, 25)
        quantile_75 = np.percentile(df[column].values, 75)

        IQR = quantile_75 - quantile_25
        IQR_weight = IQR*weight

        lowest = quantile_25 - IQR_weight
        highest = quantile_75 + IQR_weight

        outlier_idx = df[column][ (df[column] < lowest) | (df[column] > highest) ].index
        return outlier_idx

    # 함수 사용해서 이상치 값 삭제

    outlier_index = []
    weight = 1.5

    for i in train.columns[:-1]:
        outlier_index.append(list(get_outlier(df=df, column=i, weight=weight)))


    print(f'weight를 {weight}로 줬을 때 IQR로 인해 제거되는 sample의 수 : {len(list(set(sum(outlier_index, []))))}')

else:
    print('Use IQR: False')
    
# outlier_idx = get_outlier(df=df, column='AB', weight=1.5)
# train.drop(outlier_idx, axis=0, inplace=True)
# train

weight를 1.5로 줬을 때 IQR로 인해 제거되는 sample의 수 : 551


In [94]:
###############
### Z-score ###
###############
if is_z:
    # seperate class 0,1
    train_0 = train[train.Class == 0.0]
    train_1 = train[train.Class == 1.0]

    # calculate z-score
    train_0_mean = np.mean(train_0)
    train_0_std = np.std(train_0)

    z = (train_0 - train_0_mean) / train_0_std

    # find outliars base on z-score
    drop_index_list = list(set(sum([list(z[(z[i] > 3) | (z[i] < -3)].index) for i in train.columns[:-3]], [])))

    # assign을 이용해 z-score column을 새로 만드는 법.
    # train = train.assign(z_score = lambda x : x.AB.sub(x.AB.mean()).div(x.AB.std()))
    # train[['AB', 'z_score']]
    print('## Use Z-score: True ##\n')
    print(f'- train_0에서 z-score로 걸러내는 column의 수 : {len(drop_index_list)}\n')

    print('--------------------------------------------------')
    print('Class_0에 있는 outliars만 제거한 후 Class 0, 1의 수 :')
    print(train.drop(index=drop_index_list).Class.value_counts())
    
    train.drop(index=drop_index_list, inplace=True)
    print(train.columns[:-1])

In [95]:
#######################
### IsolationForest ###
#######################

if is_iForest:
    from sklearn.ensemble import IsolationForest

    iForest = IsolationForest(n_estimators = 100,
                              contamination = 0.1,
                              # contamination = 'auto',
                              max_samples = 30,
                              bootstrap = False,
                              max_features = 5,
                              random_state = 42
                             )

    iForest.fit(train)

    y_pred = iForest.predict(train)
    y_score = iForest.score_samples(train)
    train['anomaly_label'] = y_pred
    train['anomaly_score'] = y_score


    train[train['anomaly_label'] == -1]
else:
    print('Use IsolationForest: False')

Use IsolationForest: False


In [96]:
############################
### Mahalanobis distance ###
############################

# - 데이터의 분포를 고려한 거리 측도로, 관측치가 평균으로부터 벗어난 정도를 측정
# - 마할라노비스 거리를 이용하여 평균으로부터 벗어난 이상값을 검출
# - 모든 변수 간 선형관계를 만족하고, 각 변수가 정규분포를 따르는 경우 적용 가능

if is_mahala:
    # StandardScaler를 이용한 정규분포화
    # from sklearn.preprocessing import StandardScaler

    # if is_scaling == 'standard':
    #     scaler = StandardScaler()
    #     data_ = scaler.fit_transform(X_train)
    #     X_train = pd.DataFrame(data=data_, columns=X_train.columns)
    #     data_ = scaler.transform(X_val)
    #     X_val = pd.DataFrame(data=data_, columns=X_val.columns)
    #     display(X_train)


    from scipy.spatial.distance import cdist

    x = np.array([[[1,2,3],
                   [3,4,5],
                   [5,6,7]],
                  [[5,6,7],
                   [7,8,9],
                   [9,0,1]]])

    i,j,k = x.shape

    xx = x.reshape(i,j*k).T
    print(xx)


    y = np.array([[[8,7,6],
                   [6,5,4],
                   [4,3,2]],
                  [[4,3,2],
                   [2,1,0],
                   [0,1,2]]])


    yy = y.reshape(i,j*k).T

    results =  cdist(xx,yy,'mahalanobis')

    results = np.diag(results)
    print (results)
else:
    print('Use Mahalanobis distance: False')

Use Mahalanobis distance: False


## Use VIF to remove multicollinearity

In [97]:
def check_vif(df):
    vifs = [variance_inflation_factor(df, i) for i in range(df.shape[1])]
    vif_df = pd.DataFrame({"features":df.columns, "VIF" : vifs})
    vif_df = vif_df.sort_values(by="VIF", ascending=False)
    remove_col = vif_df.iloc[0, 0]
    top_vif = vif_df.iloc[0, 1]
    return vif_df, remove_col, top_vif

In [98]:
# remove all features when VIF is over 10.

##### for test #####
# apply_vif = True

if apply_vif:
    top_vif = 100
    vif_dict = {}
    
    while(top_vif > 10):
        vif_df, remove_col, top_vif = check_vif(train)
        print(remove_col, top_vif)
        
        vif_dict[remove_col] = top_vif
        
        if top_vif < 10:
            break
        train = train.drop(columns=remove_col)

    display(train)
else:
    print('Use VIF: False')

Use VIF: False


In [99]:
# vif => train
# feature selection => X = train[selected_cols]

In [100]:
# feature selection via Feature Importance
X = train.drop(columns=["Class"])
y = train['Class']

if feature_selection:
    rf = RandomForestClassifier()
    rf.fit(X, y)
    print("Train ACC : %.4f" % accuracy_score(y, rf.predict(X)))
    fi_df = pd.DataFrame({'feature':X.columns, 'importance':rf.feature_importances_})
    selected_cols = fi_df.sort_values(by="importance", ascending=False)[:m]["feature"].values
    
    display(selected_cols)
    
    X = train[selected_cols]
    display(X)

Train ACC : 1.0000


array(['DU', 'FL', 'GL', 'DA', 'CR', 'DI', 'AF', 'AB', 'FD ', 'BC', 'FR',
       'EH', 'DE', 'EE', 'CC', 'FE', 'BQ', 'DH', 'FI', 'EB'], dtype=object)

Unnamed: 0,DU,FL,GL,DA,CR,DI,AF,AB,FD,BC,FR,EH,DE,EE,CC,FE,BQ,DH,FI,EB
0,5.310690,7.298162,0.120343,69.08340,0.069225,89.245560,3109.03329,0.209377,10.265073,5.555634,1.73855,0.949104,295.570575,1.987283,0.563481,9028.291921,152.707705,0.284232,3.583450,7.294176
1,0.005518,0.173229,21.978000,70.79836,1.117800,110.581815,978.76416,0.145282,0.296850,1.229900,0.49706,0.003042,178.553100,0.858603,0.484710,6785.003474,14.754720,0.363489,10.358927,4.926396
2,1.289739,7.709560,0.196941,70.81970,0.700350,120.056438,2635.10654,0.470030,8.745201,1.229900,0.97556,0.377208,321.426625,8.146651,0.495852,8338.906181,219.320160,0.210441,11.626917,7.813674
3,2.655345,6.122162,0.155829,47.27586,0.636075,139.824570,3819.65177,0.252107,7.884336,1.229900,0.49706,0.614484,196.607985,3.813326,0.717882,10965.766040,11.050410,0.292431,14.852022,7.386060
4,1.144902,8.153058,0.096614,74.06532,0.693150,97.920120,3733.04844,0.380297,4.274640,102.151980,48.50134,0.164268,200.178160,3.490846,0.536467,16198.049590,149.717165,0.207708,13.666727,7.350720
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
612,0.005518,0.173229,21.978000,21.75904,0.698250,176.977590,3130.05946,0.149555,0.296850,2.804172,1.26092,0.003042,355.930925,1.354416,0.691257,17167.209610,27.287375,0.445479,9.879296,8.015112
613,0.648318,10.223150,0.145340,43.90996,0.761025,192.598575,5462.03438,0.435846,6.067614,3.777550,1.24236,0.139932,157.393715,0.753797,0.772304,18460.330020,344.644105,0.437280,10.910227,8.976360
614,0.005518,0.173229,21.978000,104.62032,0.879825,218.915925,2459.10720,0.427300,0.296850,1.229900,0.49706,0.003042,223.209115,2.225112,0.708616,5088.922912,103.988995,0.382620,12.029366,9.478188
615,0.510378,9.256996,0.184622,51.04140,0.583125,113.526045,1263.53524,0.363205,6.192291,1.229900,0.78764,0.139932,112.196630,1.628524,0.602254,6464.250832,82.512333,0.549333,8.026928,10.078968


In [101]:
# corr과 VIF와의 관계 파악
# 1:1의 상관관계를 파악하고 싶으면 corr
# 다:1의 상관관계를 파악하고 싶으면 VIF

if is_visualizing:
    from collections import Counter

    corr_dict = Counter(sum(combinations, []))

    # pd.DataFrame(X=corr_dict.keys())

    corr_df = pd.DataFrame(index=corr_dict.keys(), data=corr_dict.values(), columns=['corr'])
    vif_df = pd.DataFrame(data=vif_dict.values(), index=vif_dict.keys(), columns=['vif'])

    display(corr_df.join(vif_df, how='outer').sort_values(by='corr', ascending=False).fillna('-'))
else:
    print('Use Visualization: False')

Use Visualization: False


## 3. Data preprocessing

In [102]:
# class imbalance handling
## 1. undersampling

##### for test #####
# sampling_method = 'under'


if sampling_method == 'under':
    c1 = train[train.Class == 1]
    c0 = train[train.Class == 0]
    print(c1.shape, c0.shape) # 108, 509 -> 108, 108
    c0 = c0.sample(n=c1.shape[0]) # 509 -> 108
    train = pd.concat([c0, c1])
    print(train.shape)

(108, 57) (509, 57)
(216, 57)


In [103]:
## before oversampling
# df = train[selected_cols]
# df["Class"] = train["Class"]
# pd.pivot_table(index="Class", data=df)

In [104]:
## 2. oversampling -> SMOTE

##### for test #####
# sampling_method = 'over'

if sampling_method == 'over':
    X = train[selected_cols]
    y = train['Class']

    smote = SMOTE(k_neighbors=5)
    # fit_resample 함수가 자동으로 minority class를 y 기준으로 찾아서, 1:1로 맞춰줌.
    X_resampled, y_resampled = smote.fit_resample(X, y)
    print(X_resampled.shape, y_resampled.shape)
    X_resampled["Class"] = y_resampled
    train = X_resampled

In [105]:
# After SMOTE
# df = X_resampled.copy()
# df["Class"] = y_resampled
# pd.pivot_table(index="Class", data=df)

In [106]:
# 3. hybrid approach
## class0 : 509 -> 300
## class1 : 108 -> 300
if sampling_method == 'hybrid':
    N = 300
    c1 = train[train.Class == 1]
    c0 = train[train.Class == 0]
    print(c1.shape, c0.shape)
    c0 = c0.sample(n=N) # 509 -> 300
    train = pd.concat([c0, c1])
    print(train.shape)
    
    if feature_selection:
        X = train[selected_cols]
    else:
        X = train.drop("Class", axis=1)
    y = train.Class

    smote = SMOTE(k_neighbors=5)
    # The fit_resample function automatically finds the minority class by y and fits it 1:1.
    X, y = smote.fit_resample(X, y) # 300, 108 --> 300, 300
    print(X.shape, y.shape)
    display(X), display(y)

In [107]:
## we don't need anymore because we dropped, valid set.

# # to make OOF prediction
# from sklearn.model_selection import train_test_split

# #X = train.drop(columns=["Class"])
# X = train[selected_cols]
# y = train['Class']

# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, stratify=y, random_state=42)
# print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

### feature scaling

- Use StandardScaler

In [108]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

if is_scaling == 'standard':
    print('== Use StandardScaler ==')
    scaler = StandardScaler()
    data_ = scaler.fit_transform(X)
    X = pd.DataFrame(data=data_, columns=X.columns)
    display(X)
    
#     data_ = scaler.fit_transform(X_train)
#     X_train = pd.DataFrame(data=data_, columns=X_train.columns)
#     data_ = scaler.transform(X_val)
#     X_val = pd.DataFrame(data=data_, columns=X_val.columns)
#     display(X_train)

if is_scaling == 'minmax':
    print('== Use MinMaxScaler ==')
    scaler = MinMaxScaler()
    data_ = scaler.fit_transform(X_train)
    X_train = pd.DataFrame(data=data_, columns=X_train.columns)
    display(X_train)

== Use StandardScaler ==


Unnamed: 0,DU,FL,GL,DA,CR,DI,AF,AB,FD,BC,FR,EH,DE,EE,CC,FE,BQ,DH,FI,EB
0,0.388996,0.162497,-0.814910,0.847190,-2.395430,-0.671125,-0.170975,-0.572153,0.051544,-0.038354,-0.035806,0.348860,-0.334913,-0.523902,-0.477737,-0.112922,0.589077,-0.733141,-2.226608,-0.287078
1,-0.199154,-0.458271,1.304748,0.928108,1.336593,-0.423071,-1.097801,-0.709105,-0.102520,-0.104787,-0.060566,-0.163632,-0.703485,-1.072690,-0.776607,-0.311056,-0.899462,-0.031113,0.084542,-0.669271
2,-0.056781,0.198341,-0.807481,0.929115,-0.149169,-0.312920,-0.377169,-0.015212,0.028054,-0.104787,-0.051023,0.039058,-0.253473,2.470916,-0.734334,-0.173811,1.307838,-1.386753,0.517060,-0.203224
3,0.094615,0.060037,-0.811468,-0.181774,-0.377933,-0.083097,0.138196,-0.480851,0.014748,-0.104787,-0.060566,0.167593,-0.646618,0.363960,0.108090,0.058201,-0.939432,-0.660517,1.617160,-0.272247
4,-0.072838,0.236981,-0.817211,1.082256,-0.174795,-0.570275,0.100517,-0.206946,-0.041041,1.445139,0.896815,-0.076294,-0.635372,0.207163,-0.580232,0.520331,0.556808,-1.410960,1.212850,-0.277951
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
612,-0.199154,-0.458271,1.304748,-1.385756,-0.156643,0.348842,-0.161828,-0.699975,-0.102520,-0.080610,-0.045332,-0.163632,-0.144794,-0.831615,0.007070,0.605929,-0.764233,0.695122,-0.079062,-0.170709
613,-0.127891,0.417340,-0.812485,-0.340590,0.066781,0.530450,0.852755,-0.088253,-0.013330,-0.065661,-0.045702,-0.089477,-0.770132,-1.123649,0.314578,0.720141,2.660107,0.622499,0.272594,-0.015551
614,-0.199154,-0.458271,1.304748,2.523958,0.489607,0.836414,-0.453742,-0.106514,-0.102520,-0.104787,-0.060566,-0.163632,-0.562831,-0.408264,0.072935,-0.460858,0.063393,0.138342,0.654338,0.065451
615,-0.143183,0.333163,-0.808676,-0.004101,-0.566389,-0.388841,-0.973904,-0.243466,-0.011403,-0.104787,-0.054771,-0.089477,-0.912490,-0.698338,-0.330624,-0.339386,-0.168345,1.615020,-0.710915,0.162426


In [109]:
if is_pca:
    from sklearn.decomposition import PCA
    
    pca = PCA(n_components=n_components, random_state=42)
    data_ = pca.fit_transform(X)
    X = pd.DataFrame(data=data_, columns=[f"PC{i}" for i in range(1, data_.shape[1]+1)])
    display(X)

#     data_ = pca.fit_transform(X_train)
#     X_train = pd.DataFrame(data=data_, columns=[f"PC{i}" for i in range(1, data_.shape[1]+1)])
#     data_ = pca.transform(X_val)
#     X_val = pd.DataFrame(data=data_, columns=[f"PC{i}" for i in range(1, data_.shape[1]+1)])

#     display(X_train)

## 4. Fitting and Evaluation


- does not need anymore.

In [110]:
##################
### set metric ###
##################

evaluation_metric = balance_logloss
#evaluation_metric_keras = b_logloss_keras

## 5. (Super)Hyper-parameter Tuning and OOF prediction

Let's try hyper-parameter tuning using optuna, an AutoML framework.

Optuna defines a target function to optimize and then optimizes that function.

For each model, we define an optimizer function separately and then burn optuna.

In [111]:
# make oof predictions
submission = pd.read_csv('../input/icr-identify-age-related-conditions/sample_submission.csv')
final_preds = []
# rf_preds = pd.DataFrame(columns=submission.columns, data=np.zeros((train.shape[0], submission.shape[1])))
# svm_preds = pd.DataFrame(columns=submission.columns, data=np.zeros((train.shape[0], submission.shape[1])))
# xgb_preds = pd.DataFrame(columns=submission.columns, data=np.zeros((train.shape[0], submission.shape[1])))

In [112]:
# vif => train
# feature selection => X = train[selected_cols]

In [113]:
# apply same preprocessing in Testset
test.EJ = lb.transform(test.EJ)  # A->0, B->1
test = test.drop(columns=["Id"])

if apply_vif and feature_selection: # 둘 다 True: O
    print('=== test: VIF + feature-selection ===\n')
    X_test = pd.DataFrame(columns=test.columns, data=imp.transform(test))
    X_test = X_test[selected_cols]
    X_test = pd.DataFrame(columns=X_test.columns, data=scaler.transform(X_test))    
    
elif apply_vif or feature_selection: # 둘 중 하나만 True
    X_test = pd.DataFrame(columns=test.columns, data=imp.transform(test)) # imp(knn-imputer) : train 모든 column들
    if apply_vif: # vif만 Ture
        print('=== Use VIF only ===\n')
        X_test = X_test[train.columns.drop("Class")]
    else: # feature-selection만 True
        print('=== Use feature-selection only ===\n')
        X_test = X_test[selected_cols]
    X_test = pd.DataFrame(columns=X_test.columns, data=scaler.transform(X_test)) # scaler(standard/minmax) column 제거 후
    
elif is_pca: # PCA만 쓸 경우
    print('=== Use PCA only ===\n')
    X_test = pd.DataFrame(columns=test.columns, data=scaler.transform(imp.transform(test)))
    data_ = pca.fit_transform(X_test)
    X_test = pd.DataFrame(data=data_, columns=[f"PC{i}" for i in range(1, data_.shape[1]+1)])
else: # 둘 다 False: O
    print('=== Not use VIF & feature-selection ===\n')
    X_test = pd.DataFrame(columns=test.columns, data=scaler.transform(imp.transform(test)))

X_test

=== Use feature-selection only ===



Unnamed: 0,DU,FL,GL,DA,CR,DI,AF,AB,FD,BC,FR,EH,DE,EE,CC,FE,BQ,DH,FI,EB
0,-0.199766,-0.473364,-0.82658,-2.412431,-2.641811,-1.708687,-1.523636,-1.019531,-0.107108,-0.123675,-0.070479,-0.16528,-1.265879,-1.490161,-2.615686,-0.910326,-1.058669,-3.250757,-3.448942,-1.46446
1,-0.199766,-0.473364,-0.82658,-2.412431,-2.641811,-1.708687,-1.523636,-1.019531,-0.107108,-0.123675,-0.070479,-0.16528,-1.265879,-1.490161,-2.615686,-0.910326,-1.058669,-3.250757,-3.448942,-1.46446
2,-0.199766,-0.473364,-0.82658,-2.412431,-2.641811,-1.708687,-1.523636,-1.019531,-0.107108,-0.123675,-0.070479,-0.16528,-1.265879,-1.490161,-2.615686,-0.910326,-1.058669,-3.250757,-3.448942,-1.46446
3,-0.199766,-0.473364,-0.82658,-2.412431,-2.641811,-1.708687,-1.523636,-1.019531,-0.107108,-0.123675,-0.070479,-0.16528,-1.265879,-1.490161,-2.615686,-0.910326,-1.058669,-3.250757,-3.448942,-1.46446
4,-0.199766,-0.473364,-0.82658,-2.412431,-2.641811,-1.708687,-1.523636,-1.019531,-0.107108,-0.123675,-0.070479,-0.16528,-1.265879,-1.490161,-2.615686,-0.910326,-1.058669,-3.250757,-3.448942,-1.46446


In [114]:
def rf_optimizer(trial, X, y, K):
    # define parameter to tune    
    n_estimators = trial.suggest_categorical('n_estimators', [50, 100, 200])
    max_depth = trial.suggest_int('max_depth', 4, 10)
    max_features = trial.suggest_categorical('max_features', [0.6, 0.7, 0.8])
    
    
    # set model
    model = RandomForestClassifier(n_estimators=n_estimators,
                                   max_depth=max_depth,
                                   max_features=max_features,
                                   criterion='log_loss',
                                   class_weight='balanced'
                                  )
    
    # K-Fold Cross validation
    folds = StratifiedKFold(n_splits=K, shuffle=True)
    losses = []
    
    for train_idx, val_idx in folds.split(X, y):
        X_train = X.iloc[train_idx, :]
        y_train = y.iloc[train_idx]
        
        X_val = X.iloc[val_idx, :]
        y_val = y.iloc[val_idx]
        
        model.fit(X_train, y_train)
        preds = model.predict_proba(X_val)
        loss = evaluation_metric(y_val, preds)
        losses.append(loss)
    
    
    # return mean score of CV
    return np.mean(losses)

In [115]:
def xgb_optimizer(trial, X, y, K):
    
    n_estimators = trial.suggest_categorical('n_estimators', [500, 1000, 2000])
    max_depth = trial.suggest_int('max_depth', 4, 10)
    colsample_bytree = trial.suggest_categorical('colsample_bytree', [0.5, 0.6, 0.7, 0.8])
    learning_rate = trial.suggest_float('learning_rate', 1e-3, 1e-2)
    reg_lambda = trial.suggest_categorical('reg_lambda', [0.1, 0.5, 1, 2])
    
    
    model = XGBClassifier(n_estimators=n_estimators,
                          max_depth=max_depth,
                          colsample_bytree=colsample_bytree,
                          learning_rate=learning_rate,
                          reg_lambda=reg_lambda)
#                          scale_pos_weight=4.71)  ## we set class imbalance by using sampling method.
    
    
    folds = StratifiedKFold(n_splits=K, shuffle=True)
    losses = []
    
    for train_idx, val_idx in folds.split(X, y):
        X_train = X.iloc[train_idx, :]
        y_train = y.iloc[train_idx]
        
        X_val = X.iloc[val_idx, :]
        y_val = y.iloc[val_idx]
        
        model.fit(X_train, y_train)
        preds = model.predict_proba(X_val)
        loss = evaluation_metric(y_val, preds)
        losses.append(loss)
    
    
    return np.mean(losses)

In [116]:
############################
### Tunning RandomForest ###
############################

opt_func = partial(rf_optimizer, X=X, y=y, K=K)

if is_tuning:
    rf_study = optuna.create_study(direction="minimize", sampler=sampler) # determine minimize or maximize sth
    rf_study.optimize(opt_func, n_trials=n_trials)

[32m[I 2023-06-23 13:23:51,785][0m A new study created in memory with name: no-name-635c27f8-42ad-4f32-b69c-50500d34335e[0m
[32m[I 2023-06-23 13:23:53,467][0m Trial 0 finished with value: 0.5685013153212268 and parameters: {'n_estimators': 100, 'max_depth': 8, 'max_features': 0.6}. Best is trial 0 with value: 0.5685013153212268.[0m
[32m[I 2023-06-23 13:23:54,192][0m Trial 1 finished with value: 0.38244715019924885 and parameters: {'n_estimators': 50, 'max_depth': 4, 'max_features': 0.6}. Best is trial 1 with value: 0.38244715019924885.[0m
[32m[I 2023-06-23 13:23:57,637][0m Trial 2 finished with value: 0.5699926452432452 and parameters: {'n_estimators': 200, 'max_depth': 7, 'max_features': 0.8}. Best is trial 1 with value: 0.38244715019924885.[0m
[32m[I 2023-06-23 13:24:00,812][0m Trial 3 finished with value: 0.40293933087725964 and parameters: {'n_estimators': 200, 'max_depth': 7, 'max_features': 0.6}. Best is trial 1 with value: 0.38244715019924885.[0m
[32m[I 2023-06-2

In [117]:
### SVM

if is_tuning:
    best_loss = 9999.0
    best_C = 0
    kernel = 'linear'
    folds = StratifiedKFold(n_splits=K, random_state=42, shuffle=True)

    # for Linear SVM
    for C in tqdm([1, 2, 5, 10, 100]):
        losses = []
        l_svm = LinearSVC(C=C, probability=True) ## cuml version. (faster model)

        for train_idx, val_idx in folds.split(X, y):
            X_train = X.iloc[train_idx, :]
            y_train = y.iloc[train_idx]
            X_val = X.iloc[val_idx, :]
            y_val = y.iloc[val_idx]

            l_svm.fit(X_train, y_train)
            preds = l_svm.predict_proba(X_val).values
            loss = evaluation_metric(y_val, preds)
            losses.append(loss)

        avg_loss = np.mean(losses)
        if avg_loss < best_loss:
            best_loss = avg_loss
            best_C = C

    # for SVM with RBF kernel.
    for C in tqdm([1, 2, 5, 10, 100]):
        losses = []
        r_svm = SVC(C=C, probability=True) ## cuml version. (with rbf kernel)

        for train_idx, val_idx in folds.split(X, y):
            X_train = X.iloc[train_idx, :]
            y_train = y.iloc[train_idx]
            X_val = X.iloc[val_idx, :]
            y_val = y.iloc[val_idx]

            r_svm.fit(X_train, y_train)
            preds = r_svm.predict_proba(X_val).values
            loss = evaluation_metric(y_val, preds)
            losses.append(loss)

        avg_loss = np.mean(losses)
        if avg_loss < best_loss:
            best_loss = avg_loss
            best_C = C
            kernel = 'rbf'

    print("SVM(%s) log loss : %.4f" % (kernel, best_loss))

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

SVM(linear) log loss : 0.5015


In [118]:
#######################
### Tunning XGBoost ###
#######################

opt_func = partial(xgb_optimizer, X=X, y=y, K=K)

if is_tuning:
    xgb_study = optuna.create_study(direction="minimize", sampler=sampler)
    xgb_study.optimize(opt_func, n_trials=n_trials)

[32m[I 2023-06-23 13:24:17,961][0m A new study created in memory with name: no-name-b57ca2d0-1dcd-412c-9646-fa55e82f330b[0m
[32m[I 2023-06-23 13:24:22,187][0m Trial 0 finished with value: 0.4002839915394881 and parameters: {'n_estimators': 500, 'max_depth': 9, 'colsample_bytree': 0.7, 'learning_rate': 0.004226191556898453, 'reg_lambda': 0.5}. Best is trial 0 with value: 0.4002839915394881.[0m
[32m[I 2023-06-23 13:24:29,675][0m Trial 1 finished with value: 0.5160900528671231 and parameters: {'n_estimators': 2000, 'max_depth': 9, 'colsample_bytree': 0.6, 'learning_rate': 0.0074192030850069556, 'reg_lambda': 1}. Best is trial 0 with value: 0.4002839915394881.[0m
[32m[I 2023-06-23 13:24:31,697][0m Trial 2 finished with value: 0.41343229231755024 and parameters: {'n_estimators': 500, 'max_depth': 4, 'colsample_bytree': 0.6, 'learning_rate': 0.009168098265334837, 'reg_lambda': 1}. Best is trial 0 with value: 0.4002839915394881.[0m
[32m[I 2023-06-23 13:24:37,390][0m Trial 3 fini

In [119]:
# visualize experiment logs
def display_experiment_log(study):
    display(study.trials_dataframe())
    print("Best Score: %.4f" % study.best_value)
    print("Best params: ", study.best_trial.params)
    history = study.trials_dataframe()
    display(history[history.value == study.best_value])
    optuna.visualization.plot_optimization_history(study).show()
    optuna.visualization.plot_param_importances(study).show()

In [120]:
### RandomForest ###
if is_tuning:
    display_experiment_log(rf_study)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_max_depth,params_max_features,params_n_estimators,state
0,0,0.568501,2023-06-23 13:23:51.789150,2023-06-23 13:23:53.467509,0 days 00:00:01.678359,8,0.6,100,COMPLETE
1,1,0.382447,2023-06-23 13:23:53.468937,2023-06-23 13:23:54.192062,0 days 00:00:00.723125,4,0.6,50,COMPLETE
2,2,0.569993,2023-06-23 13:23:54.193395,2023-06-23 13:23:57.636963,0 days 00:00:03.443568,7,0.8,200,COMPLETE
3,3,0.402939,2023-06-23 13:23:57.638908,2023-06-23 13:24:00.812173,0 days 00:00:03.173265,7,0.6,200,COMPLETE
4,4,0.400583,2023-06-23 13:24:00.813533,2023-06-23 13:24:04.096919,0 days 00:00:03.283386,5,0.8,200,COMPLETE
5,5,0.581279,2023-06-23 13:24:04.098867,2023-06-23 13:24:04.969465,0 days 00:00:00.870598,8,0.8,50,COMPLETE
6,6,0.576337,2023-06-23 13:24:04.970803,2023-06-23 13:24:06.704622,0 days 00:00:01.733819,8,0.8,100,COMPLETE
7,7,0.693783,2023-06-23 13:24:06.706117,2023-06-23 13:24:08.451748,0 days 00:00:01.745631,10,0.8,100,COMPLETE
8,8,0.725389,2023-06-23 13:24:08.453015,2023-06-23 13:24:10.148218,0 days 00:00:01.695203,6,0.8,100,COMPLETE
9,9,0.4047,2023-06-23 13:24:10.149457,2023-06-23 13:24:13.458749,0 days 00:00:03.309292,4,0.8,200,COMPLETE


Best Score: 0.3824
Best params:  {'n_estimators': 50, 'max_depth': 4, 'max_features': 0.6}


Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_max_depth,params_max_features,params_n_estimators,state
1,1,0.382447,2023-06-23 13:23:53.468937,2023-06-23 13:23:54.192062,0 days 00:00:00.723125,4,0.6,50,COMPLETE


In [121]:
if is_tuning:
    display_experiment_log(xgb_study)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_colsample_bytree,params_learning_rate,params_max_depth,params_n_estimators,params_reg_lambda,state
0,0,0.400284,2023-06-23 13:24:17.965296,2023-06-23 13:24:22.186742,0 days 00:00:04.221446,0.7,0.004226,9,500,0.5,COMPLETE
1,1,0.51609,2023-06-23 13:24:22.188514,2023-06-23 13:24:29.674855,0 days 00:00:07.486341,0.6,0.007419,9,2000,1.0,COMPLETE
2,2,0.413432,2023-06-23 13:24:29.676748,2023-06-23 13:24:31.697379,0 days 00:00:02.020631,0.6,0.009168,4,500,1.0,COMPLETE
3,3,0.435309,2023-06-23 13:24:31.701020,2023-06-23 13:24:37.390338,0 days 00:00:05.689318,0.7,0.002679,10,1000,2.0,COMPLETE
4,4,0.436952,2023-06-23 13:24:37.392201,2023-06-23 13:24:39.885786,0 days 00:00:02.493585,0.6,0.004757,6,500,2.0,COMPLETE
5,5,0.456133,2023-06-23 13:24:39.887780,2023-06-23 13:24:48.018040,0 days 00:00:08.130260,0.5,0.003708,6,2000,1.0,COMPLETE
6,6,0.460817,2023-06-23 13:24:48.019922,2023-06-23 13:24:55.882977,0 days 00:00:07.863055,0.7,0.007049,5,2000,0.1,COMPLETE
7,7,0.411445,2023-06-23 13:24:55.886693,2023-06-23 13:24:59.171786,0 days 00:00:03.285093,0.5,0.006318,4,1000,0.1,COMPLETE
8,8,0.495189,2023-06-23 13:24:59.173717,2023-06-23 13:25:04.983362,0 days 00:00:05.809645,0.5,0.009322,6,2000,0.1,COMPLETE
9,9,0.436999,2023-06-23 13:25:04.987277,2023-06-23 13:25:06.929832,0 days 00:00:01.942555,0.6,0.004143,4,500,0.5,COMPLETE


Best Score: 0.4003
Best params:  {'n_estimators': 500, 'max_depth': 9, 'colsample_bytree': 0.7, 'learning_rate': 0.004226191556898453, 'reg_lambda': 0.5}


Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_colsample_bytree,params_learning_rate,params_max_depth,params_n_estimators,params_reg_lambda,state
0,0,0.400284,2023-06-23 13:24:17.965296,2023-06-23 13:24:22.186742,0 days 00:00:04.221446,0.7,0.004226,9,500,0.5,COMPLETE


## 6. Test Prediction and Make Submission

In [122]:
# Finalize Models
if is_tuning:
    rf_best_params = rf_study.best_params
    xgb_best_params = xgb_study.best_params

    best_rf = RandomForestClassifier(**rf_best_params)
    best_xgb = XGBClassifier(**xgb_best_params)
    if kernel == 'linear':
        best_svm = l_svm
    else:
        best_svm = r_svm

### Need to validate OOF prediction score.

- OOF score is not correlated with LB score.

In [133]:
# Make KFold OOF prediction
def oof_preds(best_model, svm=False):
    # call global variable
    global final_preds
    print(K)
    # make KFold
    folds = StratifiedKFold(n_splits=K, random_state=42, shuffle=True)
    losses = []
            
    if svm: # cuml SVC
        if kernel == 'linear':
            svm = LinearSVC(C=best_C, probability=True)
        else:
            svm = SVC(C=best_C, probability=True)
            

        for i, (train_idx, val_idx) in enumerate(folds.split(X, y)):
            X_train = X.iloc[train_idx, :]
            y_train = y.iloc[train_idx]
            X_val = X.iloc[val_idx, :]
            y_val = y.iloc[val_idx]

            print(f"========== Fold {i+1} ==========")
            svm.fit(X_train, y_train)
            preds = svm.predict_proba(X_val).values
            test_preds = svm.predict_proba(X_test)
            final_preds.append(test_preds)
            loss = evaluation_metric(y_val, preds)
            print(f"Loss : {loss:.4f}")
            losses.append(loss)
    
    else:
        # fitting with best_model
        for i, (train_idx, val_idx) in enumerate(folds.split(X, y)):
            X_train = X.iloc[train_idx, :]
            y_train = y.iloc[train_idx]
            X_val = X.iloc[val_idx, :]
            y_val = y.iloc[val_idx]

            print(f"========== Fold {i+1} ==========")
            best_model.fit(X_train, y_train)
            preds = best_model.predict_proba(X_val)
            test_preds = best_model.predict_proba(X_test)
            final_preds.append(test_preds)
            loss = evaluation_metric(y_val, preds)
            print(f"Loss : {loss:.4f}")
            losses.append(loss)

    avg_loss = np.mean(losses)
    return avg_loss

In [134]:
print("Random Forest")
rf_loss = oof_preds(best_model=best_rf, svm=False)
print("Avg Loss : %.4f" % rf_loss)
print("\nXGBoost")
xgb_loss = oof_preds(best_model=best_xgb, svm=False)
print("Avg Loss : %.4f" % xgb_loss)
print("\nSupport Vector Machine")
svm_loss = oof_preds(best_model=best_svm, svm=True)
print("Avg Loss : %.4f" % svm_loss)

print("Total logloss : %.4f" % np.mean([rf_loss, xgb_loss, svm_loss]))

Random Forest
4
Loss : 0.4672
Loss : 0.4557
Loss : 0.6924
Loss : 0.4032
Avg Loss : 0.5046

XGBoost
4
Loss : 0.3807
Loss : 0.3759
Loss : 0.5386
Loss : 0.3625
Avg Loss : 0.4144

Support Vector Machine
4
Loss : 0.3105
Loss : 0.5889
Loss : 0.7180
Loss : 0.3887
Avg Loss : 0.5015
Total logloss : 0.4735


In [125]:
pd.DataFrame(columns=[0, 1], data=best_rf.predict_proba(X_val)).head()

Unnamed: 0,0,1
0,0.956666,0.043334
1,0.977951,0.022049
2,0.416585,0.583415
3,0.977951,0.022049
4,0.79365,0.20635


In [126]:
best_svm.predict_proba(X_val).head()

Unnamed: 0,0,1
0,0.973764,0.026236
1,0.952934,0.047066
2,0.089028,0.910972
3,0.977911,0.022089
4,0.910889,0.089111


In [127]:
final_preds = final_preds[:-K] # remove SVM KFold predictions

In [132]:
final_preds

[array([[0.54803287, 0.45196713],
        [0.54803287, 0.45196713],
        [0.54803287, 0.45196713],
        [0.54803287, 0.45196713],
        [0.54803287, 0.45196713]]),
 array([[0.60464141, 0.39535859],
        [0.60464141, 0.39535859],
        [0.60464141, 0.39535859],
        [0.60464141, 0.39535859],
        [0.60464141, 0.39535859]]),
 array([[0.57023103, 0.42976897],
        [0.57023103, 0.42976897],
        [0.57023103, 0.42976897],
        [0.57023103, 0.42976897],
        [0.57023103, 0.42976897]]),
 array([[0.52675195, 0.47324805],
        [0.52675195, 0.47324805],
        [0.52675195, 0.47324805],
        [0.52675195, 0.47324805],
        [0.52675195, 0.47324805]]),
 array([[0.575266  , 0.42473397],
        [0.575266  , 0.42473397],
        [0.575266  , 0.42473397],
        [0.575266  , 0.42473397],
        [0.575266  , 0.42473397]], dtype=float32),
 array([[0.8439357, 0.1560643],
        [0.8439357, 0.1560643],
        [0.8439357, 0.1560643],
        [0.8439357, 0.1560643

In [128]:
submission[['class_0', 'class_1']] = np.mean(final_preds, axis=0)
submission

Unnamed: 0,Id,class_0,class_1
0,00eed32682bb,0.606617,0.393383
1,010ebe33f668,0.606617,0.393383
2,02fa521e1838,0.606617,0.393383
3,040e15f562a2,0.606617,0.393383
4,046e85c7cc7f,0.606617,0.393383


In [131]:
# #voting_weights = [0.1, 0.1, 0.25, 0.25, 0.3]
# #voting_weights = [0.2, 0.2, 0.2, 0.2, 0.2]
# #voting_weights = [0.25, 0.25, 0.25, 0.25]
# #voting_weights = [0.5, 0.5]
# voting_weights = [0.35, 0.35, 0.3]
# # submission['class_0'] = voting_weights[0]*preds_rf[:, 0] + voting_weights[1]*preds_lr[:, 0] + voting_weights[2]*preds_xgb[:, 0] + voting_weights[3]*preds_nn[:, 0] + voting_weights[4]*preds_svm[:, 0]
# # submission['class_1'] = voting_weights[0]*preds_rf[:, 1] + voting_weights[1]*preds_lr[:, 1] + voting_weights[2]*preds_xgb[:, 1] + voting_weights[3]*preds_nn[:, 1] + voting_weights[4]*preds_svm[:, 1]
# # submission['class_0'] = voting_weights[0]*preds_rf[:, 0] + voting_weights[1]*preds_lr[:, 0] + voting_weights[2]*preds_xgb[:, 0] + voting_weights[3]*preds_svm[:, 0]
# # submission['class_1'] = voting_weights[0]*preds_rf[:, 1] + voting_weights[1]*preds_lr[:, 1] + voting_weights[2]*preds_xgb[:, 1] + voting_weights[3]*preds_svm[:, 1]
# submission['class_0'] = voting_weights[0]*preds_rf[:, 0] + voting_weights[1]*preds_xgb[:, 0] + voting_weights[2]*preds_svm[:, 0]
# submission['class_1'] = voting_weights[0]*preds_rf[:, 1] + voting_weights[1]*preds_xgb[:, 1] + voting_weights[2]*preds_svm[:, 1]
# submission

In [None]:
submission.to_csv("submission.csv", index=False)