shout out to YONDAM KIM(my teacher)

### 00. Data and Library Load

In [1]:
# #if use TabPFN first time, add data(pip-package-icr) in your note book) and run this cell
# !pip download tabpfn --no-deps -d pip-packages
# !pip install tabpfn

In [2]:
# if you already install tabpfn, off the internet and run this cell
!pip install tabpfn --no-index --find-links=file:///kaggle/input/pip-packages-icr/pip-packages

Looking in links: file:///kaggle/input/pip-packages-icr/pip-packages
Processing /kaggle/input/pip-packages-icr/pip-packages/tabpfn-0.1.9-py3-none-any.whl
Installing collected packages: tabpfn
Successfully installed tabpfn-0.1.9
[0m

In [3]:
# same as just before cell
!mkdir -p /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff
!cp /kaggle/input/pip-packages-icr/pip-packages/prior_diff_real_checkpoint_n_0_epoch_100.cpkt /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff/

In [4]:
# basic
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#Preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.inspection import permutation_importance
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import QuantileTransformer

# ML
from sklearn.ensemble import RandomForestClassifier  # Bagging
from xgboost.sklearn import XGBClassifier            # GBM
from sklearn.svm import SVC                          # SVM
from sklearn.utils.class_weight import compute_sample_weight

# NN
from sklearn.neural_network import MLPClassifier
from tabpfn import TabPFNClassifier

# KFold(CV), partial
from sklearn.model_selection import KFold, StratifiedKFold
from functools import partial
from sklearn.ensemble import VotingClassifier
from random import *
from sklearn.model_selection import train_test_split

# AutoML framework
import optuna

import warnings
warnings.filterwarnings("ignore")

In [5]:
def balance_logloss(y_true, y_pred):
    y_pred = np.clip(y_pred, 1e-15, 1-1e-15)
    y_pred /= np.sum(y_pred, axis=1)[:, None]
    nc = np.bincount(y_true)
    
    logloss = (-1/nc[0]*(np.sum(np.where(y_true==0,1,0) * np.log(y_pred[:,0]))) - 1/nc[1]*(np.sum(np.where(y_true!=0,1,0) * np.log(y_pred[:,1])))) / 2
    
    return logloss

In [6]:
def check_vif(df):
    vifs = [variance_inflation_factor(df, i) for i in range(df.shape[1])]
    vif_df = pd.DataFrame({"features":df.columns, "VIF" : vifs})
    vif_df = vif_df.sort_values(by="VIF", ascending=False)
    remove_col = vif_df.iloc[0, 0]
    top_vif = vif_df.iloc[0, 1]
    return vif_df, remove_col, top_vif

In [7]:
# visualize experiment logs
def display_experiment_log(study):
    display(study.trials_dataframe())
    print("Best Score: %.4f" % study.best_value)
    print("Best params: ", study.best_trial.params)
    history = study.trials_dataframe()
    display(history[history.value == study.best_value])
    optuna.visualization.plot_optimization_history(study).show()
    optuna.visualization.plot_param_importances(study).show()

In [8]:
# preprocessing
scaling = 'Quantile'                   # ['Standard','Robust','MinMax','Quantile'] or False
integerized = False
sampling_method = 'hybrid'             # ['under','SMOTE','ADASYN','ran_over','hybrid'] or False


#dimension_reduction
feature_extraction = False             # ['PCA','LDA'] or False
apply_vif = False
feature_selection = False              # ['permutation','rf_model'] or False

#ensmble
models = ['RF','XGB','SVM','TabPFN']
weights= [0.15, 0.35, 0, 0.5]         # equal number of models and weight or None
n_trials = 50
evaluation_metric = balance_logloss

#postprocessing
post = True

#CV
bag_num = 4

In [9]:
train = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/train.csv')
test = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/test.csv')
greek = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/greeks.csv')
print(train.shape, test.shape)

(617, 58) (5, 57)


### 01. Preprocessing

In [10]:
lb = LabelEncoder()
train.EJ = lb.fit_transform(train.EJ)  # A->0, B->1
greek.Alpha = lb.fit_transform(greek.Alpha)
greek.Gamma = lb.fit_transform(greek.Gamma)

train = train.drop(columns=["Id"])

In [11]:
if integerized:
    int_denominators = {
     'AB': 0.004273,
     'AF': 0.00242,
     'AH': 0.008709,
     'AM': 0.003097,
     'AR': 0.005244,
     'AX': 0.008859,
     'AY': 0.000609,
     'AZ': 0.006302,
     'BC': 0.007028,
     'BD ': 0.00799,
     'BN': 0.3531,
     'BP': 0.004239,
     'BQ': 0.002605,
     'BR': 0.006049,
     'BZ': 0.004267,
     'CB': 0.009191,
     'CC': 6.12e-06,
     'CD ': 0.007928,
     'CF': 0.003041,
     'CH': 0.000398,
     'CL': 0.006365,
     'CR': 7.5e-05,
     'CS': 0.003487,
     'CU': 0.005517,
     'CW ': 9.2e-05,
     'DA': 0.00388,
     'DE': 0.004435,
     'DF': 0.000351,
     'DH': 0.002733,
     'DI': 0.003765,
     'DL': 0.00212,
     'DN': 0.003412,
     'DU': 0.0013794,
     'DV': 0.00259,
     'DY': 0.004492,
     'EB': 0.007068,
     'EE': 0.004031,
     'EG': 0.006025,
     'EH': 0.006084,
     'EL': 0.000429,
     'EP': 0.009269,
     'EU': 0.005064,
     'FC': 0.005712,
     'FD ': 0.005937,
     'FE': 0.007486,
     'FI': 0.005513,
     'FR': 0.00058,
     'FS': 0.006773,
     'GB': 0.009302,
     'GE': 0.004417,
     'GF': 0.004374,
     'GH': 0.003721,
     'GI': 0.002572
    }
    for k, v in int_denominators.items():
        train[k] = np.round(train[k]/v,1)

In [12]:
# fill missindata
train = train.fillna(train.median())

# check missing data
train[train.isnull().any(axis=1)]

Unnamed: 0,AB,AF,AH,AM,AR,AX,AY,AZ,BC,BD,...,FL,FR,FS,GB,GE,GF,GH,GI,GL,Class


In [13]:
# seperate Class
X = train.drop(columns=["Class"])
y = train["Class"]

In [14]:
# remove all features when VIF is high
if apply_vif:
    top_vif = 100

    while(top_vif > 10):
        vif_df, remove_col, top_vif = check_vif(X)
        print(remove_col, top_vif)
        if top_vif < 10:
            break
        X = X.drop(columns=remove_col)

In [15]:
# Feature selection
if feature_selection == 'rf_model':
    rf = RandomForestClassifier()
    rf.fit(X, y)
    print("Train ACC : %.4f" % accuracy_score(y, rf.predict(X)))
    fi_df = pd.DataFrame({'feature':X.columns, 'importance':rf.feature_importances_})
    selected_cols = fi_df.sort_values(by="importance", ascending=False)[:20]["feature"].values
    X = X[selected_cols]

if feature_selection == 'permutation':
    rf = RandomForestClassifier(n_estimators=300, random_state=42)
    rf.fit(X, y)
    result = permutation_importance(rf, X, y,  
                                n_repeats=30, random_state=42, n_jobs=-1)
    # Feature label
    Feature = X.columns

    # sort by Feature importance(ascending), save index
    sorted_result = result.importances_mean.argsort()

    # # make DataFrame
    importances = pd.DataFrame(result.importances_mean[sorted_result], index=Feature[sorted_result]).sort_values(0, ascending=False)   
    selected_cols = list(importances[importances[0]>0][0].index)
    X = X[selected_cols]


In [16]:
# class imbalance handling
## 1. undersampling
if sampling_method == 'under':
    c1 = X[y == 1]
    c0 = X[y == 0]
    print(c1.shape, c0.shape) # 108, 509 -> 108, 108
    c0 = c0.sample(n=c1.shape[0]) # 509 -> 108
    X = pd.concat([c0, c1])
    print(X.shape)
    
## 2. oversampling -> SMOTE
if sampling_method == 'SMOTE':
    smote = SMOTE(k_neighbors=5)
    # The fit_resample function automatically finds the minority class by y and fits it 1:1.
    X_resampled, y_resampled = smote.fit_resample(X, y)
    print(X_resampled.shape, y_resampled.shape)
    y = y_resampled
    X = X_resampled
    
    
if sampling_method == 'ran_over': 
    ros = RandomOverSampler(random_state=42)
    X, y = ros.fit_resample(X, y)
    
    
if sampling_method == 'ADASYN': 
    ros = ADASYN(random_state=42)
    X, y = ros.fit_resample(X, y)

    
    
# 3. hybrid approach
## class0 : 509 -> 300
## class1 : 108 -> 300
if sampling_method == 'hybrid':
    L = X.columns
    c1 = train[train.Class == 1]
    c0 = train[train.Class == 0]
    print(c1.shape, c0.shape)
    c0 = c0.sample(n=300) # 509 -> 300
    train = pd.concat([c0, c1])
    print(train.shape)
    
    X = train[L]
    y = train['Class']

    smote = SMOTE(k_neighbors=5)
    # The fit_resample function automatically finds the minority class by y and fits it 1:1.
    X, y = smote.fit_resample(X, y) # 300, 108 --> 300, 300
    print(X.shape, y.shape)

(108, 57) (509, 57)
(408, 57)
(600, 56) (600,)


In [17]:
# scaling
if scaling == 'Standard':
    scaler = StandardScaler()
    L = X.columns
    data_ = scaler.fit_transform(X)
    X = pd.DataFrame(data=data_, columns=X.columns)
    display(X)
    
if scaling == 'Robust':
    L = X.columns
    scaler = RobustScaler()
    data_ = scaler.fit_transform(X)
    X = pd.DataFrame(data=data_, columns=X.columns)
    display(X)
    
if scaling == 'MinMax':
    L = X.columns
    scaler = RobustScaler()
    data_ = scaler.fit_transform(X)
    X = pd.DataFrame(data=data_, columns=X.columns)
    display(X)
    
if scaling == 'Quantile':
    L = X.columns
    scaler = QuantileTransformer(n_quantiles=50, random_state=42, output_distribution="normal")
    data_ = scaler.fit_transform(X)
    X = pd.DataFrame(data=data_, columns=X.columns)
    display(X)

Unnamed: 0,AB,AF,AH,AM,AR,AX,AY,AZ,BC,BD,...,FI,FL,FR,FS,GB,GE,GF,GH,GI,GL
0,-1.248030,0.895317,0.499118,-0.064908,-5.199338,-1.125366,1.149336,-5.199338,-5.199338,-0.153340,...,-0.732101,0.532581,-0.551971,-5.199338,0.694324,1.350170,-0.754163,-0.667937,-0.776599,-0.876728
1,-0.354624,-0.191569,-5.199338,0.161814,0.986530,0.770953,1.612539,-0.966309,0.846401,-0.461545,...,0.339069,0.710390,-0.732101,-0.115334,-1.133284,1.339830,0.325802,-0.979745,2.575252,-0.081692
2,0.336854,-0.858437,0.518169,0.858437,-5.199338,0.043953,0.782269,0.721203,0.073298,1.071808,...,0.149107,-5.199338,-5.199338,-0.461545,0.130091,0.699659,0.776599,0.452255,-0.090093,5.199338
3,-0.907883,0.787964,-5.199338,-0.652327,-5.199338,-0.303815,2.053060,-0.014647,0.993361,0.721203,...,-0.238682,-5.199338,-0.616503,-5.199338,-0.759736,-5.199338,1.404174,1.350170,0.166055,5.199338
4,-0.238682,0.204371,-5.199338,-0.503864,-5.199338,0.636874,0.840429,-0.811006,0.551971,-1.677273,...,0.308200,1.079270,-5.199338,-5.199338,-0.161814,1.371292,-1.094377,0.986530,0.518169,-0.840429
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,0.561744,-0.312592,1.462615,-1.339830,-5.199338,-1.612539,0.551971,-0.204371,-0.178797,-0.234378,...,-0.726641,-0.208646,0.715786,-0.631757,-0.591393,-5.199338,1.393046,0.715786,-1.252602,0.225785
596,1.141273,-0.060715,-5.199338,1.789989,0.828574,0.356853,-5.199338,0.321392,1.880058,-1.880058,...,-1.500226,-0.374750,-5.199338,-5.199338,-2.088776,2.127374,-0.907883,0.683712,-0.466205,0.480247
597,1.117518,1.880058,-5.199338,0.715786,1.216732,1.234468,-5.199338,1.243484,-0.170299,-0.993361,...,-0.356853,-0.406365,-0.657513,-0.420038,-0.388252,0.678435,-1.125366,-0.191569,1.141273,0.561744
598,0.401825,0.035577,0.596385,0.383743,-5.199338,0.039765,-5.199338,0.356853,0.631757,0.732101,...,0.234378,1.000238,0.864502,0.494384,0.174547,0.200100,-1.190824,1.280601,-0.611450,-1.190824


In [18]:
# feature extraction
if feature_extraction == 'PCA':    
    dim = PCA(n_components=0.90, random_state=42)
    data_ = dim.fit_transform(X)
    X = pd.DataFrame(data=data_, columns=[f"PC{i}" for i in range(1, data_.shape[1]+1)])
    X

if feature_extraction == 'LDA':
    dim = LinearDiscriminantAnalysis(n_components=6)
    data_ = dim.fit_transform(X, greek["Gamma"])
    X = pd.DataFrame(data=data_, columns=[f"PC{i}" for i in range(1, data_.shape[1]+1)])
    X

### 02.Hyper-parameter Tuning

In [19]:
def rf_optimizer(trial, X, y, K):
    # define parameter to tune
    n_estimators = trial.suggest_categorical('n_estimators', [50, 100, 200])
    max_depth = trial.suggest_int('max_depth', 3, 8)
    max_features = trial.suggest_categorical('max_features', [0.6, 0.7, 0.8])
    
    
    # set model
    model = RandomForestClassifier(n_estimators=n_estimators,
                                   max_depth=max_depth,
                                   max_features=max_features,
                                   criterion='log_loss',
#                                    class_weight='balanced'
                                  )
    
    # K-Fold Cross validation
    folds = StratifiedKFold(n_splits=K, shuffle=True)
    losses = []
    
    for train_idx, val_idx in folds.split(X, y):
        X_train = X.iloc[train_idx, :]
        y_train = y.iloc[train_idx]
        
        X_val = X.iloc[val_idx, :]
        y_val = y.iloc[val_idx]
        
        model.fit(X_train, y_train)
        preds = model.predict_proba(X_val)
        loss = evaluation_metric(y_val, preds)
        losses.append(loss)
    
    # return mean score of CV
    return np.mean(losses)

In [20]:
if 'RF' in models:

    K = 5 # set K of K-Fold
    opt_func = partial(rf_optimizer, X=X, y=y, K=K)


    rf_study = optuna.create_study(direction="minimize") # determine minimize or maximize sth
    rf_study.optimize(opt_func, n_trials=n_trials)
    rf_best_params = rf_study.best_params
    best_rf = RandomForestClassifier(**rf_best_params)

    display_experiment_log(rf_study)

[I 2023-06-24 01:42:08,098] A new study created in memory with name: no-name-2a7d96d8-d04d-4b8c-85ee-e5532a0f152e
[I 2023-06-24 01:42:16,023] Trial 0 finished with value: 0.21945815029715104 and parameters: {'n_estimators': 200, 'max_depth': 6, 'max_features': 0.7}. Best is trial 0 with value: 0.21945815029715104.
[I 2023-06-24 01:42:17,643] Trial 1 finished with value: 0.2672615639348874 and parameters: {'n_estimators': 50, 'max_depth': 4, 'max_features': 0.6}. Best is trial 0 with value: 0.21945815029715104.
[I 2023-06-24 01:42:24,754] Trial 2 finished with value: 0.3006041645892724 and parameters: {'n_estimators': 200, 'max_depth': 3, 'max_features': 0.8}. Best is trial 0 with value: 0.21945815029715104.
[I 2023-06-24 01:42:28,278] Trial 3 finished with value: 0.219284552429483 and parameters: {'n_estimators': 100, 'max_depth': 7, 'max_features': 0.6}. Best is trial 3 with value: 0.219284552429483.
[I 2023-06-24 01:42:30,247] Trial 4 finished with value: 0.21680158318068607 and para

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_max_depth,params_max_features,params_n_estimators,state
0,0,0.219458,2023-06-24 01:42:08.101485,2023-06-24 01:42:16.022859,0 days 00:00:07.921374,6,0.7,200,COMPLETE
1,1,0.267262,2023-06-24 01:42:16.024885,2023-06-24 01:42:17.642831,0 days 00:00:01.617946,4,0.6,50,COMPLETE
2,2,0.300604,2023-06-24 01:42:17.644252,2023-06-24 01:42:24.753939,0 days 00:00:07.109687,3,0.8,200,COMPLETE
3,3,0.219285,2023-06-24 01:42:24.755342,2023-06-24 01:42:28.278450,0 days 00:00:03.523108,7,0.6,100,COMPLETE
4,4,0.216802,2023-06-24 01:42:28.279722,2023-06-24 01:42:30.246716,0 days 00:00:01.966994,6,0.7,50,COMPLETE
5,5,0.267035,2023-06-24 01:42:30.248084,2023-06-24 01:42:36.767909,0 days 00:00:06.519825,4,0.6,200,COMPLETE
6,6,0.224339,2023-06-24 01:42:36.769251,2023-06-24 01:42:44.390283,0 days 00:00:07.621032,5,0.7,200,COMPLETE
7,7,0.219485,2023-06-24 01:42:44.392242,2023-06-24 01:42:48.356824,0 days 00:00:03.964582,8,0.7,100,COMPLETE
8,8,0.209078,2023-06-24 01:42:48.358251,2023-06-24 01:42:56.084980,0 days 00:00:07.726729,7,0.6,200,COMPLETE
9,9,0.22098,2023-06-24 01:42:56.086302,2023-06-24 01:42:57.902864,0 days 00:00:01.816562,7,0.6,50,COMPLETE


Best Score: 0.2091
Best params:  {'n_estimators': 200, 'max_depth': 7, 'max_features': 0.6}


Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_max_depth,params_max_features,params_n_estimators,state
8,8,0.209078,2023-06-24 01:42:48.358251,2023-06-24 01:42:56.084980,0 days 00:00:07.726729,7,0.6,200,COMPLETE


In [21]:
def xgb_optimizer(trial, X, y, K):
    n_estimators = trial.suggest_categorical('n_estimators', [300, 500, 700])
    max_depth = trial.suggest_int('max_depth', 4, 10)
    colsample_bytree = trial.suggest_categorical('colsample_bytree', [0.5, 0.6, 0.7, 0.8])
    learning_rate = trial.suggest_float('learning_rate', 1e-3, 5e-2)
    reg_lambda = trial.suggest_categorical('reg_lambda', [0.5, 1, 2, 4])
#     sample_weight = trial.suggest_categorical('sample_weight', [sample_weights])

    
    model = XGBClassifier(n_estimators=n_estimators,
                          max_depth=max_depth,
                          colsample_bytree=colsample_bytree,
                          learning_rate=learning_rate,
                          reg_lambda=reg_lambda,
#                           objective="multi:softprob", # predict for greek["Alpha"]
                          eval_metric = evaluation_metric
#                          scale_pos_weight= 4.71
                            )  ## we set class imbalance by using sampling method.
    
    
    folds = StratifiedKFold(n_splits=K, shuffle=True)
    losses = []
    
    for train_idx, val_idx in folds.split(X, y):
        X_train = X.iloc[train_idx, :]
        y_train = y.iloc[train_idx]
        
        X_val = X.iloc[val_idx, :]
        y_val = y.iloc[val_idx]
        
        model.fit(X_train, y_train)
        preds = model.predict_proba(X_val)
        loss = evaluation_metric(y_val, preds)
        losses.append(loss)
    
    
    return np.mean(losses)

In [22]:
if 'XGB' in models:

    K = 5
    opt_func = partial(xgb_optimizer, X=X, y=y, K=K)


    xgb_study = optuna.create_study(direction="minimize")
    xgb_study.optimize(opt_func, n_trials=n_trials)
    xgb_best_params = xgb_study.best_params
    best_xgb = XGBClassifier(**xgb_best_params)

    display_experiment_log(xgb_study)

[I 2023-06-24 01:46:27,543] A new study created in memory with name: no-name-6d648346-382c-4652-9bb0-e70af5680b9a
[I 2023-06-24 01:46:33,152] Trial 0 finished with value: 0.20933465160119039 and parameters: {'n_estimators': 500, 'max_depth': 5, 'colsample_bytree': 0.5, 'learning_rate': 0.007308601187886229, 'reg_lambda': 2}. Best is trial 0 with value: 0.20933465160119039.
[I 2023-06-24 01:46:38,284] Trial 1 finished with value: 0.15551031975182314 and parameters: {'n_estimators': 300, 'max_depth': 7, 'colsample_bytree': 0.8, 'learning_rate': 0.04904753575150251, 'reg_lambda': 4}. Best is trial 1 with value: 0.15551031975182314.
[I 2023-06-24 01:46:43,527] Trial 2 finished with value: 0.18917822959832847 and parameters: {'n_estimators': 300, 'max_depth': 10, 'colsample_bytree': 0.8, 'learning_rate': 0.013027043301476363, 'reg_lambda': 0.5}. Best is trial 1 with value: 0.15551031975182314.
[I 2023-06-24 01:46:51,114] Trial 3 finished with value: 0.2361917242842416 and parameters: {'n_es

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_colsample_bytree,params_learning_rate,params_max_depth,params_n_estimators,params_reg_lambda,state
0,0,0.209335,2023-06-24 01:46:27.546371,2023-06-24 01:46:33.151973,0 days 00:00:05.605602,0.5,0.007309,5,500,2.0,COMPLETE
1,1,0.15551,2023-06-24 01:46:33.153692,2023-06-24 01:46:38.283466,0 days 00:00:05.129774,0.8,0.049048,7,300,4.0,COMPLETE
2,2,0.189178,2023-06-24 01:46:38.285141,2023-06-24 01:46:43.527303,0 days 00:00:05.242162,0.8,0.013027,10,300,0.5,COMPLETE
3,3,0.236192,2023-06-24 01:46:43.528992,2023-06-24 01:46:51.114089,0 days 00:00:07.585097,0.6,0.006336,7,500,4.0,COMPLETE
4,4,0.181412,2023-06-24 01:46:51.115756,2023-06-24 01:46:55.602835,0 days 00:00:04.487079,0.6,0.013175,8,300,0.5,COMPLETE
5,5,0.189485,2023-06-24 01:46:55.604503,2023-06-24 01:47:01.942573,0 days 00:00:06.338070,0.5,0.012834,10,500,4.0,COMPLETE
6,6,0.14742,2023-06-24 01:47:01.944164,2023-06-24 01:47:04.933184,0 days 00:00:02.989020,0.6,0.043423,5,300,1.0,COMPLETE
7,7,0.168119,2023-06-24 01:47:04.934764,2023-06-24 01:47:11.676452,0 days 00:00:06.741688,0.8,0.025137,10,500,0.5,COMPLETE
8,8,0.130376,2023-06-24 01:47:11.680087,2023-06-24 01:47:17.997783,0 days 00:00:06.317696,0.8,0.031559,10,700,0.5,COMPLETE
9,9,0.351461,2023-06-24 01:47:18.001228,2023-06-24 01:47:26.932029,0 days 00:00:08.930801,0.8,0.002299,6,500,1.0,COMPLETE


Best Score: 0.1262
Best params:  {'n_estimators': 700, 'max_depth': 7, 'colsample_bytree': 0.7, 'learning_rate': 0.037208834007683586, 'reg_lambda': 2}


Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_colsample_bytree,params_learning_rate,params_max_depth,params_n_estimators,params_reg_lambda,state
28,28,0.126187,2023-06-24 01:49:22.683746,2023-06-24 01:49:29.071603,0 days 00:00:06.387857,0.7,0.037209,7,700,2.0,COMPLETE


In [23]:
def svm_optimizer(trial, X, y, K):
    C = trial.suggest_int('C', 1, 30)
#     kernel = trial.suggest_categorical('kernel', ['rbf', 'linear'])
    gamma = trial.suggest_categorical('gamma', [0.1, 0.3, 0.5, 0.7, 1])
    

    model = SVC(C=C,
                kernel='rbf',
#                 class_weight='balanced', # if class imbalanced
                probability=True,
                #cache_size=1000,
                random_state=42,
                gamma = gamma,
                
               )
    
    folds = StratifiedKFold(n_splits=K, shuffle=True)
    losses = []
    
    for train_idx, val_idx in folds.split(X, y):
        X_train = X.iloc[train_idx, :]
        y_train = y.iloc[train_idx]
        
        X_val = X.iloc[val_idx, :]
        y_val = y.iloc[val_idx]
        
        model.fit(X_train, y_train)
        preds = model.predict_proba(X_val)
        loss = evaluation_metric(y_val, preds)
        losses.append(loss)
    
    
    return np.mean(losses)

In [24]:
if 'SVM' in models:
    K = 5
    opt_func = partial(svm_optimizer, X=X, y=y, K=K)
    svm_study = optuna.create_study(direction="minimize")
    svm_study.optimize(opt_func, n_trials=n_trials)
    svm_best_params = svm_study.best_params
    best_svm = SVC(random_state=42, **svm_best_params, probability=True)

    display_experiment_log(svm_study)

[I 2023-06-24 01:51:17,244] A new study created in memory with name: no-name-5a6b1ce0-12a1-4bcf-acba-3448429825db
[I 2023-06-24 01:51:17,721] Trial 0 finished with value: 0.576356247967119 and parameters: {'C': 6, 'gamma': 0.1}. Best is trial 0 with value: 0.576356247967119.
[I 2023-06-24 01:51:18,464] Trial 1 finished with value: 0.5802887599911254 and parameters: {'C': 8, 'gamma': 0.1}. Best is trial 0 with value: 0.576356247967119.
[I 2023-06-24 01:51:19,197] Trial 2 finished with value: 0.6271686541860777 and parameters: {'C': 12, 'gamma': 0.3}. Best is trial 0 with value: 0.576356247967119.
[I 2023-06-24 01:51:19,735] Trial 3 finished with value: 0.5815105847333552 and parameters: {'C': 7, 'gamma': 0.1}. Best is trial 0 with value: 0.576356247967119.
[I 2023-06-24 01:51:20,231] Trial 4 finished with value: 0.6278994796040237 and parameters: {'C': 29, 'gamma': 0.3}. Best is trial 0 with value: 0.576356247967119.
[I 2023-06-24 01:51:20,724] Trial 5 finished with value: 0.57336808472

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_C,params_gamma,state
0,0,0.576356,2023-06-24 01:51:17.246476,2023-06-24 01:51:17.721448,0 days 00:00:00.474972,6,0.1,COMPLETE
1,1,0.580289,2023-06-24 01:51:17.722601,2023-06-24 01:51:18.463989,0 days 00:00:00.741388,8,0.1,COMPLETE
2,2,0.627169,2023-06-24 01:51:18.465871,2023-06-24 01:51:19.197298,0 days 00:00:00.731427,12,0.3,COMPLETE
3,3,0.581511,2023-06-24 01:51:19.198894,2023-06-24 01:51:19.734872,0 days 00:00:00.535978,7,0.1,COMPLETE
4,4,0.627899,2023-06-24 01:51:19.736389,2023-06-24 01:51:20.231442,0 days 00:00:00.495053,29,0.3,COMPLETE
5,5,0.573368,2023-06-24 01:51:20.232715,2023-06-24 01:51:20.724024,0 days 00:00:00.491309,26,0.1,COMPLETE
6,6,0.666768,2023-06-24 01:51:20.725244,2023-06-24 01:51:21.312144,0 days 00:00:00.586900,14,1.0,COMPLETE
7,7,0.651378,2023-06-24 01:51:21.313445,2023-06-24 01:51:21.828769,0 days 00:00:00.515324,15,0.7,COMPLETE
8,8,0.668402,2023-06-24 01:51:21.830012,2023-06-24 01:51:22.402471,0 days 00:00:00.572459,9,1.0,COMPLETE
9,9,0.645862,2023-06-24 01:51:22.403725,2023-06-24 01:51:22.880212,0 days 00:00:00.476487,27,0.5,COMPLETE


Best Score: 0.5634
Best params:  {'C': 12, 'gamma': 0.1}


Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_C,params_gamma,state
26,26,0.56343,2023-06-24 01:51:31.280063,2023-06-24 01:51:31.771765,0 days 00:00:00.491702,12,0.1,COMPLETE


In [25]:
#TabPFN
tP = TabPFNClassifier(N_ensemble_configurations=64,device='cuda:0')

Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters


### 03. Test Prediction

In [26]:
## preprocessing in same way
test.EJ = lb.fit_transform(test.EJ)  # A->0, B->1
test = test.drop(columns=["Id"])
X_test = test.fillna(train.median())

if scaling != False:
    X_test = scaler.transform(X_test[L])
    X_test = pd.DataFrame(data=X_test, columns=L)

if feature_extraction != False:
    data_ = dim.transform(X_test)
    X_test = pd.DataFrame(data=data_, columns=[f"PC{i}" for i in range(1, data_.shape[1]+1)])

X_test = X_test[X.columns]
X_test

Unnamed: 0,AB,AF,AH,AM,AR,AX,AY,AZ,BC,BD,...,FI,FL,FR,FS,GB,GE,GF,GH,GI,GL
0,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,...,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338
1,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,...,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338
2,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,...,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338
3,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,...,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338
4,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,...,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338


### 04. Fitting and Evaluation(w/OOF)

In [27]:
voting_clf = VotingClassifier(estimators=[
                ["rf", best_rf],
                ["xgb", best_xgb],
                ["svc", best_svm],
                ["TabPFN", tP]
    ],
    voting="soft"
    , weights=weights
)

In [28]:
def oof_preds(best_model):
    avg_tlosses = []
    avg_vlosses = []
    
    for bag in range(bag_num):

        print(f'########################## bag: {bag} ##########################')

        folds = StratifiedKFold(n_splits=K, random_state=42 * bag, shuffle=True)
        train_scores = []
        val_scores = []
        models[bag] = []

        for i, (train_idx, val_idx) in enumerate(folds.split(X, y)):
            X_train = X.iloc[train_idx, :]
            y_train = y.iloc[train_idx]
            X_val = X.iloc[val_idx, :]
            y_val = y.iloc[val_idx]

            best_model.fit(X_train, y_train)
            pred_train = best_model.predict_proba(X_train)
            pred_val= best_model.predict_proba(X_val) 
            train_score = evaluation_metric(y_train, pred_train)
            val_score = evaluation_metric(y_val, pred_val)
            train_scores.append(train_score)
            val_scores.append(val_score)

        avg_tloss = np.mean(train_scores)
        avg_vloss = np.mean(val_scores)
        print(f"Bags: {bag}, Train Score: {round(avg_tloss, 3)}")
        print(f"Bags: {bag}, Test Score: {round(avg_vloss, 3)}")
        
        avg_tlosses.append(avg_tloss)
        avg_vlosses.append(avg_vloss)
        
    fi_tloss = np.mean(avg_tlosses)
    fi_vloss = np.mean(avg_vlosses)


    return fi_tloss, fi_vloss



In [29]:
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, stratify=y, random_state=42)
# print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)


print("Random Forest")
rf_tloss, rf_vloss = oof_preds(best_model=best_rf)
print("--- Prediction Score ---")
print("Train Score : %.4f" % rf_tloss)
print("Test Score : %.4f" % rf_vloss)

print("\nXGBoost")
xgb_tloss, xgb_vloss = oof_preds(best_model=best_xgb)
print("--- Prediction Score ---")
print("Train Score : %.4f" % xgb_tloss)
print("Test Score : %.4f" % xgb_vloss)

print("\nSupport Vector Machine")
svm_tloss, svm_vloss = oof_preds(best_model=best_svm)
print("--- Prediction Score ---")
print("Train Score : %.4f" % svm_tloss)
print("Test Score : %.4f" % svm_vloss)

print("\nTabPFN")
tP_tloss, tP_vloss = oof_preds(best_model=tP)
print("--- Prediction Score ---")
print("Train Score : %.4f" % tP_tloss)
print("Test Score : %.4f" % tP_vloss)


print("\nEnsemble model")
ens_tloss, ens_vloss = oof_preds(best_model=voting_clf)
print("--- Prediction Score ---")
print("Train Score : %.4f" % ens_tloss)
print("Test Score : %.4f" % ens_vloss)

Random Forest
########################## bag: 0 ##########################
Bags: 0, Train Score: 0.078
Bags: 0, Test Score: 0.228
########################## bag: 1 ##########################
Bags: 1, Train Score: 0.078
Bags: 1, Test Score: 0.22
########################## bag: 2 ##########################
Bags: 2, Train Score: 0.078
Bags: 2, Test Score: 0.218
########################## bag: 3 ##########################
Bags: 3, Train Score: 0.077
Bags: 3, Test Score: 0.23
--- Prediction Score ---
Train Score : 0.0775
Test Score : 0.2242

XGBoost
########################## bag: 0 ##########################
Bags: 0, Train Score: 0.008
Bags: 0, Test Score: 0.141
########################## bag: 1 ##########################
Bags: 1, Train Score: 0.008
Bags: 1, Test Score: 0.13
########################## bag: 2 ##########################
Bags: 2, Train Score: 0.008
Bags: 2, Test Score: 0.134
########################## bag: 3 ##########################
Bags: 3, Train Score: 0.008
Bags: 3, Test

### 05. Submission

In [30]:
voting_clf.fit(X, y)
preds = voting_clf.predict_proba(X_test)

Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters


In [31]:
#post processing
if post:
    p0 = preds[:, 0]
    p0[p0 > 0.85] = 1
    p0[p0 < 0.15] = 0
else:
    p0 = preds[:, 0]

In [32]:
submission = pd.read_csv('../input/icr-identify-age-related-conditions/sample_submission.csv')
submission

Unnamed: 0,Id,class_0,class_1
0,00eed32682bb,0.5,0.5
1,010ebe33f668,0.5,0.5
2,02fa521e1838,0.5,0.5
3,040e15f562a2,0.5,0.5
4,046e85c7cc7f,0.5,0.5


In [33]:
submission['class_0'] = p0
submission['class_1'] = 1-p0
submission

Unnamed: 0,Id,class_0,class_1
0,00eed32682bb,0.846335,0.153665
1,010ebe33f668,0.846335,0.153665
2,02fa521e1838,0.846335,0.153665
3,040e15f562a2,0.846335,0.153665
4,046e85c7cc7f,0.846335,0.153665


In [34]:
submission.to_csv("submission.csv", index=False)