In [1]:
!pip install tabpfn --no-index --find-links=file:///kaggle/input/pip-packages-icr/pip-packages
!pip install ipywidgets
!mkdir -p /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff
!cp /kaggle/input/pip-packages-icr/pip-packages/prior_diff_real_checkpoint_n_0_epoch_100.cpkt /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff/

Looking in links: file:///kaggle/input/pip-packages-icr/pip-packages
mkdir: /opt/conda: Permission denied
cp: directory /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff does not exist


In [2]:
# Import Joblib Module from Scikit Learn
import joblib

import numpy as np                       # NumPy for numerical computations
import pandas as pd                      # Pandas for data manipulation and analysis
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, normalize   # LabelEncoder for encoding categorical variables, normalize for feature scaling
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier   # GradientBoostingClassifier and RandomForestClassifier for classification models
from sklearn.metrics import accuracy_score   # accuracy_score for evaluating model performance
from sklearn.impute import SimpleImputer   # SimpleImputer for handling missing values
import imblearn   # imblearn for imbalanced dataset handling
from imblearn.over_sampling import RandomOverSampler   # RandomOverSampler for oversampling minority class
from imblearn.under_sampling import RandomUnderSampler   # RandomUnderSampler for undersampling majority class
import xgboost   # XGBoost for gradient boosting models
import inspect   # inspect for retrieving information about live objects
from collections import defaultdict   # defaultdict for creating a dictionary with default values
from tabpfn import TabPFNClassifier   # TabPFNClassifier for a specific classification model
import warnings   # warnings for ignoring warnings during runtime
from sklearn.model_selection import KFold as KF
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
import sys
import os

  from pandas import MultiIndex, Int64Index


In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Load Dataset

In [4]:
train = pd.read_csv('../datasets/icr-identify-age-related-conditions/train.csv')
train = train.drop(['Id','EJ'], axis=1)
columns = train.columns
imputer = SimpleImputer(missing_values = np.nan, strategy ='median')
imputer = imputer.fit(train)
train = imputer.transform(train)
train = pd.DataFrame(train, columns = columns)
train.isnull().sum()

AB       0
AF       0
AH       0
AM       0
AR       0
AX       0
AY       0
AZ       0
BC       0
BD       0
BN       0
BP       0
BQ       0
BR       0
BZ       0
CB       0
CC       0
CD       0
CF       0
CH       0
CL       0
CR       0
CS       0
CU       0
CW       0
DA       0
DE       0
DF       0
DH       0
DI       0
DL       0
DN       0
DU       0
DV       0
DY       0
EB       0
EE       0
EG       0
EH       0
EL       0
EP       0
EU       0
FC       0
FD       0
FE       0
FI       0
FL       0
FR       0
FS       0
GB       0
GE       0
GF       0
GH       0
GI       0
GL       0
Class    0
dtype: int64

## Cross Validation

In [5]:
NUM_FOLDS = 5
skf = StratifiedKFold(n_splits=NUM_FOLDS, shuffle=True, random_state=42)
splitter = skf.split(train, train.Class)

for fold_idx, (train_idx, val_idx) in enumerate(splitter):
    print(f'Getting fold number {fold_idx}')
    df_train = train.iloc[train_idx]
    df_val = train.iloc[val_idx]
    
    #drop column Id & reset index
    
#     df_train = df_train.drop(['Id'], axis=1)
    df_train = df_train.reset_index(drop = True)
#     df_val = df_val.drop(['Id'], axis=1)
    df_val = df_val.reset_index(drop = True)
    
    #kfold path
    save_dir = f'../datasets/kfold/fold{fold_idx}'
    os.makedirs(save_dir, exist_ok = True)
    
    #saving
    df_train.to_csv(os.path.join(save_dir, 'train.csv'), index = False)
    df_val.to_csv(os.path.join(save_dir, 'val.csv'), index = False)
    
    # for testing
    save_dir1 = f'../datasets/kfold1/fold{fold_idx}'
    os.makedirs(save_dir1, exist_ok = True)
    df_train = df_train.reset_index(drop=True)
    df_val = df_val.reset_index(drop=True)
    df_train.to_csv(os.path.join(save_dir1, 'train.csv'))
    df_val.to_csv(os.path.join(save_dir1, 'val.csv'))

Getting fold number 0
Getting fold number 1
Getting fold number 2
Getting fold number 3
Getting fold number 4


## Pre-processing

In [6]:
def prepair_input(df, classi):
    columns = df.columns
    
    # Convert the values in the 'EJ' column of the 'test' dataframe to binary values (0 or 1),
    # based on the occurrence of the 'first_category' in the 'train' dataframe
#     first_category = df.EJ.unique()[0]
#     df.EJ = df.EJ.eq(first_category).astype('int')

    df = df.rename(columns={'BD ': 'BD', 'CD ': 'CD', 'CW ': 'CW', 'FD ': 'FD'})
    
    imputer = SimpleImputer(missing_values = np.nan, strategy ='median')
    imputer = imputer.fit(df)
    df = imputer.transform(df)
    df = pd.DataFrame(df, columns = columns)
    
    # Create a RandomOverSampler object with a random state of 42
    ros = RandomOverSampler(random_state=42)

    # Resample the 'train_pred_and_time' dataframe and 'greeks.Alpha' series using RandomOverSampler
    # The resampled data is assigned to 'train_ros' and 'y_ros' respectively
    x_ros, y_ros = ros.fit_resample(df, classi)
    return x_ros, y_ros

def normolized(df):
    columns = df.columns
    
    scaler = StandardScaler()
    model = scaler.fit(df)
    scaled_df = model.transform(df)
    
    scaled_df = pd.DataFrame(scaled_df, columns = columns)
    return scaled_df

## Balanced Log Loss

In [7]:
def balanced_log_loss(y_true, y_pred):
    # y_true: correct labels 0, 1
    # y_pred: predicted probabilities of class=1
    # calculate the number of observations for each class
    N_0 = np.sum(1 - y_true)
    N_1 = np.sum(y_true)
    # calculate the weights for each class to balance classes
    w_0 = 1 / N_0
    w_1 = 1 / N_1
    # calculate the predicted probabilities for each class
    p_0 = np.clip(y_pred[:, 0], 1e-15, 1 - 1e-15)
    print(p_0)
    p_1 = np.clip(y_pred[:, 1], 1e-15, 1 - 1e-15)
    print(p_1)
    # calculate the summed log loss for each class
    log_loss_0 = -np.sum((1 - y_true) * np.log(p_0))
    log_loss_1 = -np.sum(y_true * np.log(p_1))
    # calculate the weighted summed logarithmic loss
    # (factgor of 2 included to give same result as LL with balanced input)
    balanced_log_loss = 2*(w_0 * log_loss_0 + w_1 * log_loss_1) / (w_0 + w_1)
    # return the average log loss
    return balanced_log_loss/(N_0+N_1)

In [8]:
# def balanced_log_loss(y_true, y_pred):
#   # calculate the number of observations for each class
#     N_0 = np.sum(1 - y_true)
#     N_1 = np.sum(y_true)
#    # calculate the weights for each class
#     w_0 = 1 / N_0
#     w_1 = 1 / N_1
#    # calculate the predicted probabilities for each class
#     p_0 = np.clip(y_pred[:, 0], 1e-15, 1 - 1e-15)
#     p_1 = np.clip(y_pred[:, 1], 1e-15, 1 - 1e-15)
#    # calculate the log loss for each class
#     log_loss_0 = -np.sum((1-y_true) * np.log(p_0))
#     log_loss_1 = -np.sum(y_true * np.log(p_1))
#    # calculate the balanced logarithmic loss
#     balanced_log_loss = 2*(log_loss_0 + log_loss_1) / (w_0 + w_1)
#     return balanced_log_loss/ (N_0+N_1)

## Model

In [9]:
class Ensemble():
    def __init__(self):
        self.classifiers =[xgboost.XGBClassifier(n_estimators=100,max_depth=3,learning_rate=0.2,subsample=0.9,colsample_bytree=0.85),
                           xgboost.XGBClassifier(),
                           TabPFNClassifier(N_ensemble_configurations=12),
                           TabPFNClassifier(N_ensemble_configurations=12),
                           RandomForestClassifier(n_estimators=100, max_depth=3, random_state=0)]
    def fit(self,X,y):
        for i, classifier in enumerate(self.classifiers):
            if i in [2, 3]:
                classifier.fit(X,y,overwrite_warning =True)
            else :
                classifier.fit(X, y)
     
    def predict_proba(self, x):
        # N_models * N_rows * N_classes (4 * 5 * 2)
        probabilities = np.stack([classifier.predict_proba(x) for classifier in self.classifiers])
        averaged_probabilities = np.mean(probabilities, axis=0) # N_rows * N_classes
        return averaged_probabilities

# Post processing

In [10]:
def calibrate_prob(probs):
    print('TYPE:', probs.shape, type(probs))
    ret = probs.copy()
    ret[probs > 0.80] = 1.0
    ret[probs < 0.20] = 0.0
    return ret

## Training

In [12]:
def training(model):
    outer_results = list()   # List to store the loss results of each outer fold
    best_loss = np.inf   # Variable to track the best loss achieved during training
    split = 0   # Counter for the current split
    splits = 5   # Total number of splits for the inner cross-validation
    models = []   # List to store the trained models for each inner fold

    # Loop over the splits of the inner cross-validation using tqdm for progress visualization
    for split in range(splits):
        print('fold', split)
        #loading train & test dataset for each fold
        save_dir = f'../datasets/kfold/fold{split}'
        
        # x_train & y_train
        df_train = pd.read_csv(os.path.join(save_dir, 'train.csv'))
        x_train = df_train.drop(['Class'], axis=1)
        y_train = df_train.Class
        # pre-processing
        x_train, y_train = prepair_input(x_train, y_train)

        # x_val & y_val
        df_val = pd.read_csv(os.path.join(save_dir, 'val.csv'))
        x_val = df_val.drop(['Class'], axis=1)
        y_val = df_val.Class

        #fitting model
        model.fit(x_train, y_train)   # Fit the model on the training data
        models.append(model)   # Append the trained model to the list of models

        y_pred = model.predict_proba(x_val)   # Predict probabilities for the validation set   # Calculate class probabilities
        y_p = np.empty((y_pred.shape[0],y_pred.shape[1]))   # Create an empty array to store predicted labels
                
        y_p = calibrate_prob(y_pred)
        
        loss = balanced_log_loss(y_val, y_p)  # Calculate the balanced log loss between the predicted labels and the true labels

        # checking
        y_val = y_val.to_frame()
        y_val.rename(columns = {'Class': 'gt'}, inplace = True)
        y_val['pred'] = y_p[:, 1]
#         print(type(y_val['gt']), type(y_val.loc[0, 'gt']), type(y_val['pred']), type(y_val.loc[0, 'pred']))
        p00 = y_pred[:, 1]
        p00 = p00.flatten()
        y_val['prob'] = p00
        display(y_val)

        if loss < best_loss:
            best_model = model   # Save the best model based on the lowest loss
            best_loss = loss   # Update the best loss
            print('best_model_saved')

        outer_results.append(loss)   # Append the loss to the list of outer results
        print('>val_loss=%.5f, split = %.1f' % (loss, split))

        split += 1

    print('LOSS: %.5f' % (np.mean(outer_results)))  # Print the average loss across all outer folds
    print('Best loss', best_loss)
    return best_model, models   # Return the best model and the list of trained models

In [13]:
yt = Ensemble()

m, models = training(yt)

Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
fold 0


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


TYPE: (124, 2) <class 'numpy.ndarray'>
[6.92910461e-01 1.00000000e+00 1.00000000e+00 3.22527471e-01
 1.00000000e+00 1.00000000e+00 1.00000000e+00 1.00000000e+00
 1.00000000e+00 1.00000000e+00 1.00000000e+00 1.00000000e+00
 1.00000000e+00 1.00000000e+00 1.00000000e+00 1.00000000e+00
 1.00000000e+00 1.00000000e+00 1.00000000e+00 1.00000000e+00
 1.00000000e+00 4.68316357e-01 1.00000000e+00 1.00000000e+00
 1.00000000e+00 2.50907243e-01 5.28551325e-01 1.00000000e+00
 1.00000000e-15 1.00000000e+00 1.00000000e+00 1.00000000e+00
 1.00000000e+00 1.00000000e+00 2.33233495e-01 1.00000000e+00
 1.00000000e+00 1.00000000e+00 1.00000000e+00 1.00000000e-15
 1.00000000e+00 1.00000000e+00 1.00000000e+00 1.00000000e+00
 1.00000000e+00 1.00000000e+00 1.00000000e+00 1.00000000e+00
 6.94164931e-01 7.07644317e-01 1.00000000e+00 1.00000000e+00
 1.00000000e+00 1.00000000e+00 1.00000000e+00 1.00000000e+00
 3.38439424e-01 1.00000000e+00 1.00000000e+00 2.57021056e-01
 1.00000000e+00 4.14101783e-01 7.58035929e-01 

Unnamed: 0,gt,pred,prob
0,1.0,0.30709,0.30709
1,0.0,0.0,0.025927
2,0.0,0.0,0.059935
3,1.0,0.677473,0.677473
4,0.0,0.0,0.082901
5,0.0,0.0,0.094174
6,0.0,0.0,0.099339
7,0.0,0.0,0.083961
8,0.0,0.0,0.037889
9,0.0,0.0,0.064395


best_model_saved
>val_loss=0.21344, split = 0.0
fold 1


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


TYPE: (124, 2) <class 'numpy.ndarray'>
[1.00000000e+00 1.00000000e+00 1.00000000e+00 1.00000000e+00
 1.00000000e+00 1.00000000e+00 1.00000000e+00 2.84714147e-01
 1.00000000e+00 1.00000000e+00 1.00000000e+00 1.00000000e+00
 1.00000000e+00 1.00000000e+00 1.00000000e+00 4.89481140e-01
 1.00000000e+00 1.00000000e+00 1.00000000e+00 1.00000000e+00
 7.57800769e-01 1.00000000e+00 4.10830525e-01 1.00000000e+00
 2.95190623e-01 1.00000000e+00 1.00000000e+00 6.18904261e-01
 1.00000000e+00 1.00000000e+00 1.00000000e+00 1.00000000e-15
 1.00000000e+00 2.09550590e-01 5.65321382e-01 1.00000000e+00
 1.00000000e+00 1.00000000e+00 1.00000000e+00 1.00000000e+00
 6.20290758e-01 1.00000000e+00 1.00000000e+00 1.00000000e+00
 1.00000000e+00 1.00000000e+00 1.00000000e-15 1.00000000e+00
 1.00000000e+00 5.34582286e-01 1.00000000e+00 1.00000000e-15
 1.00000000e+00 1.00000000e+00 1.00000000e+00 3.04123638e-01
 1.00000000e-15 1.00000000e+00 1.00000000e-15 1.00000000e-15
 6.36711572e-01 1.00000000e+00 1.00000000e+00 

Unnamed: 0,gt,pred,prob
0,0.0,0.0,0.045194
1,0.0,0.0,0.056743
2,0.0,0.0,0.107301
3,0.0,0.0,0.064243
4,0.0,0.0,0.057091
5,0.0,0.0,0.028946
6,0.0,0.0,0.058669
7,1.0,0.715286,0.715286
8,0.0,0.0,0.036295
9,0.0,0.0,0.032243


>val_loss=1.46993, split = 1.0
fold 2


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


TYPE: (123, 2) <class 'numpy.ndarray'>
[1.00000000e+00 1.00000000e+00 1.00000000e+00 1.00000000e+00
 1.00000000e+00 7.96656416e-01 6.48407778e-01 1.00000000e+00
 1.00000000e-15 1.00000000e+00 1.00000000e+00 1.00000000e+00
 1.00000000e+00 1.00000000e+00 1.00000000e+00 1.00000000e+00
 1.00000000e+00 1.00000000e+00 1.00000000e+00 2.54996053e-01
 1.00000000e+00 1.00000000e+00 1.00000000e+00 1.00000000e+00
 1.00000000e+00 1.00000000e+00 1.00000000e+00 1.00000000e+00
 1.00000000e+00 1.00000000e+00 1.00000000e+00 1.00000000e+00
 1.00000000e+00 1.00000000e+00 1.00000000e+00 2.57042624e-01
 1.00000000e+00 2.80274294e-01 1.00000000e+00 2.50311175e-01
 1.00000000e+00 1.00000000e+00 4.91128496e-01 1.00000000e+00
 1.00000000e+00 1.00000000e+00 1.00000000e+00 1.00000000e+00
 1.00000000e+00 3.81539405e-01 1.00000000e+00 1.00000000e+00
 4.13876660e-01 1.00000000e+00 1.00000000e+00 1.00000000e+00
 1.00000000e-15 1.00000000e+00 1.00000000e+00 1.00000000e+00
 1.00000000e+00 1.00000000e-15 1.00000000e+00 

Unnamed: 0,gt,pred,prob
0,0.0,0.0,0.127913
1,0.0,0.0,0.097909
2,0.0,0.0,0.07985
3,0.0,0.0,0.034507
4,0.0,0.0,0.057018
5,0.0,0.203344,0.203344
6,1.0,0.351592,0.351592
7,0.0,0.0,0.026622
8,1.0,1.0,0.951597
9,0.0,0.0,0.040422


>val_loss=0.56994, split = 2.0
fold 3


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


TYPE: (123, 2) <class 'numpy.ndarray'>
[2.27375344e-01 1.00000000e+00 1.00000000e-15 1.00000000e+00
 1.00000000e+00 7.99178781e-01 3.33552160e-01 1.00000000e+00
 1.00000000e+00 1.00000000e+00 1.00000000e+00 1.00000000e-15
 1.00000000e-15 1.00000000e+00 1.00000000e+00 5.08837769e-01
 1.00000000e+00 7.47882206e-01 1.00000000e+00 1.00000000e+00
 1.00000000e-15 1.00000000e+00 1.00000000e+00 1.00000000e+00
 1.00000000e+00 1.00000000e+00 7.96907909e-01 1.00000000e+00
 7.45858649e-01 1.00000000e+00 1.00000000e+00 1.00000000e+00
 1.00000000e+00 1.00000000e+00 6.98809502e-01 1.00000000e+00
 1.00000000e+00 1.00000000e+00 1.00000000e+00 1.00000000e+00
 1.00000000e+00 1.00000000e+00 1.00000000e+00 1.00000000e+00
 1.00000000e+00 1.00000000e+00 4.04940522e-01 1.00000000e+00
 6.49550109e-01 1.00000000e-15 1.00000000e+00 1.00000000e+00
 1.00000000e+00 1.00000000e+00 1.00000000e+00 1.00000000e+00
 7.92359116e-01 1.00000000e+00 7.57668570e-01 1.00000000e+00
 1.00000000e+00 1.00000000e+00 1.00000000e+00 

Unnamed: 0,gt,pred,prob
0,1.0,0.772625,0.772625
1,0.0,0.0,0.10369
2,1.0,1.0,0.900497
3,0.0,0.0,0.056469
4,0.0,0.0,0.051167
5,0.0,0.200821,0.200821
6,1.0,0.666448,0.666448
7,0.0,0.0,0.057619
8,0.0,0.0,0.154653
9,0.0,0.0,0.170202


>val_loss=1.60513, split = 3.0
fold 4


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


TYPE: (123, 2) <class 'numpy.ndarray'>
[1.00000000e+00 1.00000000e+00 1.00000000e+00 1.00000000e+00
 1.00000000e+00 1.00000000e-15 1.00000000e+00 1.00000000e+00
 1.00000000e+00 1.00000000e+00 1.00000000e+00 1.00000000e+00
 1.00000000e+00 1.00000000e+00 1.00000000e+00 1.00000000e-15
 1.00000000e+00 1.00000000e+00 1.00000000e+00 1.00000000e+00
 1.00000000e+00 1.00000000e+00 1.00000000e+00 1.00000000e+00
 2.32964656e-01 1.00000000e+00 1.00000000e+00 1.00000000e+00
 6.68255575e-01 1.00000000e+00 3.17652501e-01 1.00000000e+00
 1.00000000e+00 1.00000000e+00 7.83384844e-01 7.40359029e-01
 1.00000000e+00 1.00000000e+00 1.00000000e+00 1.00000000e+00
 4.90476307e-01 4.56163971e-01 1.00000000e+00 1.00000000e+00
 1.00000000e+00 1.00000000e+00 1.00000000e+00 1.00000000e-15
 1.00000000e+00 1.00000000e+00 1.00000000e+00 1.00000000e+00
 1.00000000e+00 1.00000000e+00 2.67962932e-01 1.00000000e+00
 1.00000000e+00 1.00000000e+00 5.15952118e-01 2.42515468e-01
 1.00000000e+00 1.00000000e+00 1.00000000e+00 

Unnamed: 0,gt,pred,prob
0,0.0,0.0,0.049766
1,0.0,0.0,0.045279
2,0.0,0.0,0.034758
3,0.0,0.0,0.072465
4,0.0,0.0,0.069091
5,1.0,1.0,0.854817
6,0.0,0.0,0.109823
7,0.0,0.0,0.12987
8,0.0,0.0,0.090798
9,0.0,0.0,0.189898


best_model_saved
>val_loss=0.10561, split = 4.0
LOSS: 0.79281
Best loss 0.10561440489188031
