In [1]:
!pip install tabpfn --no-index --find-links=file:///kaggle/input/pip-packages-icr/pip-packages
!pip install ipywidgets
!mkdir -p /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff
!cp /kaggle/input/pip-packages-icr/pip-packages/prior_diff_real_checkpoint_n_0_epoch_100.cpkt /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff/

Looking in links: file:///kaggle/input/pip-packages-icr/pip-packages
mkdir: /opt/conda: Permission denied
cp: directory /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff does not exist


In [2]:
# Import Joblib Module from Scikit Learn
import joblib

import numpy as np                       # NumPy for numerical computations
import pandas as pd                      # Pandas for data manipulation and analysis
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, normalize   # LabelEncoder for encoding categorical variables, normalize for feature scaling
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier   # GradientBoostingClassifier and RandomForestClassifier for classification models
from sklearn.metrics import accuracy_score   # accuracy_score for evaluating model performance
from sklearn.impute import SimpleImputer   # SimpleImputer for handling missing values
import imblearn   # imblearn for imbalanced dataset handling
from imblearn.over_sampling import RandomOverSampler   # RandomOverSampler for oversampling minority class
from imblearn.under_sampling import RandomUnderSampler   # RandomUnderSampler for undersampling majority class
import xgboost   # XGBoost for gradient boosting models
import inspect   # inspect for retrieving information about live objects
from collections import defaultdict   # defaultdict for creating a dictionary with default values
from tabpfn import TabPFNClassifier   # TabPFNClassifier for a specific classification model
import warnings   # warnings for ignoring warnings during runtime
from sklearn.model_selection import KFold as KF
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
import sys
import os

  from pandas import MultiIndex, Int64Index


In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Load Dataset

In [4]:
train = pd.read_csv('../datasets/icr-identify-age-related-conditions/train.csv')
train = train.drop(['Id','EJ'], axis=1)
columns = train.columns
imputer = SimpleImputer(missing_values = np.nan, strategy ='median')
imputer = imputer.fit(train)
train = imputer.transform(train)
train = pd.DataFrame(train, columns = columns)
train.isnull().sum()

AB       0
AF       0
AH       0
AM       0
AR       0
AX       0
AY       0
AZ       0
BC       0
BD       0
BN       0
BP       0
BQ       0
BR       0
BZ       0
CB       0
CC       0
CD       0
CF       0
CH       0
CL       0
CR       0
CS       0
CU       0
CW       0
DA       0
DE       0
DF       0
DH       0
DI       0
DL       0
DN       0
DU       0
DV       0
DY       0
EB       0
EE       0
EG       0
EH       0
EL       0
EP       0
EU       0
FC       0
FD       0
FE       0
FI       0
FL       0
FR       0
FS       0
GB       0
GE       0
GF       0
GH       0
GI       0
GL       0
Class    0
dtype: int64

## Cross Validation

In [20]:
NUM_FOLDS = 5
skf = StratifiedKFold(n_splits=NUM_FOLDS, shuffle=True, random_state=42)
splitter = skf.split(train, train.Class)

for fold_idx, (train_idx, val_idx) in enumerate(splitter):
    print(f'Getting fold number {fold_idx}')
    df_train = train.iloc[train_idx]
    df_val = train.iloc[val_idx]
    
    #drop column Id & reset index
    
#     df_train = df_train.drop(['Id'], axis=1)
    df_train = df_train.reset_index(drop = True)
#     df_val = df_val.drop(['Id'], axis=1)
    df_val = df_val.reset_index(drop = True)
    
    #kfold path
    save_dir = f'../datasets/kfold/fold{fold_idx}'
    os.makedirs(save_dir, exist_ok = True)
    
    #saving
    df_train.to_csv(os.path.join(save_dir, 'train.csv'), index = False)
    df_val.to_csv(os.path.join(save_dir, 'val.csv'), index = False)
    
    # for testing
    save_dir1 = f'../datasets/kfold1/fold{fold_idx}'
    os.makedirs(save_dir1, exist_ok = True)
    df_train = df_train.reset_index(drop=True)
    df_val = df_val.reset_index(drop=True)
    df_train.to_csv(os.path.join(save_dir1, 'train.csv'))
    df_val.to_csv(os.path.join(save_dir1, 'val.csv'))

Getting fold number 0
Getting fold number 1
Getting fold number 2
Getting fold number 3
Getting fold number 4


## Pre-processing

In [21]:
def prepair_input(df, classi):
    columns = df.columns
    
    # Convert the values in the 'EJ' column of the 'test' dataframe to binary values (0 or 1),
    # based on the occurrence of the 'first_category' in the 'train' dataframe
#     first_category = df.EJ.unique()[0]
#     df.EJ = df.EJ.eq(first_category).astype('int')

    df = df.rename(columns={'BD ': 'BD', 'CD ': 'CD', 'CW ': 'CW', 'FD ': 'FD'})
    
    imputer = SimpleImputer(missing_values = np.nan, strategy ='median')
    imputer = imputer.fit(df)
    df = imputer.transform(df)
    df = pd.DataFrame(df, columns = columns)
    
    # Create a RandomOverSampler object with a random state of 42
    ros = RandomOverSampler(random_state=42)

    # Resample the 'train_pred_and_time' dataframe and 'greeks.Alpha' series using RandomOverSampler
    # The resampled data is assigned to 'train_ros' and 'y_ros' respectively
    x_ros, y_ros = ros.fit_resample(df, classi)
    return x_ros, y_ros

def normolized(df):
    columns = df.columns
    
    scaler = StandardScaler()
    model = scaler.fit(df)
    scaled_df = model.transform(df)
    
    scaled_df = pd.DataFrame(scaled_df, columns = columns)
    return scaled_df

## Balanced Log Loss

In [22]:
def balanced_log_loss(y_true, y_pred):
    # y_true: correct labels 0, 1
    # y_pred: predicted probabilities of class=1
    # calculate the number of observations for each class
    N_0 = np.sum(1 - y_true)
    N_1 = np.sum(y_true)
    # calculate the weights for each class to balance classes
    w_0 = 1 / N_0
    w_1 = 1 / N_1
    # calculate the predicted probabilities for each class
    p_0 = np.clip(y_pred[:, 0], 1e-15, 1 - 1e-15)
    p_1 = np.clip(y_pred[:, 1], 1e-15, 1 - 1e-15)
    # calculate the summed log loss for each class
    log_loss_0 = -np.sum((1 - y_true) * np.log(p_0))
    log_loss_1 = -np.sum(y_true * np.log(p_1))
    # calculate the weighted summed logarithmic loss
    # (factgor of 2 included to give same result as LL with balanced input)
    balanced_log_loss = 2*(w_0 * log_loss_0 + w_1 * log_loss_1) / (w_0 + w_1)
    # return the average log loss
    return balanced_log_loss/(N_0+N_1)

## Model

In [23]:
class Ensemble():
    def __init__(self):
        self.classifiers =[xgboost.XGBClassifier(n_estimators=100,max_depth=3,learning_rate=0.2,subsample=0.9,colsample_bytree=0.85),
                           xgboost.XGBClassifier(),
                           TabPFNClassifier(N_ensemble_configurations=12),
                           TabPFNClassifier(N_ensemble_configurations=12),
                           RandomForestClassifier(n_estimators=100, max_depth=3, random_state=0)]
    def fit(self,X,y):
        for i, classifier in enumerate(self.classifiers):
            if i in [2, 3]:
                classifier.fit(X,y,overwrite_warning =True)
            else :
                classifier.fit(X, y)
     
    def predict_proba(self, x):
        # N_models * N_rows * N_classes (4 * 5 * 2)
        probabilities = np.stack([classifier.predict_proba(x) for classifier in self.classifiers])
        averaged_probabilities = np.mean(probabilities, axis=0) # N_rows * N_classes
        return averaged_probabilities

# Post processing

In [24]:
def calibrate_prob(probs):
    print('TYPE:', probs.shape, type(probs))
    ret = probs.copy()
    ret[probs > 0.80] = 1.0
    ret[probs < 0.20] = 0.0
    return ret

## Training

In [41]:
def training(model):
    outer_results = list()   # List to store the loss results of each outer fold
    best_loss = np.inf   # Variable to track the best loss achieved during training
    split = 0   # Counter for the current split
    splits = 5   # Total number of splits for the inner cross-validation
    models = []   # List to store the trained models for each inner fold

    # Loop over the splits of the inner cross-validation using tqdm for progress visualization
    for split in range(splits):
        print('fold', split)
        #loading train & test dataset for each fold
        save_dir = f'../datasets/kfold/fold{split}'
        
        # x_train & y_train
        df_train = pd.read_csv(os.path.join(save_dir, 'train.csv'))
        x_train = df_train.drop(['Class'], axis=1)
        y_train = df_train.Class
        # pre-processing
        x_train, y_train = prepair_input(x_train, y_train)

        # x_val & y_val
        df_val = pd.read_csv(os.path.join(save_dir, 'val.csv'))
        x_val = df_val.drop(['Class'], axis=1)
        y_val = df_val.Class

        #fitting model
        model.fit(x_train, y_train)   # Fit the model on the training data
        models.append(model)   # Append the trained model to the list of models

        y_pred = model.predict_proba(x_val)   # Predict probabilities for the validation set   # Calculate class probabilities
        y_p = np.empty((y_pred.shape[0],y_pred.shape[1]))   # Create an empty array to store predicted labels
        
        print(y_pred)
        print('sub', y_pred[:, 0:].sum(axis=1))
        
        y_p = calibrate_prob(y_pred)
        
        loss = balanced_log_loss(y_val, y_p)  # Calculate the balanced log loss between the predicted labels and the true labels

        # checking
        y_val = y_val.to_frame()
        y_val.rename(columns = {'Class': 'gt'}, inplace = True)
        y_val['pred'] = y_p[:, 1]
#         print(type(y_val['gt']), type(y_val.loc[0, 'gt']), type(y_val['pred']), type(y_val.loc[0, 'pred']))
        p00 = y_pred[:, 1]
        p00 = p00.flatten()
        y_val['prob'] = p00
        display(y_val)

        if loss < best_loss:
            best_model = model   # Save the best model based on the lowest loss
            best_loss = loss   # Update the best loss
            print('best_model_saved')

        outer_results.append(loss)   # Append the loss to the list of outer results
        print('>val_loss=%.5f, split = %.1f' % (loss, split))

        split += 1

    print('LOSS: %.5f' % (np.mean(outer_results)))  # Print the average loss across all outer folds
    print('Best loss', best_loss)
    return best_model, models   # Return the best model and the list of trained models

In [42]:
yt = Ensemble()

m, models = training(yt)

Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
fold 0


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[[0.69291046 0.30708957]
 [0.9740727  0.02592732]
 [0.94006551 0.0599345 ]
 [0.32252747 0.67747255]
 [0.9170994  0.08290062]
 [0.90582596 0.09417402]
 [0.90066061 0.09933942]
 [0.91603931 0.08396069]
 [0.96211108 0.03788894]
 [0.93560454 0.06439548]
 [0.95167473 0.04832526]
 [0.90592486 0.09407515]
 [0.97754406 0.02245593]
 [0.90971462 0.09028537]
 [0.96720359 0.03279642]
 [0.86784238 0.13215762]
 [0.96135796 0.03864205]
 [0.96017498 0.03982501]
 [0.90974692 0.09025307]
 [0.95809404 0.04190596]
 [0.96443021 0.03556979]
 [0.46831636 0.53168363]
 [0.9578133  0.04218668]
 [0.94017593 0.05982407]
 [0.95002923 0.04997076]
 [0.25090724 0.74909276]
 [0.52855132 0.47144869]
 [0.87944081 0.12055921]
 [0.1171993  0.88280071]
 [0.95018624 0.04981381]
 [0.90288671 0.09711328]
 [0.96157677 0.03842325]
 [0.95930863 0.04069141]
 [0.84891057 0.15108941]
 [0.2332335  0.76676651]
 [0.81551972 0.18448029]
 [0.92587274 0.07412726]
 [0.93142077 0.06857921]
 [0.95805044 0.04194957]
 [0.02956497 0.97043504]


Unnamed: 0,gt,pred,prob
0,1.0,0.30709,0.30709
1,0.0,0.0,0.025927
2,0.0,0.0,0.059935
3,1.0,0.677473,0.677473
4,0.0,0.0,0.082901
5,0.0,0.0,0.094174
6,0.0,0.0,0.099339
7,0.0,0.0,0.083961
8,0.0,0.0,0.037889
9,0.0,0.0,0.064395


best_model_saved
>val_loss=0.21344, split = 0.0
fold 1


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[[0.95480624 0.04519376]
 [0.9432566  0.05674341]
 [0.89269863 0.10730135]
 [0.93575747 0.06424252]
 [0.94290881 0.05709121]
 [0.97105361 0.02894641]
 [0.94133053 0.0586695 ]
 [0.28471415 0.71528586]
 [0.96370515 0.03629486]
 [0.96775658 0.03224341]
 [0.96785834 0.03214166]
 [0.89759712 0.10240289]
 [0.95465342 0.04534656]
 [0.92013707 0.07986292]
 [0.926839   0.07316098]
 [0.48948114 0.51051884]
 [0.9538173  0.04618271]
 [0.94015085 0.05984913]
 [0.96348369 0.03651629]
 [0.96383941 0.0361606 ]
 [0.75780077 0.24219926]
 [0.93009573 0.06990427]
 [0.41083052 0.5891695 ]
 [0.95054814 0.04945184]
 [0.29519062 0.70480938]
 [0.94992608 0.05007392]
 [0.91312863 0.0868714 ]
 [0.61890426 0.38109571]
 [0.93902647 0.06097354]
 [0.94696641 0.05303358]
 [0.95515991 0.04484009]
 [0.12490334 0.87509666]
 [0.95495275 0.04504724]
 [0.20955059 0.79044941]
 [0.56532138 0.4346786 ]
 [0.94762898 0.052371  ]
 [0.9575423  0.04245769]
 [0.9268314  0.07316861]
 [0.90938439 0.09061559]
 [0.93918297 0.06081701]


Unnamed: 0,gt,pred,prob
0,0.0,0.0,0.045194
1,0.0,0.0,0.056743
2,0.0,0.0,0.107301
3,0.0,0.0,0.064243
4,0.0,0.0,0.057091
5,0.0,0.0,0.028946
6,0.0,0.0,0.058669
7,1.0,0.715286,0.715286
8,0.0,0.0,0.036295
9,0.0,0.0,0.032243


>val_loss=1.46993, split = 1.0
fold 2


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[[0.87208738 0.12791261]
 [0.90209133 0.09790866]
 [0.92014967 0.07985032]
 [0.96549339 0.0345066 ]
 [0.94298172 0.05701829]
 [0.79665642 0.20334357]
 [0.64840778 0.35159223]
 [0.97337763 0.02662238]
 [0.0484029  0.95159711]
 [0.95957847 0.04042151]
 [0.96031161 0.03968838]
 [0.96162036 0.03837967]
 [0.93524406 0.06475596]
 [0.94797079 0.05202923]
 [0.95383969 0.04616032]
 [0.86982146 0.13017853]
 [0.94616866 0.05383136]
 [0.97207559 0.02792439]
 [0.94132147 0.05867854]
 [0.25499605 0.74500394]
 [0.93489601 0.06510396]
 [0.92922634 0.07077368]
 [0.96044995 0.03955007]
 [0.93347901 0.06652098]
 [0.96278479 0.03721519]
 [0.92000581 0.07999417]
 [0.94701332 0.05298668]
 [0.94163305 0.05836695]
 [0.96693204 0.03306795]
 [0.89441827 0.10558173]
 [0.92832124 0.07167876]
 [0.95829083 0.04170916]
 [0.95620396 0.04379606]
 [0.90359128 0.09640873]
 [0.95718632 0.04281369]
 [0.25704262 0.74295737]
 [0.89758395 0.10241605]
 [0.28027429 0.71972572]
 [0.93817036 0.06182962]
 [0.25031118 0.74968883]


Unnamed: 0,gt,pred,prob
0,0.0,0.0,0.127913
1,0.0,0.0,0.097909
2,0.0,0.0,0.07985
3,0.0,0.0,0.034507
4,0.0,0.0,0.057018
5,0.0,0.203344,0.203344
6,1.0,0.351592,0.351592
7,0.0,0.0,0.026622
8,1.0,1.0,0.951597
9,0.0,0.0,0.040422


>val_loss=0.56994, split = 2.0
fold 3


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[[0.22737534 0.77262467]
 [0.89630953 0.10369047]
 [0.09950348 0.90049653]
 [0.94353072 0.05646928]
 [0.94883315 0.05116684]
 [0.79917878 0.20082121]
 [0.33355216 0.66644782]
 [0.94238104 0.05761898]
 [0.84534745 0.15465254]
 [0.82979829 0.17020168]
 [0.94771993 0.05228007]
 [0.08170184 0.91829813]
 [0.10674955 0.89325044]
 [0.97790474 0.02209525]
 [0.9640866  0.0359134 ]
 [0.50883777 0.49116221]
 [0.95658733 0.04341264]
 [0.74788221 0.25211777]
 [0.95621743 0.04378255]
 [0.93379727 0.06620271]
 [0.17594421 0.82405577]
 [0.96257763 0.03742236]
 [0.89129275 0.10870728]
 [0.92205963 0.07794039]
 [0.9611347  0.03886529]
 [0.96250856 0.0374914 ]
 [0.79690791 0.20309208]
 [0.96414592 0.03585408]
 [0.74585865 0.25414132]
 [0.97400466 0.02599536]
 [0.96215668 0.03784333]
 [0.96761264 0.03238734]
 [0.97149608 0.02850394]
 [0.94659836 0.05340165]
 [0.6988095  0.3011905 ]
 [0.94115427 0.0588457 ]
 [0.9370731  0.06292689]
 [0.95250441 0.04749558]
 [0.96428909 0.0357109 ]
 [0.90887392 0.09112611]


Unnamed: 0,gt,pred,prob
0,1.0,0.772625,0.772625
1,0.0,0.0,0.10369
2,1.0,1.0,0.900497
3,0.0,0.0,0.056469
4,0.0,0.0,0.051167
5,0.0,0.200821,0.200821
6,1.0,0.666448,0.666448
7,0.0,0.0,0.057619
8,0.0,0.0,0.154653
9,0.0,0.0,0.170202


>val_loss=1.60513, split = 3.0
fold 4


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[[0.95023419 0.0497658 ]
 [0.95472109 0.04527892]
 [0.96524161 0.03475836]
 [0.92753473 0.0724653 ]
 [0.93090871 0.06909129]
 [0.14518335 0.85481666]
 [0.89017739 0.10982259]
 [0.8701304  0.1298696 ]
 [0.90920162 0.09079837]
 [0.81010187 0.18989815]
 [0.94746665 0.05253336]
 [0.95611836 0.04388167]
 [0.95443162 0.04556839]
 [0.93192106 0.06807893]
 [0.95525915 0.04474082]
 [0.07340821 0.92659179]
 [0.95766476 0.04233526]
 [0.93263615 0.06736386]
 [0.92462372 0.07537627]
 [0.92757154 0.07242845]
 [0.94895019 0.05104983]
 [0.97313567 0.02686433]
 [0.97383313 0.02616687]
 [0.95303029 0.04696971]
 [0.23296466 0.76703533]
 [0.94431663 0.05568334]
 [0.80085485 0.19914517]
 [0.93319735 0.06680267]
 [0.66825557 0.33174443]
 [0.97029696 0.02970304]
 [0.3176525  0.68234746]
 [0.85004656 0.14995342]
 [0.95037086 0.04962915]
 [0.94797512 0.0520249 ]
 [0.78338484 0.21661516]
 [0.74035903 0.25964097]
 [0.94110528 0.05889471]
 [0.95367319 0.0463268 ]
 [0.93767776 0.06232226]
 [0.95478488 0.04521512]


Unnamed: 0,gt,pred,prob
0,0.0,0.0,0.049766
1,0.0,0.0,0.045279
2,0.0,0.0,0.034758
3,0.0,0.0,0.072465
4,0.0,0.0,0.069091
5,1.0,1.0,0.854817
6,0.0,0.0,0.109823
7,0.0,0.0,0.12987
8,0.0,0.0,0.090798
9,0.0,0.0,0.189898


best_model_saved
>val_loss=0.10561, split = 4.0
LOSS: 0.79281
Best loss 0.10561440489188031
