In [1]:
!pip install tabpfn --no-index --find-links=file:///kaggle/input/pip-packages-icr/pip-packages
!mkdir -p /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff
!cp /kaggle/input/pip-packages-icr/pip-packages/prior_diff_real_checkpoint_n_0_epoch_100.cpkt /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff/

Looking in links: file:///kaggle/input/pip-packages-icr/pip-packages
Processing /kaggle/input/pip-packages-icr/pip-packages/tabpfn-0.1.9-py3-none-any.whl
Installing collected packages: tabpfn
Successfully installed tabpfn-0.1.9


In [2]:
# Import Joblib Module from Scikit Learn
import joblib

import numpy as np                       # NumPy for numerical computations
import pandas as pd                      # Pandas for data manipulation and analysis
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, normalize   # LabelEncoder for encoding categorical variables, normalize for feature scaling
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier   # GradientBoostingClassifier and RandomForestClassifier for classification models
from tabpfn import TabPFNClassifier 
import xgboost   # XGBoost for gradient boosting models
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score   # accuracy_score for evaluating model performance
from sklearn.impute import SimpleImputer   # SimpleImputer for handling missing values
import imblearn   # imblearn for imbalanced dataset handling
from imblearn.over_sampling import RandomOverSampler   # RandomOverSampler for oversampling minority class
from imblearn.under_sampling import RandomUnderSampler   # RandomUnderSampler for undersampling majority class
import inspect   # inspect for retrieving information about live objects
from collections import defaultdict   # defaultdict for creating a dictionary with default values
import warnings   # warnings for ignoring warnings during runtime
from sklearn.model_selection import KFold as KF
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
import sys
import os
import joblib
import pickle



## Pre-processing

In [3]:
def prepair_test(df):
    #take 'Id' column and drop 'Id', 'EJ' columns
    Id = df['Id']
    test = df.drop(['Id', 'EJ'], axis=1)
    columns = test.columns
    
    imputer = SimpleImputer(missing_values = np.nan, strategy ='median')
    imputer = imputer.fit(test)
    test = imputer.transform(test)
    test = pd.DataFrame(test, columns = columns)
    return Id, test

## Ensemble

In [4]:
class Ensemble():
    def __init__(self):
        self.classifiers = config_classifiers(config)
        print(self.classifiers)
        
    def fit(self,X,y):
        for classifier in self.classifiers:
            print(classifier)
            if (type(classifier) == type(tabpfn)):
                classifier.fit(X, y, overwrite_warning=True)
            else :
                classifier.fit(X, y)
     
    def predict_proba(self, x):
        # N_models * N_rows * N_classes (#models * 5 * 4)
        probabilities = np.stack([classifier.predict_proba(x) for classifier in self.classifiers])
        averaged_probabilities = np.mean(probabilities, axis=0) # N_rows * N_classes
        class_0_est_instances = averaged_probabilities[:, 0].sum()  # N_rows
        others_est_instances = averaged_probabilities[:, 1:].sum()  # N_rows   
        # Weighted probabilities based on class imbalance
        new_probabilities = averaged_probabilities * np.array([[1/(class_0_est_instances if i==0 else others_est_instances) for i in range(averaged_probabilities.shape[1])]])
        ret =  new_probabilities / np.sum(new_probabilities, axis=1, keepdims=1) 
        return ret

## Post processing

In [5]:
def calibrate_prob(probs, shape, thres_1, thres_0):
#     print('TYPE:', probs.shape, type(probs))
    
    #transfer to probabilitiy of 2 class: 0 & 1
    class_0_prob = probs[:, 0]
    others_prob = probs[:, 1:].sum(axis=1)
    class_0_prob = class_0_prob.reshape((shape, 1))
    others_prob = others_prob.reshape((shape, 1))
    
#     probs = np.concatenate([class_0_prob, others_prob], axis=-1)
#     ret = probs.copy()
    col_0 = class_0_prob.copy()
    col_0[class_0_prob < thres_1] = 0.0
    col_0[class_0_prob > thres_0] = 1.0
    col_1 = 1.0 - col_0
    ret = np.concatenate([col_0, col_1], axis = -1)
#     print('ret', type(ret))
    return ret

## Load Model

In [6]:
save_dir = '/kaggle/input/save-models/ensemble.sav'
models = pickle.load(open(save_dir, 'rb'))

In [7]:
models

[<__main__.Ensemble at 0x7db64acda0b0>,
 <__main__.Ensemble at 0x7db5964cfd60>,
 <__main__.Ensemble at 0x7db595d39660>,
 <__main__.Ensemble at 0x7db595c84520>,
 <__main__.Ensemble at 0x7db595baf970>]

## Load Thres

In [8]:
thres = pickle.load(open('/kaggle/input/save-models/thres.sav', 'rb'))
thres

[0.02, 0.7799999999999999, 0.17258183678220163]

## Infer

In [9]:
test = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/test.csv')

#pre-processing
Id, test = prepair_test(test)

In [10]:
y_p_lst = list()
num_ensembles = len(models)

#post processing
for ensemble in models:
    y_pred = ensemble.predict_proba(test)
    shape = test.shape[0]
    y_p = calibrate_prob(y_pred, shape, thres[0], thres[1])
    y_p_lst.append(y_p)

y_p = np.array(y_p_lst).sum(axis=0)/num_ensembles

In [11]:
#post processing
shape = test.shape[0]

#submission
submission = pd.DataFrame(Id, columns=['Id'])
submission["class_0"] = y_p[:, 0]
submission["class_1"] = y_p[:, 1]
submission.to_csv('submission.csv', index=False)

In [12]:
submission


Unnamed: 0,Id,class_0,class_1
0,00eed32682bb,0.5,0.5
1,010ebe33f668,0.5,0.5
2,02fa521e1838,0.5,0.5
3,040e15f562a2,0.5,0.5
4,046e85c7cc7f,0.5,0.5
