In [None]:
!pip install tabpfn --no-index --find-links=file:///kaggle/input/pip-packages-icr/pip-packages
!pip install ipywidgets
!mkdir -p /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff
!cp /kaggle/input/pip-packages-icr/pip-packages/prior_diff_real_checkpoint_n_0_epoch_100.cpkt /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff/

In [None]:
# Import Joblib Module from Scikit Learn
import joblib

import numpy as np                       # NumPy for numerical computations
import pandas as pd                      # Pandas for data manipulation and analysis
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, normalize   # LabelEncoder for encoding categorical variables, normalize for feature scaling
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier   # GradientBoostingClassifier and RandomForestClassifier for classification models
from tabpfn import TabPFNClassifier 
import xgboost   # XGBoost for gradient boosting models
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score   # accuracy_score for evaluating model performance
from sklearn.impute import SimpleImputer   # SimpleImputer for handling missing values
import imblearn   # imblearn for imbalanced dataset handling
from imblearn.over_sampling import RandomOverSampler   # RandomOverSampler for oversampling minority class
from imblearn.under_sampling import RandomUnderSampler   # RandomUnderSampler for undersampling majority class
import inspect   # inspect for retrieving information about live objects
from collections import defaultdict   # defaultdict for creating a dictionary with default values
import warnings   # warnings for ignoring warnings during runtime
from sklearn.model_selection import KFold as KF
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
import sys
import os

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Load Dataset

In [None]:
train = pd.read_csv('../datasets/icr-identify-age-related-conditions/train.csv')
greeks = pd.read_csv('../datasets/icr-identify-age-related-conditions/greeks.csv')
test = pd.read_csv('../datasets/icr-identify-age-related-conditions/test.csv')

In [None]:
#drop 'Id' 'EJ' columns
train = train.drop(['Id','EJ'], axis=1)

#fill 'median' for missing values
columns = train.columns
imputer = SimpleImputer(missing_values = np.nan, strategy ='median')
imputer = imputer.fit(train)
train = imputer.transform(train)
train = pd.DataFrame(train, columns = columns)

## Cross Validation

In [None]:
NUM_FOLDS = 5
skf = StratifiedKFold(n_splits=NUM_FOLDS, shuffle=True, random_state=42)
splitter = skf.split(train, train.Class)

for fold_idx, (train_idx, val_idx) in enumerate(splitter):
    print(f'Getting fold number {fold_idx}')
    x_train, y_train = train.iloc[train_idx], greeks.Alpha.iloc[train_idx]
    df_train = pd.concat((x_train, y_train), axis=1)
    df_val = train.iloc[val_idx]
    
    #drop column Id & reset index
    
#     df_train = df_train.drop(['Id'], axis=1)
    df_train = df_train.reset_index(drop = True)
#     df_val = df_val.drop(['Id'], axis=1)
    df_val = df_val.reset_index(drop = True)
    
    #kfold path
    save_dir = f'../datasets/kfold/fold{fold_idx}'
    os.makedirs(save_dir, exist_ok = True)
    
    #saving
    df_train.to_csv(os.path.join(save_dir, 'train.csv'), index = False)
    df_val.to_csv(os.path.join(save_dir, 'val.csv'), index = False)
    
    # for testing
    save_dir1 = f'../datasets/kfold1/fold{fold_idx}'
    os.makedirs(save_dir1, exist_ok = True)
    df_train = df_train.reset_index(drop=True)
    df_val = df_val.reset_index(drop=True)
    df_train.to_csv(os.path.join(save_dir1, 'train.csv'))
    df_val.to_csv(os.path.join(save_dir1, 'val.csv'))

## Pre-processing

In [None]:
def prepair_input(df, classi):
    columns = df.columns
    
    # Convert the values in the 'EJ' column of the 'test' dataframe to binary values (0 or 1),
    # based on the occurrence of the 'first_category' in the 'train' dataframe
#     first_category = df.EJ.unique()[0]
#     df.EJ = df.EJ.eq(first_category).astype('int')

    df = df.rename(columns={'BD ': 'BD', 'CD ': 'CD', 'CW ': 'CW', 'FD ': 'FD'})
    
    imputer = SimpleImputer(missing_values = np.nan, strategy ='median')
    imputer = imputer.fit(df)
    df = imputer.transform(df)
    df = pd.DataFrame(df, columns = columns)
    
    # Create a RandomOverSampler object with a random state of 42
    ros = RandomOverSampler(random_state=42)

    # Resample the 'train_pred_and_time' dataframe and 'greeks.Alpha' series using RandomOverSampler
    # The resampled data is assigned to 'train_ros' and 'y_ros' respectively
    x_ros, y_ros = ros.fit_resample(df, classi)
#     print(y_ros.value_counts())
    return x_ros, y_ros

def normolized(df):
    columns = df.columns
    
    scaler = StandardScaler()
    model = scaler.fit(df)
    scaled_df = model.transform(df)
    
    scaled_df = pd.DataFrame(scaled_df, columns = columns)
    return scaled_df

## Balanced Log Loss

In [None]:
def balanced_log_loss(y_true, y_pred):
    #number of true values of 0 & 1
    N_0 = np.sum(1 - y_true)
    N_1 = np.sum(y_true)
    # calculate the weights for each class to balance classes
    w_0 = 1 / N_0
    w_1 = 1 / N_1
    # calculate the predicted probabilities for each class
    p_0 = np.clip(y_pred[:, 0], 1e-15, 1 - 1e-15)
    p_1 = np.clip(y_pred[:, 1], 1e-15, 1 - 1e-15)
    # calculate the summed log loss for each class
    log_loss_0 = -np.sum((1 - y_true) * np.log(p_0))
    log_loss_1 = -np.sum(y_true * np.log(p_1))
    # calculate the weighted summed logarithmic loss
    # (factgor of 2 included to give same result as LL with balanced input)
    balanced_log_loss = 2*(w_0 * log_loss_0 + w_1 * log_loss_1) / (w_0 + w_1)
    # return the average log loss
    return balanced_log_loss/(N_0+N_1)

In [None]:
_tmp_true = np.array([0, 0, 1, 0, 1])
_tmp_pred = np.array([
    [0, 1],
    [0, 1],
    [1, 0],
    [0, 1],
    [1, 0]
], dtype = np.float32)
_tmp_pred = 1.0 - _tmp_pred
print(_tmp_pred)


balanced_log_loss(_tmp_true, _tmp_pred)

## Model

In [None]:
tab = TabPFNClassifier(N_ensemble_configurations=12)
type(tab)
tabpfn = [TabPFNClassifier(N_ensemble_configurations=12),
          TabPFNClassifier(N_ensemble_configurations=24)]

type(tab)

In [None]:
tabpfn = TabPFNClassifier(N_ensemble_configurations=12)

config1 = [
    {'name': 'xgb', 'n_estimators': 100 , 'max_depth': 3, 'learning_rate': 0.1, 'subsample': 0.9, 'colsample_bytree': 0.85},
    {'name': 'xgb', 'n_estimators': 200 , 'max_depth': 3, 'learning_rate': 0.2, 'subsample': 0.8, 'colsample_bytree': 0.85},
    {'name': 'tabpfn', 'N_ensemble_configurations': 12},
    {'name': 'tabpfn', 'N_ensemble_configurations': 24},
    {'name': 'randomforest', 'n_estimators': 200, 'max_depth': 3, 'random_state': 42},
    {'name': 'randomforest', 'n_estimators': 200, 'max_depth': 3, 'random_state': 42},
    {'name': 'gaussiannb'},
    {'name': 'multinomialnb'},
    {'name': 'kneighbors'},
    {'name': 'gradientboosting'}
] * 2

config2 = [
    {'name': 'xgb', 'n_estimators': 100 , 'max_depth': 5, 'learning_rate': 0.1, 'subsample': 0.9, 'colsample_bytree': 0.85},
    {'name': 'xgb', 'n_estimators': 100 , 'max_depth': 5, 'learning_rate': 0.2, 'subsample': 0.8, 'colsample_bytree': 0.85},
    {'name': 'tabpfn', 'N_ensemble_configurations': 12},
    {'name': 'randomforest', 'n_estimators': 100, 'max_depth': 5, 'random_state': 50},
    {'name': 'randomforest', 'n_estimators': 100, 'max_depth': 5, 'random_state': 50},
    {'name': 'gaussiannb'},
    {'name': 'multinomialnb'},
    {'name': 'kneighbors'},
    {'name': 'gradientboosting'}
]

config3 = [
    {'name': 'xgb', 'n_estimators': 100 , 'max_depth': 7, 'learning_rate': 0.1, 'subsample': 0.9, 'colsample_bytree': 0.85},
    {'name': 'randomforest', 'n_estimators': 100, 'max_depth': 7, 'random_state': 30},
    {'name': 'randomforest', 'n_estimators': 100, 'max_depth': 7, 'random_state': 30}
]

config = config1+config2+config3

def config_classifiers(config):
    CLASSIFIER_CLASSES = {
        'xgb': XGBClassifier,
        'tabpfn': TabPFNClassifier,
        'randomforest': RandomForestClassifier,
        'gaussiannb': GaussianNB,
        'multinomialnb': MultinomialNB,
        'kneighbors': KNeighborsClassifier,
        'gradientboosting': GradientBoostingClassifier
    }

    thismodule = sys.modules[__name__]
    classifiers = []
    for sub_cfg in config:
    #     cls = globals()[sub_cfg['name']]
        cls = CLASSIFIER_CLASSES[sub_cfg['name']]
        kwargs = {k:v for k, v in sub_cfg.items() if k != 'name'}
        classifier = cls(**kwargs)
        classifiers.append(classifier)
    return classifiers

In [None]:
class Ensemble():
    def __init__(self):
        self.classifiers = config_classifiers(config)
        print(self.classifiers)
        
    def fit(self,X,y):
        for classifier in self.classifiers:
            print(classifier)
            if (type(classifier) == type(tabpfn)):
                classifier.fit(X, y, overwrite_warning=True)
            else :
                classifier.fit(X, y)
     
    def predict_proba(self, x):
        # N_models * N_rows * N_classes (#models * 5 * 4)
        probabilities = np.stack([classifier.predict_proba(x) for classifier in self.classifiers])
        averaged_probabilities = np.mean(probabilities, axis=0) # N_rows * N_classes
        class_0_est_instances = averaged_probabilities[:, 0].sum()  # N_rows
        others_est_instances = averaged_probabilities[:, 1:].sum()  # N_rows   
        # Weighted probabilities based on class imbalance
        new_probabilities = averaged_probabilities * np.array([[1/(class_0_est_instances if i==0 else others_est_instances) for i in range(averaged_probabilities.shape[1])]])
        ret =  new_probabilities / np.sum(new_probabilities, axis=1, keepdims=1) 
        return ret

# Post processing

In [None]:
def calibrate_prob(probs, shape, thres_1, thres_0):
    print('TYPE:', probs.shape, type(probs))
    
    #transfer to probabilitiy of 2 class: 0 & 1
    class_0_prob = probs[:, 0]
    others_prob = probs[:, 1:].sum(axis=1)
    class_0_prob = class_0_prob.reshape((shape, 1))
    others_prob = others_prob.reshape((shape, 1))
    
#     probs = np.concatenate([class_0_prob, others_prob], axis=-1)
#     ret = probs.copy()
    col_0 = class_0_prob.copy()
    col_0[class_0_prob < thres_1] = 0.0
    col_0[class_0_prob > thres_0] = 1.0
    col_1 = 1.0 - col_0
    ret = np.concatenate([col_0, col_1], axis = -1)
    print('ret', type(ret))
    return ret

## Training

In [None]:
def training():
    splits = 5   # Total number of splits for the inner cross-validation
    models = []   # List to store the trained models for each inner fold
    thres_lst = []
    loss_lst = []
    pred_sets = []
    true_sets = []

    # Loop over the splits of the inner cross-validation using tqdm for progress visualization
    for split in range(splits):
        model = Ensemble()
        print('fold', split)
        #loading train & test dataset for each fold
        save_dir = f'../datasets/kfold/fold{split}'

        # x_train & y_train
        df_train = pd.read_csv(os.path.join(save_dir, 'train.csv'))
        x_train = df_train.drop(['Class', 'Alpha'], axis=1)
        y_train = df_train.Alpha
        #labael-encoder
        le = LabelEncoder()
        y_train = le.fit_transform(y_train)
        # pre-processing
        x_train, y_train = prepair_input(x_train, y_train)

        # x_val & y_val
        df_val = pd.read_csv(os.path.join(save_dir, 'val.csv'))
        x_val = df_val.drop(['Class'], axis=1)
        y_val = df_val.Class
        print(y_val.value_counts())

        #fitting model
        model.fit(x_train, y_train)   # Fit the model on the training data
        models.append(model)   # Append the trained model to the list of models
        y_pred = model.predict_proba(x_val) # Predict probabilities for the validation set for 4 classes
        shape = y_val.size
        
        for i in range(shape):
            pred_sets.append(y_pred[i])
            true_sets.append(y_val[i])
    
    y_pred = np.array(pred_sets)
    y_val = pd.Series(true_sets)
    
    print('Models', models)
    
    return models, y_pred, y_val   # Return the list of trained models

In [None]:
models, y_pred, y_val = training()

## Evaluation

In [None]:
def evaluation(y_pred, y_val):    
    ret = []
    #find best threshold
    for thres_1 in np.arange(0, 1, 0.01):
        for thres_0 in np.arange(thres_1, 1, 0.01):
            shape = len(y_val)
            #post processing
            y_p = calibrate_prob(y_pred, shape, thres_1, thres_0)

            #balanced log loss
            loss = balanced_log_loss(y_val, y_p)  # Calculate the balanced log loss between the predicted labels and the true labels

    #         # checking
    #         y_val = y_val.to_frame()
    #         y_val.rename(columns = {'Class': 'gt'}, inplace = True)
    #         y_val['pred'] = y_p[:, 1]
    # #         print(type(y_val['gt']), type(y_val.loc[0, 'gt']), type(y_val['pred']), type(y_val.loc[0, 'pred']))
    #         p00 = y_p[:, 1]
    #         p00 = p00.flatten()
    #         y_val['prob'] = p00
    #         display(y_val)
            print('>LOSS=%.5f' % loss)
            ret.append([thres_1, thres_0, loss])
    
    ret = sorted(ret, key= lambda x: x[2])
    print('best:\n', ret[:10])
    
    return ret[0]

In [None]:
best_thres = evaluation(y_pred, y_val)

## Save Model

In [None]:
# Import Joblib Module from Scikit Learn
import joblib

# Save RL_Model to file in the current working directory
save_dir = f'../model'
os.makedirs(save_dir, exist_ok = True)

In [None]:
joblib.dump(m, 'ensemble.joblib')

In [None]:
m = joblib.load('ensemble.joblib')
m

In [None]:
test

In [None]:
Id = test['Id']
test = test.drop(['Id', 'EJ'], axis=1)
loss_ensembles = list()
for ensemble in models:
    y_pred = ensemble.predict_proba(test)
    shape = test.shape[0]
    y_p = calibrate_prob(y_pred, shape, thres)
    loss_ensembles.append(y_p)
    
print(loss_ensembles)

In [None]:
y_pred

In [None]:
#post processing
shape = test.shape[0]

y_p = calibrate_prob(y_pred, shape)
submission = pd.DataFrame(Id, columns=['Id'])
submission["class_0"] = y_p[:, 0]
submission["class_1"] = y_p[:, 1]
submission.to_csv('submission.csv', index=False)

In [None]:
df_submission = pd.read_csv('submission.csv')

In [None]:
a = pd.read_csv('../datasets/icr-identify-age-related-conditions/train.csv')

In [None]:
a = a.sample(100)

In [None]:
c = pd.read_csv('../datasets/kfold/fold5/val.csv')
b = pd.read_csv('../datasets/kfold/fold6/val.csv')
a = pd.concat([c, b])

In [None]:
a.isnull().sum()

In [None]:
save_dir = f'../datasets/sample_test/'
os.makedirs(save_dir, exist_ok = True)
    
#saving
a.to_csv(os.path.join(save_dir, 'test.csv'), index = False)

In [None]:
# #take 'Id' column and drop 'Id', 'EJ' columns
# Id = a['Id']
test = a.drop(['Class'], axis=1)
columns = test.columns

imputer = SimpleImputer(missing_values = np.nan, strategy ='median')
imputer = imputer.fit(test)
test = imputer.transform(test)
test = pd.DataFrame(test, columns = columns)

In [None]:
def prepair_test(df):
    #take 'Id' column and drop 'Id', 'EJ' columns
    Id = df['Id']
    test = df.drop(['Id', 'EJ'], axis=1)
    columns = test.columns
    
    imputer = SimpleImputer(missing_values = np.nan, strategy ='median')
    imputer = imputer.fit(test)
    test = imputer.transform(test)
    test = pd.DataFrame(test, columns = columns)
    return Id, test

In [None]:
a = pd.read_csv('../datasets/sample_test/test.csv')

In [None]:
Id , test = prepair_test(a)

In [None]:
best_thres[2]

In [None]:
y_val = test.Class
test = test.drop(['Class'], axis=1)

In [None]:
ensemble = models[0]

y_pred = ensemble.predict_proba(test)
print(y_pred)
shape = test.shape[0]
y_p = calibrate_prob(y_pred, shape, best_thres[0], best_thres[1])
loss = balanced_log_loss(y_val, y_p)



print(y_p)
print(y_val)
print(loss)

In [None]:
loss_ensembles = list()
for ensemble in models:
    y_pred = ensemble.predict_proba(test)
    print(y_pred)
    shape = test.shape[0]
    y_p = calibrate_prob(y_pred, shape, best_thres[0], best_thres[1])
    loss = balanced_log_loss(y_val, y_p)
    loss_ensembles.append(loss)
    final_loss = np.mean(loss_ensembles)

print('lost_ensembles', loss_ensembles)
print('LOSS', final_loss)

In [None]:
y_pred.shape

In [None]:
shape = test.shape[0]

y_p = calibrate_prob(y_pred, shape, 0.90)
y_p.shape

In [None]:
y_val = a.Class
loss = balanced_log_loss(y_val, y_p)
loss

In [None]:


submission = pd.DataFrame(Id, columns=['Id'])
submission["class_0"] = y_p[:, 0]
submission["class_1"] = y_p[:, 1]
submission.to_csv('submission.csv', index=False)

In [None]:
submission