In [1]:
import pandas as pd
import numpy as np
import time
import random

from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder, MinMaxScaler, RobustScaler, QuantileTransformer, MaxAbsScaler
from sklearn.compose import make_column_transformer
from sklearn.metrics import make_scorer, log_loss
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.inspection import permutation_importance
from sklearn.feature_selection import RFECV, mutual_info_regression
from sklearn.base import BaseEstimator, ClassifierMixin

from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from concurrent.futures import ThreadPoolExecutor

from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier, RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

import tensorflow as tf

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical

import warnings
from pprint import pprint
import os

pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')

experiment_name = 'baseline'

# Set seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)
random.seed(42)



In [2]:
train = pd.read_csv(r'.\train.csv')
test = pd.read_csv(r'.\test.csv')

# original = pd.read_csv(r'.\validation_data.csv')
train.head(3)

Unnamed: 0,id,N_Days,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage,Status
0,0,999,D-penicillamine,21532,M,N,N,N,N,2.3,316.0,3.35,172.0,1601.0,179.8,63.0,394.0,9.7,3.0,D
1,1,2574,Placebo,19237,F,N,N,N,N,0.9,364.0,3.54,63.0,1440.0,134.85,88.0,361.0,11.0,3.0,C
2,2,3428,Placebo,13727,F,N,Y,Y,Y,3.3,299.0,3.55,131.0,1029.0,119.35,50.0,199.0,11.7,4.0,D


In [3]:
TARGET = 'Status'
binary_cols = ['Ascites', 'Hepatomegaly', 'Spiders']
categorical_cols = ['Drug', 'Sex', 'Stage', 'Edema']
drop_cols = ['id', TARGET]

In [4]:
numerical_cols = train.drop(categorical_cols + binary_cols + drop_cols, axis=1).select_dtypes(include=np.number).columns
numerical_cols

Index(['N_Days', 'Age', 'Bilirubin', 'Cholesterol', 'Albumin', 'Copper',
       'Alk_Phos', 'SGOT', 'Tryglicerides', 'Platelets', 'Prothrombin'],
      dtype='object')

# Preprocess Data

In [5]:
transformer = make_column_transformer(
    (OneHotEncoder(sparse=False, handle_unknown='infrequent_if_exist', drop='if_binary'), binary_cols),
    (OneHotEncoder(sparse=False, handle_unknown='infrequent_if_exist'), categorical_cols),
    remainder='passthrough')

df_to_ohe = train.drop(drop_cols, axis=1)
test = test.drop('id', axis=1)

transformed = transformer.fit_transform(df_to_ohe)

# Get the transformed feature names
transformed_feat_names = [name.split('__')[-1] for name in transformer.get_feature_names_out()]

# Create DataFrame of the transformed features
df_to_ohe_transformed = pd.DataFrame(transformed, columns=transformed_feat_names)
df_to_ohe_transformed.head()

Unnamed: 0,Ascites_Y,Hepatomegaly_Y,Spiders_Y,Drug_D-penicillamine,Drug_Placebo,Sex_F,Sex_M,Stage_1.0,Stage_2.0,Stage_3.0,Stage_4.0,Edema_N,Edema_S,Edema_Y,N_Days,Age,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin
0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,999.0,21532.0,2.3,316.0,3.35,172.0,1601.0,179.8,63.0,394.0,9.7
1,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,2574.0,19237.0,0.9,364.0,3.54,63.0,1440.0,134.85,88.0,361.0,11.0
2,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,3428.0,13727.0,3.3,299.0,3.55,131.0,1029.0,119.35,50.0,199.0,11.7
3,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,2576.0,18460.0,0.6,256.0,3.5,58.0,1653.0,71.3,96.0,269.0,10.7
4,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,788.0,16658.0,1.1,346.0,3.65,63.0,1181.0,125.55,96.0,298.0,10.6


In [6]:
transformed_new_data = transformer.transform(test)

# # Get the transformed feature names
# transformed_feat_names = [name.split('__')[-1] for name in transformer.get_feature_names_out()]

# Create DataFrame of the transformed features
test_transformed = pd.DataFrame(transformed_new_data, columns=transformed_feat_names)
test_transformed.head(3)

Unnamed: 0,Ascites_Y,Hepatomegaly_Y,Spiders_Y,Drug_D-penicillamine,Drug_Placebo,Sex_F,Sex_M,Stage_1.0,Stage_2.0,Stage_3.0,Stage_4.0,Edema_N,Edema_S,Edema_Y,N_Days,Age,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin
0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,3839.0,19724.0,1.2,546.0,3.37,65.0,1636.0,151.9,90.0,430.0,10.6
1,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,2468.0,14975.0,1.1,660.0,4.22,94.0,1257.0,151.9,155.0,227.0,10.0
2,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,51.0,13149.0,2.0,151.0,2.96,46.0,961.0,69.75,101.0,213.0,13.0


In [7]:
label_encoder = LabelEncoder()

label_encoder.fit(train[TARGET])

df_to_ohe_transformed[TARGET] = label_encoder.transform(train[TARGET])

df_to_ohe_transformed.head()

Unnamed: 0,Ascites_Y,Hepatomegaly_Y,Spiders_Y,Drug_D-penicillamine,Drug_Placebo,Sex_F,Sex_M,Stage_1.0,Stage_2.0,Stage_3.0,Stage_4.0,Edema_N,Edema_S,Edema_Y,N_Days,Age,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Status
0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,999.0,21532.0,2.3,316.0,3.35,172.0,1601.0,179.8,63.0,394.0,9.7,2
1,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,2574.0,19237.0,0.9,364.0,3.54,63.0,1440.0,134.85,88.0,361.0,11.0,0
2,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,3428.0,13727.0,3.3,299.0,3.55,131.0,1029.0,119.35,50.0,199.0,11.7,2
3,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,2576.0,18460.0,0.6,256.0,3.5,58.0,1653.0,71.3,96.0,269.0,10.7,0
4,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,788.0,16658.0,1.1,346.0,3.65,63.0,1181.0,125.55,96.0,298.0,10.6,0


In [8]:
# Convert object type columns back to numeric if they were originally numeric
for col in df_to_ohe_transformed.columns:
    if df_to_ohe_transformed[col].dtype == 'object':
        df_to_ohe_transformed[col] = pd.to_numeric(df_to_ohe_transformed[col], errors='coerce')

# Check the first few rows of the DataFrame
df_to_ohe_transformed.head()

Unnamed: 0,Ascites_Y,Hepatomegaly_Y,Spiders_Y,Drug_D-penicillamine,Drug_Placebo,Sex_F,Sex_M,Stage_1.0,Stage_2.0,Stage_3.0,Stage_4.0,Edema_N,Edema_S,Edema_Y,N_Days,Age,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Status
0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,999.0,21532.0,2.3,316.0,3.35,172.0,1601.0,179.8,63.0,394.0,9.7,2
1,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,2574.0,19237.0,0.9,364.0,3.54,63.0,1440.0,134.85,88.0,361.0,11.0,0
2,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,3428.0,13727.0,3.3,299.0,3.55,131.0,1029.0,119.35,50.0,199.0,11.7,2
3,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,2576.0,18460.0,0.6,256.0,3.5,58.0,1653.0,71.3,96.0,269.0,10.7,0
4,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,788.0,16658.0,1.1,346.0,3.65,63.0,1181.0,125.55,96.0,298.0,10.6,0


In [9]:
df_to_ohe_transformed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7905 entries, 0 to 7904
Data columns (total 26 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Ascites_Y             7905 non-null   float64
 1   Hepatomegaly_Y        7905 non-null   float64
 2   Spiders_Y             7905 non-null   float64
 3   Drug_D-penicillamine  7905 non-null   float64
 4   Drug_Placebo          7905 non-null   float64
 5   Sex_F                 7905 non-null   float64
 6   Sex_M                 7905 non-null   float64
 7   Stage_1.0             7905 non-null   float64
 8   Stage_2.0             7905 non-null   float64
 9   Stage_3.0             7905 non-null   float64
 10  Stage_4.0             7905 non-null   float64
 11  Edema_N               7905 non-null   float64
 12  Edema_S               7905 non-null   float64
 13  Edema_Y               7905 non-null   float64
 14  N_Days                7905 non-null   float64
 15  Age                  

In [10]:
test_transformed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5271 entries, 0 to 5270
Data columns (total 25 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Ascites_Y             5271 non-null   float64
 1   Hepatomegaly_Y        5271 non-null   float64
 2   Spiders_Y             5271 non-null   float64
 3   Drug_D-penicillamine  5271 non-null   float64
 4   Drug_Placebo          5271 non-null   float64
 5   Sex_F                 5271 non-null   float64
 6   Sex_M                 5271 non-null   float64
 7   Stage_1.0             5271 non-null   float64
 8   Stage_2.0             5271 non-null   float64
 9   Stage_3.0             5271 non-null   float64
 10  Stage_4.0             5271 non-null   float64
 11  Edema_N               5271 non-null   float64
 12  Edema_S               5271 non-null   float64
 13  Edema_Y               5271 non-null   float64
 14  N_Days                5271 non-null   float64
 15  Age                  

## Training Machine Learning Models

In [11]:
X = df_to_ohe_transformed.drop(TARGET, axis=1)
y = df_to_ohe_transformed[TARGET]
test_features = test_transformed.copy()

sk = StratifiedKFold(n_splits=10, shuffle=True, random_state=5)

### Keras Model

In [12]:
class KerasClassifierWrapper(BaseEstimator, ClassifierMixin):
    def __init__(self, epochs=100, batch_size=32):
        self.epochs = epochs
        self.batch_size = batch_size
        self.model = self.create_model()

    def create_model(self):
        model = Sequential()
        model.add(Dense(128, input_dim=len(X.columns), activation='relu'))
        model.add(BatchNormalization())
        model.add(Dense(64, activation='relu'))
        model.add(BatchNormalization())
        model.add(Dense(32, activation='relu'))
        model.add(BatchNormalization())
        model.add(Dense(3, activation='softmax'))  # Output layer for 3 classes
        model.compile(optimizer='adam', 
                      loss='categorical_crossentropy', 
                      metrics=[tf.keras.metrics.CategoricalCrossentropy(name='log_loss')])
        return model

    def fit(self, X, y):
        # self.model = self.create_model()
        early_stopping = EarlyStopping(monitor='val_loss', patience=30, restore_best_weights=True, mode='min')
        reduce_LR = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', patience=5, min_lr=0.00001, factor=0.8)

        self.model.fit(X, to_categorical(y), 
                       epochs=self.epochs, 
                       batch_size=self.batch_size, 
                       verbose=0, 
                       validation_split=0.1, 
                       callbacks=[early_stopping, reduce_LR])
        return self

    def predict_proba(self, X):
        return self.model.predict(X)

    def score(self, X, y):
        y_pred = self.predict_proba(X)
        return -log_loss(to_categorical(y), y_pred)

In [13]:
# def create_model(): 
#     model = Sequential()
#     model.add(Dense(128, input_dim=len(X.columns), activation='relu'))
#     model.add(BatchNormalization())
#     model.add(Dense(64, activation='relu'))
#     model.add(BatchNormalization())
#     model.add(Dense(32, activation='relu'))
#     model.add(BatchNormalization())
#     model.add(Dense(3, activation='softmax'))  # Output layer for 3 classes

#     model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['categorical_crossentropy'])
#     return model

# def fit_predict(model, X_train, y_train, X_test):
#     early_stopping = EarlyStopping(monitor='val_loss', patience=30, restore_best_weights=True, mode='min')
#     reduce_LR = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', patience=5, min_lr=0.00001, factor=0.8)

#     model.fit(X_train, to_categorical(y_train), epochs=100, 
#               batch_size=32, verbose=0, validation_split=0.2, 
#               callbacks=[early_stopping, reduce_LR])
    
#     y_pred = model.predict(X_test)
    
#     return y_pred

# # Custom scorer function
# def custom_log_loss(y_true, y_pred):
#     return log_loss(to_categorical(y_true), y_pred)

# # Cross-validation
# log_loss_scores = []

# for train, test in sk.split(X, y):
#     model = create_model()
#     y_pred = fit_predict(model, X.iloc[train], y.iloc[train], X.iloc[test])
#     score = custom_log_loss(y.iloc[test], y_pred)
#     log_loss_scores.append(score)

# # Print the results
# print("Log Loss: %.2f (%.2f)" % (np.mean(log_loss_scores), np.std(log_loss_scores)))

### List of models

In [14]:
models = [
	# Ensemble and Tree Methods
	AdaBoostClassifier(random_state=5),
	BaggingClassifier(random_state=5),
	ExtraTreesClassifier(random_state=5),
	GradientBoostingClassifier(random_state=5),
	HistGradientBoostingClassifier(random_state=5),
	RandomForestClassifier(random_state=5),
    
	XGBClassifier(random_state=5),
	LGBMClassifier(n_jobs=-1, random_state=5),
	CatBoostClassifier(random_state=5, verbose=False, early_stopping_rounds=100),
    
	# Liner Models
    LogisticRegression(multi_class='multinomial', solver='lbfgs'),
    
	# Support Vector Machine
    SVC(probability=True),
    
	# KNeighbors & Naive Models
	KNeighborsClassifier(),
    GaussianNB(),
    
	# NeuralNet
	KerasClassifierWrapper(),
    ]

# # create table to compare MLA metrics
# MLA_columns = ['MLA Name', 'MLA Parameters', 'MLA Train Accuracy Mean', 'MLA Test Accuracy Mean', 'MLA Test Accuracy 3*STD', 'MLA Time']
# MLA_compare = pd.DataFrame(columns = MLA_columns)

### Cross-validation function

In [15]:
def evaluate_models(models, X, y, important_features, cv_split, experiment_name):
    # Create a DataFrame to store comparison results
    MLA_compare = pd.DataFrame(columns=['MLA Name', 
                                        'MLA Parameters', 
                                        'MLA Train Log Loss', 
                                        'MLA Test Log Loss', 
                                        'MLA Time'])
    
    def evaluate_model(alg, idx):
        MLA_name = alg.__class__.__name__
        features = important_features.get(MLA_name, [])

        if not features:
            return None

        # Perform cross-validation
        log_loss_scorer = make_scorer(log_loss, greater_is_better=False, needs_proba=True)
        
        cv_results = cross_validate(alg, X[features], y, 
                                    cv=cv_split, 
                                    scoring={'Log Loss': log_loss_scorer}, 
                                    return_train_score=True, 
                                    n_jobs=-1)

        # Format time
        mean_fit_time = cv_results['fit_time'].mean()
        minutes, seconds = divmod(mean_fit_time, 60)

        print(f'Done with {MLA_name}.')
        
        # Populate results
        return {
            'MLA Name': MLA_name,
            'MLA Parameters': str(alg.get_params()),
            'MLA Train Log Loss': -cv_results['train_Log Loss'].mean(),
            'MLA Test Log Loss': -cv_results['test_Log Loss'].mean(),
            'MLA Time': f"{int(minutes)} min {seconds:.2f} sec",
        }

    results_list = []

    # Use ThreadPoolExecutor for parallel execution
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = [executor.submit(evaluate_model, alg, idx) for idx, alg in enumerate(models)]
        for future in futures:
            result = future.result()
            if result:
                results_list.append(result)

    # Create a DataFrame from the list of dictionaries
    MLA_compare = pd.DataFrame(results_list)

    # Sort and save results
    MLA_compare.sort_values(by=['MLA Test Log Loss'], ascending=True, inplace=True)
    MLA_compare.to_csv(f'{experiment_name}_results.csv', index=False)

    return MLA_compare

# Usage:
# models = [list of your models]
# MLA_compare = evaluate_models(models, X, y, important_features, cv_split, 'experiment_name')

### Baseline Model

In [16]:
baseline_features = {}

for model in models:
    model_name = model.__class__.__name__

    baseline_features[model_name] = list(X.columns)

In [17]:
# Set seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)
random.seed(42)

baseline_models = evaluate_models(models, X, y, baseline_features, sk, f'{experiment_name}')
baseline_models

Done with LGBMClassifier.
Done with AdaBoostClassifier.
Done with CatBoostClassifier.
Done with BaggingClassifier.
Done with HistGradientBoostingClassifier.
Done with ExtraTreesClassifier.
Done with LogisticRegression.
Done with RandomForestClassifier.
Done with GradientBoostingClassifier.
Done with XGBClassifier.
Done with GaussianNB.
Done with KNeighborsClassifier.
Done with SVC.
Done with KerasClassifierWrapper.


Unnamed: 0,MLA Name,MLA Parameters,MLA Train Log Loss,MLA Test Log Loss,MLA Time
3,GradientBoostingClassifier,"{'ccp_alpha': 0.0, 'criterion': 'friedman_mse'...",0.3685708,0.450019,0 min 12.18 sec
8,CatBoostClassifier,"{'verbose': False, 'random_state': 5, 'early_s...",0.1726042,0.461763,0 min 42.25 sec
7,LGBMClassifier,"{'boosting_type': 'gbdt', 'class_weight': None...",0.1632736,0.463957,0 min 1.04 sec
4,HistGradientBoostingClassifier,"{'categorical_features': None, 'early_stopping...",0.1111534,0.487657,0 min 4.46 sec
6,XGBClassifier,"{'objective': 'binary:logistic', 'use_label_en...",0.07704598,0.501359,0 min 8.84 sec
13,KerasClassifierWrapper,"{'batch_size': 32, 'epochs': 100}",0.4757771,0.522538,4 min 21.02 sec
5,RandomForestClassifier,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",0.1132907,0.54266,0 min 2.84 sec
2,ExtraTreesClassifier,"{'bootstrap': False, 'ccp_alpha': 0.0, 'class_...",2.109424e-15,0.557769,0 min 2.48 sec
9,LogisticRegression,"{'C': 1.0, 'class_weight': None, 'dual': False...",0.6245248,0.626359,0 min 0.40 sec
10,SVC,"{'C': 1.0, 'break_ties': False, 'cache_size': ...",0.660219,0.662001,0 min 50.00 sec


# Ensembling

In [18]:
# def hill_climbing(x, y, x_test):
#     le = LabelEncoder()
#     y_encoded = le.fit_transform(y)
#     y_one_hot = pd.get_dummies(y_encoded)

#     # Evaluating out-of-fold predictions
#     scores = {}
#     for col in x.columns:
#         scores[col] = log_loss(y_one_hot, x[[col]])

#     # Sorting the model scores in ascending order
#     scores = {k: v for k, v in sorted(scores.items(), key=lambda item: item[1])}
#     print(scores)

#     # Sort x and x_test based on scores
#     x = x[list(scores.keys())]
#     x_test = x_test[list(scores.keys())]

#     # Initialize weights
#     weights = {col: 1 if i == 0 else 0 for i, col in enumerate(x.columns)}

#     STOP = False
#     current_best_ensemble = x.iloc[:,0]
#     current_best_test_ensemble = x_test.iloc[:,0]
#     MODELS = x.iloc[:,1:]
#     weight_range = np.arange(-0.5, 0.51, 0.01) 
#     history = [log_loss(y_one_hot, current_best_ensemble)]

#     while not STOP:
#         potential_new_best_cv_score = log_loss(y_one_hot, current_best_ensemble)
#         k_best, wgt_best = None, None
#         for k in MODELS:
#             for wgt in weight_range:
#                 potential_ensemble = (1 - wgt) * current_best_ensemble + wgt * MODELS[k]
#                 cv_score = log_loss(y_one_hot, potential_ensemble)
#                 if cv_score < potential_new_best_cv_score:
#                     potential_new_best_cv_score = cv_score
#                     k_best, wgt_best = k, wgt

#         if k_best is not None:
#             # Update weights
#             weights = {col: (1 - wgt_best) * weights[col] if col != k_best else wgt_best for col in weights}
#             current_best_ensemble = (1 - wgt_best) * current_best_ensemble + wgt_best * MODELS[k_best]
#             current_best_test_ensemble = (1 - wgt_best) * current_best_test_ensemble + wgt_best * x_test[k_best]
#             MODELS.drop(k_best, axis=1, inplace=True)
            
#             if MODELS.shape[1] == 0:
#                 STOP = True
#             history.append(potential_new_best_cv_score)
#         else:
#             STOP = True
        
#     hill_ens_pred = current_best_ensemble
#     hill_ens_pred_test = current_best_test_ensemble
    
#     return hill_ens_pred, hill_ens_pred_test, weights

In [19]:
# model_scores = []
# model_fold_scores = []
# models_dict = {}

# for idx, model_name in enumerate(model_names):
    
#     pred_cols = [f"pred_{response_col}_{c}_{model_name}" for c in response_col_order]
    
#     print("="*25)
#     print(f"Starting training, validation and prediction for model {model_name} [MODEL {idx+1}/{len(model_names)}]")
#     print("="*25)
    
#     print("-"*25)
#     print(f"Training model:")
#     print("-"*25)
    
#     start_time = time.process_time()
    
#     trainer = KFoldTrainer(
#         seed = CFG.seed,
#         model_name = model_name,
#         model_params = model_params_dict[model_name],
#     )
#     # training
#     trainer.train_by_fold(df_combined, feature_names, verbose=True)
#     print("-"*25)
    
#     models_dict[model_name] = trainer.get_saved_models()
    
#     # validation
#     pred_train, metric, fold_metrics, mean_fold_metric, sd_fold_metric = \
#         trainer.get_oof_predictions_and_metric(df_combined, feature_names)
    
#     df_combined[pred_cols] = pred_train
    
#     for fold in range(CFG.n_folds):
#         print(f"LOSS for FOLD {fold}: {fold_metrics[fold]:6f}")
#     print()
#     print(f"OOF LOSS for {model_name}: {metric:.6f}")
#     print(f"Mean/SD LOSS for {model_name}: {mean_fold_metric:.6f} ± {sd_fold_metric:.6f}")
    
#     # for plotting later on
#     model_scores.append({
#         'model_name': model_name,
#         'score': metric,
#     })
#     model_fold_scores.extend([{
#         'model_name': model_name,
#         'fold': fold,
#         'score': fold_metrics[fold]
#     } for fold in range(CFG.n_folds)])
    
#     # prediction (test set)
#     print("-"*25)
#     print(f"Predicting test set with model:")
#     print("-"*25)
#     df_test[pred_cols] = trainer.predict(df_test, feature_names, verbose=True)
    
#     # cleanup
#     del trainer, pred_train, metric, fold_metrics, mean_fold_metric, sd_fold_metric
#     gc.collect()

In [20]:
# Set seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)
random.seed(42)

# hill_climb_scores = []
# hc_test_scores = []
# hill_climb_weights = []
# final_predictions = []
# final_hc_predictions = []
# avg_predictions_scores = []
# optuna_weights_scores = []
# stacked_scores = []
# optuna_weights_scores_stack = []

# labels = list(range(y.nunique()))

# for i, (train_index, test_index) in enumerate(sk.split(X, y)):
#     X_train, X_test = X.iloc[train_index], X.iloc[test_index]
#     y_train, y_test = y.iloc[train_index], y.iloc[test_index]

#     print(f'Fold {i+1}')
    
#     MLA_cv_train_preds = []
#     MLA_cv_preds = []
#     MLA_cv_preds_dict = {}
#     MLA_cv_preds_test_dict = {}
#     MLA_names = []
    
#     for alg in models:
#         MLA_name = alg.__class__.__name__
        
#         predictor = alg.fit(X_train, y_train)
#         pred_result = predictor.predict_proba(X_test)
#         test_result = predictor.predict_proba(test_features)

#         # MLA_cv_preds.append(pred_result)
#         # final_predictions.append(test_result)

#         # MLA_cv_preds_dict[MLA_name] = pred_result
#         # MLA_cv_preds_test_dict[MLA_name] = test_result

#         # Store each class's predictions separately
#         for class_index in range(pred_result.shape[1]):
#             class_name = f'{MLA_name}_class_{class_index}'
#             MLA_cv_preds_dict[class_name] = pred_result[:, class_index]
#             MLA_cv_preds_test_dict[class_name] = test_result[:, class_index]


#         MLA_names.append(MLA_name)

#         print(f'Done with {MLA_name}')

#     print(MLA_cv_preds_dict)
#     print(MLA_cv_preds_test_dict)
    
#     #################
#     ### Averaging ###
#     #################
#     avg_prediction = np.mean(MLA_cv_preds, axis=0)
#     avg_prediction_score = log_loss(y_test, avg_prediction)
#     avg_predictions_scores.append(avg_prediction_score)
#     print(f'The Fold {i+1} average prediction log loss is {avg_prediction_score}')

#     print(MLA_cv_preds_dict)
#     print(MLA_cv_preds_test_dict)

#     ##################
#     ### Hill Climb ###
#     ##################
#     hill_climb_pred, hill_climb_final_pred, hill_climb_weight = hill_climbing(pd.DataFrame(MLA_cv_preds_dict), y_test, pd.DataFrame(MLA_cv_preds_test_dict))
    
#     hill_climb_score = log_loss(y_test, hill_climb_pred)
#     # hc_test_score = mean_absolute_error(test_target, hill_climb_final_pred)
#     hill_climb_scores.append(hill_climb_score)
#     # hc_test_scores.append(hc_test_score)
#     final_hc_predictions.append(hill_climb_final_pred)
#     hill_climb_weights.append(hill_climb_weight)
    
#     print(f'The Fold {i+1} Hill Climb CV Log Loss Score is {hill_climb_score}')
#     # print(f'The Fold {i+1} Hill Climb Actual Data Score is {hc_test_score}')
#     print(f'The Fold {i+1} weight are {hill_climb_weight}')

#     print()

# print()
# print(f'The average prediction CV log loss is ==> {np.mean(avg_predictions_scores)}')
# print(f'The Hill Climbing CV log loss is ==> {np.mean(hill_climb_scores)}')
# # print(f'The Hill Climbing Test CV score is ==> {np.mean(hc_test_scores)}')
# print(f'The Hill Climbing weights are ==> {hill_climb_weights}')

In [21]:
# # Set seeds for reproducibility
# np.random.seed(42)
# tf.random.set_seed(42)
# random.seed(42)

# ensemble_log_losses = []
# fold_weights = []  # To store weights for each fold
# weight_range = np.arange(-0.5, 0.51, 0.01)

# for train_index, test_index in sk.split(X, y):
#     X_train, X_test = X.iloc[train_index], X.iloc[test_index]
#     y_train, y_test = y.iloc[train_index], y.iloc[test_index]

#     # Storing predictions
#     model_predictions_train = {}
#     model_predictions_test = {}

#     for model in models:
#         model_name = model.__class__.__name__
#         model.fit(X_train, y_train)
#         model_predictions_train[model_name] = model.predict_proba(X_test)
#         model_predictions_test[model_name] = model.predict_proba(X_test)
#         print(f'Done with {model_name}', end='\n')

#     # Initialize ensemble with the first model
#     first_model_name = next(iter(model_predictions_train.keys()))
#     current_best_ensemble = model_predictions_train[first_model_name]
#     current_best_test_ensemble = model_predictions_test[first_model_name]

#     # Initialize weights
#     model_weights = {model_name: 0 for model_name in model_predictions_train.keys()}
#     model_weights[first_model_name] = 1.0

#     # Remove the first model from the list
#     MODELS = list(model_predictions_train.keys())[1:]
#     STOP = False
#     history = [log_loss(y_test, current_best_ensemble)]
#     fold_weights_current = [model_weights.copy()]

#     while not STOP:
#         potential_new_best_cv_score = log_loss(y_test, current_best_ensemble)
#         k_best, wgt_best = None, None
#         for k in MODELS:
#             for wgt in weight_range:
#                 potential_ensemble = (1 - wgt) * current_best_ensemble + wgt * model_predictions_train[k]
#                 cv_score = log_loss(y_test, potential_ensemble)
#                 if cv_score < potential_new_best_cv_score:
#                     potential_new_best_cv_score = cv_score
#                     k_best, wgt_best = k, wgt

#         if k_best is not None:
#             # Update ensemble
#             current_best_ensemble = (1 - wgt_best) * current_best_ensemble + wgt_best * model_predictions_train[k_best]
#             current_best_test_ensemble = (1 - wgt_best) * current_best_test_ensemble + wgt_best * model_predictions_test[k_best]
            
#             # Update weights
#             model_weights = {model_name: (1 - wgt_best) * model_weights[model_name] if model_name != k_best else wgt_best for model_name in model_weights}
#             fold_weights_current.append(model_weights.copy())
#             MODELS.remove(k_best)

#             if len(MODELS) == 0:
#                 STOP = True
#             history.append(potential_new_best_cv_score)
#         else:
#             STOP = True

#     # Calculate log loss for the optimized ensemble on the test set and store weights
#     loss = log_loss(y_test, current_best_ensemble)
#     print(loss)
#     ensemble_log_losses.append(loss)
#     fold_weights.append(fold_weights_current[-1])  # Store final weights of this fold

# # Average log loss across all folds
# average_log_loss = np.mean(ensemble_log_losses)
# print(f"Ensemble Average Log Loss: {average_log_loss}")
# print("Weights per fold:", fold_weights)

In [22]:
# # Set seeds for reproducibility
# np.random.seed(42)
# tf.random.set_seed(42)
# random.seed(42)

# ensemble_log_losses = []
# weight_range = np.arange(-0.5, 0.6, 0.01)

# for train_index, test_index in sk.split(X, y):
#     X_train, X_test = X.iloc[train_index], X.iloc[test_index]
#     y_train, y_test = y.iloc[train_index], y.iloc[test_index]

#     # Storing predictions and calculating individual log losses
#     model_predictions = {}
#     model_log_losses = {}

#     for model in models:
#         model_name = model.__class__.__name__
#         model.fit(X_train, y_train)
#         probas = model.predict_proba(X_test)
#         model_predictions[model_name] = probas
#         model_log_losses[model_name] = log_loss(y_test, probas)
#         print(f'Done with {model_name}', end='\n')

#     # Sorting models by their log loss
#     sorted_models = sorted(model_log_losses, key=model_log_losses.get)

#     # Initialize model weights: best model gets weight 1, others get 0
#     model_weights = {model: 1 if model == sorted_models[0] else 0 for model in model_log_losses.keys()}

#     # Hill Climbing for optimizing weights
#     best_loss = model_log_losses[sorted_models[0]]
#     for model_name in model_weights.keys():
#         for wgt in weight_range:
#             temp_weights = model_weights.copy()
#             temp_weights[model_name] = wgt
#             remaining_weight = 1 - wgt
#             for other_model in temp_weights:
#                 if other_model != model_name:
#                     temp_weights[other_model] = remaining_weight / (len(models) - 1)

#             combined_preds = np.zeros_like(next(iter(model_predictions.values())))
#             for m_name, m_probas in model_predictions.items():
#                 combined_preds += temp_weights[m_name] * m_probas

#             current_loss = log_loss(y_test, combined_preds / combined_preds.sum(axis=1, keepdims=True))
#             if current_loss < best_loss:
#                 best_loss = current_loss
#                 model_weights = temp_weights.copy()

#     # Calculate log loss with optimized weights
#     optimized_preds = np.zeros_like(next(iter(model_predictions.values())))
#     for m_name, m_probas in model_predictions.items():
#         optimized_preds += model_weights[m_name] * m_probas

#     loss = log_loss(y_test, optimized_preds / optimized_preds.sum(axis=1, keepdims=True))
#     ensemble_log_losses.append(loss)

# # Average log loss across all folds
# average_log_loss = np.mean(ensemble_log_losses)
# print(f"Ensemble Average Log Loss: {average_log_loss}")
# print(sum(model_weights.values()))

In [23]:
# # Set seeds for reproducibility
# np.random.seed(42)
# tf.random.set_seed(42)
# random.seed(42)

# ensemble_log_losses = []
# weight_range = np.arange(-0.5, 0.6, 0.01)

# for train_index, test_index in sk.split(X, y):
#     X_train, X_test = X.iloc[train_index], X.iloc[test_index]
#     y_train, y_test = y.iloc[train_index], y.iloc[test_index]

#     # Storing predictions and calculating individual log losses
#     model_predictions = {}
#     model_log_losses = {}

#     for model in models:
#         model_name = model.__class__.__name__
#         model.fit(X_train, y_train)
#         probas = model.predict_proba(X_test)
#         model_predictions[model_name] = probas
#         model_log_losses[model_name] = log_loss(y_test, probas)
#         print(f'Done with {model_name}', end='\n')

#     # Sorting models by their log loss
#     sorted_models = sorted(model_log_losses, key=model_log_losses.get)

#     # Initialize model weights: best model gets weight 1, others get 0
#     model_weights = {model: 1 if model == sorted_models[0] else 0 for model in model_log_losses.keys()}

#     # Hill Climbing for optimizing weights
#     best_loss = model_log_losses[sorted_models[0]]
#     for model_name, probas in model_predictions.items():
#         for wgt in weight_range:
#             combined_preds = np.zeros_like(probas)
#             for m_name, m_probas in model_predictions.items():
#                 weight = wgt if m_name == model_name else model_weights[m_name]
#                 combined_preds += weight * m_probas

#             current_loss = log_loss(y_test, combined_preds / combined_preds.sum(axis=1, keepdims=True))
#             if current_loss < best_loss:
#                 best_loss = current_loss
#                 model_weights[model_name] = wgt

#     # Calculate log loss with optimized weights
#     optimized_preds = np.zeros_like(probas)
#     for m_name, m_probas in model_predictions.items():
#         optimized_preds += model_weights[m_name] * m_probas

#     loss = log_loss(y_test, optimized_preds / optimized_preds.sum(axis=1, keepdims=True))
#     ensemble_log_losses.append(loss)

# # Average log loss across all folds
# average_log_loss = np.mean(ensemble_log_losses)
# print(f"Ensemble Average Log Loss: {average_log_loss}")
# print(sum(model_weights.values()))