In [1]:
# # Import packages
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from EDA_script_HCR_ICIR import *

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
# import packages for hyperparameters tuning
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
X_train, X_test, y_train, y_test = train_test_split(df_train, df_y, test_size = 0.2, random_state = 0)


space={'max_depth': hp.quniform("max_depth", 3, 18, 1),
        'gamma': hp.uniform ('gamma', 1,9),
        'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
        'reg_lambda' : hp.uniform('reg_lambda', 0,1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'n_estimators': 1000,
        'eta' : 0.01,
        'seed': 123
    }

def objective(space):
    clf=xgb.XGBClassifier(eta = space['eta'],
                    n_estimators =space['n_estimators'], max_depth = int(space['max_depth']), gamma = space['gamma'],
                    reg_alpha = int(space['reg_alpha']),min_child_weight=int(space['min_child_weight']),
                    colsample_bytree=int(space['colsample_bytree']))
    
    evaluation = [( X_train, y_train), ( X_test, y_test)]
    
    clf.fit(X_train, y_train,
            eval_set=evaluation, eval_metric="mlogloss",
            early_stopping_rounds=20,verbose=False)
    

    pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, pred)
    print ("SCORE:", accuracy)
    return {'loss': -accuracy, 'status': STATUS_OK }

trials = Trials()

best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 100,
                        trials = trials)


print("The best hyperparameters are : ","\n")
print(best_hyperparams)


def objective_final(space):
    clf=xgb.XGBClassifier(
                    n_estimators =space['n_estimators'], max_depth = int(space['max_depth']), gamma = space['gamma'],
                    reg_alpha = int(space['reg_alpha']),min_child_weight=int(space['min_child_weight']),
                    colsample_bytree=int(space['colsample_bytree']))
    
    evaluation = [( X_train, y_train), ( X_test, y_test)]
    
    clf.fit(X_train, y_train,
            eval_set=evaluation, eval_metric="mlogloss",
            early_stopping_rounds=20,verbose=True)
    
    y_train_probs = clf.predict_proba(df_train)
    pred = clf.predict_proba(df_test)
    return [y_train_probs, pred]


best_hyperparams['n_estimators']= 1000
best_hyperparams['seed'] = 123
best_hyperparams['eta'] = 0.01
pred_list=objective_final(best_hyperparams)


SCORE:                                                 
0.44677083333333334                                    
SCORE:                                                                             
0.4532291666666667                                                                 
SCORE:                                                                             
0.45625                                                                           
SCORE:                                                                            
0.443125                                                               
SCORE:                                                                 
0.4571875                                                              
SCORE:                                                                   
0.45                                                                     
SCORE:                                                                   
0.456875                        

In [2]:
from sklearn.metrics import log_loss
y_test_probs = pred_list[1]

class_order = [0, 1, 2, 3, 4]
class_mapping = {class_label: f"Class_{class_label}" for class_label in class_order}

y_train_probs = pred_list[0]
val_log_loss = log_loss(df_y, y_train_probs, labels=class_order)
print(f"Validation Multiclass Logarithmic Loss: {val_log_loss}")

Validation Multiclass Logarithmic Loss: 0.9286988746581737
