## In this notebook, we are going to explore the XGBoosting classifier's hyperparameter space, take the best configuration of hyperparameters based on UCB algorithmy, compare with randomly chosen hyperparameter configuration, compare their performance and evaluate the validation error.

### Import necessary libraries

In [13]:
import numpy as np
import pandas as pd
import math
import random
import matplotlib.pyplot as plt
import seaborn as sns 
import xlrd
from sklearn.model_selection import train_test_split
import xgboost as xgb 
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from utils import *

### Read the data

In [14]:
data = pd.read_excel('Titanic_dataset.xls') 

### Preprocess the data

In [15]:
na_value = data.isna()
na_counts = na_value.sum()
print(na_counts)

pclass          0
survived        0
name            0
sex             0
age           263
sibsp           0
parch           0
ticket          0
fare            1
cabin        1014
embarked        2
boat          823
body         1188
home.dest     564
dtype: int64


In [16]:
columns_to_drop = ['name', 'ticket', 'cabin', 'boat', 'body', 'home.dest']
data = data.drop(columns_to_drop, axis=1)

data['age'].fillna(data['age'].median(), inplace=True)
data['fare'].fillna(data['fare'].mean(), inplace=True)
data['embarked'].fillna(data['embarked'].mode()[0], inplace=True)

data = pd.get_dummies(data, columns=['sex', 'embarked'])

### Split the data into train and test sets

In [17]:
X = data.drop('survived', axis=1)
y = data['survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Define the hyperparameter space for XGBoosting algorithm

In [18]:
learning_rates = [0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5]
n_estimators = [50, 100, 150, 200, 250, 300, 350, 400, 450, 500]
max_depths = [3, 5, 7, 9]
reg_alpha = [0, 0.1, 0.5, 1, 2]
reg_lambda = [0, 0.1, 0.5, 1, 2]

param_space_XGB = []

for lr in learning_rates:
    for ne in n_estimators:
        for md in max_depths:
            for ra in reg_alpha:
                for rl in reg_lambda:
                    config = {
                        'learning_rate': lr,
                        'n_estimators': ne,
                        'max_depth': md,
                        'reg_alpha': ra,
                        'reg_lambda': rl
                    }
                    param_space_XGB.append(config)

len(param_space_XGB)

7000

### Evaluate the UCB on XGBoosting and Random Strategy

In [19]:
best_config_ucb, best_config_random_XGB, pred_rand_XGB = XGBoost_Random(X_train, y_train, X_test, y_test, param_space_XGB)

Dataset: Titanic Dataset

UCB Strategy on XGBoosting:
Best validation error: 0.1984732824427481
Best hyperparameter configuration: {'learning_rate': 0.01, 'n_estimators': 50, 'max_depth': 9, 'reg_alpha': 0, 'reg_lambda': 0.5}

Random Strategy on XGBoosting:
Best validation error: 0.23664122137404575
Best hyperparameter configuration: {'learning_rate': 0.1, 'n_estimators': 50, 'max_depth': 5, 'reg_alpha': 0.1, 'reg_lambda': 1}


### Train the final XGBoosting model using the best hyperparameters from UCB strategy

In [20]:
XGB_UCB_test = xgb.XGBClassifier(**best_config_ucb)
XGB_UCB_test.fit(X_train, y_train) 

### Evaluate performance on the test set using the best hyperparameters from UCB strategy

In [21]:
test_predictions_ucb_XGB = XGB_UCB_test.predict(X_test)
test_accuracy_ucb = accuracy_score(y_test, test_predictions_ucb_XGB)
test_precision_ucb = precision_score(y_test, test_predictions_ucb_XGB)
test_recall_ucb = recall_score(y_test, test_predictions_ucb_XGB)
test_f1_ucb = f1_score(y_test, test_predictions_ucb_XGB)

### Evaluate performance on the test set using the best hyperparameters from random strategy


In [22]:
test_predictions_rand_XGB = pred_rand_XGB 
test_accuracy_random = accuracy_score(y_test, test_predictions_rand_XGB)
test_precision_random = precision_score(y_test, test_predictions_rand_XGB)
test_recall_random = recall_score(y_test, test_predictions_rand_XGB)
test_f1_random = f1_score(y_test, test_predictions_rand_XGB)

### Compare the performances of UCB-selected and Random-Selected hyperparameters for XGBoosting  

In [23]:
print("UCB-selected hyperparameters for XGBoosting: ", best_config_ucb)
print("UCB-selected performance:")
print(f"  - Accuracy: {test_accuracy_ucb}")
print(f"  - Precision: {test_precision_ucb}")
print(f"  - Recall: {test_recall_ucb}")
print(f"  - F1-score: {test_f1_ucb}")

print("\nRandom-selected hyperparameters for XGBoosting: ", best_config_random_XGB)
print("Random-selected performance:")
print(f"  - Accuracy: {test_accuracy_random}")
print(f"  - Precision: {test_precision_random}")
print(f"  - Recall: {test_recall_random}")
print(f"  - F1-score: {test_f1_random}")

UCB-selected hyperparameters for XGBoosting:  {'learning_rate': 0.01, 'n_estimators': 50, 'max_depth': 9, 'reg_alpha': 0, 'reg_lambda': 0.5}
UCB-selected performance:
  - Accuracy: 0.8015267175572519
  - Precision: 0.875
  - Recall: 0.652542372881356
  - F1-score: 0.7475728155339806

Random-selected hyperparameters for XGBoosting:  {'learning_rate': 0.1, 'n_estimators': 50, 'max_depth': 5, 'reg_alpha': 0.1, 'reg_lambda': 1}
Random-selected performance:
  - Accuracy: 0.7633587786259542
  - Precision: 0.85
  - Recall: 0.576271186440678
  - F1-score: 0.686868686868687
