## In this notebook, we are going to explore the Random Forest classifier's hyperparameter space, take the best configuration of hyperparameters based on UCB algorithmy, compare with randomly chosen hyperparameter configuration, compare their performance and evaluate the validation error.

### Import necessary libraries

In [13]:
import numpy as np
import pandas as pd
import math
import random
import matplotlib.pyplot as plt
import seaborn as sns 
import xlrd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from utils import *

### Read the data

In [14]:
data = pd.read_csv('Titanic_dataset.csv') 

### Preprocess the data

In [15]:
na_value = data.isna()
na_counts = na_value.sum()
print(na_counts)

pclass          0
survived        0
name            0
sex             0
age           263
sibsp           0
parch           0
ticket          0
fare            1
cabin        1014
embarked        2
boat          823
body         1188
home.dest     564
dtype: int64


In [22]:
columns_to_drop = ['name', 'ticket', 'cabin', 'boat', 'body', 'home.dest']
data = data.drop(columns_to_drop, axis=1)

data['age'].fillna(data['age'].median(), inplace=True)
data['fare'].fillna(data['fare'].mean(), inplace=True)
data['embarked'].fillna(data['embarked'].mode()[0], inplace=True)

data = pd.get_dummies(data, columns=['sex', 'embarked'])

KeyError: "['name', 'ticket', 'cabin', 'boat', 'body', 'home.dest'] not found in axis"

### Split the data into train and test sets

In [27]:
data.columns

Index(['pclass', 'survived', 'age', 'sibsp', 'parch', 'fare', 'sex_female',
       'sex_male', 'embarked_C', 'embarked_Q', 'embarked_S'],
      dtype='object')

In [28]:
X = data.drop('survived', axis=1)
y = data['survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Define the hyperparameter space for Random Forest

In [29]:
n_estimators = [50, 100, 150, 200, 250, 300, 350, 400, 450, 500]
max_depths = [3, 5, 7, 9]
min_samples_split = [2, 4, 6]
min_samples_leaf = [1, 2, 3]
max_features = ['sqrt', 'log2']
bootstrap = [True, False]
criterion = ['gini', 'entropy']

rf_param_space = []

for n in n_estimators:
    for m in max_depths:
        for split in min_samples_split:
            for leaf in min_samples_leaf:
                for feat in max_features:
                    for boot in bootstrap:
                        for crit in criterion:
                            config_rf = {
                                'n_estimators': n,
                                'max_depth': m,
                                'min_samples_split': split,
                                'min_samples_leaf': leaf,
                                'max_features': feat,
                                'bootstrap': boot,
                                'criterion': crit
                            }
                            rf_param_space.append(config_rf)

print(len(rf_param_space))

2880


In [30]:
best_config_rf, best_config_random_rf, pred_rand_rf = RandomForest_Random(X_train, y_train, X_test, y_test, rf_param_space)

Dataset: Titanic Dataset

UCB Strategy for RandomForest:
Best validation error: 0.20610687022900764
Best hyperparameter configuration: {'n_estimators': 50, 'max_depth': 7, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'bootstrap': False, 'criterion': 'gini'}

Random Strategy for RandomForest:
Best validation error: 0.23282442748091603
Best hyperparameter configuration: {'n_estimators': 150, 'max_depth': 9, 'min_samples_split': 2, 'min_samples_leaf': 3, 'max_features': 'sqrt', 'bootstrap': True, 'criterion': 'gini'}


### Train the final Random Forest model using the best hyperparameters from UCB strategy

In [31]:
RF_UCB_test = RandomForestClassifier(**best_config_rf)
RF_UCB_test.fit(X_train, y_train) 

### Evaluate performance on the test set using the best hyperparameters from UCB strategy

In [32]:
test_predictions_rf = RF_UCB_test.predict(X_test)
test_accuracy_rf = accuracy_score(y_test, test_predictions_rf)
test_precision_rf = precision_score(y_test, test_predictions_rf)
test_recall_rf = recall_score(y_test, test_predictions_rf)
test_f1_rf = f1_score(y_test, test_predictions_rf) 

### Evaluate performance on the test set using the best hyperparameters from random strategy


In [33]:
test_precision_random_rf = pred_rand_rf
test_accuracy_random_rf = accuracy_score(y_test, test_precision_random_rf)
test_precision_random_rf = precision_score(y_test, test_precision_random_rf)
# test_recall_random_rf = recall_score(y_test, test_precision_random_rf)
# test_f1_random_rf = f1_score(y_test, test_precision_random_rf) 

### Compare the performances of UCB-selected and Random-Selected hyperparameters for Random Forest  

In [34]:
print("UCB-selected hyperparameters for XGBoosting: ", best_config_rf)
print("UCB-selected performance:")
print(f"  - Accuracy: {test_accuracy_rf}")
print(f"  - Precision: {test_precision_rf}")
print(f"  - Recall: {test_recall_rf}")
print(f"  - F1-score: {test_f1_rf}")

print("\nRandom-selected hyperparameters for XGBoosting: ", best_config_random_rf)
print("Random-selected performance:")
print(f"  - Accuracy: {test_accuracy_random_rf}")
print(f"  - Precision: {test_precision_random_rf}")
# print(f"  - Recall: {test_recall_random_svc}")
# print(f"  - F1-score: {test_f1_random_svc}")

UCB-selected hyperparameters for XGBoosting:  {'n_estimators': 50, 'max_depth': 7, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'bootstrap': False, 'criterion': 'gini'}
UCB-selected performance:
  - Accuracy: 0.7709923664122137
  - Precision: 0.8625
  - Recall: 0.5847457627118644
  - F1-score: 0.6969696969696969

Random-selected hyperparameters for XGBoosting:  {'n_estimators': 150, 'max_depth': 9, 'min_samples_split': 2, 'min_samples_leaf': 3, 'max_features': 'sqrt', 'bootstrap': True, 'criterion': 'gini'}
Random-selected performance:
  - Accuracy: 0.767175572519084
  - Precision: 0.8352941176470589
