## Hyperparameter Optimisation

In [30]:
import pandas as pd
import numpy as np
from time import time
from scipy.stats import randint as sp_randint

from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SequentialFeatureSelector, SelectFromModel
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, f1_score, precision_score, \
                            recall_score

import optuna
from optuna.visualization import plot_optimization_history, \
                                plot_param_importances, plot_rank, plot_slice

In [20]:
try:
    data = pd.read_csv("data/processed_data.csv")

except FileNotFoundError:
    # Download processed data:
    address = 'https://raw.githubusercontent.com/MichaelAllen1966/' + \
                '1804_python_healthcare/master/titanic/data/processed_data.csv'

    data = pd.read_csv(address)

    # Create a data subfolder if one does not already exist
    import os
    data_directory ='./data/'
    if not os.path.exists(data_directory):
        os.makedirs(data_directory)

    # Save data
    data.to_csv(data_directory + 'processed_data.csv', index=False)

data = data.astype(float)

# Drop Passengerid (axis=1 indicates we are removing a column rather than a row)
# We drop passenger ID as it is not original data

data.drop('PassengerId', inplace=True, axis=1)

X = data.drop('Survived',axis=1) # X = all 'data' except the 'survived' column
y = data['Survived'] # y = 'survived' column from 'data'

feature_names = X.columns.tolist()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training Dataset Samples: {len(X_train)}")
print(f"Testing Dataset Samples: {len(X_test)}")

Training Dataset Samples: 712
Testing Dataset Samples: 179


# Exhaustive

In [26]:
params = [
    {'criterion': ['gini', 'entropy', 'log_loss'],
     'max_depth': range(1, 16, 1),
     'min_samples_split': range(2, 16, 1),
     'min_samples_leaf': range(1, 16, 1)
    }]

start_time = time()

gs_dt = GridSearchCV(DecisionTreeClassifier(random_state=42),
                      param_grid=params,
                      scoring='accuracy',
                      cv=5)

gs_dt.fit(X_train, y_train)

print(f"Best parameters{gs_dt.best_params_}")

print(f"Training Set Score: {gs_dt.score(X_train, y_train)}")

print(f"Test Set Score: {gs_dt.score(X_test, y_test)}")

print(f"Time taken: {(time() - start_time):.3f}s")


Best parameters{'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 9}
Training Set Score: 0.8904494382022472
Time taken: 208.973s


# Randomised

In [29]:
params_dist = [
    {'criterion': ['gini', 'entropy', 'log_loss'],
     'max_depth': sp_randint(1, 16),
     'min_samples_split': sp_randint(2, 16),
     'min_samples_leaf': sp_randint(1, 16)
    }]

start_time = time()

rgs_dt = RandomizedSearchCV(
                      DecisionTreeClassifier(random_state=42),
                      n_iter=500,
                      param_distributions=params_dist,
                      scoring='accuracy',
                      cv=5)



rgs_dt.fit(X_train, y_train)

print(f"Best parameters{rgs_dt.best_params_}")

print(f"Training Set Score: {rgs_dt.score(X_train, y_train)}")

print(f"Test Set Score: {rgs_dt.score(X_test, y_test)}")

print(f"Time taken: {(time() - start_time):.3f}s")


Best parameters{'criterion': 'log_loss', 'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 9}
Training Set Score: 0.8904494382022472
Test Set Score: 0.7988826815642458
Time taken: 8.176s


  _data = np.array(data, dtype=dtype, copy=copy,


# Optuna

In [37]:
def objective(trial):

    # Set Optuna trial parameters and ranges
    rf_max_depth = trial.suggest_int('max_depth', 1, 32, log=True)
    rf_min_samples_split = trial.suggest_int('min_samples_split', 2, 32, log=True)
    rf_min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 32, log=True)
    rf_criterion = trial.suggest_categorical('criterion', ['gini', 'entropy', 'log_loss'])


    # Set up model
    model = DecisionTreeClassifier(
       criterion=rf_criterion,
       max_depth=rf_max_depth,
       min_samples_leaf=rf_min_samples_leaf,
       min_samples_split=rf_min_samples_split,
       random_state=42
       )

    # Assess accuracy with sklearn.model_selection.cross_val_score
    accuracy = cross_val_score(
        model, X_train, y_train, n_jobs=-1, cv=3).mean()

    return accuracy

start_time = time()

# Set up Optuna study - we need to specifiy that we wish to maximise objective
study = optuna.create_study(direction='maximize')

# Run optimisation
study.optimize(objective, n_trials=1000)

# Get best model run
trial = study.best_trial

print(f'Accuracy: {trial.value:0.3f}')
print(f'Best hyperparameters: {trial.params}')
print(f"Time taken: {(time() - start_time):.3f}s")


[I 2024-07-18 17:44:57,191] A new study created in memory with name: no-name-6397947d-3a00-406c-af71-f7db91b6fd03
[I 2024-07-18 17:44:57,224] Trial 0 finished with value: 0.8061731021522532 and parameters: {'max_depth': 6, 'min_samples_split': 17, 'min_samples_leaf': 3, 'criterion': 'log_loss'}. Best is trial 0 with value: 0.8061731021522532.
[I 2024-07-18 17:44:57,241] Trial 1 finished with value: 0.7879362715550355 and parameters: {'max_depth': 1, 'min_samples_split': 8, 'min_samples_leaf': 6, 'criterion': 'log_loss'}. Best is trial 0 with value: 0.8061731021522532.
[I 2024-07-18 17:44:57,258] Trial 2 finished with value: 0.7865002541100828 and parameters: {'max_depth': 4, 'min_samples_split': 2, 'min_samples_leaf': 12, 'criterion': 'gini'}. Best is trial 0 with value: 0.8061731021522532.
[I 2024-07-18 17:44:57,273] Trial 3 finished with value: 0.7879362715550355 and parameters: {'max_depth': 1, 'min_samples_split': 12, 'min_samples_leaf': 16, 'criterion': 'gini'}. Best is trial 0 wi

Accuracy: 0.813
Best hyperparameters: {'max_depth': 3, 'min_samples_split': 5, 'min_samples_leaf': 28, 'criterion': 'entropy'}
Time taken: 31.215s


In [39]:
plot_optimization_history(study)

In [41]:
plot_slice(study)

In [42]:
plot_param_importances(study)

In [43]:
optuna.visualization.plot_param_importances(
    study, target=lambda t: t.duration.total_seconds(), target_name="duration"
)

In [44]:
plot_rank(study)


plot_rank is experimental (supported from v3.2.0). The interface can change in the future.

