# Hyperparameter optimisation with Optuna

We import the libraries we are going to use

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc
from sklearn.tree import DecisionTreeClassifier
import optuna

We load the data

In [2]:
dataset_path = 'Aleph_dataset.csv'
data = pd.read_csv(dataset_path, index_col = 0)

feature_columns = data.columns
input_variables = data.columns[feature_columns != 'isb']
input_data      = data[input_variables]
truth           = data['isb']

input_train, input_validation, truth_train, truth_validation = train_test_split(input_data, truth, test_size=0.2)

Here's a basic example of how to use optuna to optimise a random forests. 
As you know, random forests have two critical hyper-parameters: the maximum depth of the trees  `max_depth` and the number of independent estimators `n_estimators`. 
We are going to use Optuna to find the combination of both parameters that results in the maximum accuracy:

In [3]:
# First e define an objective function which return the quantity we want to optimise —either maximise or minimise—
def objective(trial):
    
    n_estimators = trial.suggest_int('n_estimators', 1, 20)
    max_depth = trial.suggest_int('max_depth', 1, 20)
    
    model = RandomForestClassifier(max_depth=max_depth, n_estimators=n_estimators, criterion='entropy') 
    model.fit(input_train, truth_train)  
    return model.score(input_validation, truth_validation)

# The we create an "study" which will test different hyper-parameter values
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)

# Select the best trail and print its accuracy
trial = study.best_trial
print('Accuracy: {}'.format(trial.value))
print("Best hyperparameters: {}".format(trial.params))

[32m[I 2022-07-28 09:25:34,708][0m A new study created in memory with name: no-name-00d0507d-33c2-4563-be5f-b61c10c980db[0m
[32m[I 2022-07-28 09:25:34,924][0m Trial 0 finished with value: 0.8812187224021195 and parameters: {'n_estimators': 17, 'max_depth': 2}. Best is trial 0 with value: 0.8812187224021195.[0m
[32m[I 2022-07-28 09:25:35,165][0m Trial 1 finished with value: 0.884604062408007 and parameters: {'n_estimators': 4, 'max_depth': 19}. Best is trial 1 with value: 0.884604062408007.[0m
[32m[I 2022-07-28 09:25:35,280][0m Trial 2 finished with value: 0.8874006476302619 and parameters: {'n_estimators': 10, 'max_depth': 2}. Best is trial 2 with value: 0.8874006476302619.[0m
[32m[I 2022-07-28 09:25:36,465][0m Trial 3 finished with value: 0.8972622902561084 and parameters: {'n_estimators': 20, 'max_depth': 18}. Best is trial 3 with value: 0.8972622902561084.[0m
[32m[I 2022-07-28 09:25:36,981][0m Trial 4 finished with value: 0.9009420076538122 and parameters: {'n_estim

Accuracy: 0.9009420076538122
Best hyperparameters: {'n_estimators': 11, 'max_depth': 11}


Let's visualise how the study went, first how the objective change throughout the trials and then the countour plot of the objective function:

In [4]:
optuna.visualization.plot_optimization_history(study)

In [5]:

optuna.visualization.plot_contour(study, params=['n_estimators', 'max_depth'])

We can also optimise categorical varaibles, e.g. for choosing the best estimator:

In [6]:
# First e define an objective function which return the quantity we want to optimise —either maximise or minimise—
def objective(trial):
        
    classifier = trial.suggest_categorical('classifier', ['RandomForest', 'Tree'])
    
    if classifier == 'RandomForest':
        n_estimators = trial.suggest_int('n_estimators', 1, 20)
        max_depth = trial.suggest_int('max_depth', 1, 20)
        model = RandomForestClassifier(max_depth=max_depth, n_estimators=n_estimators, criterion='entropy') 
    else:
        max_depth = trial.suggest_int('max_depth', 1, 20)
        model = DecisionTreeClassifier(max_depth = max_depth, criterion = 'entropy') 
        
    model.fit(input_train, truth_train)  
    return model.score(input_validation, truth_validation)

# The we create an "study" which will test different hyper-parameter values
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)

# Select the best trail and print its accuracy
trial = study.best_trial
print('Accuracy: {}'.format(trial.value))
print("Best hyperparameters: {}".format(trial.params))

[32m[I 2022-07-28 09:28:27,205][0m A new study created in memory with name: no-name-3f8c273c-7582-4f22-9035-07a64e548efc[0m
[32m[I 2022-07-28 09:28:27,314][0m Trial 0 finished with value: 0.8918163085075066 and parameters: {'classifier': 'Tree', 'max_depth': 5}. Best is trial 0 with value: 0.8918163085075066.[0m
[32m[I 2022-07-28 09:28:27,487][0m Trial 1 finished with value: 0.8899028554607006 and parameters: {'classifier': 'Tree', 'max_depth': 9}. Best is trial 0 with value: 0.8918163085075066.[0m
[32m[I 2022-07-28 09:28:27,566][0m Trial 2 finished with value: 0.8907859876361496 and parameters: {'classifier': 'Tree', 'max_depth': 4}. Best is trial 0 with value: 0.8918163085075066.[0m
[32m[I 2022-07-28 09:28:27,620][0m Trial 3 finished with value: 0.8909331763320577 and parameters: {'classifier': 'RandomForest', 'n_estimators': 2, 'max_depth': 5}. Best is trial 0 with value: 0.8918163085075066.[0m
[32m[I 2022-07-28 09:28:27,738][0m Trial 4 finished with value: 0.886370

Accuracy: 0.898881365911098
Best hyperparameters: {'classifier': 'RandomForest', 'n_estimators': 20, 'max_depth': 17}
