In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
#!pip install optuna

In [3]:
import sys
print(sys.executable)
print(sys.path)

/Users/hermansolem/anaconda3/envs/optuna_setup/bin/python
['/Users/hermansolem/Developer/optuna_setup/notebooks', '/Users/hermansolem/anaconda3/envs/optuna_setup/lib/python310.zip', '/Users/hermansolem/anaconda3/envs/optuna_setup/lib/python3.10', '/Users/hermansolem/anaconda3/envs/optuna_setup/lib/python3.10/lib-dynload', '', '/Users/hermansolem/anaconda3/envs/optuna_setup/lib/python3.10/site-packages', '/Users/hermansolem/Developer/optuna_setup']


In [4]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score

import optuna

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
healthexp = sns.load_dataset('healthexp')

# Data Exploration and Engineering

- pd.get_dummies used for one-hot encoding (converting categorical data into dummy/indicator variables).


In [6]:
healthexp.head()

Unnamed: 0,Year,Country,Spending_USD,Life_Expectancy
0,1970,Germany,252.311,70.6
1,1970,France,192.143,72.2
2,1970,Great Britain,123.993,71.9
3,1970,Japan,150.437,72.0
4,1970,USA,326.961,70.9


In [7]:
healthexp = pd.get_dummies(healthexp)

In [8]:
healthexp.head()

Unnamed: 0,Year,Spending_USD,Life_Expectancy,Country_Canada,Country_France,Country_Germany,Country_Great Britain,Country_Japan,Country_USA
0,1970,252.311,70.6,False,False,True,False,False,False
1,1970,192.143,72.2,False,True,False,False,False,False
2,1970,123.993,71.9,False,False,False,True,False,False
3,1970,150.437,72.0,False,False,False,False,True,False
4,1970,326.961,70.9,False,False,False,False,False,True


# Setting target


In [9]:
X = healthexp.drop(['Life_Expectancy'], axis=1)

In [10]:
Y = healthexp['Life_Expectancy']

# Dividing into test-train-split


In [11]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Random Forest Regressor (RFR) for prediction


In [12]:
rfr = RandomForestRegressor(random_state=42)

In [13]:
rfr.fit(X_train, Y_train)

# Using RFR for prediction

- **MAE Interpretation**: Gives a linear measure of the average magnitude of errors in a set of predictions, without considering their direction (i.e., positive or negative). It is simple to understand and less sensitive to outliers compared to MSE.

- **MSE Interpretation**: Penalizes larger errors more than MAE because the errors are squared. This can be beneficial if you want to pay extra attention to large errors, but it can also be influenced heavily by outliers.

- Interpretation: R² score ranges from 0 to 1, where:
  - 1 indicates that the regression predictions perfectly fit the data.
  - 0 indicates that the model does not explain any of the variability in the response data around its mean.
  - Negative values indicate that the model performs worse than a horizontal line (mean of actual values).


In [14]:
y_pred = rfr.predict(X_test)

In [15]:
mean_absolute_error(Y_test, y_pred)

0.274527272727264

In [16]:
mean_squared_error(Y_test, y_pred)

0.12436518181817355

In [17]:
r2_score(Y_test, y_pred)

0.9898132982462418

# Optuna setup

- cv=5: Uses 5-fold cross-validation.
- scoring='neg_mean_squared_error': Uses the negative mean squared error (MSE) as the scoring metric.
- Setting n_jobs=-1 in cross_val_score allows the computation to use all available processors, parallelizing the cross-validation process.


In [18]:
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 100, 1000)
    max_depth = trial.suggest_int('max_depth', 10, 50)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)

    model = RandomForestRegressor(n_estimators=n_estimators, 
                                  max_depth=max_depth, 
                                  min_samples_split=min_samples_split, 
                                  min_samples_leaf=min_samples_leaf)

    score = cross_val_score(model, X_train, Y_train, cv=5, scoring='neg_mean_squared_error', n_jobs=-1).mean()

    return score

In [19]:
study = optuna.create_study(direction='maximize', sampler = optuna.samplers.RandomSampler(seed=42))

[I 2024-05-29 19:12:46,318] A new study created in memory with name: no-name-b4bd55e8-d53d-4e6d-804d-d9aa4da3021e


In [20]:
study.optimize(objective, n_trials=200)

[I 2024-05-29 19:12:50,004] Trial 0 finished with value: -0.8873459460691159 and parameters: {'n_estimators': 437, 'max_depth': 48, 'min_samples_split': 8, 'min_samples_leaf': 6}. Best is trial 0 with value: -0.8873459460691159.
[I 2024-05-29 19:12:51,687] Trial 1 finished with value: -1.4343641787715051 and parameters: {'n_estimators': 240, 'max_depth': 16, 'min_samples_split': 2, 'min_samples_leaf': 9}. Best is trial 0 with value: -0.8873459460691159.
[I 2024-05-29 19:12:53,465] Trial 2 finished with value: -1.623487943681614 and parameters: {'n_estimators': 641, 'max_depth': 39, 'min_samples_split': 2, 'min_samples_leaf': 10}. Best is trial 0 with value: -0.8873459460691159.
[I 2024-05-29 19:12:55,682] Trial 3 finished with value: -0.27101580994676044 and parameters: {'n_estimators': 850, 'max_depth': 18, 'min_samples_split': 3, 'min_samples_leaf': 2}. Best is trial 3 with value: -0.27101580994676044.
[I 2024-05-29 19:12:57,343] Trial 4 finished with value: -0.3863066422995664 and p

In [21]:
study.best_params

{'n_estimators': 940,
 'max_depth': 45,
 'min_samples_split': 2,
 'min_samples_leaf': 1}

In [22]:
best_params = study.best_params

# Optuna Vizualisations

- **Optimization History Plot**

  - **Function**: `optuna.visualization.plot_optimization_history(study)`
  - **Description**: Shows the objective values of all trials over time.
  - **Utility**: Helps understand the progress and convergence of the optimization process.

- **Parallel Coordinate Plot**

  - **Function**: `optuna.visualization.plot_parallel_coordinate(study)`
  - **Description**: Visualizes the relationship between multiple hyperparameters and the objective value.
  - **Utility**: Identifies hyperparameters' impact on the objective value and their interactions.

- **Contour Plot**

  - **Function**: `optuna.visualization.plot_contour(study)`
  - **Description**: Displays a contour map of the objective value with respect to two hyperparameters at a time.
  - **Utility**: Visualizes interactions between hyperparameters and highlights optimal regions.

- **Slice Plot**

  - **Function**: `optuna.visualization.plot_slice(study)`
  - **Description**: Shows the relationship between individual hyperparameters and the objective value.
  - **Utility**: Helps understand the distribution of objective values relative to each hyperparameter.

- **Importance Plot**

  - **Function**: `optuna.visualization.plot_param_importances(study)`
  - **Description**: Displays the relative importance of each hyperparameter in determining the objective value.
  - **Utility**: Identifies critical hyperparameters for optimizing the objective function.

- **EDF (Empirical Distribution Function) Plot**
  - **Function**: `optuna.visualization.plot_edf(study)`
  - **Description**: Shows the cumulative distribution of the objective values from the trials.
  - **Utility**: Understands the overall distribution and quality of solutions found during optimization.


In [23]:
optuna.visualization.plot_optimization_history(study)

In [24]:
optuna.visualization.plot_parallel_coordinate(study)

In [27]:
optuna.visualization.plot_contour(study)

In [25]:
optuna.visualization.plot_slice(study, params=['n_estimators', 'max_depth', 'min_samples_split', 'min_samples_leaf'])

In [26]:
optuna.visualization.plot_param_importances(study)

In [28]:
optuna.visualization.plot_edf(study)

# Create best model version


In [29]:
best_n_estimators = best_params['n_estimators']
best_max_depth = best_params['max_depth']
best_min_samples_split = best_params['min_samples_split']
best_min_samples_leaf = best_params['min_samples_leaf']

In [30]:
best_model = RandomForestRegressor(n_estimators=best_n_estimators,
                                      max_depth=best_max_depth,
                                      min_samples_split=best_min_samples_split,
                                      min_samples_leaf=best_min_samples_leaf)

In [31]:
best_model.fit(X_train, Y_train)

# Evaluating best model


In [32]:
y_pred = best_model.predict(X_test)

In [33]:
mean_absolute_error(Y_test, y_pred)

0.2691218568664621

In [35]:
mean_squared_error(Y_test, y_pred)

0.11640552944561978

In [36]:
r2_score(Y_test, y_pred)

0.9904652701534702