# Using Bayesian Optimization to Perform Cross Validation

## Import

### Modules

In [18]:
%load_ext autoreload
%autoreload 2
%load_ext watermark
%watermark -n -u -v -iv -w

import sys
from pathlib import Path

from hyperopt import hp
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.datasets import load_iris, make_classification
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
import pandas as pd

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
Last updated: Wed Dec 07 2022

Python implementation: CPython
Python version       : 3.9.7
IPython version      : 7.27.0

hyperopt: 0.2.7
sys     : 3.9.7 (default, Oct 12 2021, 02:43:43) 
[GCC 10.2.1 20210110]
pandas  : 1.3.3

Watermark: 2.3.1



Setup paths

In [2]:
PROJECT_ROOT = Path.cwd().resolve().parent
sys.path.append(str(PROJECT_ROOT))

### Scripts

In [3]:
from datatoolkit.model_selection import BayesianSearchCV

## Examples

### Random Forest Classifier

Set parameter spacem, which is a dictionary of hyperparameters and their distributions.

In [4]:
parameter_space = {
    'n_estimators': hp.uniformint('n_estimators', 100, 1000),
    'max_depth': hp.uniformint('max_depth', 1, 5),
    'min_weight_fraction_leaf':  hp.uniform('min_weight_fraction_leaf', 0, 0.5),
    'criterion': hp.choice('criterion', {'gini', 'entropy', 'log_loss'}),
            }

Set estimator andm cross validation generator

In [5]:

estimator = RandomForestClassifier()
cv = StratifiedShuffleSplit(n_splits=3, test_size=0.2, random_state=42)

Load data

In [6]:
X, y = load_iris(return_X_y=True)
X = X[:, :2]
X = X[y < 2]
y = y[y < 2]

Cross validate with `BayesianSearchCV`

In [7]:
bs = BayesianSearchCV(estimator=estimator, parameter_space=parameter_space, scoring=["f1_score", "roc_auc_score"], refit="f1_score", n_iter=5, cv=cv);
bs.fit(X, y)

100%|██████████| 5/5 [00:19<00:00,  3.90s/trial, best loss: 0.36143540669856444]


Analyzing results...

In [8]:
cv_results_ = pd.DataFrame.from_dict(bs.cv_results_)
cv_results_[['parameters', 'rank_score', 'average_val_f1_score']]

Unnamed: 0,parameters,rank_score,average_val_f1_score
0,"{'criterion': 'log_loss', 'max_depth': 4, 'min...",4,0.165641
1,"{'criterion': 'entropy', 'max_depth': 4, 'min_...",1,0.100478
2,"{'criterion': 'log_loss', 'max_depth': 5, 'min...",2,0.151356
3,"{'criterion': 'gini', 'max_depth': 3, 'min_wei...",3,0.155793
4,"{'criterion': 'gini', 'max_depth': 3, 'min_wei...",5,0.178817


Check if the best estimator is the same as the estimator with the best parameters.

In [9]:
assert bs.best_params_ == cv_results_.query("rank_score == 1")['parameters'].values[0]

### Cross validating a pipeline

Load dataset

In [17]:
X, y = make_classification()
y

array([0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0,
       1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0])

Setup pipeline

In [19]:
steps = [('pca', PCA()), ('rf', RandomForestClassifier())]
pipeline = Pipeline(steps)

Define parameter_space

In [20]:
parameter_space = {
    'rf__n_estimators': hp.uniformint('rf__n_estimators', 100, 1000),
    'rf__max_depth': hp.uniformint('rf__max_depth', 1, 5),
    'rf__min_weight_fraction_leaf':  hp.uniform('rf__min_weight_fraction_leaf', 0, 0.5),
    'rf__criterion': hp.choice('rf__criterion', {'gini', 'entropy', 'log_loss'}),
    'pca__n_components': hp.uniformint('pca__n_components', 1, X.shape[1]),
            }

Cross validation with pipeline

In [22]:
cv = BayesianSearchCV(estimator=pipeline, parameter_space=parameter_space, scoring=["f1_score", "roc_auc_score"], refit="f1_score", n_iter=5, cv=cv);
cv.fit(X, y)

100%|██████████| 5/5 [00:12<00:00,  2.40s/trial, best loss: 1.6394132165061146]


Analyze the results

In [24]:
cv_results_ = pd.DataFrame.from_dict(cv.cv_results_)
cv_results_[['parameters', 'rank_score', 'average_val_f1_score']]

Unnamed: 0,parameters,rank_score,average_val_f1_score
0,"{'pca__n_components': 1, 'rf__criterion': 'gin...",5,0.529683
1,"{'pca__n_components': 4, 'rf__criterion': 'gin...",4,0.331699
2,"{'pca__n_components': 5, 'rf__criterion': 'log...",3,0.285354
3,"{'pca__n_components': 5, 'rf__criterion': 'gin...",1,0.225397
4,"{'pca__n_components': 3, 'rf__criterion': 'gin...",2,0.281218


Check if the best estimator is the same as the estimator with the best parameters.

In [25]:
assert cv.best_params_ == cv_results_.query("rank_score == 1")['parameters'].values[0]