## Optuna function

In [1]:
import optuna
import pandas as pd
from sklearn import linear_model
from sklearn import ensemble
from sklearn import datasets
from sklearn import model_selection

#Grabbing a sklearn Classification dataset:
X,y = datasets.load_breast_cancer(return_X_y=True, as_frame=True)




### Step 1. Define an objective function
### Step 2. Define a set of hyperparameters to try
### Step 3. Define the variable/metrics you want to optimize
### Step 4. Run function



### Trial : A single call of the objective function
### Study : An optimization session, which is a set of trials¶

In [2]:
#Step 1. Define an objective function to be maximized.
def objective(trial):

    classifier_name = trial.suggest_categorical("classifier", ["LogReg", "RandomForest"])
    
    # Step 2. Setup values for the hyperparameters:
    if classifier_name == 'LogReg':
        logreg_c = trial.suggest_float("logreg_c", 1e-10, 1e10, log=True)
        classifier_obj = linear_model.LogisticRegression(C=logreg_c)
    else:
        rf_n_estimators = trial.suggest_int("rf_n_estimators", 10, 1000)
        rf_max_depth = trial.suggest_int("rf_max_depth", 2, 32, log=True)
        classifier_obj = ensemble.RandomForestClassifier(
            max_depth=rf_max_depth, n_estimators=rf_n_estimators
        )

    # Step 3: Scoring method:
    score = model_selection.cross_val_score(classifier_obj, X, y, n_jobs=-1, cv=3)
    accuracy = score.mean()
    return accuracy

# Step 4: Running it
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

[32m[I 2021-04-20 11:36:43,892][0m A new study created in memory with name: no-name-87cddc6c-f100-4a65-a203-0820d595e677[0m
[32m[I 2021-04-20 11:36:45,805][0m Trial 0 finished with value: 0.945521210433491 and parameters: {'classifier': 'LogReg', 'logreg_c': 59588536.05877921}. Best is trial 0 with value: 0.945521210433491.[0m
[32m[I 2021-04-20 11:36:46,890][0m Trial 1 finished with value: 0.9472663139329806 and parameters: {'classifier': 'LogReg', 'logreg_c': 4142500007.851913}. Best is trial 1 with value: 0.9472663139329806.[0m
[32m[I 2021-04-20 11:36:49,197][0m Trial 2 finished with value: 0.9578204771187228 and parameters: {'classifier': 'RandomForest', 'rf_n_estimators': 661, 'rf_max_depth': 17}. Best is trial 2 with value: 0.9578204771187228.[0m
[32m[I 2021-04-20 11:36:50,239][0m Trial 3 finished with value: 0.9367399981435068 and parameters: {'classifier': 'LogReg', 'logreg_c': 2612.7720506581145}. Best is trial 2 with value: 0.9578204771187228.[0m
[32m[I 2021-04

[32m[I 2021-04-20 11:37:09,826][0m Trial 36 finished with value: 0.9543117051888982 and parameters: {'classifier': 'RandomForest', 'rf_n_estimators': 386, 'rf_max_depth': 14}. Best is trial 23 with value: 0.9648565859092174.[0m
[32m[I 2021-04-20 11:37:09,848][0m Trial 37 finished with value: 0.6274204028589994 and parameters: {'classifier': 'LogReg', 'logreg_c': 2.4088945821340217e-10}. Best is trial 23 with value: 0.9648565859092174.[0m
[32m[I 2021-04-20 11:37:10,736][0m Trial 38 finished with value: 0.9543117051888982 and parameters: {'classifier': 'RandomForest', 'rf_n_estimators': 500, 'rf_max_depth': 20}. Best is trial 23 with value: 0.9648565859092174.[0m
[32m[I 2021-04-20 11:37:10,762][0m Trial 39 finished with value: 0.6274204028589994 and parameters: {'classifier': 'LogReg', 'logreg_c': 6.773041562031867e-10}. Best is trial 23 with value: 0.9648565859092174.[0m
[32m[I 2021-04-20 11:37:11,265][0m Trial 40 finished with value: 0.9578297595841455 and parameters: {'c

[32m[I 2021-04-20 11:37:31,294][0m Trial 72 finished with value: 0.959574863083635 and parameters: {'classifier': 'RandomForest', 'rf_n_estimators': 456, 'rf_max_depth': 13}. Best is trial 47 with value: 0.9666202543395525.[0m
[32m[I 2021-04-20 11:37:31,671][0m Trial 73 finished with value: 0.9595934280144807 and parameters: {'classifier': 'RandomForest', 'rf_n_estimators': 199, 'rf_max_depth': 15}. Best is trial 47 with value: 0.9666202543395525.[0m
[32m[I 2021-04-20 11:37:31,789][0m Trial 74 finished with value: 0.9578297595841455 and parameters: {'classifier': 'RandomForest', 'rf_n_estimators': 54, 'rf_max_depth': 9}. Best is trial 47 with value: 0.9666202543395525.[0m
[32m[I 2021-04-20 11:37:32,437][0m Trial 75 finished with value: 0.9560753736192332 and parameters: {'classifier': 'RandomForest', 'rf_n_estimators': 345, 'rf_max_depth': 10}. Best is trial 47 with value: 0.9666202543395525.[0m
[32m[I 2021-04-20 11:37:33,436][0m Trial 76 finished with value: 0.9595934280

In [3]:
study.best_params

{'classifier': 'RandomForest', 'rf_n_estimators': 75, 'rf_max_depth': 13}

In [4]:
study.best_value

0.9666202543395525

#### n_estimators as 411 and max_depth fo 23 works best

## Historical studies
### - need to change address of DB

In [8]:
import joblib

# Create a study name:
study_name = 'experiment-C'

# Store in DB:
study = optuna.create_study(study_name = study_name, storage='sqlite:///tmp/experiments.db', load_if_exists=True)

# Optimize:
study.optimize(objective, n_trials=3)


### Overview of all the trials

In [10]:
study.trials_dataframe()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_classifier,params_logreg_c,params_rf_max_depth,params_rf_n_estimators,state
0,0,0.942003,2021-04-20 11:16:57.379777,2021-04-20 11:16:59.340939,0 days 00:00:01.961162,LogReg,7.250940e+05,,,COMPLETE
1,1,0.927912,2021-04-20 11:16:59.346251,2021-04-20 11:17:00.310228,0 days 00:00:00.963977,LogReg,2.986606e-02,,,COMPLETE
2,2,0.956075,2021-04-20 11:17:00.311132,2021-04-20 11:17:02.588281,0 days 00:00:02.277149,RandomForest,,6.0,831.0,COMPLETE
3,3,0.627420,2021-04-20 11:17:02.589224,2021-04-20 11:17:03.437037,0 days 00:00:00.847813,LogReg,1.065467e-09,,,COMPLETE
4,4,0.920941,2021-04-20 11:17:03.437952,2021-04-20 11:17:03.477689,0 days 00:00:00.039737,LogReg,4.519648e-05,,,COMPLETE
...,...,...,...,...,...,...,...,...,...,...
95,95,0.959584,2021-04-20 11:18:04.697577,2021-04-20 11:18:05.154594,0 days 00:00:00.457017,RandomForest,,8.0,240.0,COMPLETE
96,96,0.959575,2021-04-20 11:18:05.155599,2021-04-20 11:18:06.364919,0 days 00:00:01.209320,RandomForest,,13.0,668.0,COMPLETE
97,97,0.957830,2021-04-20 11:18:06.366095,2021-04-20 11:18:06.420259,0 days 00:00:00.054164,RandomForest,,23.0,17.0,COMPLETE
98,98,0.954312,2021-04-20 11:18:06.421163,2021-04-20 11:18:07.026081,0 days 00:00:00.604918,RandomForest,,22.0,324.0,COMPLETE


## Visualization

In [11]:
### Optimization history plot

In [6]:
# Visualize the optimization history.
from optuna.visualization import plot_optimization_history

plot_optimization_history(study).show()

In [7]:
# Visualize the parallel coordinate
optuna.visualization.plot_parallel_coordinate(study, params = ['rf_n_estimators', 'rf_max_depth'])

In [8]:
# Visualize the slice plot
optuna.visualization.plot_slice(study, params = ['rf_n_estimators', 'rf_max_depth'])

## Defining parameter spaces
### Optuna supports five ways to define parameters

In [None]:
def objective(trial):
    # Categorical paramter
    rf_criterion = trial.suggest_categorical('rf_criterion', ['gini', 'entropy'])
    
    # Int paramter
    rf_n_estimators = trial.suggest_int("rf_n_estimators", 10, 1000)
    
    rf_max_depth = trial.suggest_int("rf_max_depth", 2, 32, log=True)
    
    # Uniform parameter
    rf_min_weight_fraction_leaf = trial.suggest_uniform('rf_min_weight_fraction_leaf', 0.0, 1.0)
    
    # Loguniform parameter
#     learning_rate = trial.suggest_loguniform('rf_parameter_x', 1e-5, 1e-2)
    
    # Discrete-uniform parameter
#     drop_path_rate = trial.suggest_discrete_uniform('rf_parameter_y', 0.0, 1.0, 0.1)
    
    classifier_obj = ensemble.RandomForestClassifier(
             n_estimators=rf_n_estimators, criterion=rf_criterion, max_depth=rf_max_depth
            ,min_weight_fraction_leaf = rf_min_weight_fraction_leaf)
    
    score = model_selection.cross_val_score(classifier_obj, X, y, n_jobs=-1, cv=3)
    accuracy = score.mean()
    return accuracy

In [None]:
%%timeit
study = optuna.create_study(direction="maximize")


study.optimize(objective, n_trials=1000)

In [None]:
study.best_params

In [None]:
study.best_value