<a href="https://colab.research.google.com/github/stepthom/869_course/blob/main/optuna_example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# My First Optuna Slides

[Optuna](https://optuna.org/) is great for advanced hyperparameter tuning.

- Stephen W. Thomas
- Used for MMA 869, MMAI 869, and GMMA 869

In [1]:
import datetime
print(datetime.datetime.now())

2022-07-11 15:46:35.955518


In [None]:
pip install optuna

In [3]:
import pandas as pd
import optuna
import numpy as np

# Load and Prep Data

In [4]:
df = pd.read_csv("https://raw.githubusercontent.com/stepthom/869_course/main/data/GermanCredit.csv")
df.head()

Unnamed: 0,Duration,Amount,InstallmentRatePercentage,ResidenceDuration,Age,NumberExistingCredits,NumberPeopleMaintenance,Telephone,ForeignWorker,Class,...,OtherInstallmentPlans.Bank,OtherInstallmentPlans.Stores,OtherInstallmentPlans.None,Housing.Rent,Housing.Own,Housing.ForFree,Job.UnemployedUnskilled,Job.UnskilledResident,Job.SkilledEmployee,Job.Management.SelfEmp.HighlyQualified
0,6,1169,4,4,67,2,1,0,1,Good,...,0,0,1,0,1,0,0,0,1,0
1,48,5951,2,2,22,1,1,1,1,Bad,...,0,0,1,0,1,0,0,0,1,0
2,12,2096,2,3,49,1,2,1,1,Good,...,0,0,1,0,1,0,0,1,0,0
3,42,7882,2,4,45,1,2,1,1,Good,...,0,0,1,0,0,1,0,0,1,0
4,24,4870,3,4,53,2,2,1,1,Bad,...,0,0,1,0,0,1,0,0,1,0


In [5]:
target_name = 'Class'
X = df.drop([target_name], axis=1)
y = df[target_name].to_numpy()

# Create and Run an Optuna Study

First we must create an objective function. This is a function that builds a model (given a particular values for all hyperparameters) and returns a score.

Read more in the tutorials: https://optuna.readthedocs.io/en/stable/tutorial/index.html

In [6]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score


def objective_dt(trial, X, y):

  # Now, define all the hyperparams we want to vary, and what values they are allowed
  # to take.
  #
  # Each trial, optuna will automatically choose values for each hyperparam.
  hyper_params = {
        
        'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy']),
        'splitter': trial.suggest_categorical('splitter', ['best', 'random']),

        "max_depth": trial.suggest_int("max_depth", 5, 100, step=5),

        "min_samples_split": trial.suggest_int("min_samples_split", 5, 100, step=5),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 5, 100, step=5),
        "max_leaf_nodes": trial.suggest_int("max_leaf_nodes", 10, 1010, step=100),

        'class_weight': trial.suggest_categorical('class_weight', ['balanced', None]),

        "random_state": 77,
  }
    

  # Use the hyperparams that optuna has chosen for this trial to create a DecisionTreeClassifier
  clf = DecisionTreeClassifier(**hyper_params)

  # Run CV to see how well these hyper_params do
  cv_scores = cross_val_score(clf, X, y, cv=10, scoring="f1_macro")
  score = np.mean(cv_scores)

  # Whatever we return here tells optuna how well these parameters did
  return score

In [7]:
# Create and run an Optuna study

# More options for creating the optuna study can be found at their webpage:
# https://optuna.readthedocs.io/en/stable/reference/generated/optuna.create_study.html
#
# The default sampler is called TPESampler and is very good, but there are others.

study = optuna.create_study(direction="maximize")


[32m[I 2022-07-11 15:46:43,826][0m A new study created in memory with name: no-name-27b9d015-0fbc-44ac-b273-3d957be51776[0m


In [8]:
# More options for optimizing the hyperparms can be found:
# https://optuna.readthedocs.io/en/stable/reference/generated/optuna.study.Study.html#optuna.study.Study.optimize

study.optimize(lambda trial: objective_dt(trial, X, y), n_trials=100,  gc_after_trial=True)

[32m[I 2022-07-11 15:46:43,961][0m Trial 0 finished with value: 0.6320754457375056 and parameters: {'criterion': 'entropy', 'splitter': 'random', 'max_depth': 80, 'min_samples_split': 95, 'min_samples_leaf': 40, 'max_leaf_nodes': 210, 'class_weight': 'balanced'}. Best is trial 0 with value: 0.6320754457375056.[0m
[32m[I 2022-07-11 15:46:44,251][0m Trial 1 finished with value: 0.6362491445911533 and parameters: {'criterion': 'gini', 'splitter': 'best', 'max_depth': 25, 'min_samples_split': 30, 'min_samples_leaf': 10, 'max_leaf_nodes': 610, 'class_weight': None}. Best is trial 1 with value: 0.6362491445911533.[0m
[32m[I 2022-07-11 15:46:44,518][0m Trial 2 finished with value: 0.6388172148542477 and parameters: {'criterion': 'entropy', 'splitter': 'random', 'max_depth': 45, 'min_samples_split': 15, 'min_samples_leaf': 40, 'max_leaf_nodes': 510, 'class_weight': None}. Best is trial 2 with value: 0.6388172148542477.[0m
[32m[I 2022-07-11 15:46:44,727][0m Trial 3 finished with valu

# (Optional) Inspect the Results of the Study

In [9]:
# What were the best params?
study.best_params

{'class_weight': 'balanced',
 'criterion': 'gini',
 'max_depth': 35,
 'max_leaf_nodes': 810,
 'min_samples_leaf': 75,
 'min_samples_split': 95,
 'splitter': 'random'}

In [10]:
# What was the best value?
study.best_value

0.6540645427725201

In [11]:
# All the details of the best trial
study.best_trial

FrozenTrial(number=7, values=[0.6540645427725201], datetime_start=datetime.datetime(2022, 7, 11, 15, 46, 45, 651614), datetime_complete=datetime.datetime(2022, 7, 11, 15, 46, 45, 814459), params={'criterion': 'gini', 'splitter': 'random', 'max_depth': 35, 'min_samples_split': 95, 'min_samples_leaf': 75, 'max_leaf_nodes': 810, 'class_weight': 'balanced'}, distributions={'criterion': CategoricalDistribution(choices=('gini', 'entropy')), 'splitter': CategoricalDistribution(choices=('best', 'random')), 'max_depth': IntUniformDistribution(high=100, low=5, step=5), 'min_samples_split': IntUniformDistribution(high=100, low=5, step=5), 'min_samples_leaf': IntUniformDistribution(high=100, low=5, step=5), 'max_leaf_nodes': IntUniformDistribution(high=1010, low=10, step=100), 'class_weight': CategoricalDistribution(choices=('balanced', None))}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=7, state=TrialState.COMPLETE, value=None)

In [12]:
fig = optuna.visualization.plot_optimization_history(study)
fig.show()

In [13]:
fig = optuna.visualization.plot_param_importances(study)
fig.show()

# Build the Final Model (on all the data)

In [14]:
clf = DecisionTreeClassifier(**study.best_params)
clf.fit(X,y)

DecisionTreeClassifier(class_weight='balanced', max_depth=35,
                       max_leaf_nodes=810, min_samples_leaf=75,
                       min_samples_split=95, splitter='random')

In [15]:
 # ... and now you can make predictions on new (competition, or real) data, deploy, etc...