
Licensed under the MIT License.

Copyright (c) 2021-2025. All rights reserved.

# Optuna Specified Search Space with Integrated CV

* Using Optuna integrated CV

In [1]:
import pandas as pd
import timeit

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import balanced_accuracy_score
from sklearn import preprocessing

import optuna.integration.lightgbm as lgb
from lightgbm import LGBMClassifier
import optuna

import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [2]:
df30 = pd.read_csv('../../crystal_ball/data_collector/structured_data/leaf.csv')
print(df30.shape)

df30.head()

(340, 16)


Unnamed: 0,species,specimen_number,eccentricity,aspect_ratio,elongation,solidity,stochastic_convexity,isoperimetric_factor,maximal_indentation_depth,lobedness,average_intensity,average_contrast,smoothness,third_moment,uniformity,entropy
0,1,1,0.72694,1.4742,0.32396,0.98535,1.0,0.83592,0.004657,0.003947,0.04779,0.12795,0.016108,0.005232,0.000275,1.1756
1,1,2,0.74173,1.5257,0.36116,0.98152,0.99825,0.79867,0.005242,0.005002,0.02416,0.090476,0.008119,0.002708,7.5e-05,0.69659
2,1,3,0.76722,1.5725,0.38998,0.97755,1.0,0.80812,0.007457,0.010121,0.011897,0.057445,0.003289,0.000921,3.8e-05,0.44348
3,1,4,0.73797,1.4597,0.35376,0.97566,1.0,0.81697,0.006877,0.008607,0.01595,0.065491,0.004271,0.001154,6.6e-05,0.58785
4,1,5,0.82301,1.7707,0.44462,0.97698,1.0,0.75493,0.007428,0.010042,0.007938,0.045339,0.002051,0.00056,2.4e-05,0.34214


In [3]:
# Using optuna, multiclass need to adjacent classes
le = preprocessing.LabelEncoder()
df30['species'] = le.fit_transform(df30['species'])


# train, test split for df30
y30 = df30['species']
X30 = df30.drop('species', axis=1)

X_train30, X_test30, y_train30, y_test30 = train_test_split(X30, y30, test_size=0.2,
                                               random_state=10, shuffle=True, stratify=y30)

X_train30.reset_index(inplace=True, drop=True)
X_test30.reset_index(inplace=True, drop=True)
y_train30.reset_index(inplace=True, drop=True)
y_test30.reset_index(inplace=True, drop=True)

print(X_train30.shape, X_test30.shape, y_train30.shape, y_test30.shape)
print(y_train30.nunique(), y_test30.nunique())

(272, 15) (68, 15) (272,) (68,)
30 30


## Without Pruner for Leaves30
* LGBM params: https://lightgbm.readthedocs.io/en/latest/Parameters.html
* Params used in Optuna's integrated LGBM CV: https://optuna.readthedocs.io/en/stable/reference/generated/optuna.integration.lightgbm.LightGBMTunerCV.html
* Optuna trial methods: https://optuna.readthedocs.io/en/stable/reference/generated/optuna.trial.Trial.html#optuna.trial.Trial
* Without pruner

In [20]:
def objective(trial):
    dtrain = lgb.Dataset(X_train30, y_train30)
    upper = min(32768, int(X_train30.shape[0]))
        
    params = {
        "objective": "multiclass",
        "metric": "multi_logloss",
        "num_class": 30, 
        "random_state": 10,
        "verbosity": -1,
        
        # specify the similar search space as FLAML, but optuna has no initial value to suggest
        "num_leaves": trial.suggest_int("num_leaves", 4, upper),
        "learning_rate": trial.suggest_loguniform("learning_rate", 1/1024, 1.0),
        "subsample": trial.suggest_uniform("subsample", 0.1, 1.0),
        "max_bin": trial.suggest_int("max_bin", 3, 10),
        "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.01, 1.0),
        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1/1024, 1024),
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1/1024, 1024),
        "max_depth": trial.suggest_int("max_depth", 2, 10),
        "extra_trees": True
    }

    tuner = lgb.LightGBMTunerCV(
        params, 
        dtrain,
        time_budget=100,
        verbose_eval=False,
        folds=StratifiedKFold(n_splits=5, shuffle=True, random_state=10),
        verbosity=-1
    )
    
    tuner.run()
    return tuner.best_score

In [21]:
start = timeit.default_timer()

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=10)

print("Number of finished trials: {}".format(len(study.trials)))
print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))
    
stop = timeit.default_timer()
print('Time: ', stop - start)

[32m[I 2021-08-28 18:01:17,303][0m A new study created in memory with name: no-name-b558219f-fd64-46fe-8ff7-27fe794b97aa[0m
[32m[I 2021-08-28 18:01:17,311][0m A new study created in memory with name: no-name-49141714-556d-48d3-a8a7-f9b64ab2dc86[0m





  0%|                                                                                            | 0/7 [00:00<?, ?it/s][A[A[A[A[A




feature_fraction, val_score: inf:   0%|                                                          | 0/7 [00:00<?, ?it/s][A[A[A[A[A




feature_fraction, val_score: 3.426870:   0%|                                                     | 0/7 [00:04<?, ?it/s][A[A[A[A[A




feature_fraction, val_score: 3.426870:  14%|######4                                      | 1/7 [00:04<00:28,  4.71s/it][A[A[A[A[A




feature_fraction, val_score: 3.426870:  14%|######4                                      | 1/7 [00:04<00:28,  4.71s/it][A[A[A[A[A




feature_fraction, val_score: 3.426870:  14%|##

Number of finished trials: 10
Best trial:
  Value: 0.9585259218863685
  Params: 
    num_leaves: 57
    learning_rate: 0.07977886530426873
    subsample: 0.3449928354346119
    max_bin: 8
    colsample_bytree: 0.813610248421246
    reg_alpha: 0.004590131406993605
    reg_lambda: 16.423746018923037
    max_depth: 7
Time:  1104.655316800001





In [22]:
model = LGBMClassifier(objective='multiclass', metric='multi_logloss', num_class=30, 
                       random_state=10, 
                       num_leaves=57, learning_rate=0.07977886530426873, subsample=0.3449928354346119, max_bin=8,
                       colsample_bytree=0.813610248421246, reg_alpha=0.004590131406993605, reg_lambda=16.423746018923037, max_depth=7)
model.fit(X_train30, y_train30)
y_pred30 = model.predict(X_test30)

balanced_accuracy = balanced_accuracy_score(y_test30, y_pred30)
print(f'The balanced accuracy on testing data from optimized model is {balanced_accuracy}')

The balanced accuracy on testing data from optimized model is 0.711111111111111


## Add Pruner for Leaves30

* Optuna pruners andntips: https://optuna.readthedocs.io/en/stable/tutorial/10_key_features/003_efficient_optimization_algorithms.html#pruning-algorithms

In [7]:
def objective(trial):
    dtrain = lgb.Dataset(X_train30, y_train30)
    upper = min(32768, int(X_train30.shape[0]))
        
    params = {
        "objective": "multiclass",
        "metric": "multi_logloss",
        "num_class": 30, 
        "random_state": 10,
        "verbosity": -1,
        
        # specify the similar search space as FLAML, but optuna has no initial value to suggest
        "num_leaves": trial.suggest_int("num_leaves", 4, upper),
        "learning_rate": trial.suggest_loguniform("learning_rate", 1/1024, 1.0),
        "subsample": trial.suggest_uniform("subsample", 0.1, 1.0),
        "max_bin": trial.suggest_int("max_bin", 3, 10),
        "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.01, 1.0),
        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1/1024, 1024),
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1/1024, 1024),
        "max_depth": trial.suggest_int("max_depth", 2, 10),
        "extra_trees": True
    }
    
    pruning_callback = optuna.integration.LightGBMPruningCallback(trial, "multi_logloss")
    tuner = lgb.LightGBMTunerCV(
        params, 
        dtrain,
        time_budget=100,
        verbose_eval=False,
        folds=StratifiedKFold(n_splits=5, shuffle=True, random_state=10),
        verbosity=-1,
        callbacks=[pruning_callback]
    )
    
    tuner.run()
    return tuner.best_score

In [8]:
start = timeit.default_timer()

study = optuna.create_study(direction="minimize", pruner=optuna.pruners.HyperbandPruner())
study.optimize(objective, n_trials=100)

print("Number of finished trials: {}".format(len(study.trials)))
print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))
    
stop = timeit.default_timer()
print('Time: ', stop - start)

feature_fraction, val_score: 1.255608: 100%|#############################################| 7/7 [01:40<00:00, 14.31s/it]
  0%|                                                                                           | 0/20 [00:00<?, ?it/s]
  0%|                                                                                           | 0/10 [00:00<?, ?it/s]
  0%|                                                                                            | 0/3 [00:00<?, ?it/s]
  0%|                                                                                           | 0/20 [00:00<?, ?it/s]
  0%|                                                                                            | 0/5 [00:00<?, ?it/s]
feature_fraction, val_score: 3.426870: 100%|#############################################| 7/7 [00:59<00:00,  8.54s/it]
num_leaves, val_score: 3.426870:  30%|###############                                   | 6/20 [00:44<01:43,  7.39s/it]
  0%|                                   

Number of finished trials: 100
Best trial:
  Value: 0.9011210615816487
  Params: 
    num_leaves: 132
    learning_rate: 0.01450911890294548
    subsample: 0.6602135782504309
    max_bin: 9
    colsample_bytree: 0.7259319761026826
    reg_alpha: 0.0012697671364013724
    reg_lambda: 0.1981004944741745
    max_depth: 2
Time:  11033.8225253


In [9]:
model = LGBMClassifier(objective='multiclass', metric='multi_logloss', num_class=132, 
                       random_state=10, 
                       num_leaves=270, learning_rate=0.01450911890294548, subsample=0.6602135782504309, max_bin=9,
                       colsample_bytree=0.7259319761026826, reg_alpha=0.0012697671364013724, reg_lambda=0.1981004944741745, max_depth=2)
model.fit(X_train30, y_train30)
y_pred30 = model.predict(X_test30)

balanced_accuracy = balanced_accuracy_score(y_test30, y_pred30)
print(f'The balanced accuracy on testing data from optimized model is {balanced_accuracy}')

The balanced accuracy on testing data from optimized model is 0.6166666666666667
