hello world 👋


In [8]:
import time
from functools import partial

import numpy as np
import optuna
from catboost import CatBoostClassifier, Pool
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split

In [9]:
N_TRIALS = 20

In [10]:
# fetch data set
ID = 1461
X, y = fetch_openml(data_id=ID, data_home=f"openml_download_{ID}", return_X_y=True)

# downsample and create a stratified train test split
X = X.sample(frac=0.2)
X = X.dropna(axis=0, how="any")
y = y.astype(int) - 1

y = y.loc[X.index]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [11]:
train_pool = Pool(
    data=X_train,
    label=y_train,
    cat_features=list(X.select_dtypes(include="category").columns),
)
train_pool.quantize()

test_pool = Pool(
    data=X_test,
    label=y_test,
    cat_features=list(X.select_dtypes(include="category").columns),
)
test_pool.quantize()

1. Tree parameters
   - Depth
   - min_data_in_leaf
   - grow_policy
2. Sampling parameters
   - Subsample
   - colsample_bylevel
   - sampling_frquency
3. Regularization parameters
   - penalties_coefficient
   - first_feature_use_penalties
   - leaf_estimation_backtracking
4. Learning rate
   - Iterations
   - learning_rate
   - model_shrink_rate
   - boost_from_average


- what are hyperparams, and why tune
- gridsearch, randomsearch, lhs search -> pro cons of each
- optuna in one go
- optuna in steps
- compare time should be less, for same or better performance


In [12]:
def obj_step(trial, step: int, best_params: dict = {}) -> float:
    if step == 1:
        params = {
            "depth": trial.suggest_int(name="depth", low=1, high=15),
            "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
        }
    elif step == 2:
        params = {
            "subsample": trial.suggest_float("subsample", 0.01, 1.0),
            "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 1.0),
        }
    elif step == 3:
        params = {
            "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 0.01, 1),
            "bagging_temperature": trial.suggest_float(
                "bagging_temperature", 0.01, 1_000_000, log=True
            ),
        }
    elif step == 4:
        params = {
            "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
            "iterations": trial.suggest_int("iterations", 8, 128),
        }

    params.update(best_params)

    cbc = CatBoostClassifier(
        **params,
        eval_metric="F1",
        # cat_features=list(X.select_dtypes(include="category").columns),
    )
    cbc.fit(train_pool, eval_set=(test_pool), verbose=False)

    return cbc.get_best_score().get("validation").get("F1")

In [13]:
best_params, elapsed_time = {"iterations": 128}, []

for i in range(1, 5):
    start = time.time()
    obj_i = partial(obj_step, step=i, best_params=best_params)
    study = optuna.create_study(study_name="baby-steps", direction="maximize")
    study.optimize(obj_i, n_trials=N_TRIALS, timeout=120)

    end = time.time()
    elapsed_time.append(end - start)
    best_params.update(study.best_params)
    print("-" * 10)
    print(f"time step {i}: {elapsed_time[i-1]:.2f} sec")
    print(f"total time step : {np.sum(elapsed_time):.2f} sec")
    print(f"{best_params=}")
    print(f"{study.best_value=}")
    print("-" * 10)

[I 2024-03-22 17:28:52,521] A new study created in memory with name: baby-steps
[I 2024-03-22 17:28:52,714] Trial 0 finished with value: 0.4660194174757281 and parameters: {'depth': 4, 'min_data_in_leaf': 29}. Best is trial 0 with value: 0.4660194174757281.
[I 2024-03-22 17:28:52,957] Trial 1 finished with value: 0.45625 and parameters: {'depth': 6, 'min_data_in_leaf': 96}. Best is trial 0 with value: 0.4660194174757281.
[I 2024-03-22 17:28:53,131] Trial 2 finished with value: 0.4660194174757281 and parameters: {'depth': 4, 'min_data_in_leaf': 75}. Best is trial 0 with value: 0.4660194174757281.
[I 2024-03-22 17:28:55,009] Trial 3 finished with value: 0.396039603960396 and parameters: {'depth': 13, 'min_data_in_leaf': 52}. Best is trial 0 with value: 0.4660194174757281.
[I 2024-03-22 17:28:55,432] Trial 4 finished with value: 0.4299674267100977 and parameters: {'depth': 10, 'min_data_in_leaf': 93}. Best is trial 0 with value: 0.4660194174757281.
[I 2024-03-22 17:28:56,472] Trial 5 fini

----------
time step 1: 15.90 sec
total time step : 15.90 sec
best_params={'iterations': 128, 'depth': 4, 'min_data_in_leaf': 29}
study.best_value=0.4660194174757281
----------


[I 2024-03-22 17:29:08,713] Trial 1 finished with value: 0.4573170731707317 and parameters: {'subsample': 0.22419557625732472, 'colsample_bylevel': 0.5305767025019139}. Best is trial 0 with value: 0.46349206349206346.
[I 2024-03-22 17:29:08,870] Trial 2 finished with value: 0.44728434504792336 and parameters: {'subsample': 0.17872908071922383, 'colsample_bylevel': 0.6230192267675259}. Best is trial 0 with value: 0.46349206349206346.
[I 2024-03-22 17:29:09,005] Trial 3 finished with value: 0.4258064516129032 and parameters: {'subsample': 0.41137163981615493, 'colsample_bylevel': 0.1388071407379855}. Best is trial 0 with value: 0.46349206349206346.
[I 2024-03-22 17:29:09,181] Trial 4 finished with value: 0.44375 and parameters: {'subsample': 0.9904427446380495, 'colsample_bylevel': 0.5189454182141292}. Best is trial 0 with value: 0.46349206349206346.
[I 2024-03-22 17:29:09,331] Trial 5 finished with value: 0.43278688524590164 and parameters: {'subsample': 0.03438701026739289, 'colsample_

----------
time step 2: 3.15 sec
total time step : 19.05 sec
best_params={'iterations': 128, 'depth': 4, 'min_data_in_leaf': 29, 'subsample': 0.3152169831966806, 'colsample_bylevel': 0.32859226341433706}
study.best_value=0.4660194174757281
----------


[I 2024-03-22 17:29:11,855] Trial 1 finished with value: 0.29959514170040485 and parameters: {'l2_leaf_reg': 0.2835165516423327, 'bagging_temperature': 117.92580541598309}. Best is trial 0 with value: 0.29959514170040485.
[I 2024-03-22 17:29:11,997] Trial 2 finished with value: 0.29959514170040485 and parameters: {'l2_leaf_reg': 0.07823760999938806, 'bagging_temperature': 61.3739716182452}. Best is trial 0 with value: 0.29959514170040485.
[I 2024-03-22 17:29:12,147] Trial 3 finished with value: 0.29959514170040485 and parameters: {'l2_leaf_reg': 0.15971716429359725, 'bagging_temperature': 52689.55456068125}. Best is trial 0 with value: 0.29959514170040485.
[I 2024-03-22 17:29:12,286] Trial 4 finished with value: 0.29959514170040485 and parameters: {'l2_leaf_reg': 0.3109284664330199, 'bagging_temperature': 0.02528531429478604}. Best is trial 0 with value: 0.29959514170040485.
[I 2024-03-22 17:29:12,436] Trial 5 finished with value: 0.29959514170040485 and parameters: {'l2_leaf_reg': 0.6

----------
time step 3: 3.02 sec
total time step : 22.07 sec
best_params={'iterations': 128, 'depth': 4, 'min_data_in_leaf': 29, 'subsample': 0.3152169831966806, 'colsample_bylevel': 0.32859226341433706, 'l2_leaf_reg': 0.22645656496651975, 'bagging_temperature': 0.1479103713938864}
study.best_value=0.312
----------


[I 2024-03-22 17:29:14,884] Trial 1 finished with value: 0.11320754716981131 and parameters: {'learning_rate': 0.017117014143124828, 'iterations': 95}. Best is trial 1 with value: 0.11320754716981131.
[I 2024-03-22 17:29:15,036] Trial 2 finished with value: 0.4294871794871795 and parameters: {'learning_rate': 0.08156906571376626, 'iterations': 64}. Best is trial 2 with value: 0.4294871794871795.
[I 2024-03-22 17:29:15,180] Trial 3 finished with value: 0.0 and parameters: {'learning_rate': 0.00462371673795551, 'iterations': 124}. Best is trial 2 with value: 0.4294871794871795.
[I 2024-03-22 17:29:15,334] Trial 4 finished with value: 0.4415584415584416 and parameters: {'learning_rate': 0.07716440416578711, 'iterations': 72}. Best is trial 4 with value: 0.4415584415584416.
[I 2024-03-22 17:29:15,487] Trial 5 finished with value: 0.0 and parameters: {'learning_rate': 0.0010287219837477018, 'iterations': 102}. Best is trial 4 with value: 0.4415584415584416.
[I 2024-03-22 17:29:15,621] Trial

----------
time step 4: 3.02 sec
total time step : 25.08 sec
best_params={'iterations': 107, 'depth': 4, 'min_data_in_leaf': 29, 'subsample': 0.3152169831966806, 'colsample_bylevel': 0.32859226341433706, 'l2_leaf_reg': 0.22645656496651975, 'bagging_temperature': 0.1479103713938864, 'learning_rate': 0.07183770874707367}
study.best_value=0.44370860927152317
----------


In [14]:
def obj_all(trial) -> float:
    params = {
        "depth": trial.suggest_int(name="depth", low=1, high=15),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
        "subsample": trial.suggest_float("subsample", 0.01, 1.0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 1.0),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 0.01, 1),
        "bagging_temperature": trial.suggest_float(
            "bagging_temperature", 1e-3, 1_000_000, log=True
        ),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "iterations": trial.suggest_int("iterations", 1, 256),
    }

    cbc = CatBoostClassifier(
        **params,
        eval_metric="F1",
        # cat_features=list(X.select_dtypes(include="category").columns),
    )
    cbc.fit(train_pool, eval_set=(test_pool), verbose=False)

    return cbc.get_best_score().get("validation").get("F1")


print("-" * 10)

start = time.time()
study = optuna.create_study(study_name="all-in", direction="maximize")
study.optimize(obj_all, n_trials=4 * N_TRIALS, timeout=120)

end = time.time()
print("-" * 10)
print(f"time all-in: {end - start:.2f} sec")
print(f"{study.best_params=}")
print(f"{study.best_value=}")

[I 2024-03-22 17:29:17,609] A new study created in memory with name: all-in


----------


[I 2024-03-22 17:29:17,811] Trial 0 finished with value: 0.03940886699507389 and parameters: {'depth': 6, 'min_data_in_leaf': 100, 'subsample': 0.5510380902367414, 'colsample_bylevel': 0.17501771607858782, 'l2_leaf_reg': 0.12834473825438214, 'bagging_temperature': 0.016781291725149878, 'learning_rate': 0.008377392575886848, 'iterations': 188}. Best is trial 0 with value: 0.03940886699507389.
[I 2024-03-22 17:29:18,379] Trial 1 finished with value: 0.0861244019138756 and parameters: {'depth': 2, 'min_data_in_leaf': 58, 'subsample': 0.11973766131987405, 'colsample_bylevel': 0.5591365846810507, 'l2_leaf_reg': 0.12568741591024649, 'bagging_temperature': 188.89834390756937, 'learning_rate': 0.006405065787627228, 'iterations': 245}. Best is trial 1 with value: 0.0861244019138756.
[I 2024-03-22 17:29:18,503] Trial 2 finished with value: 0.13023255813953488 and parameters: {'depth': 8, 'min_data_in_leaf': 23, 'subsample': 0.8997806207121376, 'colsample_bylevel': 0.8937438687954574, 'l2_leaf_re

----------
time all-in: 57.29 sec
study.best_params={'depth': 7, 'min_data_in_leaf': 66, 'subsample': 0.7920239790464081, 'colsample_bylevel': 0.2782857183194383, 'l2_leaf_reg': 0.6812036496154472, 'bagging_temperature': 50343.84993588016, 'learning_rate': 0.08668340118273181, 'iterations': 195}
study.best_value=0.4764890282131662
