hello world 👋 

In [18]:
import time

import optuna
from catboost import CatBoostClassifier
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split

In [2]:
# fetch data set
ID = 1240
X, y = fetch_openml(data_id=ID, data_home=f"openml_download_{ID}", return_X_y=True)

# downsample and create a stratified train test split
X = X.sample(frac=0.2)
X = X.dropna(axis=0, how="any")
y = y.loc[X.index]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [3]:
classifier = CatBoostClassifier(
    iterations=32,
    loss_function="Logloss",
    eval_metric="F1",
    metric_period=16,
    cat_features=list(X.select_dtypes(include="category").columns),
)

In [4]:
classifier.fit(
    X_train, y_train, eval_set=(X_test, y_test), metric_period=16, verbose=False
)

<catboost.core.CatBoostClassifier at 0x10b87b2b0>

In [5]:
classifier.get_best_score().get("validation").get("F1")

0.8007828136814482

1. Tree parameters
    * Depth
    * min_data_in_leaf
    * grow_policy
2. Sampling parameters
    * Subsample
    * colsample_bylevel
    * sampling_frquency
3. Regularization parameters
    * penalties_coefficient
    * first_feature_use_penalties
    * leaf_estimation_backtracking
4. Learning rate
    * Iterations
    * learning_rate
    * model_shrink_rate
    * boost_from_average

- what are hyperparams, and why tune
- gridsearch, randomsearch, lhs search -> pro cons of each
- optuna in one go
- optuna in steps 
- compare time should be less, for same or better performance 

In [6]:
def objective(trial):
    param = {
        "depth": trial.suggest_int("depth", 1, 15),
        "iterations": trial.suggest_int("iterations", 8, 128),
    }

    cbc = CatBoostClassifier(
        **param,
        eval_metric="F1",
        cat_features=list(X.select_dtypes(include="category").columns),
    )
    cbc.fit(
        X_train, y_train, eval_set=(X_test, y_test), metric_period=16, verbose=False
    )

    return cbc.get_best_score().get("validation").get("F1")

In [7]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10, timeout=120)

[I 2024-03-21 22:54:51,897] A new study created in memory with name: no-name-314011ed-0df5-4293-9531-d61007dad81e
[I 2024-03-21 22:55:04,378] Trial 0 finished with value: 0.8108783857879154 and parameters: {'depth': 12, 'iterations': 99}. Best is trial 0 with value: 0.8108783857879154.
[I 2024-03-21 22:55:06,886] Trial 1 finished with value: 0.768937097172629 and parameters: {'depth': 1, 'iterations': 113}. Best is trial 0 with value: 0.8108783857879154.
[I 2024-03-21 22:55:10,298] Trial 2 finished with value: 0.796464479063087 and parameters: {'depth': 2, 'iterations': 124}. Best is trial 0 with value: 0.8108783857879154.
[I 2024-03-21 22:55:11,856] Trial 3 finished with value: 0.7971442639094042 and parameters: {'depth': 11, 'iterations': 16}. Best is trial 0 with value: 0.8108783857879154.
[I 2024-03-21 22:55:21,657] Trial 4 finished with value: 0.8099571266077522 and parameters: {'depth': 12, 'iterations': 83}. Best is trial 0 with value: 0.8108783857879154.
[I 2024-03-21 22:55:25,

In [11]:
study.best_params

{'depth': 12, 'iterations': 99}

In [10]:
from functools import partial

In [59]:
def obj(trial, step: int) -> float:
    all_step_param = {
        1: {
            "depth": trial.suggest_int(name="depth", low=1, high=15),
            "iterations": trial.suggest_int("iterations", 8, 128),
        },
        2: {
            "subsample": trial.suggest_float("subsample", 0.01, 1.0),
            "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 1.0),
        },
        # 3: {
        #     "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
        #     "bagging_temperature": trial.suggest_float("bagging_temperature", 0.01, 1),
        # },
        # 4: {
        #     "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        # },
    }
    current_params = all_step_param[step]

    print(current_params)
    cbc = CatBoostClassifier(
        **current_params,
        eval_metric="F1",
        cat_features=list(X.select_dtypes(include="category").columns),
    )
    cbc.fit(
        X_train, y_train, eval_set=(X_test, y_test), metric_period=16, verbose=False
    )

    return cbc.get_best_score().get("validation").get("F1")

In [60]:
start = time.time()

study = optuna.create_study(direction="maximize")
obj_1 = partial(obj, step=1)
study.optimize(obj_1, n_trials=10, timeout=120)

end = time.time()

elapsed_time = []
elapsed_time.append(end - start)

f"{end-start:.2f} sec"

[I 2024-03-22 09:05:05,853] A new study created in memory with name: no-name-b152768d-030c-4422-bcbe-aed5d4a71de4


{'depth': 8, 'iterations': 69}


[I 2024-03-22 09:05:09,896] Trial 0 finished with value: 0.8063223508459484 and parameters: {'depth': 8, 'iterations': 69, 'subsample': 0.9414831839310278, 'colsample_bylevel': 0.27536204961454747}. Best is trial 0 with value: 0.8063223508459484.


{'depth': 14, 'iterations': 105}


[I 2024-03-22 09:05:40,828] Trial 1 finished with value: 0.8092867546357688 and parameters: {'depth': 14, 'iterations': 105, 'subsample': 0.8648713805808682, 'colsample_bylevel': 0.15102405478691028}. Best is trial 1 with value: 0.8092867546357688.


{'depth': 3, 'iterations': 103}


[I 2024-03-22 09:05:44,118] Trial 2 finished with value: 0.7990014710471182 and parameters: {'depth': 3, 'iterations': 103, 'subsample': 0.4314496319319453, 'colsample_bylevel': 0.6431281686858351}. Best is trial 1 with value: 0.8092867546357688.


{'depth': 4, 'iterations': 64}


[I 2024-03-22 09:05:46,510] Trial 3 finished with value: 0.7993593736097518 and parameters: {'depth': 4, 'iterations': 64, 'subsample': 0.7413468355755314, 'colsample_bylevel': 0.100893458730708}. Best is trial 1 with value: 0.8092867546357688.


{'depth': 8, 'iterations': 110}


[I 2024-03-22 09:05:52,613] Trial 4 finished with value: 0.8073264296959886 and parameters: {'depth': 8, 'iterations': 110, 'subsample': 0.13932050964558507, 'colsample_bylevel': 0.2925627760243539}. Best is trial 1 with value: 0.8092867546357688.


{'depth': 6, 'iterations': 123}


[I 2024-03-22 09:05:58,186] Trial 5 finished with value: 0.8047563908435023 and parameters: {'depth': 6, 'iterations': 123, 'subsample': 0.228029402223525, 'colsample_bylevel': 0.5903726744226127}. Best is trial 1 with value: 0.8092867546357688.


{'depth': 6, 'iterations': 84}


[I 2024-03-22 09:06:01,992] Trial 6 finished with value: 0.8057958091841284 and parameters: {'depth': 6, 'iterations': 84, 'subsample': 0.42474748869421164, 'colsample_bylevel': 0.14416873825783233}. Best is trial 1 with value: 0.8092867546357688.


{'depth': 5, 'iterations': 100}


[I 2024-03-22 09:06:06,185] Trial 7 finished with value: 0.8037711515798368 and parameters: {'depth': 5, 'iterations': 100, 'subsample': 0.31998511746479846, 'colsample_bylevel': 0.34242612117337207}. Best is trial 1 with value: 0.8092867546357688.


{'depth': 7, 'iterations': 109}


[I 2024-03-22 09:06:11,623] Trial 8 finished with value: 0.8068270321992034 and parameters: {'depth': 7, 'iterations': 109, 'subsample': 0.3579087218435869, 'colsample_bylevel': 0.31993839733157176}. Best is trial 1 with value: 0.8092867546357688.


{'depth': 7, 'iterations': 93}


[I 2024-03-22 09:06:16,291] Trial 9 finished with value: 0.805559267673393 and parameters: {'depth': 7, 'iterations': 93, 'subsample': 0.41784346177256604, 'colsample_bylevel': 0.8383802938332553}. Best is trial 1 with value: 0.8092867546357688.


'70.44 sec'

In [17]:
%time
study.best_value

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 5.96 µs


0.8094567970728962

In [20]:
start = time.time()

In [21]:
end = time.time()

In [24]:
f"{end-start:.2f} sec"

'8.97 sec'