hello world 👋 

In [1]:
import optuna
from catboost import CatBoostClassifier
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split

In [2]:
ID = 1240
X, y = fetch_openml(data_id=ID, data_home=f"openml_download_{ID}", return_X_y=True)

X = X.sample(frac=0.2)
X = X.dropna(axis=0, how="any")
y = y.loc[X.index]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [3]:
classifier = CatBoostClassifier(
    iterations=32,
    loss_function="Logloss",
    eval_metric="F1",
    metric_period=16,
    cat_features=list(X.select_dtypes(include="category").columns),
)

In [4]:
classifier.fit(
    X_train, y_train, eval_set=(X_test, y_test), metric_period=16, verbose=False
)

<catboost.core.CatBoostClassifier at 0x10b87b2b0>

In [5]:
classifier.get_best_score().get("validation").get("F1")

0.8007828136814482

1. Tree parameters
    * Depth
    * min_data_in_leaf
    * grow_policy
2. Sampling parameters
    * Subsample
    * colsample_bylevel
    * sampling_frquency
3. Regularization parameters
    * penalties_coefficient
    * first_feature_use_penalties
    * leaf_estimation_backtracking
4. Learning rate
    * Iterations
    * learning_rate
    * model_shrink_rate
    * boost_from_average

- what are hyperparams, and why tune
- gridsearch, randomsearch, lhs search -> pro cons of each
- optuna in one go
- optuna in steps 
- compare time should be less, for same or better performance 

In [6]:
def objective(trial):
    param = {
        "depth": trial.suggest_int("depth", 1, 15),
        "iterations": trial.suggest_int("iterations", 8, 128),
    }

    cbc = CatBoostClassifier(
        **param,
        eval_metric="F1",
        cat_features=list(X.select_dtypes(include="category").columns),
    )
    cbc.fit(
        X_train, y_train, eval_set=(X_test, y_test), metric_period=16, verbose=False
    )

    return cbc.get_best_score().get("validation").get("F1")

In [7]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10, timeout=120)

[I 2024-03-21 22:54:51,897] A new study created in memory with name: no-name-314011ed-0df5-4293-9531-d61007dad81e
[I 2024-03-21 22:55:04,378] Trial 0 finished with value: 0.8108783857879154 and parameters: {'depth': 12, 'iterations': 99}. Best is trial 0 with value: 0.8108783857879154.
[I 2024-03-21 22:55:06,886] Trial 1 finished with value: 0.768937097172629 and parameters: {'depth': 1, 'iterations': 113}. Best is trial 0 with value: 0.8108783857879154.
[I 2024-03-21 22:55:10,298] Trial 2 finished with value: 0.796464479063087 and parameters: {'depth': 2, 'iterations': 124}. Best is trial 0 with value: 0.8108783857879154.
[I 2024-03-21 22:55:11,856] Trial 3 finished with value: 0.7971442639094042 and parameters: {'depth': 11, 'iterations': 16}. Best is trial 0 with value: 0.8108783857879154.
[I 2024-03-21 22:55:21,657] Trial 4 finished with value: 0.8099571266077522 and parameters: {'depth': 12, 'iterations': 83}. Best is trial 0 with value: 0.8108783857879154.
[I 2024-03-21 22:55:25,

In [10]:
from functools import partial

In [None]:
def obj(trial, step):
    all_step_param = {
        1: {
            "depth": trial.suggest_int(name="depth", low=1, high=15),
            "iterations": trial.suggest_int("iterations", 8, 128),
        },
        2: {
            "subsample": trial.suggest_float("subsample", 0.01, 1.0),
            "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 1.0),
        },
        3: {
            "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
            "bagging_temperature": trial.suggest_float("bagging_temperature", 0.01, 1),
        },
        4: {
            "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        },
    }

    cbc = CatBoostClassifier(
        **all_step_param[step],
        eval_metric="F1",
        cat_features=list(X.select_dtypes(include="category").columns),
    )
    cbc.fit(
        X_train, y_train, eval_set=(X_test, y_test), metric_period=16, verbose=False
    )

    return cbc.get_best_score().get("validation").get("F1")


def get_obj_step(step: int):
    return partial(obj, step=step)


# TODO feed to previous best params forward

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(get_obj_step(step), n_trials=10, timeout=120)