In [1]:
import sys

sys.path.append("../src")

import optuna
import polars as pl
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error

from utils.feature import load_feature
from utils.io import load_pickle


In [2]:
optuna.__version__

'4.0.0'

In [8]:
def load_data():
    feature = load_feature(
        "../data/feature",
        sorted(["agent_parsed_feature", "numeric_feature"]),
    )
    print("Feature shape:", feature.shape)
    target: pl.DataFrame = load_pickle("../data/feature/utility_agent1.pkl")
    fold: pl.DataFrame = load_pickle("../data/feature/fold.pkl")

    is_valid = fold["fold"].eq(0).alias("is_valid")

    X_train = feature.filter(~is_valid).to_pandas()
    y_train = target.filter(~is_valid).to_pandas()
    X_valid = feature.filter(is_valid).to_pandas()
    y_valid = target.filter(is_valid).to_pandas()

    return X_train, y_train, X_valid, y_valid


X_train, y_train, X_valid, y_valid = load_data()

Feature shape: (233234, 596)


In [15]:
def objective(trial: optuna.Trial):
    params = {
        # "task_type": "GPU",
        "loss_function": "RMSE",
        "iterations": trial.suggest_int("iterations", 100, 3000),
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.2),
        "bagging_temperature": trial.suggest_uniform(
            "bagging_temperature", 0, 1
        ),
        "max_depth": trial.suggest_int("max_depth", 4, 16),
        "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 0.1, 10),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 50),
        "random_strength": trial.suggest_uniform("random_strength", 0, 10),
        "colsample_bylevel": trial.suggest_uniform(
            "colsample_bylevel", 0.1, 1.0
        ),
        "one_hot_max_size": trial.suggest_int("one_hot_max_size", 2, 10),
        "od_type": trial.suggest_categorical("od_type", ["IncToDec", "Iter"]),
    }

    model = CatBoostRegressor(**params, silent=True)
    model.fit(X_train, y_train)
    predictions = model.predict(X_valid)
    rmse = mean_squared_error(y_valid, predictions, squared=False)
    return rmse

In [16]:
# Optunaによる最適化実行
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100)


[I 2024-10-01 00:52:39,728] A new study created in memory with name: no-name-da05ad8a-8008-4eb6-b205-88579acb2987
  "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.2),
  "bagging_temperature": trial.suggest_uniform(
  "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 0.1, 10),
  "random_strength": trial.suggest_uniform("random_strength", 0, 10),
  "colsample_bylevel": trial.suggest_uniform(
[I 2024-10-01 00:53:57,751] Trial 0 finished with value: 0.45056881983333563 and parameters: {'iterations': 1984, 'learning_rate': 0.015628244566437886, 'bagging_temperature': 0.4665927966969097, 'max_depth': 7, 'l2_leaf_reg': 1.0686145654004036, 'min_data_in_leaf': 37, 'random_strength': 2.186114566582872, 'colsample_bylevel': 0.2278143453964046, 'one_hot_max_size': 9, 'od_type': 'IncToDec'}. Best is trial 0 with value: 0.45056881983333563.
  "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.2),
  "bagging_temperature": trial.suggest_uniform(
  "l2_leaf_reg"

KeyboardInterrupt: 

In [None]:
print("Best hyperparameters:", study.best_params)
print("Best RMSE:", study.best_value)

In [None]:
from utils.io import save_pickle

save_pickle("../data/catboost.pkl", study.best_params)