In [1]:
import copy
import pickle
from datetime import datetime

import numpy as np
import optuna
import pandas as pd
import xgboost as xgb
from sklearn.metrics import root_mean_squared_log_error
from sklearn.model_selection import KFold
import random
from insurance.common import PREP_DATA_PATH
from insurance.data_pipeline import get_feat_columns, make_xgboost_pipeline
from insurance.logger import setup_logger

In [2]:
log_file = datetime.now().strftime("xgboost_tune_log_%Y-%m-%d_%H-%M-%S.log")
logger = setup_logger(log_file=log_file)


In [3]:
prep_data_path = PREP_DATA_PATH / "prepared_data.feather"
target_column = "Premium Amount"

df = pd.read_feather(prep_data_path)

feat_cols = get_feat_columns()
feat_names = feat_cols.names

features = df.drop(columns=[target_column])
features = features[feat_names]
logger.info(f"features shape: {features.shape}")

labels = df[target_column]

X_train = features[feat_names]
y_train = np.log1p(labels)

# Fit the pipeline
data_pipeline = make_xgboost_pipeline()
X_train = data_pipeline.fit_transform(X_train)
for col in feat_cols.categorical:
    X_train[col] = X_train[col].astype("category")

dtrain = xgb.DMatrix(
    X_train,
    label=y_train,
    enable_categorical=True,
    feature_names=X_train.columns.to_list(),
)


2024-12-22 14:48:46,000 - logger - INFO - features shape: (1200000, 19)


In [None]:
base_param = {
    "device": "cuda",
    "verbosity": 0,
    "objective": "reg:squarederror",
    "random_state": 42,
    "eval_metric": "rmse",
    # use exact for small dataset.
    "tree_method": "auto",
}


def objective(trial):
    param = copy.deepcopy(base_param)
    param.update(
        {
            # # defines booster, gblinear for linear functions.
            # "booster": trial.suggest_categorical("booster", ["gbtree", "dart"]),
            "booster": "gbtree",
            # L2 regularization weight.
            "lambda": trial.suggest_float("lambda", 10, 200),
            # L1 regularization weight.
            # "alpha": trial.suggest_float("alpha", 1e-3, 0.2, log=True),
            "alpha": 0.1,
            # sampling ratio for training data.
            # "subsample": trial.suggest_float("subsample", 0.2, 1.0),
            # # sampling according to each tree.
            # "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
        }
    )

    if param["booster"] in ["gbtree", "dart"]:
        # maximum depth of the tree, signifies complexity of the tree.
        param["max_depth"] = trial.suggest_int("max_depth", 2, 10, step=1)
        # minimum child weight, larger the term more conservative the tree.
        param["min_child_weight"] = trial.suggest_int("min_child_weight", 2, 10)
        param["eta"] = trial.suggest_float("eta", 0.15, 0.25)
        # defines how selective algorithm is.
        param["gamma"] = 3e-6
        # param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])
        param["grow_policy"] = "depthwise"

    # if param["booster"] == "dart":
    #     param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
    #     param["normalize_type"] = trial.suggest_categorical(
    #         "normalize_type", ["tree", "forest"]
    #     )
    #     param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
    #     param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)

    # num_boost_round = trial.suggest_int("num_boost_round", 10, 40)
    num_boost_round = 20

    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "test-rmse")
    history = xgb.cv(param, dtrain, num_boost_round=num_boost_round, callbacks=[pruning_callback])
    mean_rmse = history["test-rmse-mean"].values[-1]

    logger.info(f"Out-of-fold RMSLE: {mean_rmse:.4f}")
    return mean_rmse


study = optuna.create_study(
    direction="minimize",
    sampler=optuna.samplers.TPESampler(seed=42),
    # pruner=optuna.pruners.MedianPruner(),
)
study.optimize(objective, n_trials=50)


logger.info(f"Number of finished trials: {len(study.trials)}")
logger.info("Best trial:")
trial = study.best_trial

logger.info("  Value: {}".format(trial.value))
logger.info("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))
    base_param[key] = value

[I 2024-12-22 14:48:46,620] A new study created in memory with name: no-name-7561ccb7-7248-4020-a3a4-96b3cae59024
2024-12-22 14:48:49,495 - logger - INFO - Out-of-fold RMSLE: 1.0471
[I 2024-12-22 14:48:49,496] Trial 0 finished with value: 1.0471109441663062 and parameters: {'lambda': 81.16262258099887, 'max_depth': 10, 'min_child_weight': 8, 'eta': 0.20986584841970365}. Best is trial 0 with value: 1.0471109441663062.
2024-12-22 14:48:50,876 - logger - INFO - Out-of-fold RMSLE: 1.0555
[I 2024-12-22 14:48:50,876] Trial 1 finished with value: 1.0554830607889316 and parameters: {'lambda': 39.64354168406294, 'max_depth': 3, 'min_child_weight': 2, 'eta': 0.23661761457749353}. Best is trial 0 with value: 1.0471109441663062.
2024-12-22 14:48:52,799 - logger - INFO - Out-of-fold RMSLE: 1.0464
[I 2024-12-22 14:48:52,800] Trial 2 finished with value: 1.0463841771115396 and parameters: {'lambda': 124.21185223120968, 'max_depth': 8, 'min_child_weight': 2, 'eta': 0.24699098521619944}. Best is trial 

    lambda: 152.3558802567305
    max_depth: 8
    min_child_weight: 2
    eta: 0.22403445158956123


In [13]:
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))
    base_param[key] = value

    lambda: 152.3558802567305
    max_depth: 8
    min_child_weight: 2
    eta: 0.22403445158956123


In [11]:
from optuna.visualization import plot_contour
from optuna.visualization import plot_edf
from optuna.visualization import plot_intermediate_values
from optuna.visualization import plot_optimization_history
from optuna.visualization import plot_parallel_coordinate
from optuna.visualization import plot_param_importances
from optuna.visualization import plot_rank
from optuna.visualization import plot_slice
from optuna.visualization import plot_timeline

In [12]:
plot_contour(study)


In [8]:
plot_param_importances(study)