In [6]:
"""
Optuna example that optimizes a classifier configuration for cancer dataset using XGBoost.

In this example, we optimize the accuracy of cancer detection using the XGBoost. The accuracy is
estimated by cross-validation. We optimize both the choice of booster model and its
hyperparameters.

"""

import os
import shutil

import optuna

import sklearn.datasets
import sklearn.metrics
import xgboost as xgb

In [7]:
#SEED = 7 #MWB
SEED = 108
N_FOLDS = 3
CV_RESULT_DIR = "./xgboost_cv_results"

In [8]:
def objective(trial):
    (data, target) = sklearn.datasets.load_breast_cancer(return_X_y=True)
    dtrain = xgb.DMatrix(data, label=target)

    param = {
        "verbosity": 0,
        "objective": "binary:logistic",
        "eval_metric": "auc",
        "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
        # sampling ratio for training data.
        "subsample": trial.suggest_float("subsample", 0.2, 1.0),
        # sampling according to each tree.
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
    }

    if param["booster"] == "gbtree" or param["booster"] == "dart":
        param["max_depth"] = trial.suggest_int("max_depth", 1, 9)
        # minimum child weight, larger the term more conservative the tree.
        param["min_child_weight"] = trial.suggest_int("min_child_weight", 2, 10)
        param["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
        param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])

    if param["booster"] == "dart":
        param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
        param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)

    xgb_cv_results = xgb.cv(
        params=param,
        dtrain=dtrain,
        num_boost_round=10000,
        nfold=N_FOLDS,
        stratified=True,
        early_stopping_rounds=100,
        seed=SEED,
        verbose_eval=False,
    )
    
    trial.set_user_attr("n_estimators", len(xgb_cv_results))

    # Save cross-validation results.
    filepath = os.path.join(CV_RESULT_DIR, "{}.csv".format(trial.number))
    xgb_cv_results.to_csv(filepath, index=False)

    # Extract the best score.
    best_score = xgb_cv_results["test-auc-mean"].values[-1]
    return best_score


In [5]:
if __name__ == "__main__":
    if not os.path.exists(CV_RESULT_DIR):
        os.mkdir(CV_RESULT_DIR)

    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=20, timeout=600)

    print("Number of finished trials: ", len(study.trials))
    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))
    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

    print("  Number of estimators: {}".format(trial.user_attrs["n_estimators"]))

    shutil.rmtree(CV_RESULT_DIR)


[32m[I 2021-11-30 11:36:46,067][0m A new study created in memory with name: no-name-080095a8-444f-4875-8076-01006d035257[0m
[32m[I 2021-11-30 11:36:46,573][0m Trial 0 finished with value: 0.986066 and parameters: {'booster': 'gbtree', 'lambda': 0.5326230708495907, 'alpha': 1.344193423305108e-06, 'subsample': 0.8686746191405466, 'colsample_bytree': 0.34855376256304305, 'max_depth': 9, 'min_child_weight': 5, 'eta': 3.8773829087567033e-07, 'gamma': 0.8256439518340901, 'grow_policy': 'lossguide'}. Best is trial 0 with value: 0.986066.[0m
[32m[I 2021-11-30 11:36:46,951][0m Trial 1 finished with value: 0.9852573333333333 and parameters: {'booster': 'dart', 'lambda': 1.6301875803867005e-06, 'alpha': 6.966820754581761e-07, 'subsample': 0.37813788801288983, 'colsample_bytree': 0.6867844982859739, 'max_depth': 7, 'min_child_weight': 6, 'eta': 0.0010639785467287553, 'gamma': 1.317598575825695e-07, 'grow_policy': 'lossguide', 'sample_type': 'uniform', 'normalize_type': 'tree', 'rate_drop':

[32m[I 2021-11-30 11:37:16,740][0m Trial 17 finished with value: 0.9940793333333332 and parameters: {'booster': 'gblinear', 'lambda': 0.0013030633270566968, 'alpha': 0.0014266255828929283, 'subsample': 0.42248918156361265, 'colsample_bytree': 0.342602561641062}. Best is trial 16 with value: 0.9942373333333333.[0m
[32m[I 2021-11-30 11:37:17,710][0m Trial 18 finished with value: 0.9948316666666667 and parameters: {'booster': 'gblinear', 'lambda': 0.001085044224629079, 'alpha': 0.0004942991637657497, 'subsample': 0.24859463786253522, 'colsample_bytree': 0.33660414374234615}. Best is trial 18 with value: 0.9948316666666667.[0m
[32m[I 2021-11-30 11:37:19,232][0m Trial 19 finished with value: 0.9921783333333334 and parameters: {'booster': 'gblinear', 'lambda': 0.045800339307490015, 'alpha': 6.282193326851764e-05, 'subsample': 0.20081355333945108, 'colsample_bytree': 0.27933733651003667}. Best is trial 18 with value: 0.9948316666666667.[0m


Number of finished trials:  20
Best trial:
  Value: 0.9948316666666667
  Params: 
    booster: gblinear
    lambda: 0.001085044224629079
    alpha: 0.0004942991637657497
    subsample: 0.24859463786253522
    colsample_bytree: 0.33660414374234615
  Number of estimators: 557


In [10]:
if __name__ == "__main__":
    if not os.path.exists(CV_RESULT_DIR):
        os.mkdir(CV_RESULT_DIR)

    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=20, timeout=600)

    print("Number of finished trials: ", len(study.trials))
    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))
    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

    print("  Number of estimators: {}".format(trial.user_attrs["n_estimators"]))

    shutil.rmtree(CV_RESULT_DIR)


[32m[I 2021-11-30 14:21:50,162][0m A new study created in memory with name: no-name-40e21e08-66ab-4e9e-8be5-4ca46440ce07[0m
[32m[I 2021-11-30 14:21:50,811][0m Trial 0 finished with value: 0.9874196666666667 and parameters: {'booster': 'gblinear', 'lambda': 0.00017019467061261584, 'alpha': 0.11809985089651562, 'subsample': 0.35503350046998644, 'colsample_bytree': 0.5976677076348662}. Best is trial 0 with value: 0.9874196666666667.[0m
[32m[I 2021-11-30 14:21:50,998][0m Trial 1 finished with value: 0.9835406666666667 and parameters: {'booster': 'gbtree', 'lambda': 1.8500036107574693e-05, 'alpha': 0.1728984759396956, 'subsample': 0.5889118111053057, 'colsample_bytree': 0.7437351633534912, 'max_depth': 7, 'min_child_weight': 6, 'eta': 4.733481548194257e-07, 'gamma': 3.6435414902649324e-08, 'grow_policy': 'lossguide'}. Best is trial 0 with value: 0.9874196666666667.[0m
[32m[I 2021-11-30 14:21:51,116][0m Trial 2 finished with value: 0.9757910000000001 and parameters: {'booster': 'g

Number of finished trials:  20
Best trial:
  Value: 0.9940720000000001
  Params: 
    booster: gblinear
    lambda: 0.0011005849296659629
    alpha: 1.3046105707593157e-06
    subsample: 0.47971283015241317
    colsample_bytree: 0.8283458258848093
  Number of estimators: 331
