In [1]:
import optuna
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import xgboost as xgb
import catboost as cb
import lightgbm as lgbm
from data.process import *
from functools import partial

# use different seeds from tuning
SEED = 42
STUDY_NAME = 'ensemble_study_4_roc'
STORAGE = 'sqlite:///ensemble.db'


2024-07-30 21:12:39.139740: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-30 21:12:39.147793: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-30 21:12:39.158702: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-30 21:12:39.158724: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-30 21:12:39.166091: I tensorflow/core/platform/cpu_feature_gua

In [2]:
xgboost_model = xgb.Booster(model_file='xgboost_model_new.json')
lgbm_model = lgbm.Booster(model_file='lgbm_model_save')
catboost_model = cb.CatBoostClassifier()
catboost_model.load_model('catboost_model_categorical.cbm')

<catboost.core.CatBoostClassifier at 0x7f9f903b3e80>

In [3]:
_, val_xgb, _ = retrieve_train_dev_test_as_category_for_xgboost()
_, val_cb, _ = retrieve_train_dev_test_for_catboost()

X_val_xgb, Y_val = split_input_output(val_xgb)
X_val_cb, _  = split_input_output(val_cb)

In [4]:
# X_val_xgb, Y_val = retrieve_validation_dataset_for_xgboost()
# X_val_cb, _ = retrieve_validation_dataset_for_catboost()


In [5]:
%%time
preds1 = xgboost_model.predict(xgb.DMatrix(X_val_xgb, enable_categorical=True), iteration_range=(0, xgboost_model.best_iteration + 1))
preds2 = lgbm_model.predict(X_val_xgb, num_iteration=lgbm_model.best_iteration)
preds3 = catboost_model.predict_proba(X_val_cb, ntree_end=catboost_model.best_iteration_)[:, 1]

CPU times: user 50.9 s, sys: 141 ms, total: 51 s
Wall time: 2.55 s


In [13]:
def objective(trial: optuna.Trial, y_val):
    # Suggest weights using Dirichlet distribution to ensure they sum up to 1
    # https://optuna.readthedocs.io/en/stable/faq.html#how-do-i-suggest-variables-which-represent-the-proportion-that-is-are-in-accordance-with-dirichlet-distribution
    weight1 = -np.log(trial.suggest_float("xgboost", 0, 1))
    weight2 = -np.log(trial.suggest_float("lightgbm", 0, 1))
    weight3 = -np.log(trial.suggest_float("catboost", 0, 1))

    total_weight = weight1 + weight2 + weight3
    trial.set_user_attr(f'p_xgboost', weight1 / total_weight)
    trial.set_user_attr(f'p_lgbm', weight2 / total_weight)
    trial.set_user_attr(f'p_catboost', weight3/ total_weight)


    # Normalize weights
    weight1 /= total_weight
    weight2 /= total_weight
    weight3 /= total_weight


    weighted_preds = (weight1 * preds1 + weight2 * preds2 + weight3 * preds3)
    
    return roc_auc_score(y_val, weighted_preds)

    # # Stratified K-Fold Cross Validation
    # Suggest weights for each model
    # weight1 = trial.suggest_float("weight1", 0.0, 1.0)
    # weight2 = trial.suggest_float("weight2", 0.0, 1.0)
    # weight3 = trial.suggest_float("weight3", 0.0, 1.0)
    # accuracies = []
    # skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=SEED)
    # for train_index, test_index in skf.split(xgb_x_val, y_val):
    #     xgb_x_test = xgb_x_val.iloc[test_index]
    #     lgbm_x_test = lgbm_x_val.iloc[test_index]
    #     catboost_x_test = catboost_x_val.iloc[test_index]
    #     y_test = y_val[test_index]

    #     # Predict probabilities with each model
    #     xgb_dmatrix = xgb.DMatrix(xgb_x_test, enable_categorical=True)
    #     preds1 = xgboost_model.predict(xgb_dmatrix, iteration_range=(0, xgboost_model.best_iteration + 1))
    #     preds2 = lgbm_model.predict(lgbm_x_test, num_iteration=lgbm_model.best_iteration)
    #     preds3 = catboost_model.predict_proba(catboost_x_test, ntree_end=catboost_model.best_iteration_)[:, 1]

    #     # Weighted average of predictions
    #     weighted_preds = (weight1 * preds1 + weight2 * preds2 + weight3 * preds3)
    #     binary_preds = (weighted_preds >= 0.5).astype(int)


    #     # Calculate accuracy
    #     accuracy = accuracy_score(y_test, binary_preds)
    #     accuracies.append(accuracy)


    # return np.mean(accuracies)

ensemble_objective = partial(
    objective, 
    y_val=Y_val
)

In [22]:
study = optuna.create_study(
    direction="maximize", 
    study_name=STUDY_NAME, 
    storage=STORAGE
)
study.optimize(ensemble_objective, n_trials=100)

[I 2024-07-30 21:21:36,022] A new study created in RDB with name: ensemble_study_4_roc
[I 2024-07-30 21:21:36,099] Trial 0 finished with value: 0.8939685441168703 and parameters: {'xgboost': 0.10677777142914457, 'lightgbm': 0.43630777740346327, 'catboost': 0.6211105268618939}. Best is trial 0 with value: 0.8939685441168703.
[I 2024-07-30 21:21:36,162] Trial 1 finished with value: 0.8932681107271783 and parameters: {'xgboost': 0.4823974232951559, 'lightgbm': 0.028494269339640277, 'catboost': 0.9077238331605784}. Best is trial 0 with value: 0.8939685441168703.
[I 2024-07-30 21:21:36,228] Trial 2 finished with value: 0.8937484598919943 and parameters: {'xgboost': 0.3297178496548091, 'lightgbm': 0.39977754817114286, 'catboost': 0.8458729381866804}. Best is trial 0 with value: 0.8939685441168703.
[I 2024-07-30 21:21:36,287] Trial 3 finished with value: 0.8952865584726781 and parameters: {'xgboost': 0.6073551537655203, 'lightgbm': 0.22108459562477545, 'catboost': 0.27208547956068885}. Best i

In [23]:
best_weights = study.best_params
print("Best weights:", best_weights)

Best weights: {'xgboost': 0.6445001462442643, 'lightgbm': 0.5574661318751899, 'catboost': 0.16980027254283867}


In [24]:
xgboost = -np.log(best_weights['xgboost'])
lightgbm = -np.log(best_weights['lightgbm'])
catboost = -np.log(best_weights['catboost'])

total = xgboost + lightgbm + catboost
xgboost /= total
lightgbm /= total
catboost /= total

print(xgboost, lightgbm, catboost)

0.15706720011588562 0.20893900120558095 0.6339937986785334


In [25]:
print("Best Score:", study.best_value)

Best Score: 0.8956252720145385


In [26]:
# study = optuna.load_study(study_name=STUDY_NAME, storage=STORAGE)

In [27]:

optuna.visualization.plot_param_importances(study)

In [28]:

optuna.visualization.plot_slice(study)