In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn import metrics
import optuna
from datetime import datetime, timedelta
import humanize

In [2]:
pd.set_option('display.float_format', '{:.2f}'.format)
random_state = 42

In [3]:
# Set default font
import matplotlib.font_manager as fm

font_path = '/usr/share/fonts/noto_sans_mono/NotoSansMono_SemiCondensed-SemiBold.ttf'
font_prop = fm.FontProperties(fname=font_path)

mpl.rcParams['font.family'] = font_prop.get_name()
plt.rcParams["font.weight"] = 'semibold'

bold = 'extra bold'

sns.set(font=font_prop.get_name(), style='darkgrid')

## Model

We are going to use the following train-test split:
- 60% training set to train the model
- 20% validation set to tune the hyperparameters
- 20% test set to evaluate the model after tuning

In [4]:
train_df = pd.read_csv('data/train.csv')

In [5]:
X = train_df.drop('default', axis=1)
y = train_df['default']

X_train_val, X_test, y_train_val, y_test = train_test_split(
        X,
        y,
        test_size=0.20,
        stratify=y,
        random_state=random_state
        )

X_train, X_val, y_train, y_val = train_test_split(
        X_train_val,
        y_train_val,
        test_size=0.25,
        stratify=y_train_val,
        random_state=random_state
        )

In [20]:
def run_study(objective, n_trials=50, direction='maximize'):
    started_at = datetime.now()

    study = optuna.create_study(direction=direction)
    study.optimize(objective, n_trials=n_trials, show_progress_bar=True)

    ended_at = datetime.now()

    duration_in_seconds = (ended_at - started_at).total_seconds()

    print("Best parameters:", study.best_params)
    print("Best score:", study.best_value)
    print("Time taken:", humanize.naturaldelta(timedelta(seconds=duration_in_seconds)))

    return study, duration_in_seconds

In [21]:
def trial_evaluation_metric(y_true, y_pred):
    # return metrics.fbeta_score(y_true, y_pred, beta=10)
    return metrics.recall_score(y_true, y_pred)

In [34]:
def objective_dt(trial):
    params = {
            'max_depth': trial.suggest_int("max_depth", 1, 20),
            'min_samples_split': trial.suggest_int("min_samples_split", 2, 20),
            'min_samples_leaf': trial.suggest_int("min_samples_leaf", 1, 20),
            'criterion': trial.suggest_categorical("criterion", ["gini", "entropy"]),
            'random_state': random_state
            }

    model = DecisionTreeClassifier(**params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)

    return trial_evaluation_metric(y_val, y_pred)

In [39]:
def objective_rf(trial):
    params = {
            'n_estimators': trial.suggest_int("n_estimators", 10, 300),
            'max_depth': trial.suggest_int("max_depth", 1, 30),
            'min_samples_split': trial.suggest_int("min_samples_split", 2, 20),
            'min_samples_leaf': trial.suggest_int("min_samples_leaf", 1, 20),
            'max_features': trial.suggest_categorical("max_features", ["sqrt", "log2", None]),
            'criterion': trial.suggest_categorical("criterion", ["gini", "entropy"]),
            'random_state': random_state
            }


    model = RandomForestClassifier(**params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)

    return trial_evaluation_metric(y_val, y_pred)

In [40]:
def objective_xgb(trial):
    params = {
            'n_estimators': trial.suggest_int("n_estimators", 50, 500),
            'max_depth': trial.suggest_int("max_depth", 3, 15),
            'learning_rate': trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
            'subsample': trial.suggest_float("subsample", 0.5, 1.0),
            'colsample_bytree': trial.suggest_float("colsample_bytree", 0.5, 1.0),
            'gamma': trial.suggest_float("gamma", 0, 5),
            'reg_alpha': trial.suggest_float("reg_alpha", 0, 10),
            'reg_lambda': trial.suggest_float("reg_lambda", 0, 10),
            'random_state': random_state,
            }

    model = XGBClassifier(**params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)

    return trial_evaluation_metric(y_val, y_pred)

In [None]:
dt_study, dt_execution_seconds = run_study(objective_dt, n_trials=5)

In [41]:
rf_study, rf_execution_seconds = run_study(objective_rf, n_trials=5)

[I 2025-03-01 23:31:42,852] A new study created in memory with name: no-name-42e6daa3-67cc-4394-a137-42ca45e48491


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-03-01 23:32:13,911] Trial 0 finished with value: 0.010091211191782301 and parameters: {'max_depth': 12, 'n_estimators': 52, 'min_samples_split': 13, 'min_samples_leaf': 5, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.010091211191782301.
[I 2025-03-01 23:33:19,295] Trial 1 finished with value: 0.0026244281990759474 and parameters: {'max_depth': 16, 'n_estimators': 131, 'min_samples_split': 13, 'min_samples_leaf': 6, 'max_features': 'log2'}. Best is trial 0 with value: 0.010091211191782301.
[I 2025-03-01 23:34:01,324] Trial 2 finished with value: 0.025404500643816743 and parameters: {'max_depth': 15, 'n_estimators': 59, 'min_samples_split': 10, 'min_samples_leaf': 5, 'max_features': 'sqrt'}. Best is trial 2 with value: 0.025404500643816743.
[I 2025-03-01 23:34:06,051] Trial 3 finished with value: 0.0079690400891643 and parameters: {'max_depth': 15, 'n_estimators': 7, 'min_samples_split': 15, 'min_samples_leaf': 14, 'max_features': 'log2'}. Best is trial 2 with value: 0.

[I 2025-03-02 00:06:28,706] Trial 33 finished with value: 0.05415356295089953 and parameters: {'max_depth': 18, 'n_estimators': 6, 'min_samples_split': 13, 'min_samples_leaf': 6, 'max_features': 'sqrt'}. Best is trial 29 with value: 0.08412906833866705.
[I 2025-03-02 00:06:36,341] Trial 34 finished with value: 3.859600718573583e-05 and parameters: {'max_depth': 13, 'n_estimators': 18, 'min_samples_split': 14, 'min_samples_leaf': 8, 'max_features': 'log2'}. Best is trial 29 with value: 0.08412906833866705.
[I 2025-03-02 00:06:48,000] Trial 35 finished with value: 0.03799188531365502 and parameters: {'max_depth': 17, 'n_estimators': 17, 'min_samples_split': 14, 'min_samples_leaf': 5, 'max_features': 'sqrt'}. Best is trial 29 with value: 0.08412906833866705.
[I 2025-03-02 00:06:50,761] Trial 36 finished with value: 0.060209700829208665 and parameters: {'max_depth': 21, 'n_estimators': 2, 'min_samples_split': 15, 'min_samples_leaf': 7, 'max_features': 'log2'}. Best is trial 29 with value: 

<IPython.core.display.Javascript object>

In [42]:
xgb_study, xgb_execution_seconds = run_study(objective_xgb, n_trials=5)

[I 2025-03-02 00:09:56,896] A new study created in memory with name: no-name-a6103e47-399a-404c-b68b-f5079183779b


  0%|          | 0/50 [00:00<?, ?it/s]

  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 1),
  'subsample': trial.suggest_discrete_uniform('subsample', 0.5, 1, 0.1),
  'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.5, 1, 0.1),
  'gamma': trial.suggest_loguniform('gamma', 0.001, 10),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.001, 10),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.001, 10),


[I 2025-03-02 00:10:14,384] Trial 0 finished with value: 0.20597693786277993 and parameters: {'max_depth': 13, 'n_estimators': 103, 'learning_rate': 0.7487311166038166, 'subsample': 0.8, 'colsample_bytree': 0.6, 'gamma': 0.0016983268846495778, 'reg_alpha': 2.670448655640208, 'reg_lambda': 0.13121812402876015}. Best is trial 0 with value: 0.20597693786277993.


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 1),
  'subsample': trial.suggest_discrete_uniform('subsample', 0.5, 1, 0.1),
  'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.5, 1, 0.1),
  'gamma': trial.suggest_loguniform('gamma', 0.001, 10),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.001, 10),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.001, 10),


[I 2025-03-02 00:10:45,915] Trial 1 finished with value: 0.014045656406353631 and parameters: {'max_depth': 29, 'n_estimators': 81, 'learning_rate': 0.014082397989394996, 'subsample': 1.0, 'colsample_bytree': 1.0, 'gamma': 3.1059371279356043, 'reg_alpha': 0.5400374999421421, 'reg_lambda': 0.3937881053574811}. Best is trial 0 with value: 0.20597693786277993.


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 1),
  'subsample': trial.suggest_discrete_uniform('subsample', 0.5, 1, 0.1),
  'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.5, 1, 0.1),
  'gamma': trial.suggest_loguniform('gamma', 0.001, 10),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.001, 10),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.001, 10),


[I 2025-03-02 00:10:48,444] Trial 2 finished with value: 0.0 and parameters: {'max_depth': 2, 'n_estimators': 4, 'learning_rate': 0.002550485658296368, 'subsample': 0.9, 'colsample_bytree': 1.0, 'gamma': 0.8101174902724542, 'reg_alpha': 0.03386946727191204, 'reg_lambda': 0.0041877719882440516}. Best is trial 0 with value: 0.20597693786277993.


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 1),
  'subsample': trial.suggest_discrete_uniform('subsample', 0.5, 1, 0.1),
  'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.5, 1, 0.1),
  'gamma': trial.suggest_loguniform('gamma', 0.001, 10),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.001, 10),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.001, 10),


[I 2025-03-02 00:11:01,882] Trial 3 finished with value: 0.13762432642926706 and parameters: {'max_depth': 14, 'n_estimators': 57, 'learning_rate': 0.3670845423585725, 'subsample': 0.6, 'colsample_bytree': 0.7, 'gamma': 0.003717000236712155, 'reg_alpha': 3.5332297759193962, 'reg_lambda': 0.004118520098258566}. Best is trial 0 with value: 0.20597693786277993.


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 1),
  'subsample': trial.suggest_discrete_uniform('subsample', 0.5, 1, 0.1),
  'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.5, 1, 0.1),
  'gamma': trial.suggest_loguniform('gamma', 0.001, 10),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.001, 10),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.001, 10),


[I 2025-03-02 00:11:29,774] Trial 4 finished with value: 0.009628222867979735 and parameters: {'max_depth': 10, 'n_estimators': 160, 'learning_rate': 0.008806069648979242, 'subsample': 0.6, 'colsample_bytree': 0.8, 'gamma': 0.005143266272011087, 'reg_alpha': 0.2784152219180451, 'reg_lambda': 0.13672676259246658}. Best is trial 0 with value: 0.20597693786277993.


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 1),
  'subsample': trial.suggest_discrete_uniform('subsample', 0.5, 1, 0.1),
  'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.5, 1, 0.1),
  'gamma': trial.suggest_loguniform('gamma', 0.001, 10),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.001, 10),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.001, 10),


[I 2025-03-02 00:11:38,132] Trial 5 finished with value: 0.005383656450156902 and parameters: {'max_depth': 2, 'n_estimators': 153, 'learning_rate': 0.02476418080366691, 'subsample': 0.9, 'colsample_bytree': 0.7, 'gamma': 0.0711029483905617, 'reg_alpha': 1.9589810570887294, 'reg_lambda': 0.0016723010332360942}. Best is trial 0 with value: 0.20597693786277993.


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 1),
  'subsample': trial.suggest_discrete_uniform('subsample', 0.5, 1, 0.1),
  'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.5, 1, 0.1),
  'gamma': trial.suggest_loguniform('gamma', 0.001, 10),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.001, 10),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.001, 10),


[I 2025-03-02 00:11:42,076] Trial 6 finished with value: 0.0 and parameters: {'max_depth': 10, 'n_estimators': 10, 'learning_rate': 0.05978502524131051, 'subsample': 0.9, 'colsample_bytree': 0.8, 'gamma': 0.008921943184137431, 'reg_alpha': 0.35461721540661817, 'reg_lambda': 0.22797192188714274}. Best is trial 0 with value: 0.20597693786277993.


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 1),
  'subsample': trial.suggest_discrete_uniform('subsample', 0.5, 1, 0.1),
  'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.5, 1, 0.1),
  'gamma': trial.suggest_loguniform('gamma', 0.001, 10),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.001, 10),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.001, 10),


[I 2025-03-02 00:11:50,500] Trial 7 finished with value: 0.0 and parameters: {'max_depth': 27, 'n_estimators': 26, 'learning_rate': 0.008593527025076311, 'subsample': 0.7, 'colsample_bytree': 1.0, 'gamma': 1.7273539650813872, 'reg_alpha': 6.417838957529529, 'reg_lambda': 0.0913710969590753}. Best is trial 0 with value: 0.20597693786277993.


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 1),
  'subsample': trial.suggest_discrete_uniform('subsample', 0.5, 1, 0.1),
  'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.5, 1, 0.1),
  'gamma': trial.suggest_loguniform('gamma', 0.001, 10),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.001, 10),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.001, 10),


[I 2025-03-02 00:12:26,273] Trial 8 finished with value: 0.12325083695909214 and parameters: {'max_depth': 19, 'n_estimators': 127, 'learning_rate': 0.13767422447960412, 'subsample': 0.5, 'colsample_bytree': 0.5, 'gamma': 0.005839348008507179, 'reg_alpha': 0.0010511508143653332, 'reg_lambda': 1.4741580103960081}. Best is trial 0 with value: 0.20597693786277993.


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 1),
  'subsample': trial.suggest_discrete_uniform('subsample', 0.5, 1, 0.1),
  'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.5, 1, 0.1),
  'gamma': trial.suggest_loguniform('gamma', 0.001, 10),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.001, 10),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.001, 10),


[I 2025-03-02 00:12:54,968] Trial 9 finished with value: 0.05600299288331286 and parameters: {'max_depth': 22, 'n_estimators': 70, 'learning_rate': 0.02916422227221152, 'subsample': 0.7, 'colsample_bytree': 0.9, 'gamma': 2.9643253436289347, 'reg_alpha': 0.09354442539249969, 'reg_lambda': 0.05437670031278041}. Best is trial 0 with value: 0.20597693786277993.


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 1),
  'subsample': trial.suggest_discrete_uniform('subsample', 0.5, 1, 0.1),
  'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.5, 1, 0.1),
  'gamma': trial.suggest_loguniform('gamma', 0.001, 10),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.001, 10),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.001, 10),


[I 2025-03-02 00:13:09,017] Trial 10 finished with value: 0.160215693346949 and parameters: {'max_depth': 11, 'n_estimators': 110, 'learning_rate': 0.7359095554925206, 'subsample': 0.8, 'colsample_bytree': 0.5, 'gamma': 0.06251805335731718, 'reg_alpha': 0.009335922100601803, 'reg_lambda': 9.977981767613482}. Best is trial 0 with value: 0.20597693786277993.


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 1),
  'subsample': trial.suggest_discrete_uniform('subsample', 0.5, 1, 0.1),
  'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.5, 1, 0.1),
  'gamma': trial.suggest_loguniform('gamma', 0.001, 10),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.001, 10),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.001, 10),


[I 2025-03-02 00:13:22,088] Trial 11 finished with value: 0.18322709708826482 and parameters: {'max_depth': 10, 'n_estimators': 114, 'learning_rate': 0.908554192022061, 'subsample': 0.8, 'colsample_bytree': 0.5, 'gamma': 0.001032678888902333, 'reg_alpha': 0.005883764562848601, 'reg_lambda': 3.8448057420057937}. Best is trial 0 with value: 0.20597693786277993.


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 1),
  'subsample': trial.suggest_discrete_uniform('subsample', 0.5, 1, 0.1),
  'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.5, 1, 0.1),
  'gamma': trial.suggest_loguniform('gamma', 0.001, 10),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.001, 10),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.001, 10),


[I 2025-03-02 00:13:33,057] Trial 12 finished with value: 0.0876665698210671 and parameters: {'max_depth': 7, 'n_estimators': 130, 'learning_rate': 0.25344138321764265, 'subsample': 0.8, 'colsample_bytree': 0.6, 'gamma': 0.0011098216953224253, 'reg_alpha': 0.005922330575585585, 'reg_lambda': 7.5840387784311725}. Best is trial 0 with value: 0.20597693786277993.


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 1),
  'subsample': trial.suggest_discrete_uniform('subsample', 0.5, 1, 0.1),
  'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.5, 1, 0.1),
  'gamma': trial.suggest_loguniform('gamma', 0.001, 10),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.001, 10),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.001, 10),


[I 2025-03-02 00:14:08,822] Trial 13 finished with value: 0.22085872801226386 and parameters: {'max_depth': 18, 'n_estimators': 96, 'learning_rate': 0.8879277494709085, 'subsample': 0.8, 'colsample_bytree': 0.6, 'gamma': 0.0010992413762979565, 'reg_alpha': 0.0010196266829901135, 'reg_lambda': 1.239035488792763}. Best is trial 13 with value: 0.22085872801226386.


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 1),
  'subsample': trial.suggest_discrete_uniform('subsample', 0.5, 1, 0.1),
  'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.5, 1, 0.1),
  'gamma': trial.suggest_loguniform('gamma', 0.001, 10),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.001, 10),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.001, 10),


[I 2025-03-02 00:14:30,791] Trial 14 finished with value: 0.10049628595956564 and parameters: {'max_depth': 18, 'n_estimators': 49, 'learning_rate': 0.1365083649054445, 'subsample': 1.0, 'colsample_bytree': 0.6, 'gamma': 0.018589612151469318, 'reg_alpha': 0.0013142456635454594, 'reg_lambda': 0.8380874546138559}. Best is trial 13 with value: 0.22085872801226386.


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 1),
  'subsample': trial.suggest_discrete_uniform('subsample', 0.5, 1, 0.1),
  'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.5, 1, 0.1),
  'gamma': trial.suggest_loguniform('gamma', 0.001, 10),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.001, 10),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.001, 10),


[I 2025-03-02 00:15:16,673] Trial 15 finished with value: 0.18653866188502308 and parameters: {'max_depth': 23, 'n_estimators': 195, 'learning_rate': 0.3865806802044193, 'subsample': 0.7, 'colsample_bytree': 0.6, 'gamma': 0.3336357829479811, 'reg_alpha': 0.042152921289646074, 'reg_lambda': 0.02335724767225633}. Best is trial 13 with value: 0.22085872801226386.


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 1),
  'subsample': trial.suggest_discrete_uniform('subsample', 0.5, 1, 0.1),
  'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.5, 1, 0.1),
  'gamma': trial.suggest_loguniform('gamma', 0.001, 10),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.001, 10),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.001, 10),


[I 2025-03-02 00:16:12,428] Trial 16 finished with value: 0.0 and parameters: {'max_depth': 16, 'n_estimators': 88, 'learning_rate': 0.0010151526625349768, 'subsample': 0.9, 'colsample_bytree': 0.7, 'gamma': 0.02050195937183228, 'reg_alpha': 1.216374752223116, 'reg_lambda': 0.022573350532046052}. Best is trial 13 with value: 0.22085872801226386.


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 1),
  'subsample': trial.suggest_discrete_uniform('subsample', 0.5, 1, 0.1),
  'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.5, 1, 0.1),
  'gamma': trial.suggest_loguniform('gamma', 0.001, 10),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.001, 10),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.001, 10),


[I 2025-03-02 00:17:07,682] Trial 17 finished with value: 0.10483608803162449 and parameters: {'max_depth': 22, 'n_estimators': 98, 'learning_rate': 0.08320848931990213, 'subsample': 0.6, 'colsample_bytree': 0.6, 'gamma': 0.0019565565000508258, 'reg_alpha': 0.019755659217521666, 'reg_lambda': 1.3686556233180178}. Best is trial 13 with value: 0.22085872801226386.


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 1),
  'subsample': trial.suggest_discrete_uniform('subsample', 0.5, 1, 0.1),
  'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.5, 1, 0.1),
  'gamma': trial.suggest_loguniform('gamma', 0.001, 10),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.001, 10),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.001, 10),


[I 2025-03-02 00:17:49,898] Trial 18 finished with value: 0.2173805618838751 and parameters: {'max_depth': 14, 'n_estimators': 141, 'learning_rate': 0.994474026595285, 'subsample': 0.8, 'colsample_bytree': 0.6, 'gamma': 0.23240257038215123, 'reg_alpha': 9.87623237101564, 'reg_lambda': 0.48387645199987}. Best is trial 13 with value: 0.22085872801226386.


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 1),
  'subsample': trial.suggest_discrete_uniform('subsample', 0.5, 1, 0.1),
  'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.5, 1, 0.1),
  'gamma': trial.suggest_loguniform('gamma', 0.001, 10),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.001, 10),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.001, 10),


[I 2025-03-02 00:18:00,567] Trial 19 finished with value: 0.0802780194453348 and parameters: {'max_depth': 32, 'n_estimators': 164, 'learning_rate': 0.2462641486653168, 'subsample': 0.7, 'colsample_bytree': 0.8, 'gamma': 8.982318514220456, 'reg_alpha': 0.1272900172327271, 'reg_lambda': 0.4919311775174698}. Best is trial 13 with value: 0.22085872801226386.


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 1),
  'subsample': trial.suggest_discrete_uniform('subsample', 0.5, 1, 0.1),
  'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.5, 1, 0.1),
  'gamma': trial.suggest_loguniform('gamma', 0.001, 10),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.001, 10),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.001, 10),


[I 2025-03-02 00:18:30,391] Trial 20 finished with value: 0.11002910715004072 and parameters: {'max_depth': 7, 'n_estimators': 199, 'learning_rate': 0.519860594457249, 'subsample': 0.5, 'colsample_bytree': 0.5, 'gamma': 0.24565815237484043, 'reg_alpha': 9.525900451337362, 'reg_lambda': 3.8001852563509604}. Best is trial 13 with value: 0.22085872801226386.


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 1),
  'subsample': trial.suggest_discrete_uniform('subsample', 0.5, 1, 0.1),
  'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.5, 1, 0.1),
  'gamma': trial.suggest_loguniform('gamma', 0.001, 10),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.001, 10),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.001, 10),


[I 2025-03-02 00:19:18,383] Trial 21 finished with value: 0.2345464751028572 and parameters: {'max_depth': 16, 'n_estimators': 139, 'learning_rate': 0.9425184030676557, 'subsample': 0.8, 'colsample_bytree': 0.6, 'gamma': 0.2072523561720798, 'reg_alpha': 1.3128100713649633, 'reg_lambda': 0.20827020419544384}. Best is trial 21 with value: 0.2345464751028572.


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 1),
  'subsample': trial.suggest_discrete_uniform('subsample', 0.5, 1, 0.1),
  'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.5, 1, 0.1),
  'gamma': trial.suggest_loguniform('gamma', 0.001, 10),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.001, 10),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.001, 10),


[W 2025-03-02 00:19:31,401] Trial 22 failed with parameters: {'max_depth': 16, 'n_estimators': 139, 'learning_rate': 0.9720885328907906, 'subsample': 0.8, 'colsample_bytree': 0.7, 'gamma': 0.20830052921807787, 'reg_alpha': 1.1566095425246725, 'reg_lambda': 0.4176956016807468} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/home/gpoulis/anaconda3/envs/myenv/lib/python3.9/site-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_18404/2578691697.py", line 18, in objective_xgb
    model.fit(X_train, y_train)
  File "/home/gpoulis/anaconda3/envs/myenv/lib/python3.9/site-packages/xgboost/core.py", line 726, in inner_f
    return func(**kwargs)
  File "/home/gpoulis/anaconda3/envs/myenv/lib/python3.9/site-packages/xgboost/sklearn.py", line 1599, in fit
    self._Booster = train(
  File "/home/gpoulis/anaconda3/envs/myenv/lib/python3.9/site-packages/xgboost/core.py", line 726, in 

KeyboardInterrupt: 

<IPython.core.display.Javascript object>

In [None]:
# create a df with the results
df_results = pd.DataFrame({
    'model': ['Decision Tree', 'Random Forest', 'XGBoost'],
    'evaluation_metric': [
        dt_study.best_value,
        rf_study.best_value,
        xgb_study.best_value
        ],
    'execution_seconds': [
        dt_execution_seconds,
        rf_execution_seconds,
        xgb_execution_seconds
        ]
    })

In [None]:
df_results

In [28]:
# calculate accuracy, precision, recall and f1-score
def calculate_metrics(model, X, y):
    y_pred = model.predict(X)

    accuracy = metrics.accuracy_score(y, y_pred)
    precision = metrics.precision_score(y, y_pred)
    recall = metrics.recall_score(y, y_pred)
    f1 = metrics.f1_score(y, y_pred)

    return accuracy, precision, recall, f1

In [29]:
best_dt = DecisionTreeClassifier(**dt_study.best_params)
best_rf = RandomForestClassifier(**rf_study.best_params)
best_xgb = XGBClassifier(**xgb_study.best_params)

best_dt.fit(X_train_val, y_train_val)
best_rf.fit(X_train_val, y_train_val)
best_xgb.fit(X_train_val, y_train_val)

''

''

<IPython.core.display.Javascript object>

In [30]:
# calculate metrics
dt_metrics = calculate_metrics(best_dt, X_test, y_test)
rf_metrics = calculate_metrics(best_rf, X_test, y_test)
xgb_metrics = calculate_metrics(best_xgb, X_test, y_test)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [31]:
model_metrics = np.array([dt_metrics, rf_metrics, xgb_metrics]).transpose()
model_metrics

array([[0.80099951, 0.7992659 , 0.80223451],
       [0.53648692, 0.        , 0.54807453],
       [0.06349237, 0.        , 0.08429983],
       [0.11354667, 0.        , 0.14612417]])

In [32]:
df_results['accuracy'] = model_metrics[0]
df_results['precision'] = model_metrics[1]
df_results['recall'] = model_metrics[2]
df_results['f1'] = model_metrics[3]

In [33]:
df_results

Unnamed: 0,model,evaluation_metric,execution_seconds,accuracy,precision,recall,f1
0,Decision Tree,0.05,22.15,0.8,0.54,0.06,0.11
1,Random Forest,0.0,73.84,0.8,0.0,0.0,0.0
2,XGBoost,0.08,29.23,0.8,0.55,0.08,0.15


In [None]:
def roc_curve_metrics(model, X, y):
    y_pred_proba = model.predict_proba(X)[:, 1]
    fpr, tpr, thresholds = metrics.roc_curve(y, y_pred_proba)
    auc = metrics.roc_auc_score(y, y_pred_proba)

    return fpr, tpr, thresholds, auc

In [None]:
# plot auc with hue as model
fig, ax = plt.subplots(figsize=(10, 6))

viz_df = pd.DataFrame(columns=['model', 'fpr', 'tpr', 'auc', 'thresholds'])

fpr, tpr, thresholds, auc = roc_curve_metrics(best_dt, X_test, y_test)
viz_df = viz_df.append({
    'model': 'Decision Tree',
    'fpr': fpr,
    'tpr': tpr,
    'auc': auc,
    'thresholds': thresholds
    }, ignore_index=True)

fpr, tpr, thresholds, auc = roc_curve_metrics(best_rf, X_test, y_test)
viz_df = viz_df.append({
    'model': 'Random Forest',
    'fpr': fpr,
    'tpr': tpr,
    'auc': auc,
    'thresholds': thresholds
    }, ignore_index=True)

fpr, tpr, thresholds, auc = roc_curve_metrics(best_xgb, X_test, y_test)
viz_df = viz_df.append({
    'model': 'XGBoost',
    'fpr': fpr,
    'tpr': tpr,
    'auc': auc,
    'thresholds': thresholds
    }, ignore_index=True)

sns.lineplot(data=viz_df, x='fpr', y='tpr', hue='model', ax=ax)

plt.title('ROC Curve')

plt.show()