# XGB Hyperparameter tuning

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score,StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report,cohen_kappa_score,f1_score,matthews_corrcoef
from sklearn.preprocessing import LabelEncoder 
import optuna
from optuna import Trial, visualization
import plotly
from optuna.samplers import TPESampler
from sklearn.utils.class_weight import compute_sample_weight

In [2]:
# import data
concat_df = pd.read_csv('/workspace/data/Akermark/concatenated_GENERALTX_cleaned.csv', sep=',', decimal='.')

In [3]:
le = LabelEncoder()
x = concat_df.loc[:,[c for c in concat_df.columns if c not in ['GENERAL_TX']]]
y = le.fit_transform(concat_df.loc[:,"GENERAL_TX"])

In [7]:
#define objective for bayesian optimization 
def objective(trial):

    # Define the hyperparameters to optimize
    params = {
        'objective': 'multi:softmax',
        'eval_metric': 'mlogloss',
        'num_class': 7,
        'tree_method': 'gpu_hist',
        'subsample': 1, 
         # L2 regularization weight.
        "lambda": trial.suggest_float("lambda", 0., 50.0),
        # L1 regularization weight.
        "alpha": trial.suggest_float("alpha", 0., 10.0),
        # defines booster
        "booster": trial.suggest_categorical("booster", ["gbtree","dart"]),
        # maximum depth of the tree, signifies complexity of the tree.
        'max_depth': trial.suggest_int('max_depth', 4, 12),
        'eta': trial.suggest_float('eta', 0.001, 0.3),
        # defines how selective algorithm is.
        'gamma': trial.suggest_float('gamma', 0,10 ),
         # sampling according to each tree.
        'colsample_bytree':trial.suggest_float('colsample_bytree',0.4,0.9),
        # sampling ratio for training data.
        #'subsample':trial.suggest_float('subsample',0.4,1),
        # minimum child weight, larger the term more conservative the tree.
        "min_child_weight": trial.suggest_int("min_child_weight", 0, 10)}
    if params["booster"] == "dart":
        params["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        params["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        params["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=False)
        params["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=False)

    
    #Perform stratified k-fold cross-validation on the XGBoost model
    scv = StratifiedKFold(n_splits=5, shuffle=True, random_state=None)
    score_mcc = []
    #score_f1 = []
    #score_cohenskappa = []
    for train_idx, val_idx in scv.split(x, y): # x, y defined in the previous step
        X_train, y_train = x.iloc[train_idx], y[train_idx]
        X_val, y_val = x.iloc[val_idx], y[val_idx]
        le = LabelEncoder()
        y_train = le.fit_transform(y_train)
        y_val = le.fit_transform(y_val)
        # Compute sample weight
        weights_y_train = compute_sample_weight('balanced', y_train)
        weights_y_val = compute_sample_weight('balanced', y_val)
        dtrain = xgb.DMatrix(X_train, label=y_train)
        dval = xgb.DMatrix(X_val, label=y_val)
    # Train the XGBoost model
        model = xgb.train(params, dtrain, early_stopping_rounds=10, evals=[(dval, 'eval')], verbose_eval=True)

    # Make predictions on the testing set
        y_val_pred = model.predict(dval)

    # Calculate the evaluation metrics
        #cohen_kappa_score_test = cohen_kappa_score(y_val, y_val_pred)
        #f1_score_test = f1_score(y_val, y_val_pred, average='macro')
        #matthews_corrcoef_test = matthews_corrcoef(y_val, y_val_pred)
        score_mcc.append(matthews_corrcoef(y_val, y_val_pred, sample_weight=weights_y_val))
        #score_f1.append(f1_score(y_val, y_val_pred, average='macro'))
        #score_cohenskappa.append(cohen_kappa_score(y_val, y_val_pred))

    return np.mean(score_mcc)#, np.mean(score_f1), np.mean(score_cohenskappa)

In [None]:
# calling the optuna study
study = optuna.create_study(direction='maximize',sampler=TPESampler())# a specific algorithm within bayesian framework, tree structure
study.optimize(objective, n_trials= 500, show_progress_bar = True)

[I 2023-11-15 11:52:32,058] A new study created in memory with name: no-name-73be72e8-e824-46ab-a146-a7bc5f886e79


  0%|          | 0/500 [00:00<?, ?it/s]

[0]	eval-mlogloss:1.61175
[1]	eval-mlogloss:1.41043
[2]	eval-mlogloss:1.26079
[3]	eval-mlogloss:1.14925
[4]	eval-mlogloss:1.06319
[5]	eval-mlogloss:0.99994
[6]	eval-mlogloss:0.94707
[7]	eval-mlogloss:0.90331
[8]	eval-mlogloss:0.86952
[9]	eval-mlogloss:0.84048
[0]	eval-mlogloss:1.60769
[1]	eval-mlogloss:1.39970
[2]	eval-mlogloss:1.24764
[3]	eval-mlogloss:1.13351
[4]	eval-mlogloss:1.04688
[5]	eval-mlogloss:0.98267
[6]	eval-mlogloss:0.92897
[7]	eval-mlogloss:0.88540
[8]	eval-mlogloss:0.85211
[9]	eval-mlogloss:0.82092
[0]	eval-mlogloss:1.60655
[1]	eval-mlogloss:1.40043
[2]	eval-mlogloss:1.24990
[3]	eval-mlogloss:1.13642
[4]	eval-mlogloss:1.05056
[5]	eval-mlogloss:0.98705
[6]	eval-mlogloss:0.93370
[7]	eval-mlogloss:0.89046
[8]	eval-mlogloss:0.85753
[9]	eval-mlogloss:0.82859
[0]	eval-mlogloss:1.60995
[1]	eval-mlogloss:1.40445
[2]	eval-mlogloss:1.25604
[3]	eval-mlogloss:1.14317
[4]	eval-mlogloss:1.05648
[5]	eval-mlogloss:0.99252
[6]	eval-mlogloss:0.93936
[7]	eval-mlogloss:0.89647
[8]	eval-mlo