# XGB Hyperparameter tuning

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score,StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report,cohen_kappa_score,f1_score,matthews_corrcoef
from sklearn.preprocessing import LabelEncoder 
import optuna
from optuna import Trial, visualization
import plotly
from optuna.samplers import TPESampler
from sklearn.utils.class_weight import compute_sample_weight

In [14]:
# import data
concat_df = pd.read_csv('/workspace/data/SGU/SFSI/SFSI/MASTER_TRAIN.csv', sep=',', decimal='.')
concat_df = concat_df.loc[:, ~concat_df.columns.isin(['Unnamed: 0'])]
concat_df.columns

Index(['DEM', 'EAS1ha', 'EAS10ha', 'DI2m', 'CVA', 'SDFS', 'DFME', 'Rugged',
       'SoilMap', 'HKDepth', 'LandAge', 'MSRM', 'MED', 'TWI20', 'SLOPE20',
       'SLOPE50', 'RELTOPOPOS', 'MAXCURV20', 'MAXCURV50', 'MINICURV20',
       'MED20m', 'Directiona', 'MED50m', 'DI20_2m', 'CVA20', 'CVA50',
       'GENERAL_TX', 'NMD', 'SoilDepth', 'ANVAD20', 'NDVI', 'Geomorphon',
       'ProfileCur'],
      dtype='object')

In [23]:
# calculate ratio between test size and whole dataset size
test_df = pd.read_csv('/workspace/data/SGU/SFSI/SFSI/MASTER_TEST.csv', sep=',', decimal='.')
len(test_df)/(len(test_df)+len(concat_df))

0.11820452940857003

In [24]:
len(test_df)

4353

In [16]:
le = LabelEncoder()
x = concat_df.loc[:,[c for c in concat_df.columns if c not in ['GENERAL_TX']]]
y = le.fit_transform(concat_df.loc[:,"GENERAL_TX"])

In [18]:
#define objective for bayesian optimization 
def objective(trial):

    # Define the hyperparameters to optimize
    params = {
        'objective': 'multi:softmax',
        'eval_metric': 'mlogloss',
        'num_class': 7,
        'tree_method': 'gpu_hist',
        'subsample': 1, 
         # L2 regularization weight.
        "lambda": trial.suggest_float("lambda", 0., 50.0),
        # L1 regularization weight.
        "alpha": trial.suggest_float("alpha", 0., 10.0),
        # defines booster
        "booster": trial.suggest_categorical("booster", ["gbtree","dart"]),
        # maximum depth of the tree, signifies complexity of the tree.
        'max_depth': trial.suggest_int('max_depth', 4, 12),
        'eta': trial.suggest_float('eta', 0.001, 0.3),
        # defines how selective algorithm is.
        'gamma': trial.suggest_float('gamma', 0,10 ),
         # sampling according to each tree.
        'colsample_bytree':trial.suggest_float('colsample_bytree',0.4,0.9),
        # sampling ratio for training data.
        #'subsample':trial.suggest_float('subsample',0.4,1),
        # minimum child weight, larger the term more conservative the tree.
        "min_child_weight": trial.suggest_int("min_child_weight", 0, 10)}
    if params["booster"] == "dart":
        params["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        params["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        params["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=False)
        params["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=False)

    
    #Perform stratified k-fold cross-validation on the XGBoost model
    scv = StratifiedKFold(n_splits=5, shuffle=True, random_state=None)
    score_mcc = []
    #score_f1 = []
    #score_cohenskappa = []
    for train_idx, val_idx in scv.split(x, y): # x, y defined in the previous step
        X_train, y_train = x.iloc[train_idx], y[train_idx]
        X_val, y_val = x.iloc[val_idx], y[val_idx]
        le = LabelEncoder()
        y_train = le.fit_transform(y_train)
        y_val = le.fit_transform(y_val)
        # Compute sample weight
        #weights_y_train = compute_sample_weight('balanced', y_train)
        #weights_y_val = compute_sample_weight('balanced', y_val)
        dtrain = xgb.DMatrix(X_train, label=y_train)
        dval = xgb.DMatrix(X_val, label=y_val)
    # Train the XGBoost model
        model = xgb.train(params, dtrain, early_stopping_rounds=10, evals=[(dval, 'eval')], verbose_eval=True)

    # Make predictions on the testing set
        y_val_pred = model.predict(dval)

    # Calculate the evaluation metrics
        #cohen_kappa_score_test = cohen_kappa_score(y_val, y_val_pred)
        #f1_score_test = f1_score(y_val, y_val_pred, average='macro')
        #matthews_corrcoef_test = matthews_corrcoef(y_val, y_val_pred)
        score_mcc.append(matthews_corrcoef(y_val, y_val_pred))
        #score_f1.append(f1_score(y_val, y_val_pred, average='macro'))
        #score_cohenskappa.append(cohen_kappa_score(y_val, y_val_pred))

    return np.mean(score_mcc)#, np.mean(score_f1), np.mean(score_cohenskappa)

In [19]:
# calling the optuna study
study = optuna.create_study(direction='maximize',sampler=TPESampler())# a specific algorithm within bayesian framework, tree structure
study.optimize(objective, n_trials= 250, show_progress_bar = True)

[I 2023-11-24 02:57:16,208] A new study created in memory with name: no-name-df9157ec-6a3b-4e29-8e98-c43055270da1


  0%|          | 0/250 [00:00<?, ?it/s]

[0]	eval-mlogloss:1.48072
[1]	eval-mlogloss:1.25081
[2]	eval-mlogloss:1.09513
[3]	eval-mlogloss:0.99669
[4]	eval-mlogloss:0.91819
[5]	eval-mlogloss:0.86468
[6]	eval-mlogloss:0.81243
[7]	eval-mlogloss:0.75294
[8]	eval-mlogloss:0.72475
[9]	eval-mlogloss:0.70514
[0]	eval-mlogloss:1.48323
[1]	eval-mlogloss:1.25542
[2]	eval-mlogloss:1.10106
[3]	eval-mlogloss:1.00487
[4]	eval-mlogloss:0.92830
[5]	eval-mlogloss:0.87607
[6]	eval-mlogloss:0.82507
[7]	eval-mlogloss:0.76838
[8]	eval-mlogloss:0.74039
[9]	eval-mlogloss:0.72132
[0]	eval-mlogloss:1.47737
[1]	eval-mlogloss:1.24811
[2]	eval-mlogloss:1.09341
[3]	eval-mlogloss:0.99528
[4]	eval-mlogloss:0.91761
[5]	eval-mlogloss:0.86394
[6]	eval-mlogloss:0.81217
[7]	eval-mlogloss:0.75402
[8]	eval-mlogloss:0.72568
[9]	eval-mlogloss:0.70652
[0]	eval-mlogloss:1.47977
[1]	eval-mlogloss:1.25134
[2]	eval-mlogloss:1.09828
[3]	eval-mlogloss:1.00169
[4]	eval-mlogloss:0.92539
[5]	eval-mlogloss:0.87362
[6]	eval-mlogloss:0.82227
[7]	eval-mlogloss:0.76630
[8]	eval-mlo

In [20]:
print('Best params: ', study.best_params)
print('Best score: ', study.best_value)

Best params:  {'lambda': 3.6797431147409454, 'alpha': 0.16377015271779502, 'booster': 'gbtree', 'max_depth': 11, 'eta': 0.24755202760689674, 'gamma': 1.1552990227831699, 'colsample_bytree': 0.8356557257338416, 'min_child_weight': 5}
Best score:  0.7169627599759515
