# XGBoost Hypertuning

*   Jacob Yousif

## Importing the libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
%%capture
!pip install imbalanced-learn xgboost optuna

In [3]:
%%capture

import pandas as pd
import numpy as np
import joblib
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
import optuna
import time

## Loading the data

In [4]:
data = np.load('Datasets/LearningSet.npz')
X_train = data['X_train']
y_train = data['y_train']
X_val = data['X_val']
y_val = data['y_val']
X_test = data['X_test']
y_test = data['y_test']

## Tuning the model

In [5]:
results = []

In [6]:
total = 0

In [7]:
def objective(trial):
    global total
    global results
    
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 20, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 25),
        'num_leaves': trial.suggest_int('num_leaves', 10, 150),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.5),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 200),
        'subsample': trial.suggest_float('subsample', 0.1, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0.0, 5.0),
        'lambda': trial.suggest_float('lambda', 0.0, 5.0),
        'alpha': trial.suggest_float('alpha', 0.0, 5.0),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 0.5, 5.0),
    }
    
    start_time = time.time()
    clf = XGBClassifier(**params, use_label_encoder=False, eval_metric='mlogloss')
    clf.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False, early_stopping_rounds=3)
    y_pred = clf.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    duration = time.time() - start_time

    total += duration
    
    results.append({
        'Trial': trial.number,
        'Accuracy': accuracy,
        'Duration': duration,
        'Params': params,
        'BestAccuracy': max([res['Accuracy'] for res in results], default=0)
    })
    
    return accuracy

In [8]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=500)

In [9]:
seconds = int(total)
print('In seconds, the total duration of the hypertuning process is:', seconds, 'seconds.')

In seconds, the total duration of the hypertuning process is: 25212 seconds.


In [10]:
minutes = int(total / 60)
print('In minutes, the total duration of the hypertuning process is:', minutes, 'minutes.')

In minutes, the total duration of the hypertuning process is: 420 minutes.


In [11]:
hours = round(total / 3600, 1)
print('In hours, the total duration of the hypertuning process is:', hours, 'hours.')

In hours, the total duration of the hypertuning process is: 7.0 hours.


In [12]:
progress = pd.DataFrame(results)

In [13]:
progress

Unnamed: 0,Trial,Accuracy,Duration,Params,BestAccuracy
0,0,0.616323,40.520358,"{'n_estimators': 357, 'max_depth': 6, 'num_lea...",0.000000
1,1,0.588889,10.896805,"{'n_estimators': 49, 'max_depth': 21, 'num_lea...",0.616323
2,2,0.564954,30.501907,"{'n_estimators': 445, 'max_depth': 10, 'num_le...",0.616323
3,3,0.571788,32.141388,"{'n_estimators': 466, 'max_depth': 21, 'num_le...",0.616323
4,4,0.591015,13.711207,"{'n_estimators': 187, 'max_depth': 8, 'num_lea...",0.616323
...,...,...,...,...,...
495,495,0.704019,43.421188,"{'n_estimators': 496, 'max_depth': 15, 'num_le...",0.718994
496,496,0.705980,71.969356,"{'n_estimators': 482, 'max_depth': 15, 'num_le...",0.718994
497,497,0.693359,43.346955,"{'n_estimators': 491, 'max_depth': 16, 'num_le...",0.718994
498,498,0.709446,51.410878,"{'n_estimators': 499, 'max_depth': 16, 'num_le...",0.718994


In [14]:
csv_file_path = 'Datasets/XGBoostHypertuningProcess.csv'
progress.to_csv(csv_file_path, index=False)

In [15]:
best_params = study.best_params
print("Best Parameters:", best_params)

Best Parameters: {'n_estimators': 469, 'max_depth': 16, 'num_leaves': 121, 'learning_rate': 0.08125057989356922, 'min_child_samples': 109, 'subsample': 0.690717197948128, 'colsample_bytree': 0.8131062669085312, 'gamma': 0.005062207777113432, 'lambda': 3.5587918512172783, 'alpha': 0.26722259203238957, 'scale_pos_weight': 3.1178354797239987}


In [16]:
clf = XGBClassifier(**best_params, use_label_encoder=False, eval_metric='mlogloss')

In [17]:
clf.fit(X_train, y_train)

In [18]:
joblib.dump(clf, 'Model/TrainedXGBoostModel.joblib')

['Model/TrainedXGBoostModel.joblib']