In [1]:
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
import matplotlib as plt
from sklearn.model_selection import train_test_split, GridSearchCV

In [2]:
file_path = './data./Data Set for Chapter - Sheet1.csv'
df = pd.read_csv(file_path)
df.dropna(inplace=True)

x = df.loc[:,'Age':'Testing']
y = df['STD Status']

scaler = StandardScaler()
x = scaler.fit_transform(x)

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.3,random_state=41) # 70% training and 30% test


In [3]:
param_grid = {
    'eta': [0.01, 0.1, 0.2,0.5],
    'max_depth': [3, 5, 7, 9, 11, 15, 20],
    'min_child_weight': [1, 5, 10],
    'subsample': [0.8, 0.9],
    'colsample_bytree': [0.8, 0.9]
}
# Create an XGBoost classifier
xgb_clf = xgb.XGBClassifier(objective='binary:logistic', tree_method='hist', eval_metric=["logloss", "auc", "error"])

# Perform grid search
grid_search = GridSearchCV(xgb_clf, param_grid, cv=5, scoring='neg_log_loss', verbose=1)
grid_search.fit(X_train, y_train)

# Display the best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

# Get the best model
best_xgb_model = grid_search.best_estimator_

# Convert the data to XGBoost DMatrix
dtrain_clf = xgb.DMatrix(X_train, label=y_train)

# Number of boosting rounds
n_boosting_rounds = 1000

# Re-run cross-validation with tuned hyperparameters
results_tuned = xgb.cv(
    best_xgb_model.get_params(), dtrain_clf,
    num_boost_round=n_boosting_rounds,
    nfold=5,
    early_stopping_rounds=10,
    metrics=["logloss", "auc", "error"],
    verbose_eval=True
)

# Display tuned results
print("Tuned Results:")
print(results_tuned.iloc[-1])

Fitting 5 folds for each of 336 candidates, totalling 1680 fits
Best Hyperparameters: {'colsample_bytree': 0.9, 'eta': 0.1, 'max_depth': 7, 'min_child_weight': 5, 'subsample': 0.9}
[0]	train-logloss:0.68311+0.00194	train-auc:0.66468+0.01876	train-error:0.44276+0.02917	test-logloss:0.68938+0.00433	test-auc:0.61051+0.08479	test-error:0.46053+0.07012
[1]	train-logloss:0.67542+0.00284	train-auc:0.69199+0.01488	train-error:0.38882+0.02627	test-logloss:0.68760+0.00545	test-auc:0.58474+0.04309	test-error:0.45263+0.03956
[2]	train-logloss:0.66859+0.00329	train-auc:0.70038+0.00660	train-error:0.38224+0.02602	test-logloss:0.68510+0.00804	test-auc:0.58616+0.06813	test-error:0.45789+0.03939
[3]	train-logloss:0.61107+0.00286	train-auc:0.92454+0.00797	train-error:0.19013+0.03618	test-logloss:0.62700+0.00414	test-auc:0.90289+0.04243	test-error:0.22895+0.06637
[4]	train-logloss:0.56342+0.00350	train-auc:0.94350+0.00498	train-error:0.10987+0.00738	test-logloss:0.57843+0.00364	test-auc:0.92015+0.03014	t

Parameters: { "enable_categorical", "missing" } are not used.



In [5]:
# Create classification matrices
dtrain_clf = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest_clf = xgb.DMatrix(X_test, y_test, enable_categorical=True)

In [6]:
params = {"objective": "binary:logistic", "tree_method": "hist"}
n = 1000

results = xgb.cv(
   params, dtrain_clf,
   num_boost_round=n,
   nfold=5,
   metrics=["logloss", "auc", "error"],
)

In [7]:
results.keys()

Index(['train-logloss-mean', 'train-logloss-std', 'train-auc-mean',
       'train-auc-std', 'train-error-mean', 'train-error-std',
       'test-logloss-mean', 'test-logloss-std', 'test-auc-mean',
       'test-auc-std', 'test-error-mean', 'test-error-std'],
      dtype='object')

In [8]:
test_logloss_mean = results['test-logloss-mean'].min()  # Minimize logloss
test_auc_mean = results['test-auc-mean'].max()  # Maximize AUC
test_error_mean = results['test-error-mean'].min()  # Minimize classification error

# Print the results
print(f"Test Logloss: {test_logloss_mean}")
print(f"Test AUC: {test_auc_mean}")
print(f"Test Error: {test_error_mean}")

Test Logloss: 0.3299284270621444
Test AUC: 0.9242725993244194
Test Error: 0.1236842105263158
