In [1]:
import xgboost as xgb

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
import matplotlib as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import GridSearchCV

Importing and processing the data

In [2]:
file_path = './data./Data Set for Chapter - Sheet1.csv'
df = pd.read_csv(file_path)
df.dropna(inplace=True)

Splitting the data into train, test and cross validation sets

In [3]:
x = df.loc[:,'Age':'Testing']
y = df['STD Status']

scaler = StandardScaler()
x = scaler.fit_transform(x)

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.3,random_state=41) # 70% training and 30% test
X_cv,y_cv = X_test[:int(len(X_test)/2)],y_test[:int(len(y_test)/2)]
X_test,y_test = X_test[int(len(X_test)/2):],y_test[int(len(y_test)/2):]

print("length of training set: ", len(X_train))
print("length of test set: ", len(X_test))
print("length of cv set: ", len(X_cv))

length of training set:  380
length of test set:  82
length of cv set:  82


Creating the XGBoost model

In [4]:
xgb_clf = xgb.XGBClassifier(
    objective="binary:logistic",
    eta=0.1,
    max_depth=5,
    min_child_weight=5,
    subsample=0.8,
    colsample_bytree=0.9
)

# Create classification matrices
dtrain_clf = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest_clf = xgb.DMatrix(X_test, y_test, enable_categorical=True)
param_grid = { 'eta': [0.01, 0.1, 0.2,0.5],
               'max_depth': [3, 5, 7, 9, 11, 15, 20],
               'min_child_weight': [1, 5, 10],
               'subsample': [0.8, 0.9],
               'colsample_bytree': [0.8, 0.9]
             }

Performing RandomSearch with initial parameters

In [5]:
random_search_xgboost_model = RandomizedSearchCV(
    xgb_clf, param_grid, cv=5, scoring='neg_log_loss', n_iter=100, verbose=1
)
# Fit the model on the training data
random_search_xgboost_model.fit(X_train, y_train)

# Display the best hyperparameters
print("Best Hyperparameters:", random_search_xgboost_model.best_params_)

# Get the best model
best_random_search_xgb_model = random_search_xgboost_model.best_estimator_

# Evaluate the model with additional metrics
predictions = best_random_search_xgb_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, predictions))
print("Precision:", precision_score(y_test, predictions))
print("Recall:", recall_score(y_test, predictions))

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best Hyperparameters: {'subsample': 0.9, 'min_child_weight': 5, 'max_depth': 15, 'eta': 0.1, 'colsample_bytree': 0.8}
Accuracy: 0.8536585365853658
Precision: 0.9285714285714286
Recall: 0.8125


Using the parameters from Random Search

In [6]:
best_params = random_search_xgboost_model.best_params_

# Re-run XGBoost CV with early stopping and additional metrics
xgb_random_search_tuned_results = xgb.cv(
    best_params, dtrain_clf,
    num_boost_round=1000,
    nfold=3,
    early_stopping_rounds=10,
    metrics=["auc", "error", "logloss"],
    verbose_eval=True
)

# Display tuned results
print("Tuned Results:")
print(xgb_random_search_tuned_results.iloc[-1])

[0]	train-auc:0.76212+0.01928	train-error:0.40007+0.04662	train-logloss:0.67105+0.00371	test-auc:0.58830+0.04077	test-error:0.50000+0.00321	test-logloss:0.69273+0.00457
[1]	train-auc:0.78715+0.02002	train-error:0.31188+0.03146	train-logloss:0.65344+0.00493	test-auc:0.59133+0.02873	test-error:0.44205+0.03265	test-logloss:0.68837+0.00569
[2]	train-auc:0.79447+0.01780	train-error:0.29873+0.02466	train-logloss:0.63762+0.00516	test-auc:0.60873+0.00980	test-error:0.45257+0.02569	test-logloss:0.68234+0.00429
[3]	train-auc:0.94232+0.00647	train-error:0.16583+0.02040	train-logloss:0.57826+0.00634	test-auc:0.81835+0.01569	test-error:0.29465+0.02508	test-logloss:0.62571+0.00281
[4]	train-auc:0.96451+0.00377	train-error:0.09474+0.00337	train-logloss:0.52875+0.00656	test-auc:0.88246+0.01312	test-error:0.19993+0.01903	test-logloss:0.57801+0.00115
[5]	train-auc:0.97356+0.00391	train-error:0.08288+0.00631	train-logloss:0.49196+0.00518	test-auc:0.89656+0.00869	test-error:0.16315+0.00715	test-logloss:0.

Performing GridSearch with default parameters

In [7]:
# Create the grid search object
grid_search_xgboost_model = GridSearchCV(
    xgb_clf, param_grid, cv=5, scoring='neg_log_loss', verbose=1
)

# Fit the model on the training data
grid_search_xgboost_model.fit(X_train, y_train)

# Display the best hyperparameters
print("Best Hyperparameters:", grid_search_xgboost_model.best_params_)

# Get the best model
best_grid_search_xgb_model = grid_search_xgboost_model.best_estimator_

# Evaluate the model with additional metrics
predictions = best_grid_search_xgb_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, predictions))
print("Precision:", precision_score(y_test, predictions))
print("Recall:", recall_score(y_test, predictions))


Fitting 5 folds for each of 336 candidates, totalling 1680 fits
Best Hyperparameters: {'colsample_bytree': 0.9, 'eta': 0.1, 'max_depth': 7, 'min_child_weight': 5, 'subsample': 0.9}
Accuracy: 0.8536585365853658
Precision: 0.9285714285714286
Recall: 0.8125


Using parameters from Grid Search

In [8]:
# Get the best parameters from  grid search
best_params = grid_search_xgboost_model.best_params_

# Re-run XGBoost CV with early stopping and additional metrics
xgb_grid_search_tuned_results = xgb.cv(
    best_params, dtrain_clf,
    num_boost_round=1000,
    nfold=5,
    early_stopping_rounds=10,
    metrics=["logloss", "auc", "error"],
    verbose_eval=True
)

# Display tuned results
print("Tuned Results:")
print(xgb_grid_search_tuned_results.iloc[-1])


[0]	train-logloss:0.67177+0.00306	train-auc:0.75574+0.02425	train-error:0.36645+0.03862	test-logloss:0.68953+0.00295	test-auc:0.56633+0.06034	test-error:0.48421+0.00985
[1]	train-logloss:0.65553+0.00397	train-auc:0.78045+0.01764	train-error:0.31711+0.01509	test-logloss:0.68727+0.00555	test-auc:0.57820+0.05632	test-error:0.50000+0.03627
[2]	train-logloss:0.64100+0.00583	train-auc:0.79210+0.01935	train-error:0.30132+0.01914	test-logloss:0.68347+0.00773	test-auc:0.58582+0.04640	test-error:0.47895+0.03777
[3]	train-logloss:0.58130+0.00582	train-auc:0.94444+0.00787	train-error:0.14408+0.01492	test-logloss:0.62658+0.00645	test-auc:0.81905+0.04534	test-error:0.29211+0.04588
[4]	train-logloss:0.53154+0.00595	train-auc:0.96508+0.00702	train-error:0.09671+0.01224	test-logloss:0.57795+0.00710	test-auc:0.88623+0.03453	test-error:0.17632+0.03287
[5]	train-logloss:0.49479+0.00482	train-auc:0.97278+0.00522	train-error:0.08355+0.00767	test-logloss:0.54399+0.00769	test-auc:0.90888+0.02793	test-error:0.