In [1]:
import csv

import xgboost as xgb

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
import matplotlib as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import GridSearchCV

Importing and processing the data

In [2]:
file_path = './data./Data Set for Chapter - Sheet1.csv'
df = pd.read_csv(file_path)
df.dropna(inplace=True)

Splitting the data into train, test and cross validation sets

In [3]:
x = df.loc[:,'Age':'Testing']
y = df['STD Status']

scaler = StandardScaler()
x = scaler.fit_transform(x)

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=41) # 80% training and 20% test

Creating the XGBoost model

In [4]:
xgb_clf = xgb.XGBClassifier(
    objective="binary:logistic",
    eta=0.1,
    max_depth=5,
    min_child_weight=5,
    subsample=0.8,
    colsample_bytree=0.9
)

# Create classification matrices
dtrain_clf = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest_clf = xgb.DMatrix(X_test, y_test, enable_categorical=True)
param_grid = { 'eta': [0.01, 0.1, 0.2,0.5],
               'max_depth': [3, 5, 7, 9, 11, 15, 20],
               'min_child_weight': [1,3,5,7,10],
               'subsample': [0.7,0.8, 0.9],
               'colsample_bytree': [0.7,0.8, 0.9]
             }

Performing RandomSearch with initial parameters

In [5]:
random_search_xgboost_model = RandomizedSearchCV(
    xgb_clf, param_grid, cv=5, scoring='neg_log_loss', n_iter=100, verbose=1
)
# Fit the model on the training data
random_search_xgboost_model.fit(X_train, y_train)

# Display the best hyperparameters
print("Best Hyperparameters:", random_search_xgboost_model.best_params_)

# Get the best model
best_random_search_xgb_model = random_search_xgboost_model.best_estimator_

# Evaluate the model with additional metrics
random_predictions = best_random_search_xgb_model.predict(X_test)
random_search_accuracy = accuracy_score(y_test, random_predictions)
print("Accuracy:", random_search_accuracy)
print("Precision:", precision_score(y_test, random_predictions))
print("Recall:", recall_score(y_test, random_predictions))

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best Hyperparameters: {'subsample': 0.9, 'min_child_weight': 7, 'max_depth': 7, 'eta': 0.2, 'colsample_bytree': 0.9}
Accuracy: 0.8807339449541285
Precision: 0.9615384615384616
Recall: 0.819672131147541


Using the parameters from Random Search

In [6]:
best_params = random_search_xgboost_model.best_params_

# Re-run XGBoost CV with early stopping and additional metrics
xgb_random_search_tuned_results = xgb.cv(
    best_params, dtrain_clf,
    num_boost_round=1000,
    nfold=3,
    early_stopping_rounds=10,
    metrics=["auc", "error", "logloss"],
    verbose_eval=True
)

# Display tuned results
print("Tuned Results:")
print(xgb_random_search_tuned_results.iloc[-1])

[0]	train-auc:0.72514+0.00645	train-error:0.35862+0.01490	train-logloss:0.66306+0.00287	test-auc:0.51326+0.05050	test-error:0.48276+0.03135	test-logloss:0.69259+0.00689
[1]	train-auc:0.76152+0.00781	train-error:0.31264+0.00586	train-logloss:0.63773+0.00614	test-auc:0.55288+0.01227	test-error:0.48966+0.01126	test-logloss:0.69009+0.00518
[2]	train-auc:0.77759+0.01206	train-error:0.29310+0.01971	train-logloss:0.61486+0.00935	test-auc:0.53939+0.00610	test-error:0.49195+0.02132	test-logloss:0.69816+0.00302
[3]	train-auc:0.94325+0.00692	train-error:0.13678+0.01697	train-logloss:0.50998+0.00696	test-auc:0.81219+0.01258	test-error:0.27586+0.03942	test-logloss:0.58982+0.00305
[4]	train-auc:0.96544+0.00668	train-error:0.10000+0.01490	train-logloss:0.43640+0.00868	test-auc:0.89330+0.00197	test-error:0.20230+0.01810	test-logloss:0.51617+0.00419
[5]	train-auc:0.96864+0.00460	train-error:0.09080+0.01138	train-logloss:0.38944+0.01136	test-auc:0.90844+0.00846	test-error:0.17471+0.01417	test-logloss:0.

Performing GridSearch with default parameters

In [7]:
# Create the grid search object
grid_search_xgboost_model = GridSearchCV(
    xgb_clf, param_grid, cv=5, scoring='neg_log_loss', verbose=1
)

# Fit the model on the training data
grid_search_xgboost_model.fit(X_train, y_train)

# Display the best hyperparameters
print("Best Hyperparameters:", grid_search_xgboost_model.best_params_)

# Get the best model
best_grid_search_xgb_model = grid_search_xgboost_model.best_estimator_

# Evaluate the model with additional metrics
grid_predictions = best_grid_search_xgb_model.predict(X_test)
grid_search_accuracy =  accuracy_score(y_test, grid_predictions)
print("Accuracy:", grid_search_accuracy)
print("Precision:", precision_score(y_test, grid_predictions))
print("Recall:", recall_score(y_test, grid_predictions))


Fitting 5 folds for each of 1260 candidates, totalling 6300 fits
Best Hyperparameters: {'colsample_bytree': 0.7, 'eta': 0.1, 'max_depth': 7, 'min_child_weight': 5, 'subsample': 0.8}
Accuracy: 0.8807339449541285
Precision: 0.9444444444444444
Recall: 0.8360655737704918


Using parameters from Grid Search

In [8]:
# Get the best parameters from  grid search
best_params = grid_search_xgboost_model.best_params_

# Re-run XGBoost CV with early stopping and additional metrics
xgb_grid_search_tuned_results = xgb.cv(
    best_params, dtrain_clf,
    num_boost_round=1000,
    nfold=5,
    early_stopping_rounds=10,
    metrics=["logloss", "auc", "error"],
    verbose_eval=True
)

# Display tuned results
print("Tuned Results:")
print(xgb_grid_search_tuned_results.iloc[-1])


[0]	train-logloss:0.68058+0.00252	train-auc:0.67711+0.01532	train-error:0.40862+0.01111	test-logloss:0.68911+0.00693	test-auc:0.56456+0.04432	test-error:0.44138+0.03379
[1]	train-logloss:0.66612+0.00455	train-auc:0.74600+0.01832	train-error:0.35575+0.02680	test-logloss:0.69008+0.00672	test-auc:0.54258+0.03280	test-error:0.46437+0.04214
[2]	train-logloss:0.65274+0.00547	train-auc:0.76781+0.01322	train-error:0.32816+0.02704	test-logloss:0.69090+0.00894	test-auc:0.53665+0.02158	test-error:0.46897+0.05603
[3]	train-logloss:0.59449+0.00532	train-auc:0.94550+0.00444	train-error:0.16264+0.01114	test-logloss:0.63192+0.01011	test-auc:0.82091+0.03829	test-error:0.32184+0.05292
[4]	train-logloss:0.58737+0.00619	train-auc:0.93512+0.00553	train-error:0.17759+0.01303	test-logloss:0.63114+0.01188	test-auc:0.79596+0.04940	test-error:0.33563+0.05603
[5]	train-logloss:0.54530+0.00771	train-auc:0.96314+0.00326	train-error:0.11724+0.01514	test-logloss:0.59237+0.01525	test-auc:0.87449+0.03842	test-error:0.

In [9]:
'''
try:
    with open('data.csv', 'a', newline='') as csvfile:
        writer = csv.writer(csvfile)
        # Write data to the CSV file
        writer.writerow(['xgboost_grid_search', accuracy_score(y_test, grid_predictions) ])
        writer.writerow(['xgboost_random_search', accuracy_score(y_test, random_predictions)])

except Exception as e:
    print(f"An error occurred: {e}")

# Close the CSV file
csvfile.close()'''

'\ntry:\n    with open(\'data.csv\', \'a\', newline=\'\') as csvfile:\n        writer = csv.writer(csvfile)\n        # Write data to the CSV file\n        writer.writerow([\'xgboost_grid_search\', accuracy_score(y_test, grid_predictions) ])\n        writer.writerow([\'xgboost_random_search\', accuracy_score(y_test, random_predictions)])\n\nexcept Exception as e:\n    print(f"An error occurred: {e}")\n\n# Close the CSV file\ncsvfile.close()'

In [10]:
if grid_search_accuracy > random_search_accuracy:
    print("GRID SEARCH")
    y_pred = best_grid_search_xgb_model.predict(X_test)
else:
    print("RANDOM SEARCH")
    y_pred = best_random_search_xgb_model.predict(X_test)

conf_matrix = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(report)
print(conf_matrix)

RANDOM SEARCH
              precision    recall  f1-score   support

           0       0.81      0.96      0.88        48
           1       0.96      0.82      0.88        61

    accuracy                           0.88       109
   macro avg       0.88      0.89      0.88       109
weighted avg       0.89      0.88      0.88       109

[[46  2]
 [11 50]]


In [11]:
accuracy = accuracy_score(y_test, y_pred)
print("accuracy: ", accuracy)

accuracy:  0.8807339449541285
