In [1]:
import csv

import xgboost as xgb

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
import matplotlib as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import GridSearchCV

Importing and processing the data

In [2]:
file_path = './data./Data Set for Chapter - Sheet1.csv'
df = pd.read_csv(file_path)
df.dropna(inplace=True)

Splitting the data into train, test and cross validation sets

In [3]:
x = df.loc[:,'Age':'Testing']
y = df['STD Status']

scaler = StandardScaler()
x = scaler.fit_transform(x)

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.3,random_state=41) # 70% training and 30% test
X_cv,y_cv = X_test[:int(len(X_test)/2)],y_test[:int(len(y_test)/2)]
X_test,y_test = X_test[int(len(X_test)/2):],y_test[int(len(y_test)/2):]

print("length of training set: ", len(X_train))
print("length of test set: ", len(X_test))
print("length of cv set: ", len(X_cv))

length of training set:  380
length of test set:  82
length of cv set:  82


Creating the XGBoost model

In [4]:
xgb_clf = xgb.XGBClassifier(
    objective="binary:logistic",
    eta=0.1,
    max_depth=5,
    min_child_weight=5,
    subsample=0.8,
    colsample_bytree=0.9
)

# Create classification matrices
dtrain_clf = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest_clf = xgb.DMatrix(X_test, y_test, enable_categorical=True)
param_grid = { 'eta': [0.01, 0.1, 0.2,0.5],
               'max_depth': [3, 5, 7, 9, 11, 15, 20],
               'min_child_weight': [1,3,5,7,10],
               'subsample': [0.7,0.8, 0.9],
               'colsample_bytree': [0.7,0.8, 0.9]
             }

Performing RandomSearch with initial parameters

In [5]:
random_search_xgboost_model = RandomizedSearchCV(
    xgb_clf, param_grid, cv=5, scoring='neg_log_loss', n_iter=100, verbose=1
)
# Fit the model on the training data
random_search_xgboost_model.fit(X_train, y_train)

# Display the best hyperparameters
print("Best Hyperparameters:", random_search_xgboost_model.best_params_)

# Get the best model
best_random_search_xgb_model = random_search_xgboost_model.best_estimator_

# Evaluate the model with additional metrics
random_predictions = best_random_search_xgb_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, random_predictions))
print("Precision:", precision_score(y_test, random_predictions))
print("Recall:", recall_score(y_test, random_predictions))

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best Hyperparameters: {'subsample': 0.8, 'min_child_weight': 5, 'max_depth': 20, 'eta': 0.1, 'colsample_bytree': 0.7}
Accuracy: 0.8536585365853658
Precision: 0.9285714285714286
Recall: 0.8125


Using the parameters from Random Search

In [6]:
best_params = random_search_xgboost_model.best_params_

# Re-run XGBoost CV with early stopping and additional metrics
xgb_random_search_tuned_results = xgb.cv(
    best_params, dtrain_clf,
    num_boost_round=1000,
    nfold=3,
    early_stopping_rounds=10,
    metrics=["auc", "error", "logloss"],
    verbose_eval=True
)

# Display tuned results
print("Tuned Results:")
print(xgb_random_search_tuned_results.iloc[-1])

[0]	train-auc:0.68788+0.01872	train-error:0.43296+0.03626	train-logloss:0.67859+0.00293	test-auc:0.54148+0.02090	test-error:0.52110+0.01463	test-logloss:0.69616+0.00234
[1]	train-auc:0.74322+0.02158	train-error:0.36058+0.03496	train-logloss:0.66317+0.00496	test-auc:0.56181+0.01138	test-error:0.49225+0.04684	test-logloss:0.69240+0.00370
[2]	train-auc:0.76769+0.01502	train-error:0.33294+0.03398	train-logloss:0.65056+0.00765	test-auc:0.59434+0.00880	test-error:0.46321+0.02693	test-logloss:0.68917+0.00394
[3]	train-auc:0.94650+0.00730	train-error:0.17638+0.03592	train-logloss:0.59123+0.00865	test-auc:0.84429+0.02593	test-error:0.31302+0.03671	test-logloss:0.63074+0.00191
[4]	train-auc:0.93705+0.01086	train-error:0.19347+0.02670	train-logloss:0.58336+0.00975	test-auc:0.82293+0.03256	test-error:0.32881+0.04833	test-logloss:0.62988+0.00488
[5]	train-auc:0.96390+0.00497	train-error:0.11186+0.01148	train-logloss:0.53994+0.00799	test-auc:0.87985+0.01322	test-error:0.20779+0.03179	test-logloss:0.

Performing GridSearch with default parameters

In [7]:
# Create the grid search object
grid_search_xgboost_model = GridSearchCV(
    xgb_clf, param_grid, cv=5, scoring='neg_log_loss', verbose=1
)

# Fit the model on the training data
grid_search_xgboost_model.fit(X_train, y_train)

# Display the best hyperparameters
print("Best Hyperparameters:", grid_search_xgboost_model.best_params_)

# Get the best model
best_grid_search_xgb_model = grid_search_xgboost_model.best_estimator_

# Evaluate the model with additional metrics
grid_predictions = best_grid_search_xgb_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, grid_predictions))
print("Precision:", precision_score(y_test, grid_predictions))
print("Recall:", recall_score(y_test, grid_predictions))


Fitting 5 folds for each of 1260 candidates, totalling 6300 fits
Best Hyperparameters: {'colsample_bytree': 0.9, 'eta': 0.1, 'max_depth': 5, 'min_child_weight': 3, 'subsample': 0.7}
Accuracy: 0.8536585365853658
Precision: 0.9285714285714286
Recall: 0.8125


Using parameters from Grid Search

In [8]:
# Get the best parameters from  grid search
best_params = grid_search_xgboost_model.best_params_

# Re-run XGBoost CV with early stopping and additional metrics
xgb_grid_search_tuned_results = xgb.cv(
    best_params, dtrain_clf,
    num_boost_round=1000,
    nfold=5,
    early_stopping_rounds=10,
    metrics=["logloss", "auc", "error"],
    verbose_eval=True
)

# Display tuned results
print("Tuned Results:")
print(xgb_grid_search_tuned_results.iloc[-1])


[0]	train-logloss:0.67702+0.00239	train-auc:0.69203+0.02664	train-error:0.37237+0.02593	test-logloss:0.68940+0.00494	test-auc:0.56852+0.04705	test-error:0.45789+0.05966
[1]	train-logloss:0.66274+0.00202	train-auc:0.73850+0.01167	train-error:0.33684+0.01797	test-logloss:0.68602+0.00589	test-auc:0.57794+0.02177	test-error:0.46579+0.03868
[2]	train-logloss:0.65089+0.00452	train-auc:0.75530+0.01393	train-error:0.32829+0.01695	test-logloss:0.68440+0.00644	test-auc:0.58377+0.04153	test-error:0.45789+0.05419
[3]	train-logloss:0.59026+0.00502	train-auc:0.94319+0.00618	train-error:0.14737+0.01630	test-logloss:0.62827+0.00605	test-auc:0.83682+0.06344	test-error:0.28947+0.03329
[4]	train-logloss:0.53890+0.00351	train-auc:0.96303+0.00483	train-error:0.09013+0.01259	test-logloss:0.58215+0.00837	test-auc:0.87970+0.03970	test-error:0.20789+0.03570
[5]	train-logloss:0.50297+0.00341	train-auc:0.97151+0.00429	train-error:0.08026+0.00492	test-logloss:0.54827+0.00898	test-auc:0.90585+0.03013	test-error:0.

In [15]:

try:
    with open('data.csv', 'a', newline='') as csvfile:
        writer = csv.writer(csvfile)
        # Write data to the CSV file
        writer.writerow(['xgboost_grid_search', accuracy_score(y_test, grid_predictions) ])
        writer.writerow(['xgboost_random_search', accuracy_score(y_test, random_predictions)])

except Exception as e:
    print(f"An error occurred: {e}")

# Close the CSV file
csvfile.close()