In [1]:
import csv

import xgboost as xgb

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
import matplotlib as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import GridSearchCV

Importing and processing the data

In [2]:
file_path = './data./Data Set for Chapter - Sheet1.csv'
df = pd.read_csv(file_path)
df.dropna(inplace=True)

Splitting the data into train, test and cross validation sets

In [3]:
x = df.loc[:,'Age':'Testing']
y = df['STD Status']

scaler = StandardScaler()
x = scaler.fit_transform(x)

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.3,random_state=41) # 70% training and 30% test
X_cv,y_cv = X_test[:int(len(X_test)/2)],y_test[:int(len(y_test)/2)]
X_test,y_test = X_test[int(len(X_test)/2):],y_test[int(len(y_test)/2):]

print("length of training set: ", len(X_train))
print("length of test set: ", len(X_test))
print("length of cv set: ", len(X_cv))

length of training set:  380
length of test set:  82
length of cv set:  82


Creating the XGBoost model

In [4]:
xgb_clf = xgb.XGBClassifier(
    objective="binary:logistic",
    eta=0.1,
    max_depth=5,
    min_child_weight=5,
    subsample=0.8,
    colsample_bytree=0.9
)

# Create classification matrices
dtrain_clf = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest_clf = xgb.DMatrix(X_test, y_test, enable_categorical=True)
param_grid = { 'eta': [0.01, 0.1, 0.2,0.5],
               'max_depth': [3, 5, 7, 9, 11, 15, 20],
               'min_child_weight': [1,3,5,7,10],
               'subsample': [0.7,0.8, 0.9],
               'colsample_bytree': [0.7,0.8, 0.9]
             }

Performing RandomSearch with initial parameters

In [5]:
random_search_xgboost_model = RandomizedSearchCV(
    xgb_clf, param_grid, cv=5, scoring='neg_log_loss', n_iter=100, verbose=1
)
# Fit the model on the training data
random_search_xgboost_model.fit(X_train, y_train)

# Display the best hyperparameters
print("Best Hyperparameters:", random_search_xgboost_model.best_params_)

# Get the best model
best_random_search_xgb_model = random_search_xgboost_model.best_estimator_

# Evaluate the model with additional metrics
random_predictions = best_random_search_xgb_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, random_predictions))
print("Precision:", precision_score(y_test, random_predictions))
print("Recall:", recall_score(y_test, random_predictions))

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best Hyperparameters: {'subsample': 0.7, 'min_child_weight': 5, 'max_depth': 11, 'eta': 0.2, 'colsample_bytree': 0.5}
Accuracy: 0.8536585365853658
Precision: 0.9285714285714286
Recall: 0.8125


Using the parameters from Random Search

In [6]:
best_params = random_search_xgboost_model.best_params_

# Re-run XGBoost CV with early stopping and additional metrics
xgb_random_search_tuned_results = xgb.cv(
    best_params, dtrain_clf,
    num_boost_round=1000,
    nfold=3,
    early_stopping_rounds=10,
    metrics=["auc", "error", "logloss"],
    verbose_eval=True
)

# Display tuned results
print("Tuned Results:")
print(xgb_random_search_tuned_results.iloc[-1])

[0]	train-auc:0.62154+0.02564	train-error:0.40395+0.00974	train-logloss:0.67960+0.00315	test-auc:0.49428+0.02716	test-error:0.51062+0.04398	test-logloss:0.69864+0.00583
[1]	train-auc:0.71655+0.02136	train-error:0.35132+0.02583	train-logloss:0.65296+0.00718	test-auc:0.58827+0.05645	test-error:0.46848+0.05407	test-logloss:0.68686+0.01356
[2]	train-auc:0.73042+0.02180	train-error:0.33029+0.01877	train-logloss:0.63982+0.00878	test-auc:0.58651+0.06727	test-error:0.45005+0.05302	test-logloss:0.68499+0.02014
[3]	train-auc:0.74077+0.02542	train-error:0.32632+0.00807	train-logloss:0.62904+0.01246	test-auc:0.59832+0.05918	test-error:0.45007+0.03989	test-logloss:0.68162+0.02252
[4]	train-auc:0.74044+0.01752	train-error:0.32761+0.01236	train-logloss:0.62204+0.01195	test-auc:0.58696+0.05265	test-error:0.45538+0.04504	test-logloss:0.68443+0.02434
[5]	train-auc:0.92209+0.00224	train-error:0.17504+0.02102	train-logloss:0.52599+0.00678	test-auc:0.80930+0.02479	test-error:0.31042+0.06459	test-logloss:0.

Performing GridSearch with default parameters

In [11]:
# Create the grid search object
grid_search_xgboost_model = GridSearchCV(
    xgb_clf, param_grid, cv=5, scoring='neg_log_loss', verbose=1
)

# Fit the model on the training data
grid_search_xgboost_model.fit(X_train, y_train)

# Display the best hyperparameters
print("Best Hyperparameters:", grid_search_xgboost_model.best_params_)

# Get the best model
best_grid_search_xgb_model = grid_search_xgboost_model.best_estimator_

# Evaluate the model with additional metrics
grid_predictions = best_grid_search_xgb_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, grid_predictions))
print("Precision:", precision_score(y_test, grid_predictions))
print("Recall:", recall_score(y_test, grid_predictions))


Fitting 5 folds for each of 4000 candidates, totalling 20000 fits
Best Hyperparameters: {'colsample_bytree': 0.9, 'eta': 0.1, 'max_depth': 7, 'min_child_weight': 3, 'subsample': 0.6}
Accuracy: 0.8536585365853658
Precision: 0.9285714285714286
Recall: 0.8125


Using parameters from Grid Search

In [8]:
# Get the best parameters from  grid search
best_params = grid_search_xgboost_model.best_params_

# Re-run XGBoost CV with early stopping and additional metrics
xgb_grid_search_tuned_results = xgb.cv(
    best_params, dtrain_clf,
    num_boost_round=1000,
    nfold=5,
    early_stopping_rounds=10,
    metrics=["logloss", "auc", "error"],
    verbose_eval=True
)

# Display tuned results
print("Tuned Results:")
print(xgb_grid_search_tuned_results.iloc[-1])


[0]	train-logloss:0.67321+0.00256	train-auc:0.72824+0.01988	train-error:0.36842+0.04182	test-logloss:0.68850+0.00269	test-auc:0.58243+0.03898	test-error:0.46579+0.03180
[1]	train-logloss:0.65743+0.00313	train-auc:0.75919+0.02040	train-error:0.32697+0.02796	test-logloss:0.68612+0.00280	test-auc:0.58374+0.02004	test-error:0.47105+0.04194
[2]	train-logloss:0.64474+0.00465	train-auc:0.77049+0.01677	train-error:0.30987+0.02938	test-logloss:0.68485+0.00564	test-auc:0.58343+0.04374	test-error:0.49737+0.04512
[3]	train-logloss:0.58555+0.00635	train-auc:0.93520+0.00739	train-error:0.15789+0.01177	test-logloss:0.62997+0.01160	test-auc:0.80919+0.06607	test-error:0.30000+0.04588
[4]	train-logloss:0.53550+0.00518	train-auc:0.96291+0.00426	train-error:0.10395+0.01132	test-logloss:0.58347+0.01145	test-auc:0.87501+0.04856	test-error:0.21053+0.03329
[5]	train-logloss:0.50071+0.00428	train-auc:0.97105+0.00429	train-error:0.08684+0.01405	test-logloss:0.55224+0.01332	test-auc:0.89248+0.03871	test-error:0.

In [10]:

try:
    with open('data.csv', 'a', newline='') as csvfile:
        writer = csv.writer(csvfile)
        # Write data to the CSV file
        writer.writerow(['xgboost_grid_search', accuracy_score(y_test, grid_predictions) ])
        writer.writerow(['xgboost_random_search', accuracy_score(y_test, random_predictions)])

except Exception as e:
    print(f"An error occurred: {e}")

# Close the CSV file
csvfile.close()

NameError: name 'stop' is not defined