In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from xgboost import XGBClassifier

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import RFE, SelectKBest, chi2, SelectKBest, mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, brier_score_loss, roc_curve, auc, roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.calibration import calibration_curve
from sklearn.linear_model import LogisticRegression


Grid Search

In [None]:
df = pd.read_csv('') # dataset path 
X = df.drop('Outcome', axis=1)
y = df['Outcome']

In [None]:
roc_auc_scores = []
accuracy_scores = []
best_params_runs = []

param_grid = {
    'max_depth': [3, 5, 7, 9],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'n_estimators': [100, 200, 300, 500],
}

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=None)
xgb = XGBClassifier()
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, scoring='roc_auc', cv=3)
grid_search.fit(X_train, y_train)

best_params_runs.append((grid_search.best_params_, grid_search.best_score_))

best_model = grid_search.best_estimator_
y_pred_proba = best_model.predict_proba(X_test)[:, 1]
y_pred = best_model.predict(X_test)

roc_auc_scores.append(roc_auc_score(y_test, y_pred_proba))
accuracy_scores.append(accuracy_score(y_test, y_pred))
    

best_overall_params = max(best_params_runs, key=lambda item: item[1])[0]

avg_roc_auc = np.mean(roc_auc_scores)
avg_accuracy = np.mean(accuracy_scores)


In [None]:
roc_auc_scores = []
accuracy_scores = []
best_params_runs = [] 

param_grid = {
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'lbfgs', 'newton-cg', 'sag', 'saga'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'max_iter': [100, 200, 300, 400, 500]
}

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=None)  # Random split

logreg = LogisticRegression()
grid_search = GridSearchCV(estimator=logreg, param_grid=param_grid, scoring='roc_auc', cv=3)
grid_search.fit(X_train, y_train)

best_params_runs.append((grid_search.best_params_, grid_search.best_score_))

best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)

roc_auc_scores.append(roc_auc_score(y_test, y_pred))
accuracy_scores.append(accuracy_score(y_test, y_pred))

best_overall_params = max(best_params_runs, key=lambda item: item[1])[0]

avg_roc_auc = np.mean(roc_auc_scores)
avg_accuracy = np.mean(accuracy_scores)



XGB Modeling + Gender Analysis

In [None]:
X = df.drop('Outcome', axis=1)
y = df['Outcome']

n_runs = 30
overall_roc_auc_scores = []
overall_accuracy_scores = []
male_roc_auc_scores = []
male_accuracy_scores = []
female_roc_auc_scores = []
female_accuracy_scores = []

for _ in range(n_runs):
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=None)

    # Train the model
    xgb = XGBClassifier(learning_rate=0.1, max_depth=5, n_estimators=100)
    xgb.fit(X_train, y_train)
    
    # Predict on test set
    y_pred_proba = xgb.predict_proba(X_test)[:, 1]
    y_pred = xgb.predict(X_test)
    
    # Overall scores
    overall_roc_auc_scores.append(roc_auc_score(y_test, y_pred_proba))
    overall_accuracy_scores.append(accuracy_score(y_test, y_pred))
    
    # Male scores
    male_mask = (X_test['Female'] == 0)
    if np.any(male_mask): # Check if there are any males in the test set
        male_roc_auc_scores.append(roc_auc_score(y_test[male_mask], y_pred_proba[male_mask]))
        male_accuracy_scores.append(accuracy_score(y_test[male_mask], y_pred[male_mask]))
    
    # Female scores
    female_mask = (X_test['Female'] == 1)
    if np.any(female_mask): # Check if there are any females in the test set
        female_roc_auc_scores.append(roc_auc_score(y_test[female_mask], y_pred_proba[female_mask]))
        female_accuracy_scores.append(accuracy_score(y_test[female_mask], y_pred[female_mask]))