In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_recall_curve, fbeta_score
from imblearn.over_sampling import SMOTE, ADASYN

In [2]:
diabetes = pd.read_csv('../../CSVs/cleaned_diabetes.csv')
diabetes = diabetes.drop(columns={'Unnamed: 0'})
diabetes.head()

Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,metformin,repaglinide,...,change_yes,change_no,diabetesMed_yes,diabetesMed_no,age_encoding,glu_serum_encoding,A1C_encoded,total_visits,polypharmacy,num_meds_chanaged
0,1,41,0,1,0,0,0,1,No,No,...,0,1,0,1,0,0,0,0,0,0
1,3,59,0,18,0,0,0,9,No,No,...,1,0,1,0,1,0,0,0,1,1
2,2,11,5,13,2,0,1,6,No,No,...,0,1,1,0,2,0,0,3,1,0
3,2,44,1,16,0,0,0,7,No,No,...,1,0,1,0,3,0,0,0,1,1
4,1,51,0,8,0,0,0,5,No,No,...,1,0,1,0,4,0,0,0,1,0


In [3]:
X = diabetes.drop(columns=['readmitted'])
y = diabetes['readmitted'] # what we're predicting

In [4]:
# 80% train, 10% validation, 10% test
from sklearn.model_selection import train_test_split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

In [26]:
# We also need to encode the medication! 
meds = [
    'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
    'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
    'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
    'tolazamide', 'insulin', 'glyburide-metformin', 'glipizide-metformin',
    'glimepiride-pioglitazone'
]

med_encoding = {'No':0,'Down':1,'Steady':2,'Up':3}
for col in meds:
    if col in X_train.columns:
        X_train[col] = X_train[col].map(med_encoding).fillna(0).astype(int)
        X_val[col] = X_val[col].map(med_encoding).fillna(0).astype(int)
        X_test[col] = X_test[col].map(med_encoding).fillna(0).astype(int)

combo_meds = ['glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone']

for df in [X_train, X_val, X_test]:
    df['combo_med'] = 0
    for med in combo_meds:
        if med in df.columns:
            df['combo_med'] += df[med]
    # Optionally binarize
    df['combo_med'] = (df['combo_med'] > 0).astype(int)
    # Drop original combo med columns
    df.drop(columns=[med for med in combo_meds if med in df.columns], inplace=True)


In [27]:
xgb_model = XGBClassifier(
    n_estimators = 100,
    max_depth = 5,
    learning_rate = 0.1,
    scale_pos_weight = 1,
    use_label_encoder = False,
    eval_metric = 'logloss',
    random_state = 42
)

xgb_model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [28]:
y_val_pred_xgb = xgb_model.predict(X_val)

print("XGBoost Validation Accuracy:", accuracy_score(y_val, y_val_pred_xgb))
print("Classification Report:\n", classification_report(y_val, y_val_pred_xgb))


XGBoost Validation Accuracy: 0.8886377039512483
Classification Report:
               precision    recall  f1-score   support

           0       0.89      1.00      0.94      9039
           1       0.58      0.01      0.01      1135

    accuracy                           0.89     10174
   macro avg       0.74      0.50      0.48     10174
weighted avg       0.85      0.89      0.84     10174



In [29]:
xgb_model = XGBClassifier(
    subsample= 0.7956447755377811,
    n_estimators = 527,
    min_child_weight = 2,
    max_depth = 3,
    learning_rate = 0.037281761570859454,
    colsample_bytree = 0.8093574700581173,
    scale_pos_weight = 5,
    use_label_encoder = False,
    eval_metric = 'logloss',
    random_state = 42
)
xgb_model.fit(X_train, y_train)
y_val_pred_xgb = xgb_model.predict(X_val)

print("XGBoost Validation Accuracy:", accuracy_score(y_val, y_val_pred_xgb))

y_proba = xgb_model.predict_proba(X_val)[:, 1]
thresh = 0.33
y_pred = (y_proba >= thresh).astype(int)
print("Classification Report:\n", classification_report(y_val, y_pred))

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost Validation Accuracy: 0.8177707882838608
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.50      0.65      9039
           1       0.16      0.76      0.26      1135

    accuracy                           0.53     10174
   macro avg       0.55      0.63      0.46     10174
weighted avg       0.85      0.53      0.61     10174



In [30]:
fbeta_score(y_val, y_pred, beta=2)

0.432378079436903

In [31]:
from sklearn.metrics import fbeta_score

best_f2 = 0
best_thresh = 0
for thresh in np.arange(0.1, 0.9, 0.01):
    y_pred_thresh = (y_proba >= thresh).astype(int)
    f2 = fbeta_score(y_val, y_pred_thresh, beta=2)
    if f2 > best_f2:
        best_f2 = f2
        best_thresh = thresh

print(f"Best F2: {best_f2}, at threshold: {best_thresh}")


Best F2: 0.432378079436903, at threshold: 0.3299999999999999


In [32]:
# On X_test and y_test using the same threshold
y_test_proba = xgb_model.predict_proba(X_test)[:, 1]
y_test_pred = (y_test_proba >= best_thresh).astype(int)
print("Test F2-Score:", fbeta_score(y_test, y_test_pred, beta=2))


Test F2-Score: 0.4162507486524256


In [11]:
# y_test_pred_xgb = xgb_model.predict(X_test)
# print("XGBoost Test Accuracy:", accuracy_score(y_test, y_test_pred_xgb))

# y_proba = xgb_model.predict_proba(X_test)[:, 1]
# thresh = 0.443
# y_pred = (y_proba >= thresh).astype(int)
# print("Classification Report:\n", classification_report(y_test, y_pred))

In [12]:
# y_proba = xgb_model.predict_proba(X_test)[:, 1]
# thresh = 0.443
# y_pred = (y_proba >= thresh).astype(int)
# print("Classification Report:\n", classification_report(y_test, y_pred))

In [13]:
# ## USING SMOTE


# # Initialize sampler — you can switch between SMOTE or ADASYN here
# sampler = SMOTE(random_state=42)  # or ADASYN(random_state=42)

# # Fit sampler to training data and resample
# X_train_resampled, y_train_resampled = sampler.fit_resample(X_train, y_train)

# print("Original training set shape:", X_train.shape, y_train.shape)
# print("Resampled training set shape:", X_train_resampled.shape, y_train_resampled.shape)


In [14]:
# xgb_model = XGBClassifier(
#     subsample=0.7956,
#     n_estimators=527,
#     min_child_weight=2,
#     max_depth=3,
#     learning_rate=0.0373,
#     colsample_bytree=0.8094,
#     scale_pos_weight=1,  # Set to 1 because imbalance handled by SMOTE
#     use_label_encoder=False,
#     eval_metric='logloss',
#     random_state=42
# )

# xgb_model.fit(X_train_resampled, y_train_resampled)


In [15]:
# y_val_proba = xgb_model.predict_proba(X_val)[:, 1]

# # Use your chosen threshold
# thresh = 0.235
# y_val_pred = (y_val_proba >= thresh).astype(int)

# print("Classification Report:\n", classification_report(y_val, y_val_pred))


In [16]:
# y_val_proba = xgb_model.predict_proba(X_val)[:, 1]
# precision, recall, thresholds = precision_recall_curve(y_val, y_val_proba)

# f1_scores = 2 * precision * recall / (precision + recall + 1e-8)
# best_thresh = thresholds[f1_scores.argmax()]

# print(f"Best threshold by F1 score: {best_thresh:.3f}")

# y_val_pred = (y_val_proba >= best_thresh).astype(int)
# print(classification_report(y_val, y_val_pred))


In [17]:
# # TESTING WITH ADASYN
# # Initialize ADASYN sampler
# adasyn_sampler = ADASYN(random_state=42)

# # Resample training data
# X_train_resampled, y_train_resampled = adasyn_sampler.fit_resample(X_train, y_train)

# print("Original training set shape:", X_train.shape, y_train.shape)
# print("ADASYN resampled training set shape:", X_train_resampled.shape, y_train_resampled.shape)

In [18]:
# xgb_model = XGBClassifier(
#     subsample=0.7956,
#     n_estimators=527,
#     min_child_weight=2,
#     max_depth=3,
#     learning_rate=0.0373,
#     colsample_bytree=0.8094,
#     scale_pos_weight=1,  # set to 1 because ADASYN balances the data
#     use_label_encoder=False,
#     eval_metric='logloss',
#     random_state=42
# )

# xgb_model.fit(X_train_resampled, y_train_resampled)


In [19]:
# y_val_proba = xgb_model.predict_proba(X_val)[:, 1]

# # Use your chosen threshold (start with 0.5 or your previous best)
# threshold = 0.261
# y_val_pred = (y_val_proba >= threshold).astype(int)

# print("Classification Report with ADASYN:\n", classification_report(y_val, y_val_pred))


In [20]:
# from sklearn.metrics import precision_recall_curve, classification_report

# y_val_proba = xgb_model.predict_proba(X_val)[:, 1]
# precision, recall, thresholds = precision_recall_curve(y_val, y_val_proba)
# f1_scores = 2 * precision * recall / (precision + recall + 1e-8)
# best_idx = f1_scores.argmax()
# best_threshold = thresholds[best_idx]

# print(f"Best threshold by F1 score: {best_threshold:.3f}")

# y_val_pred_best = (y_val_proba >= best_threshold).astype(int)
# print(classification_report(y_val, y_val_pred_best))


In [21]:
# y_test_proba = xgb_model.predict_proba(X_test)[:, 1]
# precision, recall, thresholds = precision_recall_curve(y_test, y_test_proba)
# f1_scores = 2 * precision * recall / (precision + recall + 1e-8)
# best_idx = f1_scores.argmax()
# best_threshold = thresholds[best_idx]

# print(f"Best threshold by F1 score: {best_threshold:.3f}")

# y_test_pred_best = (y_test_proba >= best_threshold).astype(int)
# print(classification_report(y_test, y_test_pred_best))