In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_recall_curve
from imblearn.over_sampling import SMOTE, ADASYN

In [2]:
diabetes = pd.read_csv('cleaned_diabetes.csv')
diabetes = diabetes.drop(columns={'Unnamed: 0'})
diabetes.head()

Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,metformin,repaglinide,...,change_yes,change_no,diabetesMed_yes,diabetesMed_no,age_encoding,glu_serum_encoding,A1C_encoded,total_visits,polypharmacy,num_meds_chanaged
0,1,41,0,1,0,0,0,1,No,No,...,0,1,0,1,0,0,0,0,0,0
1,3,59,0,18,0,0,0,9,No,No,...,1,0,1,0,1,0,0,0,1,1
2,2,11,5,13,2,0,1,6,No,No,...,0,1,1,0,2,0,0,3,1,0
3,2,44,1,16,0,0,0,7,No,No,...,1,0,1,0,3,0,0,0,1,1
4,1,51,0,8,0,0,0,5,No,No,...,1,0,1,0,4,0,0,0,1,0


In [3]:
X = diabetes.drop(columns=['readmitted'])
y = diabetes['readmitted'] # what we're predicting

In [4]:
# 80% train, 10% validation, 10% test
from sklearn.model_selection import train_test_split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

In [5]:
# We also need to encode the medication! 
meds = [
    'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
    'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
    'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
    'tolazamide', 'insulin', 'glyburide-metformin', 'glipizide-metformin',
    'glimepiride-pioglitazone'
]

med_encoding = {'No':0,'Down':1,'Steady':2,'Up':3}
for col in meds:
    if col in X_train.columns:
        X_train[col] = X_train[col].map(med_encoding)
        X_val[col] = X_val[col].map(med_encoding)
        X_test[col] = X_test[col].map(med_encoding)


In [6]:
xgb_model = XGBClassifier(
    n_estimators = 100,
    max_depth = 5,
    learning_rate = 0.1,
    scale_pos_weight = 1,
    use_label_encoder = False,
    eval_metric = 'logloss',
    random_state = 42
)

xgb_model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [7]:
y_val_pred_xgb = xgb_model.predict(X_val)

print("XGBoost Validation Accuracy:", accuracy_score(y_val, y_val_pred_xgb))
print("Classification Report:\n", classification_report(y_val, y_val_pred_xgb))


XGBoost Validation Accuracy: 0.8884411244348339
Classification Report:
               precision    recall  f1-score   support

           0       0.89      1.00      0.94      9039
           1       0.50      0.00      0.01      1135

    accuracy                           0.89     10174
   macro avg       0.69      0.50      0.47     10174
weighted avg       0.85      0.89      0.84     10174



In [8]:
xgb_model = XGBClassifier(
    subsample= 0.7956447755377811,
    n_estimators = 527,
    min_child_weight = 2,
    max_depth = 3,
    learning_rate = 0.037281761570859454,
    colsample_bytree = 0.8093574700581173,
    scale_pos_weight = 5,
    use_label_encoder = False,
    eval_metric = 'logloss',
    random_state = 42
)
xgb_model.fit(X_train, y_train)
y_val_pred_xgb = xgb_model.predict(X_val)

print("XGBoost Validation Accuracy:", accuracy_score(y_val, y_val_pred_xgb))

y_proba = xgb_model.predict_proba(X_val)[:, 1]
thresh = 0.443
y_pred = (y_proba >= thresh).astype(int)
print("Classification Report:\n", classification_report(y_val, y_pred))

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost Validation Accuracy: 0.8147238057794378
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.79      0.85      9039
           1       0.21      0.46      0.29      1135

    accuracy                           0.75     10174
   macro avg       0.57      0.62      0.57     10174
weighted avg       0.84      0.75      0.79     10174



In [9]:
y_test_pred_xgb = xgb_model.predict(X_test)
print("XGBoost Test Accuracy:", accuracy_score(y_test, y_test_pred_xgb))

y_proba = xgb_model.predict_proba(X_test)[:, 1]
thresh = 0.443
y_pred = (y_proba >= thresh).astype(int)
print("Classification Report:\n", classification_report(y_test, y_pred))

XGBoost Test Accuracy: 0.8098280098280098
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.78      0.85      9040
           1       0.20      0.44      0.28      1135

    accuracy                           0.75     10175
   macro avg       0.56      0.61      0.56     10175
weighted avg       0.84      0.75      0.78     10175



In [10]:
precision, recall, thresholds = precision_recall_curve(y_val, y_proba)

# Find threshold that maximizes F1
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-8)
best_idx = f1_scores.argmax()
best_thresh = thresholds[best_idx]

print(f"Best threshold for F1: {best_thresh:.3f}")


ValueError: Found input variables with inconsistent numbers of samples: [10174, 10175]

In [None]:
y_proba = xgb_model.predict_proba(X_test)[:, 1]
thresh = 0.443
y_pred = (y_proba >= thresh).astype(int)
print("Classification Report:\n", classification_report(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.78      0.85      9040
           1       0.20      0.44      0.28      1135

    accuracy                           0.75     10175
   macro avg       0.56      0.61      0.56     10175
weighted avg       0.84      0.75      0.78     10175



In [None]:
## USING SMOTE


# Initialize sampler — you can switch between SMOTE or ADASYN here
sampler = SMOTE(random_state=42)  # or ADASYN(random_state=42)

# Fit sampler to training data and resample
X_train_resampled, y_train_resampled = sampler.fit_resample(X_train, y_train)

print("Original training set shape:", X_train.shape, y_train.shape)
print("Resampled training set shape:", X_train_resampled.shape, y_train_resampled.shape)


Original training set shape: (81393, 72) (81393,)
Resampled training set shape: (144622, 72) (144622,)


In [None]:
xgb_model = XGBClassifier(
    subsample=0.7956,
    n_estimators=527,
    min_child_weight=2,
    max_depth=3,
    learning_rate=0.0373,
    colsample_bytree=0.8094,
    scale_pos_weight=1,  # Set to 1 because imbalance handled by SMOTE
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

xgb_model.fit(X_train_resampled, y_train_resampled)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [None]:
y_val_proba = xgb_model.predict_proba(X_val)[:, 1]

# Use your chosen threshold
thresh = 0.235
y_val_pred = (y_val_proba >= thresh).astype(int)

print("Classification Report:\n", classification_report(y_val, y_val_pred))


Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.70      0.79      9039
           1       0.17      0.50      0.26      1135

    accuracy                           0.68     10174
   macro avg       0.55      0.60      0.53     10174
weighted avg       0.84      0.68      0.73     10174



In [None]:
y_val_proba = xgb_model.predict_proba(X_val)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_val, y_val_proba)

f1_scores = 2 * precision * recall / (precision + recall + 1e-8)
best_thresh = thresholds[f1_scores.argmax()]

print(f"Best threshold by F1 score: {best_thresh:.3f}")

y_val_pred = (y_val_proba >= best_thresh).astype(int)
print(classification_report(y_val, y_val_pred))


Best threshold by F1 score: 0.235
              precision    recall  f1-score   support

           0       0.92      0.70      0.80      9039
           1       0.17      0.50      0.26      1135

    accuracy                           0.68     10174
   macro avg       0.55      0.60      0.53     10174
weighted avg       0.84      0.68      0.74     10174



In [None]:
# TESTING WITH ADASYN
# Initialize ADASYN sampler
adasyn_sampler = ADASYN(random_state=42)

# Resample training data
X_train_resampled, y_train_resampled = adasyn_sampler.fit_resample(X_train, y_train)

print("Original training set shape:", X_train.shape, y_train.shape)
print("ADASYN resampled training set shape:", X_train_resampled.shape, y_train_resampled.shape)

Original training set shape: (81393, 72) (81393,)
ADASYN resampled training set shape: (143093, 72) (143093,)


In [None]:
xgb_model = XGBClassifier(
    subsample=0.7956,
    n_estimators=527,
    min_child_weight=2,
    max_depth=3,
    learning_rate=0.0373,
    colsample_bytree=0.8094,
    scale_pos_weight=1,  # set to 1 because ADASYN balances the data
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

xgb_model.fit(X_train_resampled, y_train_resampled)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [None]:
y_val_proba = xgb_model.predict_proba(X_val)[:, 1]

# Use your chosen threshold (start with 0.5 or your previous best)
threshold = 0.261
y_val_pred = (y_val_proba >= threshold).astype(int)

print("Classification Report with ADASYN:\n", classification_report(y_val, y_val_pred))


Classification Report with ADASYN:
               precision    recall  f1-score   support

           0       0.91      0.81      0.86      9039
           1       0.20      0.38      0.26      1135

    accuracy                           0.76     10174
   macro avg       0.56      0.59      0.56     10174
weighted avg       0.83      0.76      0.79     10174



In [None]:
from sklearn.metrics import precision_recall_curve, classification_report

y_val_proba = xgb_model.predict_proba(X_val)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_val, y_val_proba)
f1_scores = 2 * precision * recall / (precision + recall + 1e-8)
best_idx = f1_scores.argmax()
best_threshold = thresholds[best_idx]

print(f"Best threshold by F1 score: {best_threshold:.3f}")

y_val_pred_best = (y_val_proba >= best_threshold).astype(int)
print(classification_report(y_val, y_val_pred_best))


Best threshold by F1 score: 0.261
              precision    recall  f1-score   support

           0       0.91      0.81      0.86      9039
           1       0.20      0.38      0.26      1135

    accuracy                           0.76     10174
   macro avg       0.56      0.59      0.56     10174
weighted avg       0.83      0.76      0.79     10174



In [None]:
y_test_proba = xgb_model.predict_proba(X_test)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_test, y_test_proba)
f1_scores = 2 * precision * recall / (precision + recall + 1e-8)
best_idx = f1_scores.argmax()
best_threshold = thresholds[best_idx]

print(f"Best threshold by F1 score: {best_threshold:.3f}")

y_test_pred_best = (y_test_proba >= best_threshold).astype(int)
print(classification_report(y_test, y_test_pred_best))

Best threshold by F1 score: 0.262
              precision    recall  f1-score   support

           0       0.91      0.82      0.86      9040
           1       0.20      0.37      0.26      1135

    accuracy                           0.77     10175
   macro avg       0.56      0.59      0.56     10175
weighted avg       0.83      0.77      0.79     10175

