In [11]:
# 1. Imports and Data Loading
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, average_precision_score, classification_report
from sklearn.inspection import permutation_importance

# Load your cleaned data
df = pd.read_csv('diabetic_heart_cleaned.csv')

# Quick check
print(df.shape)
df.head()


(59313, 48)


Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,...,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,readmitted_binary
0,500364,82442376,Caucasian,Male,[30-40),1,1,7,2,44,...,Up,No,No,No,No,No,Ch,Yes,NO,0
1,35754,82637451,Caucasian,Male,[50-60),2,1,2,3,31,...,Steady,No,No,No,No,No,No,Yes,>30,0
2,55842,84259809,Caucasian,Male,[60-70),3,1,2,4,70,...,Steady,No,No,No,No,No,Ch,Yes,NO,0
3,63768,114882984,Caucasian,Male,[70-80),1,1,7,5,73,...,No,No,No,No,No,No,No,Yes,>30,0
4,12522,48330783,Caucasian,Female,[80-90),2,1,4,13,68,...,Steady,No,No,No,No,No,Ch,Yes,NO,0


In [5]:
# Filter to cardiac patients: ICD codes 410–414 in diag_1, diag_2, or diag_3

def is_cardiac_patient(row):
    icd_list = [str(row['diag_1']), str(row['diag_2']), str(row['diag_3'])]
    return any(code.startswith(('410', '411', '412', '413', '414')) for code in icd_list if pd.notnull(code) and code not in ['nan', '?'])

df_cardiac = df[df.apply(is_cardiac_patient, axis=1)].copy()

df_cardiac.shape, df_cardiac['readmitted'].value_counts(), df_cardiac['readmitted_binary'].value_counts()


((16517, 48),
 readmitted
 NO     9371
 >30    5579
 <30    1567
 Name: count, dtype: int64,
 readmitted_binary
 0    14950
 1     1567
 Name: count, dtype: int64)

In [34]:
# 3. Drop identifiers and original readmitted field
drop_cols = ['encounter_id', 'patient_nbr', 'readmitted']
df_cardiac_clean = df_cardiac.drop(columns=[col for col in drop_cols if col in df_cardiac.columns])

In [40]:
# 4. Replace '?' with NaN
df_cardiac_clean.replace('?', np.nan, inplace=True)
df_cardiac_clean

Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted_binary
1,Caucasian,Male,[50-60),2,1,2,3,31,6,16,...,No,Steady,No,No,No,No,No,No,Yes,0
2,Caucasian,Male,[60-70),3,1,2,4,70,1,21,...,No,Steady,No,No,No,No,No,Ch,Yes,0
8,Caucasian,Male,[80-90),1,6,7,10,55,1,31,...,No,Steady,No,No,No,No,No,No,Yes,0
9,AfricanAmerican,Male,[50-60),1,1,7,4,45,4,17,...,No,Steady,No,No,No,No,No,Ch,Yes,1
12,Caucasian,Female,[50-60),2,1,4,2,66,1,19,...,No,Down,No,No,No,No,No,Ch,Yes,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59295,AfricanAmerican,Female,[70-80),1,1,7,3,57,4,11,...,No,No,No,No,No,No,No,No,No,0
59299,AfricanAmerican,Male,[70-80),1,1,7,1,32,6,14,...,No,Steady,No,No,No,No,No,No,Yes,0
59300,Caucasian,Male,[40-50),3,1,1,1,1,5,8,...,No,Steady,No,No,No,No,No,Ch,Yes,0
59306,Caucasian,Female,[70-80),1,22,7,8,51,6,19,...,No,Steady,No,No,No,No,No,No,Yes,0


In [67]:
# 5. One-hot encode categoricals except the target
cat_cols = df_cardiac_clean.select_dtypes(include=['object', 'category']).columns.tolist()
cat_cols = [c for c in cat_cols if c != 'readmitted_binary']
df_cardiac_encoded = pd.get_dummies(df_cardiac_clean, columns=cat_cols, dummy_na=True, drop_first=True)

In [69]:
# 6. Split features/target and train/test split
X = df_cardiac_encoded.drop(columns=['readmitted_binary'])
y = df_cardiac_encoded['readmitted_binary']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [72]:
# 7. --- SAFE COLUMN NAME SANITIZATION ---
illegal_chars = r'[\[\]\(\)\<\>]'
X_train.columns = X_train.columns.str.replace(illegal_chars, '_', regex=True)
X_test.columns = X_test.columns.str.replace(illegal_chars, '_', regex=True)


In [79]:
# 8. Train XGBoost model
xgb = XGBClassifier(
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric='logloss'
)
xgb.fit(X_train, y_train)
y_pred_proba = xgb.predict_proba(X_test)[:, 1]
y_pred = xgb.predict(X_test)


In [81]:
# 10. Evaluation metrics
auc = roc_auc_score(y_test, y_pred_proba)
ap = average_precision_score(y_test, y_pred)
print(f"ROC AUC: {auc:.4f}")
print(f"Average Precision (PR-AUC): {ap:.4f}")
print(classification_report(y_test, y_pred))

ROC AUC: 0.6253
Average Precision (PR-AUC): 0.0947
              precision    recall  f1-score   support

           0       0.91      1.00      0.95      2991
           1       0.00      0.00      0.00       313

    accuracy                           0.90      3304
   macro avg       0.45      0.50      0.48      3304
weighted avg       0.82      0.90      0.86      3304

