In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Load the cleaned dataset
df = pd.read_csv('../data/diabetes_readmission.csv')

# Drop invalid gender and replace unknown race (again, to be safe)
df = df[df['gender'] != 'Unknown/Invalid']
df['race'] = df['race'].replace('?', 'Unknown')

# Create the binary readmission column
df['readmit_30'] = df['readmitted'].apply(lambda x: 1 if x == '<30' else 0)

# Show shape and preview
print("Dataset shape:", df.shape)
df.head()


Dataset shape: (101763, 51)


Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,readmit_30
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,NO,0
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,Up,No,No,No,No,No,Ch,Yes,>30,0
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,Yes,NO,0
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,Up,No,No,No,No,No,Ch,Yes,NO,0
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,Steady,No,No,No,No,No,Ch,Yes,NO,0


In [2]:
# Drop columns that are not useful or too sparse
drop_cols = [
    'encounter_id', 'patient_nbr', 'weight', 'payer_code', 'medical_specialty',
    'readmitted'  # We replaced this with readmit_30 already
]

df = df.drop(columns=drop_cols)

# Show updated shape and columns
print("New dataset shape:", df.shape)
df.columns


New dataset shape: (101763, 45)


Index(['race', 'gender', 'age', 'admission_type_id',
       'discharge_disposition_id', 'admission_source_id', 'time_in_hospital',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmit_30'],
      dtype='object')

In [3]:
# List of categorical columns to one-hot encode
cat_cols = [
    'race', 'gender', 'age', 'admission_type_id', 'discharge_disposition_id',
    'admission_source_id', 'max_glu_serum', 'A1Cresult', 'change',
    'diabetesMed'
]

# Also include all the medication columns
med_cols = [
    'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
    'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
    'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
    'tolazamide', 'examide', 'citoglipton', 'insulin', 'glyburide-metformin',
    'glipizide-metformin', 'glimepiride-pioglitazone',
    'metformin-rosiglitazone', 'metformin-pioglitazone'
]

# Combine all categorical columns
all_cat_cols = cat_cols + med_cols

# One-hot encode them
df_encoded = pd.get_dummies(df, columns=all_cat_cols, drop_first=True)

print("New shape after one-hot encoding:", df_encoded.shape)
df_encoded.head()


New shape after one-hot encoding: (101763, 129)


Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,...,insulin_No,insulin_Steady,insulin_Up,glyburide-metformin_No,glyburide-metformin_Steady,glyburide-metformin_Up,glipizide-metformin_Steady,glimepiride-pioglitazone_Steady,metformin-rosiglitazone_Steady,metformin-pioglitazone_Steady
0,1,41,0,1,0,0,0,250.83,?,?,...,True,False,False,True,False,False,False,False,False,False
1,3,59,0,18,0,0,0,276.0,250.01,255,...,False,False,True,True,False,False,False,False,False,False
2,2,11,5,13,2,0,1,648.0,250,V27,...,True,False,False,True,False,False,False,False,False,False
3,2,44,1,16,0,0,0,8.0,250.43,403,...,False,False,True,True,False,False,False,False,False,False
4,1,51,0,8,0,0,0,197.0,157,250,...,False,True,False,True,False,False,False,False,False,False


In [4]:
# Separate features (X) and target label (y)
X = df_encoded.drop(columns=['readmit_30'])
y = df_encoded['readmit_30']

# Split into train/test sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train set size:", X_train.shape)
print("Test set size:", X_test.shape)


Train set size: (81410, 128)
Test set size: (20353, 128)


In [5]:
# Drop diagnosis columns to avoid string issues for now
df_encoded = df_encoded.drop(columns=['diag_1', 'diag_2', 'diag_3'], errors='ignore')

# Re-split the cleaned dataset
X = df_encoded.drop(columns=['readmit_30'])
y = df_encoded['readmit_30']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Data re-split complete.")


Data re-split complete.


In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

model = LogisticRegression(max_iter=1000, solver='liblinear')
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Confusion Matrix:
[[18040    42]
 [ 2233    38]]

Classification Report:
              precision    recall  f1-score   support

           0       0.89      1.00      0.94     18082
           1       0.47      0.02      0.03      2271

    accuracy                           0.89     20353
   macro avg       0.68      0.51      0.49     20353
weighted avg       0.84      0.89      0.84     20353



In [7]:
# Recover race and gender columns for test set
# We'll use the original, unencoded df (before one-hot) to get demographics
df_demo = df.copy().reset_index(drop=True)

# Match indices from split
X_train_full, X_test_full, y_train_full, y_test_full = train_test_split(
    df_demo, df_demo['readmit_30'], test_size=0.2, random_state=42, stratify=df_demo['readmit_30']
)

# Build fairness audit DataFrame
fairness_df = X_test_full[['race', 'gender']].copy()
fairness_df['actual'] = y_test.values
fairness_df['predicted'] = y_pred

# Check the result
fairness_df.head()


Unnamed: 0,race,gender,actual,predicted
32064,Caucasian,Male,1,0
45620,Caucasian,Male,0,0
64223,Caucasian,Female,0,0
13039,Caucasian,Male,1,0
51981,AfricanAmerican,Female,0,0


In [8]:
def subgroup_metrics(df, group_col):
    groups = df[group_col].unique()
    results = []

    for group in groups:
        subset = df[df[group_col] == group]
        tp = ((subset['actual'] == 1) & (subset['predicted'] == 1)).sum()
        tn = ((subset['actual'] == 0) & (subset['predicted'] == 0)).sum()
        fp = ((subset['actual'] == 0) & (subset['predicted'] == 1)).sum()
        fn = ((subset['actual'] == 1) & (subset['predicted'] == 0)).sum()

        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        fnr = fn / (tp + fn) if (tp + fn) > 0 else 0
        acc = (tp + tn) / len(subset)

        results.append({
            group_col: group,
            'accuracy': round(acc, 3),
            'recall': round(recall, 3),
            'false_negative_rate': round(fnr, 3),
            'support': len(subset)
        })

    return pd.DataFrame(results)

# Run subgroup analysis by race
race_metrics = subgroup_metrics(fairness_df, 'race')
gender_metrics = subgroup_metrics(fairness_df, 'gender')

# Show results
print("📊 Fairness Metrics by Race:")
display(race_metrics)

print("\n📊 Fairness Metrics by Gender:")
display(gender_metrics)


📊 Fairness Metrics by Race:


Unnamed: 0,race,accuracy,recall,false_negative_rate,support
0,Caucasian,0.887,0.017,0.983,15203
1,AfricanAmerican,0.888,0.019,0.981,3882
2,Hispanic,0.878,0.02,0.98,403
3,Unknown,0.91,0.0,1.0,469
4,Other,0.929,0.0,1.0,283
5,Asian,0.912,0.0,1.0,113



📊 Fairness Metrics by Gender:


Unnamed: 0,gender,accuracy,recall,false_negative_rate,support
0,Male,0.89,0.014,0.986,9298
1,Female,0.886,0.019,0.981,11055


In [9]:
df.to_csv("df_unencoded.csv", index=False)


In [10]:
import joblib

joblib.dump(y_pred, "y_pred.pkl")
joblib.dump(y_test, "y_test.pkl")


['y_test.pkl']