In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, roc_auc_score,
    average_precision_score, precision_recall_curve
)

In [3]:
from imblearn.over_sampling import SMOTE

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import warnings
warnings.filterwarnings("ignore")

In [4]:
df = pd.read_csv("../data/cleaned_data.csv")
df.head()

Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,num_lab_procedures,num_procedures,...,glyburide,pioglitazone,rosiglitazone,acarbose,tolazamide,insulin,glyburide-metformin,change,diabetesMed,readmit_binary
0,Caucasian,Female,5.0,6,25,1,1,Other,41,0,...,0,0,0,0,0,0,0,0,0,0
1,Caucasian,Female,15.0,1,1,7,3,Unknown,59,0,...,0,0,0,0,0,2,0,1,1,0
2,AfricanAmerican,Female,25.0,1,1,7,2,Unknown,11,5,...,0,0,0,0,0,0,0,0,1,0
3,Caucasian,Male,35.0,1,1,7,2,Unknown,44,1,...,0,0,0,0,0,2,0,1,1,0
4,Caucasian,Male,45.0,1,1,7,1,Unknown,51,0,...,0,0,0,0,0,1,0,1,1,0


In [5]:
X = df.drop(columns=["readmit_binary"])
y = df["readmit_binary"]

In [6]:
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, stratify=y, random_state=42)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, stratify=y_temp, random_state=42)

print("TRAIN:", X_train.shape)
print("VAL:", X_val.shape)
print("TEST:", X_test.shape)


TRAIN: (71236, 35)
VAL: (15265, 35)
TEST: (15265, 35)


In [7]:
onehot_cols = ["race", "gender", "medical_specialty", "diag_1", "diag_2", "diag_3"]
numeric_cols = [c for c in X.columns if c not in onehot_cols]

preprocessor = ColumnTransformer(
    transformers=[
        ("onehot", OneHotEncoder(handle_unknown="ignore"), onehot_cols),
        ("scale", StandardScaler(), numeric_cols)
    ]
)

In [8]:

preprocessor.fit(X_train)


In [9]:

X_train_enc = preprocessor.transform(X_train)
X_val_enc   = preprocessor.transform(X_val)
X_test_enc  = preprocessor.transform(X_test)


smote = SMOTE(random_state=42)
X_train_sm, y_train_sm = smote.fit_resample(X_train_enc, y_train)

print("Before SMOTE:", X_train_enc.shape, y_train.value_counts())
print("After SMOTE:", X_train_sm.shape, pd.Series(y_train_sm).value_counts())

  File "C:\Users\Admin\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
        "wmic CPU Get NumberOfCores /Format:csv".split(),
        capture_output=True,
        text=True,
    )
  File "C:\Users\Admin\anaconda3\Lib\subprocess.py", line 554, in run
    with Popen(*popenargs, **kwargs) as process:
         ~~~~~^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Admin\anaconda3\Lib\subprocess.py", line 1039, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
    ~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
                        pass_fds, cwd, env,
                        ^^^^^^^^^^^^^^^^^^^
    ...<5 lines>...
                        gid, gids, uid, umask,
                        ^^^^^^^^^^^^^^^^^^^^^^
                        start_new_session, process_group)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Admin\anaconda3\Lib\subprocess.

Before SMOTE: (71236, 86) readmit_binary
0    63286
1     7950
Name: count, dtype: int64
After SMOTE: (126572, 86) readmit_binary
0    63286
1    63286
Name: count, dtype: int64


In [10]:
models = {
    "LogisticRegression": LogisticRegression(max_iter=2000, class_weight='balanced'),
    "RandomForest": RandomForestClassifier(n_estimators=500, max_depth=12, class_weight='balanced'),
    "XGBoost": XGBClassifier(
        n_estimators=600, learning_rate=0.05, max_depth=6,
        subsample=0.8, colsample_bytree=0.8,
        scale_pos_weight=3, eval_metric="logloss"
    ),
    "LightGBM": LGBMClassifier(
        n_estimators=600, learning_rate=0.05, max_depth=-1, class_weight="balanced"
    ),
    "CatBoost": CatBoostClassifier(
        iterations=600, learning_rate=0.05, depth=6,
        verbose=0, loss_function="Logloss", class_weights=[1,3]
    )
}


In [11]:
def eval_model(model, Xv, yv):
    proba = model.predict_proba(Xv)[:,1]
    preds = (proba >= 0.5).astype(int)

    return {
        "Accuracy": accuracy_score(yv, preds),
        "Precision": precision_score(yv, preds, zero_division=0),
        "Recall": recall_score(yv, preds),
        "F1": f1_score(yv, preds),
        "ROC-AUC": roc_auc_score(yv, proba),
        "PR-AUC": average_precision_score(yv, proba)
    }


In [12]:
results = {}

for name, model in models.items():
    print(f"Training {name} ...")
    model.fit(X_train_sm, y_train_sm)

    metrics = eval_model(model, X_val_enc, y_val)
    results[name] = metrics
    print(name, metrics, "\n")


Training LogisticRegression ...
LogisticRegression {'Accuracy': 0.632361611529643, 'Precision': 0.1655255049640534, 'Recall': 0.5674882629107981, 'F1': 0.2562947256824808, 'ROC-AUC': np.float64(0.6433345173417419), 'PR-AUC': np.float64(0.19728604847050268)} 

Training RandomForest ...
RandomForest {'Accuracy': 0.8805109728136259, 'Precision': 0.2744360902255639, 'Recall': 0.04284037558685446, 'F1': 0.07411167512690356, 'ROC-AUC': np.float64(0.6436030397165581), 'PR-AUC': np.float64(0.1783904948113446)} 

Training XGBoost ...
XGBoost {'Accuracy': 0.8725188339338356, 'Precision': 0.3333333333333333, 'Recall': 0.142018779342723, 'F1': 0.1991769547325103, 'ROC-AUC': np.float64(0.6746099090425354), 'PR-AUC': np.float64(0.23026345975882295)} 

Training LightGBM ...
[LightGBM] [Info] Number of positive: 63286, number of negative: 63286
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012763 seconds.
You can set `force_row_wise=true` to remove the overhea

In [13]:
df_results = pd.DataFrame(results).T
df_results

Unnamed: 0,Accuracy,Precision,Recall,F1,ROC-AUC,PR-AUC
LogisticRegression,0.632362,0.165526,0.567488,0.256295,0.643335,0.197286
RandomForest,0.880511,0.274436,0.04284,0.074112,0.643603,0.17839
XGBoost,0.872519,0.333333,0.142019,0.199177,0.67461,0.230263
LightGBM,0.888765,0.5375,0.025235,0.048206,0.679596,0.232842
CatBoost,0.865706,0.311135,0.167254,0.217557,0.676025,0.222972


In [14]:
best_model_name = df_results["ROC-AUC"].idxmax()
best_model = models[best_model_name]

print("Best Model:", best_model_name)
df_results.loc[best_model_name]

Best Model: LightGBM


Accuracy     0.888765
Precision    0.537500
Recall       0.025235
F1           0.048206
ROC-AUC      0.679596
PR-AUC       0.232842
Name: LightGBM, dtype: float64

In [15]:
test_metrics = eval_model(best_model, X_test_enc, y_test)
test_metrics

{'Accuracy': 0.8886996396986571,
 'Precision': 0.5303030303030303,
 'Recall': 0.020551967116852612,
 'F1': 0.0395703787450537,
 'ROC-AUC': np.float64(0.670060676081653),
 'PR-AUC': np.float64(0.2247791871870587)}

In [16]:
joblib.dump(best_model, "../models/best_ml_model.pkl")
joblib.dump(preprocessor, "../models/preprocessor.pkl")

print("Model saved!")

Model saved!
