In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import (
    train_test_split,
    StratifiedKFold,
    learning_curve,
    GridSearchCV,
    RandomizedSearchCV
)
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    roc_auc_score, accuracy_score, precision_score,
    recall_score, f1_score
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import PartialDependenceDisplay
from imblearn.over_sampling import SMOTE
import xgboost as xgb
import shap

BASE_DIR = Path().resolve()

In [6]:
files_path = BASE_DIR / 'data_main' / 'natality_aligned_10pct_sample.csv'

In [7]:
#Review Dataframe Columns
first_cols = pd.read_csv(files_path, nrows=0).columns
first_cols

Index(['dob_yy', 'dob_mm', 'dob_tt', 'dob_wk', 'bfacil', 'mager', 'mrace6',
       'mracehisp', 'mar_p', 'dmar', 'meduc', 'fagecomb', 'frace6',
       'fracehisp', 'feduc', 'priorlive', 'priordead', 'priorterm', 'illb_r',
       'ilop_r', 'ilp_r', 'ilp_r11', 'precare', 'previs', 'cig_rec', 'bmi',
       'pwgt_r', 'dwgt_r', 'wtgain', 'rf_pdiab', 'rf_gdiab', 'rf_phype',
       'rf_ghype', 'rf_ehype', 'rf_ppterm', 'rf_inftr', 'rf_fedrg', 'rf_artec',
       'rf_cesar', 'rf_cesarn', 'ip_gon', 'ip_syph', 'ip_chlam', 'ip_hepb',
       'ip_hepc', 'ob_ecvs', 'ob_ecvf', 'ld_indl', 'ld_augm', 'ld_ster',
       'ld_antb', 'ld_chor', 'ld_anes', 'me_pres', 'me_rout', 'me_trial',
       'mm_mtr', 'mm_plac', 'mm_rupt', 'mm_uhyst', 'mm_aicu',
       'morbidity_reported', 'attend', 'pay', 'dplural', 'sex', 'combgest',
       'dbwt', 'ab_aven1', 'ab_aven6', 'ab_nicu', 'ab_surf', 'ab_anti',
       'ab_seiz', 'ca_anen', 'ca_mnsb', 'ca_cchd', 'ca_cdh', 'ca_omph',
       'ca_gast', 'ca_limb', 'ca_cleft', 'ca

In [None]:
def load_and_preprocess(
    files_path: Path,
    chunk_size: int = 100_000,
    sample_frac: float = 1.0
):
    """
    Loads natality data in chunks, applies optional sampling, converts rf_ and ip_
    columns to numeric, creates aggregate RF/IP indicators, renames columns, and
    returns a numeric DataFrame with Maternal Morbidity as int.
    """
    first_cols = pd.read_csv(files_path, nrows=0).columns

    ld_cols = [c for c in first_cols if c.lower().startswith("ld_")]
    rf_cols = [c for c in first_cols if c.lower().startswith("rf_")]
    ip_cols = [c for c in first_cols if c.lower().startswith("ip_")]

    usecols = [
        "bmi", "meduc", "feduc", "precare", "previs", "cig_rec",
        "morbidity_reported", "bfacil", "pay", "attend", "me_pres", "me_rout",
        'date'] + ld_cols + rf_cols + ip_cols

    rename_map = {
        "bmi": "Body Mass Index",
        "meduc": "Mother Education",
        "feduc": "Father Education",
        "precare": "Pre-natal Care Begins",
        "previs": "Pre-natal Visits",
        "cig_rec": "Cigarette Smoking",
        "morbidity_reported": "Maternal Morbidity",
        "bfacil": "Facility",
        "pay": "Payment Method",
        "attend": "Medical Provider",
        "ld_indl": "Induction of Labor",
        "ld_augm": "Augmentation of Labor",
        "ld_ster": "Steriods Used",
        "ld_antb": "Antibiotics Used",
        "ld_chor": "Chorioamnionitis Present",
        "ld_anes": "Anesthesia Used",
        "me_pres": "Fetal Presentation at Delivery",
        "me_rout": "Final Route of Delivery",
        "date": "Date of Birth",
    }

    dfs = []
    reader = pd.read_csv(files_path, chunksize=chunk_size, usecols=usecols, low_memory=False)

    for chunk in reader:
        sampled = chunk.sample(frac=sample_frac, random_state=123)

        rf_present = [c for c in rf_cols if c in sampled.columns]
        ip_present = [c for c in ip_cols if c in sampled.columns]

        if rf_present:
            sampled[rf_present] = sampled[rf_present].apply(pd.to_numeric, errors="coerce").fillna(0)
        if ip_present:
            sampled[ip_present] = sampled[ip_present].apply(pd.to_numeric, errors="coerce").fillna(0)

        sampled["Risk Factor Present"] = (sampled[rf_present].sum(axis=1) > 0).astype(int)
        sampled["Infection Present"] = (sampled[ip_present].sum(axis=1) > 0).astype(int)

        sampled.drop(columns=rf_present + ip_present, inplace=True)

        sampled.rename(columns=rename_map, inplace=True)
        dfs.append(sampled)

    df = pd.concat(dfs, ignore_index=True)

    df = df.select_dtypes(include=[np.number]).dropna()
    df["Maternal Morbidity"] = df["Maternal Morbidity"].astype(int)

    return df

In [9]:
df = load_and_preprocess(files_path)
summary = df.isna().mean().sort_values(ascending=False)
summary

Facility                          0.0
Mother Education                  0.0
Risk Factor Present               0.0
Payment Method                    0.0
Medical Provider                  0.0
Maternal Morbidity                0.0
Final Route of Delivery           0.0
Fetal Presentation at Delivery    0.0
Anesthesia Used                   0.0
Chorioamnionitis Present          0.0
Antibiotics Used                  0.0
Steriods Used                     0.0
Augmentation of Labor             0.0
Induction of Labor                0.0
Body Mass Index                   0.0
Cigarette Smoking                 0.0
Pre-natal Visits                  0.0
Pre-natal Care Begins             0.0
Father Education                  0.0
Infection Present                 0.0
dtype: float64

In [10]:
def prepare_data(df):
    """
    Splits dataset, scales features, applies SMOTE (for final training).
    Returns:
        X_train_scaled_df, X_test_scaled_df, X_train_res_df,
        y_train, y_train_res, y_test, scaler
    """
    X = df.drop("Maternal Morbidity", axis=1)
    y = df["Maternal Morbidity"]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    sm = SMOTE(random_state=42)
    X_train_res, y_train_res = sm.fit_resample(X_train_scaled, y_train)

    X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X.columns)
    X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X.columns)
    X_train_res_df = pd.DataFrame(X_train_res, columns=X.columns)

    return (
        X_train_scaled_df,
        X_test_scaled_df,
        X_train_res_df,
        y_train,
        y_train_res,
        y_test,
        scaler
    )


In [11]:
print("Preparing data (split, scale, SMOTE)...")
(
    X_train_scaled_df,
    X_test_scaled_df,
    X_train_res_df,
    y_train,
    y_train_res,
    y_test,
    scaler
) = prepare_data(df)

feature_names = X_train_scaled_df.columns.tolist()

Preparing data (split, scale, SMOTE)...


In [None]:
def evaluate_model(model, X_test, y_test, name):
    prob = model.predict_proba(X_test)[:, 1]
    preds = (prob > 0.5).astype(int)

    return {
        "Model": name,
        "AUC": roc_auc_score(y_test, prob),
        "Accuracy": accuracy_score(y_test, preds),
        "Precision": precision_score(y_test, preds, zero_division=0),
        "Recall": recall_score(y_test, preds, zero_division=0),
        "F1": f1_score(y_test, preds)
    }

def get_tuning_subset(X, y, frac=0.10, random_state=42):
    """
    Returns a random subset (fraction) of the dataset for faster hyperparameter tuning.
    """
    subset = X.sample(frac=frac, random_state=random_state)
    y_subset = y.loc[subset.index]
    return subset, y_subset

In [13]:
#Select a subset of the total dataset for hyperparameter tuning
tuning_frac=0.05
print(f"\nCreating {int(tuning_frac * 100)}% subset for tuning...")
X_tune, y_tune = get_tuning_subset(X_train_scaled_df, y_train.reset_index(drop=True), frac=tuning_frac)
print(f"Tuning subset size: {X_tune.shape[0]:,} rows")
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


Creating 5% subset for tuning...
Tuning subset size: 124,687 rows


In [18]:
def tune_logistic_regression(X, y, cv=None):
    if cv is None:
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    base = LogisticRegression(
        max_iter=300,
        class_weight="balanced",
        solver="liblinear"
    )

    param_grid = {
        "C": np.logspace(-2, 2, 5),
        "penalty": ["l1", "l2"],
    }

    grid = GridSearchCV(
        base,
        param_grid,
        scoring="recall",
        cv=cv,
        n_jobs=-1,
        verbose=1
    )
    grid.fit(X, y)

    print("Best Logistic Regression params:", grid.best_params_)
    print("Best Logistic Regression 5-fold CV Recall:", grid.best_score_)

    return grid.best_estimator_, grid.best_score_


def tune_random_forest(X, y, cv=None):
    if cv is None:
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    base = RandomForestClassifier(
        class_weight="balanced_subsample",
        n_jobs=-1,
        random_state=42
    )

    param_dist = {
        "n_estimators": [100, 200, 400],
        "max_depth": [5, 10],
        "max_features": ["sqrt", "log2"]
    }

    rand = RandomizedSearchCV(
        base,
        param_distributions=param_dist,
        n_iter=10,
        scoring="recall",
        cv=cv,
        n_jobs=1,
        random_state=42,
        verbose=1
    )
    rand.fit(X, y)

    print("Best Random Forest params:", rand.best_params_)
    print("Best Random Forest 5-fold CV Recall:", rand.best_score_)

    return rand.best_estimator_, rand.best_score_


def tune_xgboost(X, y, cv=None):
    """
    Hyperparameter tuning for XGBoost using a smaller subset and early stopping
    on an internal train/validation split.
    """
    if cv is None:
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Base XGB model
    base_model = xgb.XGBClassifier(
        objective='binary:logistic',
        eval_metric="aucpr",
        tree_method="hist",
        n_jobs=-1,
        random_state=42
    )

    param_dist = {
        "max_depth": [3, 5, 7],
        "learning_rate": [0.01, 0.05, 0.1],
        "n_estimators": [500],
        "subsample": [0.7, 0.8],
        "colsample_bytree": [0.7, 0.9],
    }

    rand = RandomizedSearchCV(
        base_model,
        param_distributions=param_dist,
        n_iter=20,
        scoring="recall",
        cv=StratifiedKFold(5, shuffle=True, random_state=42),
        n_jobs=-1,
        verbose=1,
        random_state=42,
    )

    rand.fit(X, y)

    print("Best params:", rand.best_params_)
    print("Best recall:", rand.best_score_)

    return rand.best_estimator_, rand.best_score_

In [19]:
print("\nTuning Logistic Regression...")
lr_best, lr_cv_recall = tune_logistic_regression(X_tune, y_tune, cv=cv)

print("\nTuning Random Forest...")
rf_best, rf_cv_recall = tune_random_forest(X_tune, y_tune, cv=cv)

print("\nTuning XGBoost (with early stopping)...")
xgb_best, xgb_cv_recall = tune_xgboost(X_tune, y_tune, cv=cv)


Tuning Logistic Regression...
Fitting 5 folds for each of 10 candidates, totalling 50 fits


  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)


Best Logistic Regression params: {'C': np.float64(0.01), 'penalty': 'l1'}
Best Logistic Regression 5-fold CV Recall: 0.5361721580127652

Tuning Random Forest...
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Random Forest params: {'n_estimators': 400, 'max_features': 'log2', 'max_depth': 5}
Best Random Forest 5-fold CV Recall: 0.5126582715197516

Tuning XGBoost (with early stopping)...
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best params: {'subsample': 0.8, 'n_estimators': 500, 'max_depth': 7, 'learning_rate': 0.1, 'colsample_bytree': 0.7}
Best recall: 0.002352941176470588


In [None]:
# Logistic Regression & RF refit normally
lr_best.fit(X_train_res_df, y_train_res)
rf_best.fit(X_train_res_df, y_train_res)

# XGBoost refit with early stopping on an internal validation split
# X_es_train, X_es_val, y_es_train, y_es_val = make_early_stopping_split(
    # X_train_res_df, y_train_res
# )

xgb_best.fit(
    X_train,
    y_es_train,
    eval_set=[(X_es_val, y_es_val)],
    eval_metric="aucpr",
    verbose=True,
)

# Evaluate on hold-out test set
print("\nEvaluating tuned models on test set...")
results = []
results.append(evaluate_model(lr_best, X_test_scaled_df, y_test, "Logistic Regression"))
results.append(evaluate_model(rf_best, X_test_scaled_df, y_test, "Random Forest"))
results.append(evaluate_model(xgb_best, X_test_scaled_df, y_test, "XGBoost"))

results_df = pd.DataFrame(results)

# Add CV Recall to the table
results_df["CV Recall (5-fold)"] = [
    lr_cv_recall,
    rf_cv_recall,
    xgb_cv_recall
]

# Pretty printed report
print("\n=== Model Performance Comparison ===")
print(results_df.to_string(index=False, float_format="%.3f"))

In [None]:
feat_imp = pd.DataFrame({"feature": X_train_res_df.columns})

# Logistic Regression (absolute coefficients)
feat_imp["LR"] = np.abs(lr_best.coef_[0])
feat_imp["RF"] = rf_best.feature_importances_
feat_imp["XGB"] = xgb_best.feature_importances_

# Normalize each column to make scales comparable
for col in ["LR", "RF", "XGB"]:
    feat_imp[col] = feat_imp[col] / feat_imp[col].max()

In [None]:
# Determine global top features across all methods
feat_imp["mean_importance"] = feat_imp[["LR", "RF", "XGB"]].mean(axis=1)
top = feat_imp.sort_values("mean_importance", ascending=False).head(20)

plt.figure(figsize=(10, 12))

bar_width = 0.25
x = np.arange(len(top))

plt.barh(x - bar_width, top["LR"], height=bar_width, label="Logistic Regression")
plt.barh(x,             top["RF"], height=bar_width, label="Random Forest")
plt.barh(x + bar_width, top["XGB"], height=bar_width, label="Gradient Boosting")

plt.yticks(x, top["feature"])
plt.gca().invert_yaxis()
plt.xlabel("Normalized Feature Importance")
plt.title("Feature Importance Comparison (LR vs RF vs XGB)", fontsize=14)
plt.legend()
plt.grid(axis="x", alpha=0.3)

plt.tight_layout()
plt.show()



In [None]:
# SHAP analysis
explainer = shap.TreeExplainer(xgb_best)
shap_values = explainer.shap_values(X_test_scaled_df)

# Global importance
shap.summary_plot(shap_values, X_test_df)

In [None]:
pdp_features = [
    "Pre-natal Visits",
    "Mother Education",
    "Father Education",
    "Final Route of Delivery",
    # "Payment Method",
    "Body Mass Index",
    "Medical Provider",
    # "Risk Factor Present",
]
pdp_features = [f for f in pdp_features if f in X_test_scaled_df.columns]

if pdp_features:
    print("\nPlotting partial dependence plots (XGBoost)...")
    PartialDependenceDisplay
    plot_partial_dependence_grid(
        xgb_best,
        X_test_scaled_df,
        pdp_features,
        n_rows=3,
        n_cols=2
    )

In [None]:
import joblib

def save_models(output_dir, lr_model, rf_model, xgb_model, scaler):
    """
    Saves trained models and scaler to disk using joblib.
    """
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    joblib.dump(lr_model, output_dir / "logistic_regression.pkl")
    joblib.dump(rf_model, output_dir / "random_forest.pkl")
    joblib.dump(xgb_model, output_dir / "xgboost_model.pkl")
    joblib.dump(scaler, output_dir / "scaler.pkl")

    print(f"\nModels saved to: {output_dir}")


In [None]:
save_dir = BASE / "trained_models"
save_models(save_dir, lr_best, rf_best, xgb_best, scaler)

In [None]:
def load_models(output_dir):
    """
    Loads previously saved models and scaler.
    """
    output_dir = Path(output_dir)

    lr_model = joblib.load(output_dir / "logistic_regression.pkl")
    rf_model = joblib.load(output_dir / "random_forest.pkl")
    xgb_model = joblib.load(output_dir / "xgboost_model.pkl")
    scaler = joblib.load(output_dir / "scaler.pkl")

    print(f"Loaded models from: {output_dir}")
    return lr_model, rf_model, xgb_model, scaler

In [None]:
lr_loaded, rf_loaded, xgb_loaded, scaler_loaded = load_models(models_path)

In [None]:
## Predictive Modeling using SHAP

model = xgb.XGBClassifier(
    max_depth=7,
    learning_rate=0.1,
    subsample=0.8,
    n_estimators=200,
    eval_metric="auc",
    tree_method="hist",
    early_stopping_rounds=50,
    random_state=42
)

model.fit(X_train_res, y_train_res)
pred = model.predict_proba(X_test_scaled)[:,1]
print("AUC:", roc_auc_score(y_test, pred))

# SHAP analysis
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test_scaled)

# Global importance
shap.summary_plot(shap_values, X_test)