<h2>Recall Optimization</h2>

In this notebook, we once again utilize Optuna, however instead of NSGA-II for variable selection, we are interested in tuning model hyperparameters to achieve the best possible Recall score.

We utilize the TPESampler as our primary optimization algorithm, and utilize median pruining - that is, if the Recall result is worse than the current median, the test is scrapped. We compare versions of XGBoost and LogisticRegression with tuned hyperparameters, different decision boundaries, and the usage of a "balanced" mode that seeks to increase the influence of the rare positive class.

More detailed results can be found in our final report!

<h2>Note</h2>

These optimization loops can also take awhile, so please be patient! You will see (and read about why we did so in the report) that we decided to scrap one of the trials because we felt we already had enough information and a performative enough model.

In [1]:
import os
import sys
from pathlib import Path
import optuna
from optuna.samplers import TPESampler
from optuna.pruners import MedianPruner
import numpy as np
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import TimeSeriesSplit
from xgboost import XGBClassifier
from sklearn.metrics import (
    accuracy_score,
    recall_score,
    precision_score,
    log_loss,
    roc_auc_score

)
from sklearn.dummy import DummyClassifier
import shap

src_path = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
if src_path not in sys.path:
    sys.path.insert(0, src_path)

from capstone.expanding_scaler import global_expanding_standard_scaler_by_date

BASE_DIR = Path().resolve().parent

In [8]:
class OptimizerClassifier():
    def __init__(
            self,
            search_iter=5000,
            decision_threshold=0.5,
            scoring_metric='recall',
            xgb_objective='binary:logistic',
            random_state=42,
            n_jobs=-1
    ):
        self.search_iter = search_iter
        self.decision_threshold = decision_threshold
        self.scoring_metric = scoring_metric
        self.xgb_objective = xgb_objective
        self.random_state = random_state
        self.n_jobs = n_jobs
        self.base_params = {
            "random_state": self.random_state,
            "n_jobs": self.n_jobs,
        }
        self.scorer = self._make_scorer_cust(self.scoring_metric)
        self.binary_vars = None
        self.date_col = None
        self.cv = None
        self.scaled_data = {}
        self.best_estimator = None
        self.best_score = None
        self.best_params = None
        self.best_use_balance = None

    def _make_scorer_cust(self, scoring_metric: str):
        if scoring_metric in ['logloss', 'mlogloss']:
            return log_loss
        # Uses 1 - accuracy to align with XGBoost error.
        elif scoring_metric in ['error', 'merror']:
            return lambda y_true, y_pred: 1 - (accuracy_score(y_true, y_pred))
        elif scoring_metric == "recall":
            return recall_score
        elif scoring_metric == "precision":
            return precision_score
        elif scoring_metric == "auc":
            return roc_auc_score
        else:
            raise ValueError(f"Unsupported scoring metric: {scoring_metric}")
    
    def _get_scaled_train_test_groups(self, X, y):

        order_idx = X[self.date_col].sort_values().index
        X_sorted = X.loc[order_idx]

        cont_cols = [c for c in X.columns if c not in self.binary_vars and c != self.date_col]
                
        if self.cv is None:
            
            tscv = TimeSeriesSplit(
                n_splits=3,
                test_size=int(round(X_sorted.shape[0] * 0.10, 0)),
                gap=0,
            )
            self.cv = list(tscv.split(X_sorted))

            for split, (train_index, test_index) in enumerate(self.cv):
                X_train, X_test = X_sorted.iloc[train_index], X_sorted.iloc[test_index]

                X_train_scaled = X_train.copy()
                X_test_scaled  = X_test.copy()

                X_train_scaled[cont_cols] = X_train_scaled[cont_cols].astype(float)
                X_test_scaled[cont_cols]  = X_test_scaled[cont_cols].astype(float)

                train_for_scaler = X_train_scaled[cont_cols + [self.date_col]]
                train_scaled_full, train_scaler_state = global_expanding_standard_scaler_by_date(
                    train_for_scaler,
                    date_col=self.date_col,
                    merge_cols=[self.date_col],
                    min_periods=0,
                    return_stats=True,
                )
                X_train_scaled.loc[train_scaled_full.index, cont_cols] = train_scaled_full[cont_cols]

                test_for_scaler = X_test_scaled[cont_cols + [self.date_col]]
                test_scaled_full = global_expanding_standard_scaler_by_date(
                    test_for_scaler,
                    date_col=self.date_col,
                    merge_cols=[self.date_col],
                    min_periods=0,
                    stats=train_scaler_state,
                    return_stats=False,
                )
                X_test_scaled.loc[test_scaled_full.index, cont_cols] = test_scaled_full[cont_cols]

                X_train_lr = X_train_scaled.drop(columns=[self.date_col])
                X_test_lr = X_test_scaled.drop(columns=[self.date_col])

                self.scaled_data[f'train_{split}'] = X_train_lr
                self.scaled_data[f'test_{split}'] = X_test_lr

            X_group_needs_scaling = X[cont_cols + [self.date_col]]
            X_group_scaled = global_expanding_standard_scaler_by_date(
                X_group_needs_scaling,
                date_col=self.date_col,
                merge_cols=[self.date_col],
                min_periods=0
            )
            X_group_scaled_no_date = X_group_scaled.drop(columns=[self.date_col])
            self.scaled_data['all'] = X_group_scaled_no_date

    def _eval_classifier(self, X, y, model_params, trial):

        if self.cv is None:
            raise ValueError('self.cv is not set.')
        
        order_idx = X[self.date_col].sort_values().index
        y_sorted = y.loc[order_idx]
        
        fold_scores = []
        for split, (train_index, test_index) in enumerate(self.cv):
            
            smote = SMOTE(random_state=self.random_state)
            X_train, y_train = smote.fit_resample(self.scaled_data[f'train_{split}'], y_sorted.iloc[train_index])

            X_test  = self.scaled_data[f'test_{split}']
            y_test = y_sorted.iloc[test_index]
            
            
            # We aren't using the pruining callback because it would interrupt the
            # k-fold cross-validation. Instead, we use early stopping as a parameter
            # of the model, and allow Optuna to then decide where to search next.
            model = self.ModelClass(**model_params)

            if self.ModelClass is XGBClassifier:
                model.fit(
                    X_train, y_train,
                    eval_set=[(X_test, y_test)],
                    verbose=False
                )
            else:
                model.fit(X_train,y_train)
            # During training with an eval_set and early_stopping_rounds,
            # XGBoost tracks the validation score at each boosting round.
            # When validation stops improving for early_stopping_rounds
            # consecutive rounds, training halts and best_iteration is then
            # set to the boosting round (0-based index) with the best validation score.
            best_iter = getattr(model, "best_iteration", None)
            use_proba = self.scoring_metric in ("logloss", "mlogloss", "auc")

            if best_iter is not None:
                y_proba_test = model.predict_proba(X_test, iteration_range=(0, best_iter + 1))[:, 1]
            else:
                y_proba_test = model.predict_proba(X_test)[:, 1]
            
            # yhat_train = (y_proba_train >= decision_threshold).astype(int)
            yhat_test = (y_proba_test  >= self.decision_threshold).astype(int)

            if use_proba:
                fold_score = self.scorer(y_test, y_proba_test)
            else:
                fold_score = self.scorer(y_test, yhat_test)

            if self.ModelClass is XGBClassifier:
                trial.report(fold_score, step=split)
                if trial.should_prune():
                    raise optuna.TrialPruned()

            fold_scores.append(fold_score)
        
        return float(np.mean(fold_scores))

    def _run_optimization(self, X, y):

        self._get_scaled_train_test_groups(X, y)

        def __objective(trial: optuna.Trial) -> float:

            if self.cv is None or not isinstance(self.cv, list):
                raise ValueError('cv_splits is not set.')
            if X is None:
                raise ValueError('X is not set.')
            if y is None:
                raise ValueError('y is not set.')
            if self.ModelClass is None:
                raise ValueError('ModelClass is not set.')
            
            if self.ModelClass is XGBClassifier:
                neg = (y == 0).sum()
                pos = (y == 1).sum()
                balance_eq = neg / pos

                use_balance = trial.suggest_categorical("use_balance_weight", [True, False])
                model_params = {
                    **self.base_params,
                    "n_estimators": 3000,
                    "early_stopping_rounds": 50,
                    "objective": self.xgb_objective,
                    "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
                    "max_depth": trial.suggest_int("max_depth", 3, 10),
                    "min_child_weight": trial.suggest_float("min_child_weight", 0.5, 20.0, log=True),
                    "subsample": trial.suggest_float("subsample", 0.6, 1.0),
                    "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
                    "gamma": trial.suggest_float("gamma", 1e-9, 10.0, log=True),
                }
                if use_balance:
                    model_params["scale_pos_weight"] = balance_eq
            elif self.ModelClass is RandomForestClassifier:
                model_params = {
                    **self.base_params,
                    "bootstrap": True,
                    "n_estimators": trial.suggest_int("n_estimators", 10, 300),
                    "max_depth": trial.suggest_int("max_depth", 3, 10),
                    "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
                    "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
                    "class_weight": trial.suggest_categorical("class_weight", ["balanced", None])
            }
            else:
                model_params = {
                    **self.base_params,
                    "max_iter": 1000,
                    # "solver": "liblinear",
                    "class_weight": trial.suggest_categorical("class_weight", ["balanced", None]),
                    "C": trial.suggest_float("C", 1e-9, 10.0, log=True),
                    #"penalty": trial.suggest_categorical("penalty", ["l1", "l2"]),
                }
            

            score = self._eval_classifier(X, y, model_params, trial)

            return score
        
        if self.scoring_metric in ("logloss", "mlogloss", "error", "merror"):
            direction = "minimize"
        else:
            direction = "maximize"
        
        study = optuna.create_study(
            direction=direction,
            pruner=MedianPruner(n_min_trials=self.search_iter // 2),
            sampler=TPESampler(seed=self.random_state)
        )
        study.optimize(
                __objective,
            n_trials=self.search_iter
        )

        order_idx = X[self.date_col].sort_values().index
        X_all_scaled = self.scaled_data['all'].loc[order_idx]
        y_sorted = y.loc[order_idx]

        study_params = dict(study.best_params)

        use_balance = study_params.pop("use_balance_weight", None)
        best_model_params = {**self.base_params, **study_params}
        best_estimator = self.ModelClass(**best_model_params)
        best_estimator.fit(X_all_scaled, y_sorted)

        self.best_estimator = best_estimator
        self.best_score = study.best_value
        self.best_params = best_model_params
        self.best_use_balance = use_balance

    def fit_transform(
        self,
        X,
        y,
        date_col,
        binary_vars=None,
        model_type='xgb_clf'
    ):
        
        self.date_col = date_col
        self.binary_vars = binary_vars or []
        
        if model_type == 'xgb_clf':
            self.ModelClass = XGBClassifier
        elif model_type == 'lr':
            self.ModelClass = LogisticRegression
        elif model_type == 'rf':
            self.ModelClass = RandomForestClassifier
        else:
            raise ValueError(f"Model type {model_type} is not supported.")
        
        X[self.date_col] = pd.to_datetime(X[self.date_col], errors="coerce")
        X = X.dropna(subset=[self.date_col])

        self._run_optimization(X, y)

        return (self.best_estimator, self.best_score, self.best_params)

In [3]:
def load_and_preprocess(
    files_path: Path,
    chunk_size: int = 100_000,
    sample_frac: float = 1.0
):
    """
    Loads natality data in chunks, applies optional sampling, converts rf_ and ip_
    columns to numeric, creates aggregate RF/IP indicators, renames columns, and
    returns a numeric DataFrame with Maternal Morbidity as int.
    """
    first_cols = pd.read_csv(files_path, nrows=0).columns

    ld_cols = [c for c in first_cols if c.lower().startswith("ld_")]
    rf_cols = [c for c in first_cols if c.lower().startswith("rf_")]
    ip_cols = [c for c in first_cols if c.lower().startswith("ip_")]

    usecols = [
        "bmi", "meduc", "feduc", "precare", "previs", "cig_rec",
        "morbidity_reported", "bfacil", "pay", "attend", "me_pres", "me_rout",
        'date'] + ld_cols + rf_cols + ip_cols

    rename_map = {
        "bmi": "Body Mass Index",
        "meduc": "Mother Education",
        "feduc": "Father Education",
        "precare": "Pre-natal Care Begins",
        "previs": "Pre-natal Visits",
        "cig_rec": "Cigarette Smoking",
        "morbidity_reported": "Maternal Morbidity",
        "bfacil": "Facility",
        "pay": "Payment Method",
        "attend": "Medical Provider",
        "ld_indl": "Induction of Labor",
        "ld_augm": "Augmentation of Labor",
        "ld_ster": "Steriods Used",
        "ld_antb": "Antibiotics Used",
        "ld_chor": "Chorioamnionitis Present",
        "ld_anes": "Anesthesia Used",
        "me_pres": "Fetal Presentation at Delivery",
        "me_rout": "Final Route of Delivery",
        "date": "Date of Birth",
    }

    dfs = []
    reader = pd.read_csv(files_path, chunksize=chunk_size, usecols=usecols, low_memory=False)

    for chunk in reader:
        sampled = chunk.sample(frac=sample_frac, random_state=42)

        rf_present = [c for c in rf_cols if c in sampled.columns]
        ip_present = [c for c in ip_cols if c in sampled.columns]

        if rf_present:
            sampled[rf_present] = sampled[rf_present].apply(pd.to_numeric, errors="coerce").fillna(0)
        if ip_present:
            sampled[ip_present] = sampled[ip_present].apply(pd.to_numeric, errors="coerce").fillna(0)

        sampled["Risk Factor Present"] = (sampled[rf_present].sum(axis=1) > 0).astype(int)
        sampled["Infection Present"] = (sampled[ip_present].sum(axis=1) > 0).astype(int)

        sampled.drop(columns=rf_present + ip_present, inplace=True)

        sampled.rename(columns=rename_map, inplace=True)
        dfs.append(sampled)

    df = pd.concat(dfs, ignore_index=True)

    df = df.dropna()
    df["Maternal Morbidity"] = df["Maternal Morbidity"].astype(int)

    return df

In [4]:
path = BASE_DIR / 'data_main' / 'natality_aligned_10pct_sample.csv'

clean_df = load_and_preprocess(path)

clean_df

Unnamed: 0,Facility,Mother Education,Father Education,Pre-natal Care Begins,Pre-natal Visits,Cigarette Smoking,Body Mass Index,Induction of Labor,Augmentation of Labor,Steriods Used,...,Chorioamnionitis Present,Anesthesia Used,Fetal Presentation at Delivery,Final Route of Delivery,Maternal Morbidity,Medical Provider,Payment Method,Date of Birth,Risk Factor Present,Infection Present
0,1.0,7.0,3.0,6.0,5.0,0.0,99.900002,1.0,0.0,0.0,...,0.0,1.0,1.0,1.0,0,2.0,2.0,2020-02-03,0,0
3,1.0,3.0,3.0,3.0,7.0,0.0,19.000000,1.0,0.0,0.0,...,0.0,1.0,1.0,3.0,0,1.0,8.0,2020-01-27,0,0
4,1.0,5.0,5.0,2.0,12.0,0.0,21.000000,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0,1.0,6.0,2019-12-30,0,0
6,1.0,7.0,7.0,6.0,5.0,0.0,22.200001,1.0,0.0,0.0,...,0.0,1.0,1.0,4.0,0,1.0,1.0,2020-01-27,0,0
10,1.0,6.0,6.0,2.0,10.0,0.0,30.400000,1.0,1.0,0.0,...,0.0,1.0,1.0,1.0,0,1.0,2.0,2020-01-13,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4157614,1.0,6.0,6.0,2.0,10.0,0.0,24.900000,0.0,0.0,0.0,...,0.0,1.0,1.0,4.0,0,1.0,2.0,2019-01-28,0,0
4157615,1.0,5.0,5.0,3.0,15.0,0.0,30.100000,1.0,0.0,0.0,...,0.0,0.0,1.0,4.0,1,1.0,2.0,2019-01-28,1,0
4157616,1.0,5.0,3.0,1.0,12.0,0.0,27.799999,1.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0,1.0,2.0,2019-02-04,0,0
4157617,1.0,3.0,3.0,5.0,6.0,0.0,35.400002,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0,1.0,3.0,2019-02-04,0,0


In [5]:
cat_cols = [c for c in clean_df.columns if clean_df[c].nunique() < 2]

oh_df = clean_df.copy()

oh_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
oh_encoder.fit(oh_df[cat_cols])

X_one_hot = pd.DataFrame(oh_encoder.fit_transform(oh_df[cat_cols]), columns=oh_encoder.get_feature_names_out(cat_cols))
X_one_hot = oh_df.drop(columns=cat_cols).merge(X_one_hot, left_index=True, right_index=True)

X_one_hot = X_one_hot.drop(columns=["Maternal Morbidity"])
X_cat = clean_df.drop(columns=["Maternal Morbidity"])
y = clean_df["Maternal Morbidity"]

binary_vars_oh = [var for var in X_one_hot.columns if X_one_hot[var].nunique() == 2]
binary_vars_cat = [var for var in X_cat.columns if X_cat[var].nunique() == 2]

In [6]:
dummy = DummyClassifier(strategy='uniform')
dummy2 = DummyClassifier(strategy='stratified')

dummy.fit(X_cat.drop('Date of Birth', axis=1), y)
dummy2.fit(X_cat.drop('Date of Birth', axis=1), y)

y_pred = dummy.predict(X_cat)
y_pred2 = dummy2.predict(X_cat)

print(f"Dummy AUC:  {roc_auc_score(y, y_pred,  average='macro')}")
print(f"Dummy2 AUC: {roc_auc_score(y, y_pred2, average='macro')}")

Dummy AUC:  0.4988814936117456
Dummy2 AUC: 0.4996878507881219


In [None]:
optimizer_rf = OptimizerClassifier(
    search_iter=100,
    random_state=42,
    decision_threshold=0.5,
    scoring_metric='auc',
    xgb_objective='binary:logistic',
    n_jobs=-1
)

best_rf_model, best_rf_score, best_rf_params = optimizer_rf.fit_transform(
    X_cat,
    y,
    date_col='Date of Birth',
    binary_vars=binary_vars_cat,
    model_type='rf'
)

In [None]:
optimizer_xgb = OptimizerClassifier(
    search_iter=100,
    random_state=42,
    decision_threshold=0.5,
    scoring_metric='auc',
    xgb_objective='binary:logistic',
    n_jobs=-1
)

best_xgb_model, best_xgb_score, best_xgb_params = optimizer_xgb.fit_transform(
    X_cat,
    y,
    date_col='date',
    binary_vars=binary_vars_cat,
    model_type='xgb_clf'
)

In [None]:
optimizer_lr = OptimizerClassifier(
    search_iter=100,
    random_state=42,
    decision_threshold=0.5,
    scoring_metric='auc',
    xgb_objective='binary:logistic',
    n_jobs=-1
)

best_lr_model, best_lr_score, best_lr_params = optimizer_lr.fit_transform(
    X_one_hot,
    y,
    date_col='date',
    binary_vars=binary_vars_oh,
    model_type='lr'
)