# 04. Imputation x Oversampling Method Comparison

Compares 4 imputation methods x 5 models x 5 time points.
Uses both ROS (RandomOverSampler) and SMOTE for oversampling comparison.

In [None]:
import sys
sys.path.insert(0, "..")

import warnings
warnings.filterwarnings("ignore")

import os
import logging
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import (
    roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
)
from sklearn.model_selection import StratifiedKFold
from skopt import BayesSearchCV

from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE, RandomOverSampler

from src.config import PROJECT_ROOT, MODEL_SEED
from src.variables import CATEGORY_COLS, CODE_COLS
from src.preprocessing import make_preprocessor
from src.models import get_models_search_weighted, get_models_search_unweighted

## 1. Dataset definitions (4 imputation methods x 5 time points)

In [None]:
# ── Paths ──
IMP_BASE = PROJECT_ROOT / "data/processed_imp/260106_split_corr_LLM_ADER/imputation"

datasets = {}
for imp_method in ["simple", "missforest", "hybrid", "mice"]:
    for label in ["label_30d", "label_60d", "label_90d", "label_180d", "label_365d"]:
        key = f"{imp_method}_{label}"
        datasets[key] = {
            "train_path": str(IMP_BASE / f"{imp_method}_imput/{imp_method}_{label}_train.csv"),
            "test_path": str(IMP_BASE / f"{imp_method}_imput/{imp_method}_{label}_test.csv"),
            "target": label,
        }

print(f"Total dataset configs: {len(datasets)}")

In [None]:
# Output directories
def setup_output(method_name):
    base_out = PROJECT_ROOT / f"results/new_analysis/imput_sampling_test/{method_name}"
    csv_out = base_out / "final_output"
    csv_out.mkdir(parents=True, exist_ok=True)
    return csv_out

def load_data(train_path, test_path, target):
    df_train = pd.read_csv(train_path)
    df_test = pd.read_csv(test_path)
    X_train = df_train.drop(columns=[target])
    y_train = df_train[target]
    X_test = df_test.drop(columns=[target])
    y_test = df_test[target]
    return X_train, y_train, X_test, y_test

## 2. Pipeline runner

In [None]:
auc_scorer = "roc_auc"
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=MODEL_SEED)

def run_pipeline(X_train, y_train, X_test, y_test, ds_name,
                 numeric_cols, categorical_cols, code_cols,
                 models_search, oversampler=None):
    pre = make_preprocessor(X_train, numeric_cols, categorical_cols, code_cols)
    results = []

    for name, (model, space) in tqdm(models_search.items(), desc=f"[{ds_name}]", leave=False):
        steps = [("pre", pre)]
        if oversampler is not None:
            steps.append(("oversample", oversampler))
        steps.append(("clf", model))
        pipe = ImbPipeline(steps)

        bayes = BayesSearchCV(
            pipe, search_spaces=space, n_iter=20, cv=cv,
            scoring=auc_scorer, n_jobs=1, random_state=MODEL_SEED,
        )
        bayes.fit(X_train, y_train)

        preds = bayes.predict(X_test)
        proba = bayes.predict_proba(X_test)[:, 1]

        results.append({
            "dataset": ds_name, "model": name,
            "accuracy": accuracy_score(y_test, preds),
            "precision": precision_score(y_test, preds, average="macro", zero_division=0),
            "recall": recall_score(y_test, preds, average="macro", zero_division=0),
            "f1": f1_score(y_test, preds, average="macro"),
            "roc_auc": roc_auc_score(y_test, proba),
        })
    return results

## 3. Run: ROS (RandomOverSampler)

In [None]:
csv_out_ros = setup_output("ROS")
all_results_ros = {}

for ds_name, cfg in tqdm(datasets.items(), desc="ROS Datasets"):
    X_tr, y_tr, X_te, y_te = load_data(cfg["train_path"], cfg["test_path"], cfg["target"])
    numeric_cols = [c for c in X_tr.columns if c not in CATEGORY_COLS + CODE_COLS]
    key = f"{ds_name}_oversample"
    all_results_ros[key] = run_pipeline(
        X_tr, y_tr, X_te, y_te, key,
        numeric_cols, [c for c in CATEGORY_COLS if c in X_tr.columns], [c for c in CODE_COLS if c in X_tr.columns],
        get_models_search_unweighted(),
        oversampler=RandomOverSampler(random_state=MODEL_SEED),
    )

df_ros = pd.concat([pd.DataFrame(v) for v in all_results_ros.values()], ignore_index=True)
df_ros.to_csv(csv_out_ros / "all_results.csv", index=False)
print(f"ROS results saved: {len(df_ros)} rows")

## 4. Run: SMOTE

In [None]:
csv_out_smote = setup_output("SMOTE")
all_results_smote = {}

for ds_name, cfg in tqdm(datasets.items(), desc="SMOTE Datasets"):
    X_tr, y_tr, X_te, y_te = load_data(cfg["train_path"], cfg["test_path"], cfg["target"])
    numeric_cols = [c for c in X_tr.columns if c not in CATEGORY_COLS + CODE_COLS]
    key = f"{ds_name}_oversample"
    all_results_smote[key] = run_pipeline(
        X_tr, y_tr, X_te, y_te, key,
        numeric_cols, [c for c in CATEGORY_COLS if c in X_tr.columns], [c for c in CODE_COLS if c in X_tr.columns],
        get_models_search_unweighted(),
        oversampler=SMOTE(random_state=MODEL_SEED),
    )

df_smote = pd.concat([pd.DataFrame(v) for v in all_results_smote.values()], ignore_index=True)
df_smote.to_csv(csv_out_smote / "all_results.csv", index=False)
print(f"SMOTE results saved: {len(df_smote)} rows")