# 05. Stage 1 Modeling — Nested CV & Ablation Study

Nested CV (5-fold outer, 3-fold inner) with BayesSearchCV.
Ablation: Base / Base+LLM / Base+Lab / All_Features.
DeLong test for AUC comparison.

In [None]:
import sys
sys.path.insert(0, "..")

import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from pathlib import Path
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, accuracy_score
from skopt import BayesSearchCV

from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE

from src.config import PROJECT_ROOT, MODEL_SEED, LABELS, MODELS_TO_RUN
from src.variables import LLM_COLS, LAB_COLS, CODE_COLS, CATEGORY_COLS
from src.preprocessing import create_preprocessor
from src.models import get_models_and_search_space
from src.evaluation import delong_test, auc_diff_with_ci


## 1. Path configuration

In [None]:
# ⚠️ Adjust these paths per experiment ⚠️
DATA_DIR = PROJECT_ROOT / "data/processed_imp/260114_split_corr_LLM_ADER/imputation/simple_imput"
FS_DIR = PROJECT_ROOT / "results/new_analysis/260114_qwen/Feature_Selection/simple_20/step2_FS"
OUT_DIR = PROJECT_ROOT / "results/new_analysis/260114_qwen/modeling/step1_modeling/simple_20"
OUT_DIR.mkdir(parents=True, exist_ok=True)

## 2. Nested CV & Ablation Study

In [None]:
all_cv_results = []
predictions_for_delong = {}

outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=MODEL_SEED)
inner_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=MODEL_SEED)

for label in LABELS:
    print(f"\n{'='*20} {label} {'='*20}")
    train_file = DATA_DIR / f"simple_{label}_train.csv"
    if not train_file.exists():
        print(f"File not found, skipping: {train_file.name}")
        continue

    df_train = pd.read_csv(train_file)
    final_features = pd.read_csv(FS_DIR / f"final_features_{label}.csv")["feature"].tolist()

    selected_lab = [f for f in final_features if f in LAB_COLS]
    selected_base = [f for f in final_features if f not in LAB_COLS and f not in LLM_COLS]

    variable_sets = {
        "Base": selected_base,
        "Base_LLM": selected_base + LLM_COLS,
        "Base_Lab": selected_base + selected_lab,
        "All_Features": selected_base + selected_lab + LLM_COLS,
    }

    predictions_for_delong[label] = {}

    for set_name, features in variable_sets.items():
        print(f"\n--- {set_name} ({len(features)} features) ---")
        X_train_full = df_train[features]
        y_train_full = df_train[label]
        models = get_models_and_search_space(MODELS_TO_RUN)
        predictions_for_delong[label][set_name] = {}

        for model_name, (model, search_space) in models.items():
            outer_scores = {"auc": [], "f1": [], "precision": [], "recall": [], "accuracy": []}
            fold_predictions = {"idx": [], "y_true": [], "y_pred_proba": []}

            for fold_idx, (train_idx, val_idx) in enumerate(outer_cv.split(X_train_full, y_train_full)):
                X_train_outer = X_train_full.iloc[train_idx]
                y_train_outer = y_train_full.iloc[train_idx]
                X_val_outer = X_train_full.iloc[val_idx]
                y_val_outer = y_train_full.iloc[val_idx]

                numeric_features = [c for c in X_train_outer.columns if c not in (CODE_COLS + CATEGORY_COLS)]
                categorical_features = [c for c in CATEGORY_COLS if c in X_train_outer.columns and c not in CODE_COLS]
                code_features = [c for c in CODE_COLS if c in X_train_outer.columns]
                preprocessor = create_preprocessor(numeric_features, categorical_features, code_features)

                pipeline = ImbPipeline([
                    ("preprocessor", preprocessor),
                    ("smote", SMOTE(sampling_strategy="minority", random_state=MODEL_SEED)),
                    ("clf", model),
                ])

                bayes_search = BayesSearchCV(
                    estimator=pipeline, search_spaces=search_space,
                    n_iter=10, cv=inner_cv, scoring="roc_auc",
                    n_jobs=-1, random_state=MODEL_SEED, refit=True,
                )
                bayes_search.fit(X_train_outer, y_train_outer)

                y_pred_proba = bayes_search.predict_proba(X_val_outer)[:, 1]
                y_pred = bayes_search.predict(X_val_outer)

                outer_scores["auc"].append(roc_auc_score(y_val_outer, y_pred_proba))
                outer_scores["f1"].append(f1_score(y_val_outer, y_pred))
                outer_scores["precision"].append(precision_score(y_val_outer, y_pred, zero_division=0))
                outer_scores["recall"].append(recall_score(y_val_outer, y_pred))
                outer_scores["accuracy"].append(accuracy_score(y_val_outer, y_pred))

                fold_predictions["idx"].append(X_val_outer.index.values)
                fold_predictions["y_true"].append(y_val_outer.values if hasattr(y_val_outer, "values") else y_val_outer)
                fold_predictions["y_pred_proba"].append(y_pred_proba)

            df_pred = pd.DataFrame({
                "idx": np.concatenate(fold_predictions["idx"]),
                "y_true": np.concatenate(fold_predictions["y_true"]),
                "y_pred_proba": np.concatenate(fold_predictions["y_pred_proba"]),
            }).sort_values("idx").reset_index(drop=True)
            predictions_for_delong[label][set_name][model_name] = df_pred

            result = {
                "label": label, "variable_set": set_name, "model": model_name,
                "mean_auc": np.mean(outer_scores["auc"]), "std_auc": np.std(outer_scores["auc"]),
                "mean_f1": np.mean(outer_scores["f1"]), "std_f1": np.std(outer_scores["f1"]),
                "mean_precision": np.mean(outer_scores["precision"]), "std_precision": np.std(outer_scores["precision"]),
                "mean_recall": np.mean(outer_scores["recall"]), "std_recall": np.std(outer_scores["recall"]),
                "mean_accuracy": np.mean(outer_scores["accuracy"]), "std_accuracy": np.std(outer_scores["accuracy"]),
                "cv_auc_scores": outer_scores["auc"],
            }
            all_cv_results.append(result)
            print(f"  {model_name}: AUC={result['mean_auc']:.4f}±{result['std_auc']:.4f}, F1={result['mean_f1']:.4f}")

results_df = pd.DataFrame(all_cv_results)
results_df.to_csv(OUT_DIR / "modeling_ablation_results_full.csv", index=False, encoding="utf-8-sig")
print("\nNested CV & Ablation complete!")

## 3. DeLong's test — AUC improvement with 95% CI

In [None]:
improvement_rows = []

for label in LABELS:
    for model_name in MODELS_TO_RUN:
        try:
            base = predictions_for_delong[label]["Base"][model_name]
            y_true = base["y_true"].values

            for comp_name in ["Base_LLM", "Base_Lab", "All_Features"]:
                if comp_name not in predictions_for_delong[label]:
                    continue
                if model_name not in predictions_for_delong[label][comp_name]:
                    continue
                comp = predictions_for_delong[label][comp_name][model_name]
                stat = auc_diff_with_ci(y_true, comp["y_pred_proba"].values, base["y_pred_proba"].values)
                improvement_rows.append({
                    "label": label, "model": model_name,
                    "comparison": f"{comp_name} vs Base",
                    "auc_new": stat["auc_new"], "auc_base": stat["auc_old"],
                    "delta_auc": stat["delta_auc"],
                    "delta_auc_%": 100.0 * stat["delta_auc"] / stat["auc_old"] if stat["auc_old"] > 0 else np.nan,
                    "ci_low": stat["ci_low"], "ci_high": stat["ci_high"],
                    "z": stat["z"], "p": stat["p"],
                    "significant (p<0.05)": "Yes" if stat["p"] < 0.05 else "No",
                })
        except (KeyError, ValueError) as e:
            print(f"Skipped: {label}, {model_name} -> {e}")

improvement_df = pd.DataFrame(improvement_rows)
improvement_df["comparison"] = pd.Categorical(
    improvement_df["comparison"],
    categories=["Base_LLM vs Base", "Base_Lab vs Base", "All_Features vs Base"],
    ordered=True,
)
improvement_df = improvement_df.sort_values(["label", "model", "comparison"]).reset_index(drop=True)
improvement_df.to_csv(OUT_DIR / "auc_improvement_with_ci.csv", index=False, encoding="utf-8-sig")
print("DeLong AUC improvement results saved.")

## 4. Visualization

In [None]:
for label in LABELS:
    plt.figure(figsize=(12, 7))
    subset = results_df[results_df["label"] == label]
    sns.barplot(data=subset, x="model", y="mean_auc", hue="variable_set")
    plt.title(f"Model Performance - {label}")
    plt.ylabel("Mean ROC AUC (5-fold)")
    plt.ylim(bottom=max(0.5, subset["mean_auc"].min() - 0.05))
    plt.legend(title="Variable Set")
    plt.grid(axis="y", linestyle="--", alpha=0.7)
    plt.tight_layout()
    plt.savefig(OUT_DIR / f"performance_comparison_{label}.png")
    plt.close()
print("Figures saved.")