In [7]:
import os, sys, json, warnings
warnings.filterwarnings("ignore")
from pathlib import Path
import numpy as np
import pandas as pd

# --- Repo root (notebook/script-safe)
if "__file__" in globals():
    ROOT = Path(__file__).resolve().parents[1]
else:
    CWD = Path.cwd(); ROOT = CWD.parent if CWD.name == "notebooks" else CWD
sys.path.insert(0, str(ROOT))

from src.data_loader import load_and_save_data

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import (
    roc_auc_score, average_precision_score, precision_recall_curve
)

import matplotlib
matplotlib.use("Agg")  
import matplotlib.pyplot as plt

# --- Defaults for notebooks
if 'get_ipython' in globals():
    os.environ.setdefault("CV", "3")
    os.environ.setdefault("RND", "42")
    os.environ.setdefault("TE_CAT", "1")
    os.environ.setdefault("GBM_CHECK", "1")
    os.environ.setdefault("TRAIN_SAMPLE_N", "250000")
    os.environ.setdefault("TE_ALPHA", "10")

# --- Runtime params
RND       = int(os.getenv("RND", "42"))
CV        = int(os.getenv("CV", "3"))
C_LOGREG  = float(os.getenv("C", "1.0"))
N_SAMPLE  = int(os.getenv("TRAIN_SAMPLE_N", "250000"))  # 0 = all
TE_CAT    = int(os.getenv("TE_CAT", "0")) == 1
GBM_CHECK = int(os.getenv("GBM_CHECK", "0")) == 1
TE_ALPHA  = float(os.getenv("TE_ALPHA", "10"))

# --- Helpers
def ohe_fallback():
    try:
        return OneHotEncoder(handle_unknown="infrequent_if_exist",
                             min_frequency=0.01, sparse_output=True)
    except TypeError:
        try:
            return OneHotEncoder(handle_unknown="ignore", sparse=True)
        except TypeError:
            return OneHotEncoder(handle_unknown="ignore", sparse=False)

def split_cols(cols):
    cat = [c for c in cols if c.endswith("_cat")]
    bin_ = [c for c in cols if c.endswith("_bin")]
    num  = [c for c in cols if (c not in cat and c not in bin_ and c != "target")]
    return cat, bin_, num

def fe_simple(X: pd.DataFrame) -> pd.DataFrame:
    X = X.copy()
    X["missing_count"] = X.isna().sum(axis=1)
    b = [c for c in X.columns if c.endswith("_bin")]
    if b:
        X["sum_all_bin"] = X[b].sum(axis=1)
    return X

def build_pre(cat, bin_, num):
    cat_pipe = Pipeline([("imp", SimpleImputer(strategy="most_frequent")), ("ohe", ohe_fallback())])
    num_pipe = Pipeline([("imp", SimpleImputer(strategy="median")), ("sc", StandardScaler())])
    bin_pipe = Pipeline([("imp", SimpleImputer(strategy="most_frequent"))])
    return ColumnTransformer([("cat", cat_pipe, cat), ("bin", bin_pipe, bin_), ("num", num_pipe, num)], remainder="drop")

def make_feature_set(df, drop_calc=True, extra_drop=None, add_extras=True, drop_groups=None):
    X = df.drop(columns=["target"], errors="ignore").copy().replace(-1, np.nan)
    if drop_calc:
        X = X.drop(columns=[c for c in X.columns if c.startswith("ps_calc_")], errors="ignore")
    if extra_drop:
        X = X.drop(columns=[c for c in extra_drop if c in X.columns], errors="ignore")
    extras_cols = []
    if add_extras:
        X = fe_simple(X); extras_cols = ["missing_count", "sum_all_bin"]
    if drop_groups:
        cat, bin_, num = split_cols(X.columns)
        if drop_groups.get("cat"):    X = X.drop(columns=cat, errors="ignore")
        if drop_groups.get("bin"):    X = X.drop(columns=bin_, errors="ignore")
        if drop_groups.get("num"):    X = X.drop(columns=num, errors="ignore")
        if drop_groups.get("extras"): X = X.drop(columns=[c for c in extras_cols if c in X.columns], errors="ignore")
    return X

def cv_scores_ohe(X, y, C=1.0, CV=3, seed=RND):
    cat, bin_, num = split_cols(X.columns)
    pre  = build_pre(cat, bin_, num)
    clf  = LogisticRegression(penalty="l2", solver="saga", C=C, class_weight="balanced", max_iter=4000, random_state=seed)
    pipe = Pipeline([("pre", pre), ("clf", clf)])
    skf = StratifiedKFold(n_splits=CV, shuffle=True, random_state=seed)
    proba = np.zeros(len(y), dtype=float)
    for tr, te in skf.split(X, y):
        m = pipe.fit(X.iloc[tr], y.iloc[tr])
        proba[te] = m.predict_proba(X.iloc[te])[:, 1]
    return roc_auc_score(y, proba), average_precision_score(y, proba)

# --- Target Encoding
def _kfold_target_encode(train_cat: pd.DataFrame, y_tr: pd.Series, valid_cat: pd.DataFrame, n_splits=3, alpha=10, seed=RND):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    global_mean = y_tr.mean()
    tr_enc = pd.DataFrame(index=train_cat.index)
    va_enc = pd.DataFrame(index=valid_cat.index)
    for col in train_cat.columns:
        oof = pd.Series(index=train_cat.index, dtype=float)
        for tr_idx, va_idx in skf.split(train_cat, y_tr):
            col_tr = train_cat.iloc[tr_idx][col]; y_sub = y_tr.iloc[tr_idx]
            stats = y_sub.groupby(col_tr).agg(['mean','count'])
            m = (stats['mean']*stats['count'] + global_mean*alpha) / (stats['count'] + alpha)
            oof.iloc[va_idx] = train_cat.iloc[va_idx][col].map(m)
        tr_enc[col] = oof.fillna(global_mean)
        stats_full = y_tr.groupby(train_cat[col]).agg(['mean','count'])
        m_full = (stats_full['mean']*stats_full['count'] + global_mean*alpha) / (stats_full['count'] + alpha)
        va_enc[col] = valid_cat[col].map(m_full).fillna(global_mean)
    tr_enc.columns = [f"te_{c}" for c in tr_enc.columns]
    va_enc.columns = [f"te_{c}" for c in va_enc.columns]
    return tr_enc, va_enc

def _prep_te_blocks(X_tr, X_va):
    cat, bin_, num = split_cols(X_tr.columns)
    imp_cat = SimpleImputer(strategy="most_frequent")
    imp_bin = SimpleImputer(strategy="most_frequent")
    imp_num = SimpleImputer(strategy="median")
    Xtr_cat = pd.DataFrame(imp_cat.fit_transform(X_tr[cat]) if cat else np.empty((len(X_tr),0)), columns=cat, index=X_tr.index)
    Xva_cat = pd.DataFrame(imp_cat.transform(X_va[cat]) if cat else np.empty((len(X_va),0)), columns=cat, index=X_va.index)
    Xtr_bin = pd.DataFrame(imp_bin.fit_transform(X_tr[bin_]) if bin_ else np.empty((len(X_tr),0)), columns=bin_, index=X_tr.index)
    Xva_bin = pd.DataFrame(imp_bin.transform(X_va[bin_]) if bin_ else np.empty((len(X_va),0)), columns=bin_, index=X_va.index)
    Xtr_num = pd.DataFrame(imp_num.fit_transform(X_tr[num]) if num else np.empty((len(X_tr),0)), columns=num, index=X_tr.index)
    Xva_num = pd.DataFrame(imp_num.transform(X_va[num]) if num else np.empty((len(X_va),0)), columns=num, index=X_va.index)
    sc = StandardScaler(with_mean=True, with_std=True)
    if Xtr_num.shape[1]:
        Xtr_num = pd.DataFrame(sc.fit_transform(Xtr_num), columns=Xtr_num.columns, index=Xtr_num.index)
        Xva_num = pd.DataFrame(sc.transform(Xva_num), columns=Xva_num.columns, index=Xva_num.index)
    return (Xtr_cat, Xva_cat, Xtr_bin, Xva_bin, Xtr_num, Xva_num)

def cv_scores_te(X, y, C=1.0, CV=3, seed=RND, alpha=TE_ALPHA):
    skf = StratifiedKFold(n_splits=CV, shuffle=True, random_state=seed)
    proba = np.zeros(len(y), dtype=float)
    for tr_idx, va_idx in skf.split(X, y):
        X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
        y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]
        Xtr_cat, Xva_cat, Xtr_bin, Xva_bin, Xtr_num, Xva_num = _prep_te_blocks(X_tr, X_va)
        if Xtr_cat.shape[1]:
            tr_te, va_te = _kfold_target_encode(Xtr_cat, y_tr, Xva_cat, n_splits=CV, alpha=alpha, seed=seed)
        else:
            tr_te = pd.DataFrame(index=X_tr.index); va_te = pd.DataFrame(index=X_va.index)
        Xtr_fin = pd.concat([Xtr_num, Xtr_bin, tr_te], axis=1)
        Xva_fin = pd.concat([Xva_num, Xva_bin, va_te], axis=1)
        clf = LogisticRegression(penalty="l2", solver="lbfgs", C=C, class_weight="balanced", max_iter=4000, random_state=seed)
        clf.fit(Xtr_fin, y_tr)
        proba[va_idx] = clf.predict_proba(Xva_fin)[:,1]
    return roc_auc_score(y, proba), average_precision_score(y, proba)

# --- Optional: LightGBM check 
def holdout_gbm_check(X_tr, y_tr, X_te, y_te, seed=RND):
    try:
        from lightgbm import LGBMClassifier
    except Exception:
        print("[GBM] LightGBM not available – skip.")
        return None
    clf = LGBMClassifier(n_estimators=300, learning_rate=0.1, num_leaves=31,
                         subsample=0.8, colsample_bytree=0.8, reg_lambda=0.0,
                         random_state=seed, n_jobs=-1)
    clf.fit(X_tr, y_tr)
    p = clf.predict_proba(X_te)[:,1]
    return float(roc_auc_score(y_te, p)), float(average_precision_score(y_te, p))

# --- Main
def main():
    reports = ROOT/"reports"; reports.mkdir(parents=True, exist_ok=True)

    # Load data
    df_all = load_and_save_data().replace(-1, np.nan)
    n_rows_total = len(df_all)
    df = df_all
    if N_SAMPLE and N_SAMPLE < len(df_all):
        df = df_all.sample(N_SAMPLE, random_state=RND).sort_index()
    y  = df["target"].astype(int)

    # Consistent holdout split (export indices)
    X_tr_all, X_te_all, y_tr, y_te = train_test_split(
        df.drop(columns=["target"]), y, test_size=0.2, stratify=y, random_state=RND
    )
    df_tr = pd.concat([X_tr_all, y_tr], axis=1)
    df_te = pd.concat([X_te_all, y_te], axis=1)

    split_indices = {"train": df_tr.index.tolist(), "test": df_te.index.tolist()}
    (reports/"split_indices.json").write_text(json.dumps(split_indices, indent=2))

    # Candidate configs
    configs = [
        {"name":"all_features",           "drop_calc":False, "extra_drop":[],                           "add_extras":False},
        {"name":"drop_calc+opt+extras",  "drop_calc":True,  "extra_drop":["ps_ind_14","ps_car_10_cat"], "add_extras":True},
        {"name":"drop_calc_only",        "drop_calc":True,  "extra_drop":[],                           "add_extras":False},
        {"name":"drop_calc+extras",      "drop_calc":True,  "extra_drop":[],                           "add_extras":True},
    ]

    # CV scores per config
    rows = []
    for cfg in configs:
        X_tr_cfg = make_feature_set(df_tr, drop_calc=cfg["drop_calc"], extra_drop=cfg["extra_drop"], add_extras=cfg["add_extras"])
        if TE_CAT:
            auc_cv, pr_cv = cv_scores_te(X_tr_cfg, y_tr.loc[X_tr_cfg.index], C=C_LOGREG, CV=CV)
        else:
            auc_cv, pr_cv = cv_scores_ohe(X_tr_cfg, y_tr.loc[X_tr_cfg.index], C=C_LOGREG, CV=CV)
        rows.append({
            "name":cfg["name"], "n_features":int(X_tr_cfg.shape[1]),
            "cv_auc":float(auc_cv), "cv_pr_auc":float(pr_cv),
            "drop_calc":cfg["drop_calc"], "extra_drop":cfg["extra_drop"],
            "add_extras":cfg["add_extras"], "te_cat": TE_CAT
        })

    res = pd.DataFrame(rows).sort_values(["cv_auc","cv_pr_auc"], ascending=False)
    res_path = reports/"feature_gate_scores.csv"; res.to_csv(res_path, index=False)

    # --- Select BEST BY CV AUC (tie-breaker: PR-AUC)
    best_cv = res.iloc[0].to_dict()
    X_tr_best = make_feature_set(df_tr, drop_calc=best_cv["drop_calc"], extra_drop=best_cv["extra_drop"], add_extras=best_cv["add_extras"])
    X_te_best = make_feature_set(df_te, drop_calc=best_cv["drop_calc"], extra_drop=best_cv["extra_drop"], add_extras=best_cv["add_extras"])

    # Export features (exactly those used by best CV AUC)
    (reports/"features_selected.csv").write_text(
        pd.Series(pd.Index(X_tr_best.columns), name="raw_feature").to_csv(index=False)
    )

    # --- Holdout eval (best vs. all_features) for plotting
    # Best
    if TE_CAT:
        Xtr_cat, Xva_cat, Xtr_bin, Xva_bin, Xtr_num, Xva_num = _prep_te_blocks(X_tr_best, X_te_best)
        tr_te, va_te = _kfold_target_encode(Xtr_cat, y_tr.loc[X_tr_best.index], Xva_cat, n_splits=CV, alpha=TE_ALPHA, seed=RND)
        Xtr_fin = pd.concat([Xtr_num, Xtr_bin, tr_te], axis=1)
        Xva_fin = pd.concat([Xva_num, Xva_bin, va_te], axis=1)
        clf_best = LogisticRegression(penalty="l2", solver="lbfgs", C=C_LOGREG, class_weight="balanced", max_iter=4000, random_state=RND)
        clf_best.fit(Xtr_fin, y_tr.loc[X_tr_best.index])
        proba_best = clf_best.predict_proba(Xva_fin)[:,1]
    else:
        cat_b, bin_b, num_b = split_cols(X_tr_best.columns)
        pipe_best = Pipeline([("pre", build_pre(cat_b, bin_b, num_b)),
                              ("clf", LogisticRegression(penalty="l2", solver="saga", C=C_LOGREG, class_weight="balanced", max_iter=4000, random_state=RND))])
        m_best = pipe_best.fit(X_tr_best, y_tr.loc[X_tr_best.index])
        proba_best = m_best.predict_proba(X_te_best)[:,1]

    y_true_best = y_te.loc[X_te_best.index]

    # All-features baseline
    X_tr_allF = make_feature_set(df_tr, drop_calc=False, extra_drop=[], add_extras=False)
    X_te_allF = make_feature_set(df_te, drop_calc=False, extra_drop=[], add_extras=False)
    if TE_CAT:
        Xtr_catA, Xva_catA, Xtr_binA, Xva_binA, Xtr_numA, Xva_numA = _prep_te_blocks(X_tr_allF, X_te_allF)
        tr_teA, va_teA = _kfold_target_encode(Xtr_catA, y_tr.loc[X_tr_allF.index], Xva_catA, n_splits=CV, alpha=TE_ALPHA, seed=RND)
        Xtr_all_fin = pd.concat([Xtr_numA, Xtr_binA, tr_teA], axis=1)
        Xva_all_fin = pd.concat([Xva_numA, Xva_binA, va_teA], axis=1)
        clf_all = LogisticRegression(penalty="l2", solver="lbfgs", C=C_LOGREG, class_weight="balanced", max_iter=4000, random_state=RND)
        clf_all.fit(Xtr_all_fin, y_tr.loc[X_tr_allF.index])
        proba_all = clf_all.predict_proba(Xva_all_fin)[:,1]
    else:
        catA, binA, numA = split_cols(X_tr_allF.columns)
        pipe_all = Pipeline([("pre", build_pre(catA, binA, numA)),
                             ("clf", LogisticRegression(penalty="l2", solver="saga", C=C_LOGREG, class_weight="balanced", max_iter=4000, random_state=RND))])
        m_all = pipe_all.fit(X_tr_allF, y_tr.loc[X_tr_allF.index])
        proba_all = m_all.predict_proba(X_te_allF)[:,1]

    # Scores
    hold_auc_best = roc_auc_score(y_true_best, proba_best)
    hold_pr_best  = average_precision_score(y_true_best, proba_best)
    hold_auc_all  = roc_auc_score(y_te.loc[X_te_allF.index], proba_all)
    hold_pr_all   = average_precision_score(y_te.loc[X_te_allF.index], proba_all)

    # --- Plot: Precision-Recall (best vs all_features)
    prec_b, rec_b, _ = precision_recall_curve(y_true_best, proba_best)
    prec_a, rec_a, _ = precision_recall_curve(y_te.loc[X_te_allF.index], proba_all)

    plt.figure(figsize=(7,5))
    plt.plot(rec_b, prec_b, label=f"Best (AP={hold_pr_best:.3f}, AUC={hold_auc_best:.3f})")
    plt.plot(rec_a, prec_a, label=f"All-features (AP={hold_pr_all:.3f}, AUC={hold_auc_all:.3f})")
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.title("Holdout Precision-Recall Curve")
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig(reports/"holdout_pr_curve.png", dpi=150)
    plt.close()

    # Optional GBM smoke-test on best/all (using OHE pre)
    gbm_out = None
    if GBM_CHECK:
        catB, binB, numB = split_cols(X_tr_best.columns); preB = build_pre(catB, binB, numB)
        XtrB = preB.fit_transform(X_tr_best); XvaB = preB.transform(X_te_best)
        gbm_best = holdout_gbm_check(XtrB, y_tr.loc[X_tr_best.index], XvaB, y_true_best)
        catC, binC, numC = split_cols(X_tr_allF.columns); preC = build_pre(catC, binC, numC)
        XtrC = preC.fit_transform(X_tr_allF); XvaC = preC.transform(X_te_allF)
        gbm_all  = holdout_gbm_check(XtrC, y_tr.loc[X_tr_allF.index], XvaC, y_te.loc[X_te_allF.index])
        gbm_out = {"best": gbm_best, "all": gbm_all}

    # Meta
    meta = {
        "random_state": RND, "cv_splits": CV, "C": C_LOGREG,
        "n_rows_total": int(n_rows_total), "sample_n": int(len(df)),
        "te_cat": TE_CAT, "te_alpha": TE_ALPHA, "gbm_check": bool(GBM_CHECK),
        "scores_path": str(res_path),
        "features_path": str(reports/"features_selected.csv"),
        "split_indices_path": str(reports/"split_indices.json"),
        "pr_curve_path": str(reports/"holdout_pr_curve.png"),
        "best_by_cv": best_cv,
        "holdout_scores": {
            "best_auc": float(hold_auc_best), "best_pr_auc": float(hold_pr_best),
            "all_auc": float(hold_auc_all), "all_pr_auc": float(hold_pr_all)
        },
        "gbm_holdout": gbm_out
    }
    (reports/"feature_gate_meta.json").write_text(json.dumps(meta, indent=2))

    # Console
    print("\nFEATURE-GATE done.")
    print(f"Train n={len(df_tr):,}, Holdout n={len(df_te):,}, CV={CV}, C={C_LOGREG}, TE_CAT={int(TE_CAT)}")
    print("Scores (CV):\n" + res.head(10).to_string(index=False))
    print(f"\nHoldout (Best by CV): AUC={hold_auc_best:.4f}  PR-AUC={hold_pr_best:.4f}")
    print(f"Holdout (All-features): AUC={hold_auc_all:.4f}  PR-AUC={hold_pr_all:.4f}")
    print("\nArtifacts:")
    print("→ features_selected.csv")
    print("→ split_indices.json")
    print("→ holdout_pr_curve.png")
    print("(+ feature_gate_scores.csv, feature_gate_meta.json)")

if __name__ == "__main__":
    main()


Lade Datensatz aus dem Cache.
[LightGBM] [Info] Number of positive: 7316, number of negative: 192684
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015975 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1190
[LightGBM] [Info] Number of data points in the train set: 200000, number of used features: 110
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.036580 -> initscore=-3.270988
[LightGBM] [Info] Start training from score -3.270988
[LightGBM] [Info] Number of positive: 7316, number of negative: 192684
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018131 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1381
[LightGBM] [Info] Number of data points in the train set: 200000, number of use

In [5]:
import os, sys, json, warnings
warnings.filterwarnings("ignore")
from pathlib import Path
import numpy as np
import pandas as pd

# ---- Root resolution
if "__file__" in globals():
    ROOT = Path(__file__).resolve().parents[1]
else:
    CWD = Path.cwd()
    ROOT = CWD.parent if CWD.name in ("tests", "notebooks") else CWD

# ---- Reports dir (ENV override supported)
RPT = Path(os.getenv("REPORTS_DIR") or (ROOT / "reports"))
print(f"[TEST] Using reports dir: {RPT}")

# ---- Project loader
sys.path.insert(0, str(ROOT))
from src.data_loader import load_and_save_data

# ---- Sklearn bits
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.model_selection import StratifiedKFold

# ---- Helpers
def ohe_fallback():
    try:
        return OneHotEncoder(handle_unknown="infrequent_if_exist",
                             min_frequency=0.01, sparse_output=True)
    except TypeError:
        try:
            return OneHotEncoder(handle_unknown="ignore", sparse=True)
        except TypeError:
            return OneHotEncoder(handle_unknown="ignore", sparse=False)

def split_cols(cols):
    cols = list(cols)
    cat = [c for c in cols if c.endswith("_cat")]
    bin_ = [c for c in cols if c.endswith("_bin")]
    num  = [c for c in cols if (c not in cat and c not in bin_ and c != "target")]
    return cat, bin_, num

def build_pre(cat, bin_, num):
    cat_pipe = Pipeline([("imp", SimpleImputer(strategy="most_frequent")), ("ohe", ohe_fallback())])
    num_pipe = Pipeline([("imp", SimpleImputer(strategy="median")), ("sc", StandardScaler())])
    bin_pipe = Pipeline([("imp", SimpleImputer(strategy="most_frequent"))])
    return ColumnTransformer([("cat", cat_pipe, cat),
                              ("bin", bin_pipe, bin_),
                              ("num", num_pipe, num)], remainder="drop")

# ---- Minimal TE utils (self-contained)
def _prep_te_blocks(X_tr, X_te):
    cat, bin_, num = split_cols(X_tr.columns)
    imp_cat = SimpleImputer(strategy="most_frequent")
    imp_bin = SimpleImputer(strategy="most_frequent")
    imp_num = SimpleImputer(strategy="median")

    def fit_transform_block(imp, A, cols):
        if not cols: return pd.DataFrame(np.empty((len(A),0)), index=A.index)
        return pd.DataFrame(imp.fit_transform(A[cols]), columns=cols, index=A.index)

    def transform_block(imp, A, cols):
        if not cols: return pd.DataFrame(np.empty((len(A),0)), index=A.index)
        return pd.DataFrame(imp.transform(A[cols]), columns=cols, index=A.index)

    Xtr_cat = fit_transform_block(imp_cat, X_tr, cat)
    Xte_cat = transform_block(imp_cat, X_te, cat)
    Xtr_bin = fit_transform_block(imp_bin, X_tr, bin_)
    Xte_bin = transform_block(imp_bin, X_te, bin_)
    Xtr_num = fit_transform_block(imp_num, X_tr, num)
    Xte_num = transform_block(imp_num, X_te, num)

    sc = StandardScaler(with_mean=True, with_std=True)
    if Xtr_num.shape[1]:
        Xtr_num = pd.DataFrame(sc.fit_transform(Xtr_num), columns=Xtr_num.columns, index=Xtr_num.index)
        Xte_num = pd.DataFrame(sc.transform(Xte_num), columns=Xte_num.columns, index=Xte_num.index)

    return (Xtr_cat, Xte_cat, Xtr_bin, Xte_bin, Xtr_num, Xte_num)

def _kfold_target_encode(train_cat, y_tr, valid_cat, n_splits=3, alpha=10, seed=42):
    if train_cat.shape[1] == 0:
        return pd.DataFrame(index=train_cat.index), pd.DataFrame(index=valid_cat.index)
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    global_mean = y_tr.mean()
    tr_enc = pd.DataFrame(index=train_cat.index)
    va_enc = pd.DataFrame(index=valid_cat.index)
    for col in train_cat.columns:
        oof = pd.Series(index=train_cat.index, dtype=float)
        for tr_idx, va_idx in skf.split(train_cat, y_tr):
            col_tr = train_cat.iloc[tr_idx][col]
            y_sub  = y_tr.iloc[tr_idx]
            stats = y_sub.groupby(col_tr).agg(['mean','count'])
            m = (stats['mean']*stats['count'] + global_mean*alpha) / (stats['count'] + alpha)
            oof.iloc[va_idx] = train_cat.iloc[va_idx][col].map(m)
        tr_enc[col] = oof.fillna(global_mean)
        stats_full = y_tr.groupby(train_cat[col]).agg(['mean','count'])
        m_full = (stats_full['mean']*stats_full['count'] + global_mean*alpha) / (stats_full['count'] + alpha)
        va_enc[col] = valid_cat[col].map(m_full).fillna(global_mean)
    tr_enc.columns = [f"te_{c}" for c in tr_enc.columns]
    va_enc.columns = [f"te_{c}" for c in va_enc.columns]
    return tr_enc, va_enc

# ---- Feature reconstruction (same as gate)
def fe_simple(X: pd.DataFrame) -> pd.DataFrame:
    X = X.copy()
    X["missing_count"] = X.isna().sum(axis=1)
    b = [c for c in X.columns if c.endswith("_bin")]
    if b:
        X["sum_all_bin"] = X[b].sum(axis=1)
    return X

def make_feature_set(df, drop_calc=True, extra_drop=None, add_extras=True):
    X = df.drop(columns=["target"], errors="ignore").copy().replace(-1, np.nan)
    if drop_calc:
        X = X.drop(columns=[c for c in X.columns if c.startswith("ps_calc_")], errors="ignore")
    if extra_drop:
        X = X.drop(columns=[c for c in extra_drop if c in X.columns], errors="ignore")
    if add_extras:
        X = fe_simple(X)
    return X

def main():
    # --- Artifacts existence
    paths = {
        "features": RPT / "features_selected.csv",
        "split":    RPT / "split_indices.json",
        "meta":     RPT / "feature_gate_meta.json",
        "pr_png":   RPT / "holdout_pr_curve.png",
        "scores":   RPT / "feature_gate_scores.csv",
    }
    for k,p in paths.items():
        assert p.exists(), f"{k} artifact missing: {p}"

    # --- Load artifacts
    feats = pd.read_csv(paths["features"])["raw_feature"].tolist()
    assert len(feats) > 0, "features_selected.csv is empty"

    with open(paths["split"], "r") as f: split = json.load(f)
    with open(paths["meta"], "r") as f: meta = json.load(f)

    train_idx = split["train"]; test_idx = split["test"]
    assert len(train_idx) > 0 and len(test_idx) > 0, "split indices are empty"
    assert set(train_idx).isdisjoint(set(test_idx)), "train/test indices overlap!"
    assert paths["pr_png"].stat().st_size > 1000, "holdout_pr_curve.png too small or empty"

    # --- Load data, align indices
    df_all = load_and_save_data().replace(-1, np.nan)
    miss_tr = [i for i in train_idx if i not in df_all.index]
    miss_te = [i for i in test_idx  if i not in df_all.index]
    assert not miss_tr and not miss_te, "Saved indices not in current dataframe index"

    df_tr = df_all.loc[train_idx]
    df_te = df_all.loc[test_idx]
    assert "target" in df_tr.columns, "target column missing"

    y_tr = df_tr["target"].astype(int)
    y_te = df_te["target"].astype(int)

    # --- Rebuild features using SAME recipe as gate
    best = meta.get("best_by_cv", {})
    drop_calc  = bool(best.get("drop_calc", True))
    extra_drop = best.get("extra_drop", []) or []
    add_extras = bool(best.get("add_extras", False))

    X_tr_full = make_feature_set(df_tr, drop_calc=drop_calc, extra_drop=extra_drop, add_extras=add_extras)
    X_te_full = make_feature_set(df_te, drop_calc=drop_calc, extra_drop=extra_drop, add_extras=add_extras)

    missing_cols = [c for c in feats if c not in X_tr_full.columns]
    assert not missing_cols, f"Some saved features not in reconstructed feature set: {missing_cols[:5]}"

    X_tr = X_tr_full[feats].copy()
    X_te = X_te_full[feats].copy()

    # --- Sanity on stratification
    diff_rate = abs(y_tr.mean() - y_te.mean())
    assert diff_rate < 0.01, f"Class rate drift too high between splits: {diff_rate:.4f}"

    # --- Minimal train run (mirrors meta)
    te_cat   = bool(meta.get("te_cat", False))
    rnd      = int(meta.get("random_state", 42))
    C        = float(meta.get("C", 1.0))
    cv       = int(meta.get("cv_splits", 3))
    te_alpha = float(meta.get("te_alpha", 10))

    if te_cat:
        # TE path
        Xtr_cat, Xte_cat, Xtr_bin, Xte_bin, Xtr_num, Xte_num = _prep_te_blocks(X_tr, X_te)
        tr_te, te_te = _kfold_target_encode(Xtr_cat, y_tr, Xte_cat, n_splits=cv, alpha=te_alpha, seed=rnd)
        Xtr_fin = pd.concat([Xtr_num, Xtr_bin, tr_te], axis=1)
        Xte_fin = pd.concat([Xte_num, Xte_bin, te_te], axis=1)
        clf = LogisticRegression(penalty="l2", solver="lbfgs", C=C,
                                 class_weight="balanced", max_iter=4000, random_state=rnd)
        clf.fit(Xtr_fin, y_tr)
        proba = clf.predict_proba(Xte_fin)[:, 1]
    else:
        # OHE path
        cat, bin_, num = split_cols(X_tr.columns)
        pre  = build_pre(cat, bin_, num)
        clf  = LogisticRegression(penalty="l2", solver="saga", C=C,
                                  class_weight="balanced", max_iter=4000, random_state=rnd)
        pipe = Pipeline([("pre", pre), ("clf", clf)])
        pipe.fit(X_tr, y_tr)
        proba = pipe.predict_proba(X_te)[:, 1]

    auc = roc_auc_score(y_te, proba)
    ap  = average_precision_score(y_te, proba)

    # --- Soft sanity ranges (dataset-specific)
    assert 0.5 <= auc <= 0.9, f"AUC out of expected sanity range: {auc:.3f}"
    assert 0.02 <= ap <= 0.5, f"PR-AUC out of expected sanity range: {ap:.3f}"

    # --- OK prints
    print("OK: artifacts present")
    print(f"OK: split disjoint, |train|={len(train_idx):,}, |test|={len(test_idx):,}, class drift={diff_rate:.4f}")
    print(f"OK: {len(feats)} features loaded and reconstructed")
    print(f"OK: minimal {'TE' if te_cat else 'OHE'} pipeline — Holdout AUC={auc:.3f}, PR-AUC={ap:.3f}")
    print("OK: PR-curve file present and non-empty")
    print("ALL TESTS PASSED ✅")

if __name__ == "__main__":
    try:
        main()
    except AssertionError as e:
        print(f"TEST FAILED ❌  {e}")
        sys.exit(1)
    except Exception as ex:
        print(f"TEST ERROR ❌  {type(ex).__name__}: {ex}")
        sys.exit(2)


[TEST] Using reports dir: /Users/lucasbeseler/ada_portoSeguro/reports
Lade Datensatz aus dem Cache.
OK: artifacts present
OK: split disjoint, |train|=200,000, |test|=50,000, class drift=0.0000
OK: 37 features loaded and reconstructed
OK: minimal TE pipeline — Holdout AUC=0.631, PR-AUC=0.063
OK: PR-curve file present and non-empty
ALL TESTS PASSED ✅


In [8]:
import os, sys, json, warnings
warnings.filterwarnings("ignore")
from pathlib import Path
import numpy as np
import pandas as pd

# ---------- Root + reports dir ----------
if "__file__" in globals():
    ROOT = Path(__file__).resolve().parents[1]
else:
    CWD = Path.cwd()
    ROOT = CWD.parent if CWD.name in ("tests", "notebooks", "tools") else CWD

RPT = Path(os.getenv("REPORTS_DIR") or (ROOT / "reports"))
if not RPT.exists():
    alt = ROOT.parent / "reports"
    if alt.exists():
        RPT = alt
print(f"[ABLATION] Using reports dir: {RPT}")

# ---------- Project loader ----------
sys.path.insert(0, str(ROOT))
from src.data_loader import load_and_save_data

# ---------- Sklearn ----------
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

# ---------- Plot (matplotlib only) ----------
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt

# ---------- Helpers ----------
def ohe_fallback():
    try:
        return OneHotEncoder(handle_unknown="infrequent_if_exist",
                             min_frequency=0.01, sparse_output=True)
    except TypeError:
        try:
            return OneHotEncoder(handle_unknown="ignore", sparse=True)
        except TypeError:
            return OneHotEncoder(handle_unknown="ignore", sparse=False)

def split_cols(cols):
    cols = list(cols)
    cat = [c for c in cols if c.endswith("_cat")]
    bin_ = [c for c in cols if c.endswith("_bin")]
    num  = [c for c in cols if (c not in cat and c not in bin_ and c != "target")]
    return cat, bin_, num

def build_pre(cat, bin_, num):
    cat_pipe = Pipeline([("imp", SimpleImputer(strategy="most_frequent")), ("ohe", ohe_fallback())])
    num_pipe = Pipeline([("imp", SimpleImputer(strategy="median")), ("sc", StandardScaler())])
    bin_pipe = Pipeline([("imp", SimpleImputer(strategy="most_frequent"))])
    return ColumnTransformer([("cat", cat_pipe, cat),
                              ("bin", bin_pipe, bin_),
                              ("num", num_pipe, num)], remainder="drop")

def fe_simple(X: pd.DataFrame) -> pd.DataFrame:
    X = X.copy()
    X["missing_count"] = X.isna().sum(axis=1)
    b = [c for c in X.columns if c.endswith("_bin")]
    if b:
        X["sum_all_bin"] = X[b].sum(axis=1)
    return X

def make_feature_set(df, drop_calc=True, extra_drop=None, add_extras=True):
    X = df.drop(columns=["target"], errors="ignore").copy().replace(-1, np.nan)
    if drop_calc:
        X = X.drop(columns=[c for c in X.columns if c.startswith("ps_calc_")], errors="ignore")
    if extra_drop:
        X = X.drop(columns=[c for c in extra_drop if c in X.columns], errors="ignore")
    if add_extras:
        X = fe_simple(X)
    return X

# ---------- TE utilities ----------
def _prep_te_blocks(X_tr, X_va):
    cat, bin_, num = split_cols(X_tr.columns)
    imp_cat = SimpleImputer(strategy="most_frequent")
    imp_bin = SimpleImputer(strategy="most_frequent")
    imp_num = SimpleImputer(strategy="median")

    def fit_transform_block(imp, A, cols):
        if not cols: return pd.DataFrame(np.empty((len(A),0)), index=A.index)
        return pd.DataFrame(imp.fit_transform(A[cols]), columns=cols, index=A.index)

    def transform_block(imp, A, cols):
        if not cols: return pd.DataFrame(np.empty((len(A),0)), index=A.index)
        return pd.DataFrame(imp.transform(A[cols]), columns=cols, index=A.index)

    Xtr_cat = fit_transform_block(imp_cat, X_tr, cat)
    Xva_cat = transform_block(imp_cat, X_va, cat)
    Xtr_bin = fit_transform_block(imp_bin, X_tr, bin_)
    Xva_bin = transform_block(imp_bin, X_va, bin_)
    Xtr_num = fit_transform_block(imp_num, X_tr, num)
    Xva_num = transform_block(imp_num, X_va, num)

    sc = StandardScaler(with_mean=True, with_std=True)
    if Xtr_num.shape[1]:
        Xtr_num = pd.DataFrame(sc.fit_transform(Xtr_num), columns=Xtr_num.columns, index=Xtr_num.index)
        Xva_num = pd.DataFrame(sc.transform(Xva_num), columns=Xva_num.columns, index=Xva_num.index)

    return (Xtr_cat, Xva_cat, Xtr_bin, Xva_bin, Xtr_num, Xva_num)

def _kfold_target_encode(train_cat, y_tr, valid_cat, n_splits=3, alpha=10, seed=42):
    if train_cat.shape[1] == 0:
        return pd.DataFrame(index=train_cat.index), pd.DataFrame(index=valid_cat.index)
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    global_mean = y_tr.mean()
    tr_enc = pd.DataFrame(index=train_cat.index)
    va_enc = pd.DataFrame(index=valid_cat.index)
    for col in train_cat.columns:
        oof = pd.Series(index=train_cat.index, dtype=float)
        for tr_idx, va_idx in skf.split(train_cat, y_tr):
            col_tr = train_cat.iloc[tr_idx][col]; y_sub = y_tr.iloc[tr_idx]
            stats = y_sub.groupby(col_tr).agg(['mean','count'])
            m = (stats['mean']*stats['count'] + global_mean*alpha) / (stats['count'] + alpha)
            oof.iloc[va_idx] = train_cat.iloc[va_idx][col].map(m)
        tr_enc[col] = oof.fillna(global_mean)
        stats_full = y_tr.groupby(train_cat[col]).agg(['mean','count'])
        m_full = (stats_full['mean']*stats_full['count'] + global_mean*alpha) / (stats['count'] + alpha)
        va_enc[col] = valid_cat[col].map(m_full).fillna(global_mean)
    tr_enc.columns = [f"te_{c}" for c in tr_enc.columns]
    va_enc.columns = [f"te_{c}" for c in va_enc.columns]
    return tr_enc, va_enc

# ---------- CV runners ----------
def cv_mean_auc_OHE(X, y, seed=42, cv=3, C=1.0):
    cat, bin_, num = split_cols(X.columns)
    pre  = build_pre(cat, bin_, num)
    clf  = LogisticRegression(penalty="l2", solver="saga", C=C,
                              class_weight="balanced", max_iter=4000, random_state=seed)
    pipe = Pipeline([("pre", pre), ("clf", clf)])
    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=seed)
    aucs = []
    for tr, va in skf.split(X, y):
        m = pipe.fit(X.iloc[tr], y.iloc[tr])
        p = m.predict_proba(X.iloc[va])[:,1]
        aucs.append(roc_auc_score(y.iloc[va], p))
    return float(np.mean(aucs))

def cv_mean_auc_TE(X, y, seed=42, cv=3, C=1.0, alpha=10):
    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=seed)
    aucs = []
    for tr_idx, va_idx in skf.split(X, y):
        X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
        y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]
        Xtr_cat, Xva_cat, Xtr_bin, Xva_bin, Xtr_num, Xva_num = _prep_te_blocks(X_tr, X_va)
        tr_te, va_te = _kfold_target_encode(Xtr_cat, y_tr, Xva_cat, n_splits=cv, alpha=alpha, seed=seed)
        Xtr_fin = pd.concat([Xtr_num, Xtr_bin, tr_te], axis=1)
        Xva_fin = pd.concat([Xva_num, Xva_bin, va_te], axis=1)
        clf = LogisticRegression(penalty="l2", solver="lbfgs", C=C,
                                 class_weight="balanced", max_iter=4000, random_state=seed)
        clf.fit(Xtr_fin, y_tr)
        p = clf.predict_proba(Xva_fin)[:,1]
        aucs.append(roc_auc_score(y_va, p))
    return float(np.mean(aucs))

def bootstrap_ci(values, n_boot=2000, alpha=0.05, seed=42):
    rng = np.random.RandomState(seed)
    vals = np.array(values, dtype=float)
    if len(vals) == 0:
        return float("nan"), float("nan")
    boots = []
    for _ in range(n_boot):
        samp = rng.choice(vals, size=len(vals), replace=True)
        boots.append(np.mean(samp))
    lo = np.percentile(boots, 100*alpha/2)
    hi = np.percentile(boots, 100*(1 - alpha/2))
    return float(lo), float(hi)

def main():
    # ----- artifacts -----
    meta_p  = RPT / "feature_gate_meta.json"
    split_p = RPT / "split_indices.json"
    assert meta_p.exists() and split_p.exists(), "Missing meta/split artifacts"
    meta  = json.loads(meta_p.read_text())
    split = json.loads(split_p.read_text())

    # ----- data + split -----
    df = load_and_save_data().replace(-1, np.nan)
    df_tr = df.loc[split["train"]].copy()
    y = df_tr["target"].astype(int)

    # ----- base recipe -----
    best = meta["best_by_cv"]
    drop_calc  = bool(best.get("drop_calc", True))
    extra_drop = best.get("extra_drop", [])
    if isinstance(extra_drop, str):
        try: extra_drop = list(eval(extra_drop))
        except Exception: extra_drop = []
    add_extras = bool(best.get("add_extras", False))

    X_base_full = make_feature_set(df_tr, drop_calc=drop_calc, extra_drop=extra_drop, add_extras=add_extras)
    base_cols = list(X_base_full.columns)

    # candidates
    add_candidates  = [f for f in extra_drop if f in df_tr.columns]  # previously dropped by config
    drop_candidates = base_cols[:]                                   # currently kept

    te_cat   = bool(meta.get("te_cat", False))
    cv       = int(meta.get("cv_splits", 3))
    C        = float(meta.get("C", 1.0))
    te_alpha = float(meta.get("te_alpha", 10))
    seeds    = [42,43,44,45,46]  # multi-seed for stability

    def cv_mean_auc(X, y, seed):
        return cv_mean_auc_TE(X, y, seed=seed, cv=cv, C=C, alpha=te_alpha) if te_cat else \
               cv_mean_auc_OHE(X, y, seed=seed, cv=cv, C=C)

    # baseline (once per seed)
    base_aucs = {sd: cv_mean_auc(X_base_full, y, seed=sd) for sd in seeds}

    # predefine columns so empty frames don't KeyError
    cols = ["feature","action","n_features_variant",
            "auc_base_mean","auc_variant_mean",
            "delta_auc_mean","delta_auc_ci_low","delta_auc_ci_high",
            "seeds","cv"]
    rows = []

    # DROP pass
    for f in drop_candidates:
        X_var = X_base_full.drop(columns=[f], errors="ignore")
        auc_deltas, aucs_var = [], []
        for sd in seeds:
            auc_b = base_aucs[sd]
            auc_v = cv_mean_auc(X_var, y, seed=sd)
            aucs_var.append(auc_v)
            auc_deltas.append(auc_v - auc_b)
        lo, hi = bootstrap_ci(auc_deltas, n_boot=2000, alpha=0.05, seed=seeds[0])
        rows.append([f, "DROP", int(X_var.shape[1]),
                     float(np.mean(list(base_aucs.values()))), float(np.mean(aucs_var)),
                     float(np.mean(auc_deltas)), lo, hi,
                     ",".join(map(str, seeds)), cv])

    # ADD pass
    for f in add_candidates:
        extra_drop_new = [x for x in extra_drop if x != f]
        X_var_full = make_feature_set(df_tr, drop_calc=drop_calc, extra_drop=extra_drop_new, add_extras=add_extras)
        if f not in X_var_full.columns:
            continue
        auc_deltas, aucs_var = [], []
        for sd in seeds:
            auc_b = base_aucs[sd]
            auc_v = cv_mean_auc(X_var_full, y, seed=sd)
            aucs_var.append(auc_v)
            auc_deltas.append(auc_v - auc_b)
        lo, hi = bootstrap_ci(auc_deltas, n_boot=2000, alpha=0.05, seed=seeds[0])
        rows.append([f, "ADD", int(X_var_full.shape[1]),
                     float(np.mean(list(base_aucs.values()))), float(np.mean(aucs_var)),
                     float(np.mean(auc_deltas)), lo, hi,
                     ",".join(map(str, seeds)), cv])

    # results frame (safe even if empty)
    res = pd.DataFrame(rows, columns=cols)
    out_csv = RPT / "feature_ablation_results.csv"

    if res.empty:
        # write header-only CSV and exit gracefully
        res.to_csv(out_csv, index=False)
        print(f"[ABLATION] No candidates found. Wrote header to: {out_csv}")
        return

    # sort + save
    res["abs_delta"] = res["delta_auc_mean"].abs()
    res_sorted = res.sort_values(["action","abs_delta"], ascending=[True, False])
    res_sorted.to_csv(out_csv, index=False)
    print(f"[ABLATION] Saved: {out_csv}")

    # plot top-|ΔAUC| with 95% CI
    topk = min(25, len(res_sorted))
    sub = res_sorted.sort_values("abs_delta", ascending=False).head(topk)

    xlbl = sub.apply(lambda r: f'{r["action"]}:{r["feature"]}', axis=1)
    ymu  = sub["delta_auc_mean"].values
    yerr = np.vstack([
        ymu - sub["delta_auc_ci_low"].values,
        sub["delta_auc_ci_high"].values - ymu
    ])

    plt.figure(figsize=(10, max(4, 0.35*len(sub))))
    plt.errorbar(ymu, np.arange(len(sub)), xerr=yerr, fmt='o', capsize=3)
    plt.yticks(np.arange(len(sub)), xlbl)
    plt.axvline(0.0, linestyle="--")
    plt.xlabel("ΔAUC (variant - base) with 95% CI")
    plt.title(f"Feature ablation (seeds={len(seeds)}, cv={cv}) — Top {len(sub)} by |ΔAUC|")
    plt.tight_layout()
    out_png = RPT / "feature_ablation_auc.png"
    plt.savefig(out_png, dpi=150)
    plt.close()
    print(f"[ABLATION] Plot saved: {out_png}")

if __name__ == "__main__":
    try:
        main()
    except AssertionError as e:
        print(f"ABLATION FAILED ❌  {e}")
        sys.exit(1)
    except Exception as ex:
        print(f"ABLATION ERROR ❌  {type(ex).__name__}: {ex}")
        sys.exit(2)


[ABLATION] Using reports dir: /Users/lucasbeseler/ada_portoSeguro/reports
Lade Datensatz aus dem Cache.
[ABLATION] Saved: /Users/lucasbeseler/ada_portoSeguro/reports/feature_ablation_results.csv
[ABLATION] Plot saved: /Users/lucasbeseler/ada_portoSeguro/reports/feature_ablation_auc.png


In [9]:
from pathlib import Path
import pandas as pd

# ENV oder globalen reports-Pfad verwenden
RPT = Path(os.getenv("REPORTS_DIR") or "/Users/lucasbeseler/ada_portoSeguro/reports")
df = pd.read_csv(RPT / "feature_ablation_results.csv")

def decide(row):
    if row["action"] == "DROP":
        if row["delta_auc_ci_high"] < 0:  # CI < 0: Droppen schadet
            return "KEEP"
        if row["delta_auc_ci_low"] > 0:   # CI > 0: Droppen hilft
            return "REMOVE"
        return "NEUTRAL"
    if row["action"] == "ADD":
        if row["delta_auc_ci_low"] > 0:   # Hinzufügen hilft
            return "ADD"
        if row["delta_auc_ci_high"] < 0:  # Hinzufügen schadet
            return "DONT_ADD"
        return "NEUTRAL"
    return "NEUTRAL"

out = df.copy()
out["decision"] = out.apply(decide, axis=1)
out.to_csv(RPT / "feature_ablation_decisions.csv", index=False)
print("Wrote:", RPT / "feature_ablation_decisions.csv")


Wrote: /Users/lucasbeseler/ada_portoSeguro/reports/feature_ablation_decisions.csv
