In [3]:
import os, sys, json, warnings, time
warnings.filterwarnings("ignore")

from pathlib import Path
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import average_precision_score, roc_auc_score, brier_score_loss, precision_recall_curve, accuracy_score
from sklearn.calibration import calibration_curve

# --- Repo root (notebook/script-safe)
if "__file__" in globals():
    ROOT = Path(__file__).resolve().parents[1]
else:
    ROOT = Path.cwd() if Path.cwd().name not in ("notebooks","tools","tests") else Path.cwd().parent

REPORTS_IN  = Path(os.getenv("REPORTS_IN")  or (ROOT / "reports_Hany"))
REPORTS_OUT = Path(os.getenv("REPORTS_OUT") or (ROOT / "reports_Hany"))
REPORTS_OUT.mkdir(parents=True, exist_ok=True)

sys.path.insert(0, str(ROOT))
from src.data_loader import load_and_save_data
from src.models import get_models

# ZUSÄTZLICH: Importiere notwendige Preprocessing-Module
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

# ... (Runtime params und Helpers unverändert) ...

SPEED = os.getenv("SPEED", "MEDIUM").upper().strip()
def speed_cfg():
    cfg = dict(CV=5, N_EST=6000, EARLY_STOP=200, MODELS=["lgbm","xgb"], LR=0.03)
    if SPEED == "FAST":
        cfg.update(CV=3, N_EST=2000, EARLY_STOP=50, MODELS=["lgbm"], LR=0.05)
    elif SPEED == "MEDIUM":
        cfg.update(CV=5, N_EST=4000, EARLY_STOP=100)
    elif SPEED == "FULL":
        cfg.update(CV=5, N_EST=8000, EARLY_STOP=300)
    return cfg

CFG        = speed_cfg()
RND        = int(os.getenv("RND", "42"))
CV         = int(os.getenv("CV", str(CFG["CV"])))
N_EST      = int(os.getenv("N_EST", str(CFG["N_EST"])))
ESR        = int(os.getenv("EARLY_STOP", str(CFG["EARLY_STOP"])))
MODELS     = [m.strip() for m in os.getenv("MODELS", ",".join(CFG["MODELS"])).split(",") if m.strip()]
IMB        = os.getenv("IMB", "spw").lower()
LR         = float(os.getenv("LR", str(CFG["LR"])))
MEMBER     = os.getenv("MEMBER", "Hany")

def split_cols(cols):
    cat = [c for c in cols if str(c).endswith("_cat")]
    bin_ = [c for c in cols if str(c).endswith("_bin")]
    num  = [c for c in cols if c not in cat and c not in bin_ and c != "target"]
    return cat, bin_, num

def fe_extras(X, selected):
    X = X.copy()
    if "missing_count" in selected:
        X["missing_count"] = X.isna().sum(axis=1)
    if "sum_all_bin" in selected:
        b = [c for c in X.columns if str(c).endswith("_bin")]
        X["sum_all_bin"] = X[b].sum(axis=1) if b else 0
    return X

def prep_for_trees(X: pd.DataFrame, selected_cols):
    X = fe_extras(X, selected_cols)
    keep = [c for c in selected_cols if c in X.columns]
    missing = [c for c in selected_cols if c not in X.columns]
    if missing:
        print(f"[WARN] ignoring {len(missing)} missing selected feature(s).")
    X = X[keep].copy()
    cat, _, _ = split_cols(X.columns)
    for c in cat:
        try: X[c] = X[c].astype("category")
        except: pass
    return X, cat

def scale_pos_weight(y):
    pos = int((y==1).sum()); neg = int((y==0).sum())
    return float(neg / max(pos,1))

def xgb_train_predict(Xtr, ytr, Xva, yva, Xte=None, params=None, seed=RND, lr=LR):
    import xgboost as xgb
    params = dict(params or {})
    p = {
        "objective": "binary:logistic", "eval_metric": "aucpr",
        "tree_method": params.pop("tree_method", "hist"), "eta": params.pop("learning_rate", lr),
        "max_depth": int(params.pop("max_depth", 6)), "min_child_weight": float(params.pop("min_child_weight", 2.0)),
        "subsample": float(params.pop("subsample", 0.9)), "colsample_bytree": float(params.pop("colsample_bytree", 0.9)),
        "lambda": float(params.pop("reg_lambda", 1.0)), "alpha": float(params.pop("reg_alpha", 0.0)),
        "gamma": float(params.pop("gamma", 0.0)), "seed": int(params.pop("seed", seed)), "nthread": -1,
    }
    if IMB == "spw": p["scale_pos_weight"] = float(params.pop("scale_pos_weight", 1.0))
    meta = {"use_ohe": False, "ohe_cols": None}
    cats = [c for c in Xtr.columns if str(c).endswith("_cat")]
    try:
        dtr = xgb.DMatrix(Xtr, label=ytr, enable_categorical=True)
        dva = xgb.DMatrix(Xva, label=yva, enable_categorical=True)
        dte = xgb.DMatrix(Xte, enable_categorical=True) if Xte is not None else None
        p["enable_categorical"] = True
    except Exception:
        meta["use_ohe"] = True
        dXtr = pd.get_dummies(Xtr, columns=cats, dummy_na=True)
        dXva = pd.get_dummies(Xva, columns=cats, dummy_na=True).reindex(columns=dXtr.columns, fill_value=0)
        dtr = xgb.DMatrix(dXtr, label=ytr); dva = xgb.DMatrix(dXva, label=yva)
        if Xte is not None: dXte = pd.get_dummies(Xte, columns=cats, dummy_na=True).reindex(columns=dXtr.columns, fill_value=0); dte = xgb.DMatrix(dXte)
        meta["ohe_cols"] = list(dXtr.columns)
    booster = xgb.train(params=p, dtrain=dtr, num_boost_round=N_EST, evals=[(dva, "valid")], early_stopping_rounds=ESR, verbose_eval=False)
    pred_va = booster.predict(dva, iteration_range=(0, (booster.best_iteration or N_EST)))
    pred_te = booster.predict(dte, iteration_range=(0, (booster.best_iteration or N_EST))) if Xte is not None else None
    imp = booster.get_score(importance_type="gain")
    if meta["use_ohe"]: fi = pd.Series(imp, name="gain").sort_values(ascending=False); fi.index.name = "feature"
    else:
        names = list(Xtr.columns); pairs = []
        for k, v in imp.items():
            if k.startswith("f") and k[1:].isdigit(): idx = int(k[1:]); name = names[idx] if 0 <= idx < len(names) else k
            else: name = k
            pairs.append((name, v))
        fi = pd.Series(dict(pairs), name="gain").sort_values(ascending=False)
    return booster, pred_va, pred_te, fi, meta
def xgb_predict_time_ms_per_1k(booster, X, meta):
    import xgboost as xgb
    t0 = time.perf_counter()
    if meta["use_ohe"]:
        cats = [c for c in X.columns if str(c).endswith("_cat")]
        dX = pd.get_dummies(X, columns=cats, dummy_na=True).reindex(columns=meta["ohe_cols"], fill_value=0)
        d = xgb.DMatrix(dX)
    else:
        d = xgb.DMatrix(X, enable_categorical=True)
    _ = booster.predict(d, iteration_range=(0, (booster.best_iteration or N_EST)))
    dt = time.perf_counter() - t0
    return 1000 * dt / (len(X)/1000)

def oof_cv(model_name, base_params, X, y, cat_cols):
    skf = StratifiedKFold(n_splits=CV, shuffle=True, random_state=RND)
    oof = np.zeros(len(y), dtype=float)

    for tr, va in skf.split(X, y):
        Xtr, Xva = X.iloc[tr], X.iloc[va]; ytr, yva = y.iloc[tr], y.iloc[va]

        if model_name == "lgbm":
            from lightgbm import LGBMClassifier, early_stopping, log_evaluation
            imb_kwargs = {"is_unbalance": True} if IMB == "iso" else {"is_unbalance": False}

            defaults = {
                "n_estimators": N_EST, "random_state": RND, "n_jobs": -1,
                "learning_rate": LR, "num_leaves": 128, "max_depth": -1,
                "min_child_samples": 20, "subsample": 0.9, "colsample_bytree": 0.9,
                "reg_lambda": 1.0, "reg_alpha": 0.0, "max_bin": 511,
                "feature_pre_filter": False, **imb_kwargs
            }
            if base_params: defaults.update(base_params)
            clf = LGBMClassifier(**defaults)
            clf.fit(
                Xtr, ytr, eval_set=[(Xva, yva)], eval_metric="auc",
                categorical_feature=[c for c in cat_cols if c in Xtr.columns],
                callbacks=[early_stopping(ESR), log_evaluation(0)]
            )
            oof[va] = clf.predict_proba(Xva)[:,1]
        elif model_name == "xgb":
            spw = scale_pos_weight(ytr) if IMB == "spw" else 1.0
            _, pred_va, _, _, _ = xgb_train_predict(Xtr, ytr, Xva, yva, params={**(base_params or {}), "scale_pos_weight": spw})
            oof[va] = pred_va
        else: raise ValueError("Unknown model")
    pr = average_precision_score(y, oof); roc = roc_auc_score(y, oof); br = brier_score_loss(y, oof)
    return dict(pr_auc=float(pr), roc_auc=float(roc), brier=float(br), oof=oof)

def fit_final(model_name, params, Xtr, ytr, Xte, yte, cat_cols):
    X_tr, X_val, y_tr, y_val = train_test_split(Xtr, ytr, test_size=0.1, stratify=ytr, random_state=RND)

    if model_name == "lgbm":
        from lightgbm import LGBMClassifier, early_stopping, log_evaluation
        imb_kwargs = {"is_unbalance": True} if IMB == "iso" else {"is_unbalance": False}
        defaults = {
            "n_estimators": N_EST, "random_state": RND, "n_jobs": -1,
            "learning_rate": LR, "num_leaves": 128, "max_depth": -1,
            "min_child_samples": 20, "subsample": 0.9, "colsample_bytree": 0.9,
            "reg_lambda": 1.0, "reg_alpha": 0.0, "max_bin": 511,
            "feature_pre_filter": False, **imb_kwargs
        }
        if params: defaults.update(params)
        clf = LGBMClassifier(**defaults); t0 = time.perf_counter()
        clf.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], eval_metric="auc", categorical_feature=[c for c in cat_cols if c in X_tr.columns], callbacks=[early_stopping(ESR), log_evaluation(0)])
        fit_time_s = time.perf_counter() - t0; t1 = time.perf_counter()
        proba = clf.predict_proba(Xte)[:,1]; pred_ms_per_1k = 1000 * (time.perf_counter() - t1) / (len(Xte)/1000)
        try: fi = pd.Series(clf.booster_.feature_importance(importance_type="gain"), index=clf.booster_.feature_name(), name="gain").sort_values(ascending=False)
        except Exception: fi = pd.Series(clf.feature_importances_, index=Xtr.columns, name="split").sort_values(ascending=False)
        meta = {
            "encoder": "native(LGBM)", "best_iteration": getattr(clf, "best_iteration_", None),
            "n_trees": getattr(clf, "n_estimators_", None), "fit_time_s": float(fit_time_s),
            "predict_time_ms_per_1k": float(pred_ms_per_1k), "model_obj": clf
        }
    else:
        spw = scale_pos_weight(y_tr) if IMB == "spw" else 1.0
        t0 = time.perf_counter()
        booster, _, proba, fi, xmeta = xgb_train_predict(X_tr, y_tr, X_val, y_val, Xte, params={**(params or {}), "scale_pos_weight": spw})
        fit_time_s = time.perf_counter() - t0
        pred_ms_per_1k = xgb_predict_time_ms_per_1k(booster, Xte, xmeta)
        meta = {
            "encoder": "native(XGB)" if not xmeta["use_ohe"] else "OHE(XGB-fallback)",
            "best_iteration": getattr(booster, "best_iteration", None), "n_trees": getattr(booster, "best_ntree_limit", None),
            "fit_time_s": float(fit_time_s), "predict_time_ms_per_1k": float(pred_ms_per_1k), "model_obj": booster
        }
    hold = dict(pr_auc=float(average_precision_score(yte, proba)), roc_auc=float(roc_auc_score(yte, proba)), brier=float(brier_score_loss(yte, proba)))
    return proba, hold, fi, meta

def save_pr_curve(y_true, proba, out_path):
    import matplotlib.pyplot as plt
    prec, rec, _ = precision_recall_curve(y_true, proba); ap = average_precision_score(y_true, proba)
    plt.figure(figsize=(7,5)); plt.plot(rec, prec, label=f'AP={ap:.4f}')
    plt.xlabel('Recall'); plt.ylabel('Precision'); plt.title('Precision-Recall')
    plt.xlim([0,1]); plt.ylim([0,1]); plt.grid(True, alpha=0.3); plt.legend()
    plt.tight_layout(); plt.savefig(out_path, dpi=150); plt.close()

def save_calibration(y_true, proba, out_path):
    import matplotlib.pyplot as plt
    prob_true, prob_pred = calibration_curve(y_true, proba, n_bins=20, strategy="quantile")
    plt.figure(figsize=(6,6)); plt.plot([0,1],[0,1],'--',label='Perfect')
    plt.plot(prob_pred, prob_true, marker='o', label='Model')
    plt.xlabel('Predicted'); plt.ylabel('Observed'); plt.title('Calibration')
    plt.grid(True, alpha=0.3); plt.legend(); plt.tight_layout(); plt.savefig(out_path, dpi=150); plt.close()

def save_top20_importance(fi: pd.Series, out_path):
    import matplotlib.pyplot as plt
    if fi is None or fi.empty: return
    top = fi.head(20).iloc[::-1]
    plt.figure(figsize=(8,6)); plt.barh(top.index, top.values)
    plt.xlabel('Gain'); plt.title('Top-20 Feature Importance')
    plt.tight_layout(); plt.savefig(out_path, dpi=150); plt.close()

def sample_params(model_name, n):
    rng = np.random.default_rng(RND); out = []
    if model_name == "lgbm":
        for _ in range(n):
            out.append(dict(
                learning_rate=LR, num_leaves=int(rng.integers(64, 256)), max_depth=int(rng.integers(-1, 12)),
                min_child_samples=int(rng.integers(10, 80)), subsample=float(rng.uniform(0.7, 1.0)),
                colsample_bytree=float(rng.uniform(0.7, 1.0)), reg_lambda=float(rng.uniform(0.0, 2.0)),
                reg_alpha=float(rng.uniform(0.0, 1.0)), is_unbalance=(IMB=="iso"),
                max_bin=511, feature_pre_filter=False
            ))
    elif model_name == "xgb":
        for _ in range(n):
            out.append(dict(
                learning_rate=LR, tree_method="hist", max_depth=int(rng.integers(3, 10)),
                min_child_weight=float(rng.uniform(1.0, 8.0)), subsample=float(rng.uniform(0.7, 1.0)),
                colsample_bytree=float(rng.uniform(0.7, 1.0)), reg_lambda=float(rng.uniform(0.0, 2.0)),
                reg_alpha=float(rng.uniform(0.0, 1.0)), gamma=float(rng.uniform(0.0, 2.0)),
                scale_pos_weight=1.0
            ))
    return out

def main():
    reports = ROOT/"reports_Hany"; reports.mkdir(parents=True, exist_ok=True)
    models_to_test = get_models(C_LOGREG, RND)

    df_all = load_and_save_data().replace(-1, np.nan)
    n_rows_total = len(df_all)
    df = df_all
    if N_SAMPLE and N_SAMPLE < len(df_all):
        df = df_all.sample(N_SAMPLE, random_state=RND).sort_index()
    y = df["target"].astype(int)

    X_tr_all, X_te_all, y_tr, y_te = train_test_split(
        df.drop(columns=["target"]), y, test_size=0.2, stratify=y, random_state=RND
    )
    df_tr = pd.concat([X_tr_all, y_tr], axis=1)
    df_te = pd.concat([X_te_all, y_te], axis=1)

    split_indices = {"train": df_tr.index.tolist(), "test": df_te.index.tolist()}
    (reports/"split_indices.json").write_text(json.dumps(split_indices, indent=2))

    configs = [
        {"name":"all_features", "drop_calc":False, "extra_drop":[], "add_extras":False},
        {"name":"drop_calc+opt+extras", "drop_calc":True, "extra_drop":["ps_ind_14","ps_car_10_cat"], "add_extras":True},
        {"name":"drop_calc_only", "drop_calc":True, "extra_drop":[], "add_extras":False},
        {"name":"drop_calc+extras", "drop_calc":True, "extra_drop":[], "add_extras":True},
    ]

    rows = []

    for model_name, model_clf in models_to_test.items():
        print(f"Starte Kreuzvalidierung für Modell: {model_name}...")
        for cfg in configs:
            print(f"  Konfiguration: {cfg['name']}...")
            X_tr_cfg = make_feature_set(df_tr, drop_calc=cfg["drop_calc"], extra_drop=cfg["extra_drop"], add_extras=cfg["add_extras"])

            if isinstance(model_clf, (RandomForestClassifier, SVC, LGBMClassifier)):
                auc_cv, pr_cv = cv_scores_ohe(X_tr_cfg, y_tr.loc[X_tr_cfg.index], model_clf, C=C_LOGREG, CV=CV)
            elif TE_CAT:
                auc_cv, pr_cv = cv_scores_te(X_tr_cfg, y_tr.loc[X_tr_cfg.index], C=C_LOGREG, CV=CV, clf=model_clf)
            else:
                auc_cv, pr_cv = cv_scores_ohe(X_tr_cfg, y_tr.loc[X_tr_cfg.index], C=C_LOGREG, CV=CV)

            rows.append({
                "model_name": model_name, "config_name": cfg["name"], "n_features": int(X_tr_cfg.shape[1]),
                "cv_auc": float(auc_cv), "cv_pr_auc": float(pr_cv),
                "drop_calc": cfg["drop_calc"], "extra_drop": cfg["extra_drop"],
                "add_extras": cfg["add_extras"], "te_cat": TE_CAT
            })

    res = pd.DataFrame(rows).sort_values(["cv_auc","cv_pr_auc"], ascending=False)
    res_path = reports/"feature_gate_scores.csv"; res.to_csv(res_path, index=False)

    best_cv = res.iloc[0].to_dict()
    X_tr_best = make_feature_set(df_tr, drop_calc=best_cv["drop_calc"], extra_drop=best_cv["extra_drop"], add_extras=best_cv["add_extras"])
    X_te_best = make_feature_set(df_te, drop_calc=best_cv["drop_calc"], extra_drop=best_cv["extra_drop"], add_extras=best_cv["add_extras"])

    (reports/"features_selected.csv").write_text(
        pd.Series(pd.Index(X_tr_best.columns), name="raw_feature").to_csv(index=False)
    )

    best_model_name = best_cv["model_name"]
    best_model_clf = models_to_test[best_model_name]

    if TE_CAT:
        Xtr_cat, Xva_cat, Xtr_bin, Xva_bin, Xtr_num, Xva_num = _prep_te_blocks(X_tr_best, X_te_best)
        tr_te, va_te = _kfold_target_encode(Xtr_cat, y_tr.loc[X_tr_best.index], Xva_cat, n_splits=CV, alpha=TE_ALPHA, seed=RND)
        Xtr_fin = pd.concat([Xtr_num, Xtr_bin, tr_te], axis=1)
        Xva_fin = pd.concat([Xva_num, Xva_bin, va_te], axis=1)
        clf_best = models_to_test[best_model_name]
        if hasattr(clf_best, 'C'): clf_best.C = C_LOGREG
        clf_best.fit(Xtr_fin, y_tr.loc[X_tr_best.index])
        proba_best = clf_best.predict_proba(Xva_fin)[:,1]
    else:
        cat_b, bin_b, num_b = split_cols(X_tr_best.columns)
        pre_b = build_pre(cat_b, bin_b, num_b)
        pipe_b = Pipeline([("pre", pre_b), ("clf", best_model_clf)])
        m_b = pipe_b.fit(X_tr_best, y_tr.loc[X_tr_best.index])
        proba_best = m_b.predict_proba(X_te_best)[:,1]

    y_true_best = y_te.loc[X_te_best.index]

    X_tr_allF = make_feature_set(df_tr, drop_calc=False, extra_drop=[], add_extras=False)
    X_te_allF = make_feature_set(df_te, drop_calc=False, extra_drop=[], add_extras=False)
    catA, binA, numA = split_cols(X_tr_allF.columns)
    pipe_all = Pipeline([("pre", build_pre(catA, binA, numA)),
                             ("clf", models_to_test["LogisticRegression"])])
    m_all = pipe_all.fit(X_tr_allF, y_tr.loc[X_tr_allF.index])
    proba_all = m_all.predict_proba(X_te_allF)[:,1]

    hold_auc_best = roc_auc_score(y_true_best, proba_best)
    hold_pr_best = average_precision_score(y_true_best, proba_best)
    hold_auc_all = roc_auc_score(y_te.loc[X_te_allF.index], proba_all)
    hold_pr_all = average_precision_score(y_te.loc[X_te_allF.index], proba_all)

    prec_b, rec_b, _ = precision_recall_curve(y_true_best, proba_best)
    prec_a, rec_a, _ = precision_recall_curve(y_te.loc[X_te_allF.index], proba_all)

    plt.figure(figsize=(7,5))
    plt.plot(rec_b, prec_b, label=f"Best ({best_model_name}, AP={hold_pr_best:.3f}, AUC={hold_auc_best:.3f})")
    plt.plot(rec_a, prec_a, label=f"All-features (LR, AP={hold_pr_all:.3f}, AUC={hold_auc_all:.3f})")
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.title("Holdout Precision-Recall Curve")
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig(reports/"holdout_pr_curve.png", dpi=150)
    plt.close()

    gbm_out = None
    if GBM_CHECK:
        catB, binB, numB = split_cols(X_tr_best.columns); preB = build_pre(catB, binB, numB)
        XtrB = preB.fit_transform(X_tr_best); XvaB = preB.transform(X_te_best)
        gbm_best = holdout_gbm_check(XtrB, y_tr.loc[X_tr_best.index], XvaB, y_true_best)
        catC, binC, numC = split_cols(X_tr_allF.columns); preC = build_pre(catC, binC, numC)
        XtrC = preC.fit_transform(X_tr_allF); XvaC = preC.transform(X_te_allF)
        gbm_all = holdout_gbm_check(XtrC, y_tr.loc[X_tr_allF.index], XvaC, y_te.loc[X_te_allF.index])
        gbm_out = {"best": gbm_best, "all": gbm_all}

    meta = {
        "random_state": RND, "cv_splits": CV, "C": C_LOGREG,
        "n_rows_total": int(n_rows_total), "sample_n": int(len(df)),
        "te_cat": TE_CAT, "te_alpha": TE_ALPHA, "gbm_check": bool(GBM_CHECK),
        "scores_path": str(res_path),
        "features_path": str(reports/"features_selected.csv"),
        "split_indices_path": str(reports/"split_indices.json"),
        "pr_curve_path": str(reports/"holdout_pr_curve.png"),
        "best_by_cv": best_cv,
        "holdout_scores": {
            "best_auc": float(hold_auc_best), "best_pr_auc": float(hold_pr_best),
            "all_auc": float(hold_auc_all), "all_pr_auc": float(hold_pr_all)
        },
        "gbm_holdout": gbm_out
    }
    (reports/"feature_gate_meta.json").write_text(json.dumps(meta, indent=2))

    print("\nFEATURE-GATE done.")
    print(f"Train n={len(df_tr):,}, Holdout n={len(df_te):,}, CV={CV}, C={C_LOGREG}, TE_CAT={int(TE_CAT)}")
    print("Scores (CV):\n" + res.head(10).to_string(index=False))
    print(f"\nHoldout (Best by CV): AUC={hold_auc_best:.4f}  PR-AUC={hold_pr_best:.4f}")
    print(f"Holdout (All-features): AUC={hold_auc_all:.4f}  PR-AUC={hold_pr_all:.4f}")
    print("\nArtifacts:")
    print("→ features_selected.csv")
    print("→ split_indices.json")
    print("→ holdout_pr_curve.png")
    print("(+ feature_gate_scores.csv, feature_gate_meta.json)")

if __name__ == "__main__":
    main()

NameError: name 'C_LOGREG' is not defined