In [None]:
import os, sys, json, warnings
warnings.filterwarnings("ignore")
from pathlib import Path
import numpy as np
import pandas as pd

# Was: Pfade für ein notebook/script-safe environment
if "__file__" in globals():
    ROOT = Path(__file__).resolve().parents[1]
else:
    CWD = Path.cwd(); ROOT = CWD.parent if CWD.name == "notebooks" else CWD
sys.path.insert(0, str(ROOT))

from src.data_loader import load_and_save_data
from src.models import get_models

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import (
    roc_auc_score, average_precision_score, precision_recall_curve
)

import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt

# Was: Standardwerte für Notebooks
RND       = int(os.getenv("RND", "42"))
CV        = int(os.getenv("CV", "3"))
C_LOGREG  = float(os.getenv("C", "1.0"))
N_SAMPLE  = int(os.getenv("TRAIN_SAMPLE_N", "250000"))
TE_CAT    = int(os.getenv("TE_CAT", "0")) == 1
GBM_CHECK = int(os.getenv("GBM_CHECK", "0")) == 1
TE_ALPHA  = float(os.getenv("TE_ALPHA", "10"))


# Was: Hilfsfunktionen für Preprocessing und Feature Engineering
def ohe_fallback():
    try:
        return OneHotEncoder(handle_unknown="infrequent_if_exist", min_frequency=0.01, sparse_output=True)
    except TypeError:
        try:
            return OneHotEncoder(handle_unknown="ignore", sparse=True)
        except TypeError:
            return OneHotEncoder(handle_unknown="ignore", sparse=False)

def split_cols(cols):
    cat = [c for c in cols if c.endswith("_cat")]
    bin_ = [c for c in cols if c.endswith("_bin")]
    num  = [c for c in cols if (c not in cat and c not in bin_ and c != "target")]
    return cat, bin_, num

def fe_simple(X: pd.DataFrame) -> pd.DataFrame:
    X = X.copy()
    X["missing_count"] = X.isna().sum(axis=1)
    b = [c for c in X.columns if c.endswith("_bin")]
    if b:
        X["sum_all_bin"] = X[b].sum(axis=1)
    return X

def build_pre(cat, bin_, num):
    cat_pipe = Pipeline([("imp", SimpleImputer(strategy="most_frequent")), ("ohe", ohe_fallback())])
    num_pipe = Pipeline([("imp", SimpleImputer(strategy="median")), ("sc", StandardScaler())])
    bin_pipe = Pipeline([("imp", SimpleImputer(strategy="most_frequent"))])
    return ColumnTransformer([("cat", cat_pipe, cat), ("bin", bin_pipe, bin_), ("num", num_pipe, num)], remainder="drop")

def make_feature_set(df, drop_calc=True, extra_drop=None, add_extras=True, drop_groups=None):
    X = df.drop(columns=["target"], errors="ignore").copy().replace(-1, np.nan)
    if drop_calc:
        X = X.drop(columns=[c for c in X.columns if c.startswith("ps_calc_")], errors="ignore")
    if extra_drop:
        X = X.drop(columns=[c for c in extra_drop if c in X.columns], errors="ignore")
    extras_cols = []
    if add_extras:
        X = fe_simple(X); extras_cols = ["missing_count", "sum_all_bin"]
    if drop_groups:
        cat, bin_, num = split_cols(X.columns)
        if drop_groups.get("cat"): X = X.drop(columns=cat, errors="ignore")
        if drop_groups.get("bin"): X = X.drop(columns=bin_, errors="ignore")
        if drop_groups.get("num"): X = X.drop(columns=num, errors="ignore")
        if drop_groups.get("extras"): X = X.drop(columns=[c for c in extras_cols if c in X.columns], errors="ignore")
    return X

def cv_scores_ohe(X, y, clf, C=1.0, CV=3, seed=RND):
    cat, bin_, num = split_cols(X.columns)
    pre = build_pre(cat, bin_, num)
    pipe = Pipeline([("pre", pre), ("clf", clf)])
    skf = StratifiedKFold(n_splits=CV, shuffle=True, random_state=seed)
    proba = np.zeros(len(y), dtype=float)
    for i, (tr, te) in enumerate(skf.split(X, y)):
        print(f"    - Fold {i+1}/{CV} wird trainiert ({int((i+1)/CV*100)}%)") # Was: Fortschrittsanzeige
        m = pipe.fit(X.iloc[tr], y.iloc[tr])
        proba[te] = m.predict_proba(X.iloc[te])[:, 1]
    return roc_auc_score(y, proba), average_precision_score(y, proba)

def _kfold_target_encode(train_cat: pd.DataFrame, y_tr: pd.Series, valid_cat: pd.DataFrame, n_splits=3, alpha=10, seed=RND):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    global_mean = y_tr.mean()
    tr_enc = pd.DataFrame(index=train_cat.index)
    va_enc = pd.DataFrame(index=valid_cat.index)
    for col in train_cat.columns:
        oof = pd.Series(index=train_cat.index, dtype=float)
        for tr_idx, va_idx in skf.split(train_cat, y_tr):
            col_tr = train_cat.iloc[tr_idx][col]; y_sub = y_tr.iloc[tr_idx]
            stats = y_sub.groupby(col_tr).agg(['mean','count'])
            m = (stats['mean']*stats['count'] + global_mean*alpha) / (stats['count'] + alpha)
            oof.iloc[va_idx] = train_cat.iloc[va_idx][col].map(m)
        tr_enc[col] = oof.fillna(global_mean)
        stats_full = y_tr.groupby(train_cat[col]).agg(['mean','count'])
        m_full = (stats_full['mean']*stats_full['count'] + global_mean*alpha) / (stats_full['count'] + alpha)
        va_enc[col] = valid_cat[col].map(m_full).fillna(global_mean)
    tr_enc.columns = [f"te_{c}" for c in tr_enc.columns]
    va_enc.columns = [f"te_{c}" for c in va_enc.columns]
    return tr_enc, va_enc

def _prep_te_blocks(X_tr, X_va):
    cat, bin_, num = split_cols(X_tr.columns)
    imp_cat = SimpleImputer(strategy="most_frequent")
    imp_bin = SimpleImputer(strategy="most_frequent")
    imp_num = SimpleImputer(strategy="median")
    Xtr_cat = pd.DataFrame(imp_cat.fit_transform(X_tr[cat]) if cat else np.empty((len(X_tr),0)), columns=cat, index=X_tr.index)
    Xva_cat = pd.DataFrame(imp_cat.transform(X_va[cat]) if cat else np.empty((len(X_va),0)), columns=cat, index=X_va.index)
    Xtr_bin = pd.DataFrame(imp_bin.fit_transform(X_tr[bin_]) if bin_ else np.empty((len(X_tr),0)), columns=bin_, index=X_tr.index)
    Xva_bin = pd.DataFrame(imp_bin.transform(X_va[bin_]) if bin_ else np.empty((len(X_va),0)), columns=bin_, index=X_va.index)
    Xtr_num = pd.DataFrame(imp_num.fit_transform(X_tr[num]) if num else np.empty((len(X_tr),0)), columns=num, index=X_tr.index)
    Xva_num = pd.DataFrame(imp_num.transform(X_va[num]) if num else np.empty((len(X_va),0)), columns=num, index=X_va.index)
    sc = StandardScaler(with_mean=True, with_std=True)
    if Xtr_num.shape[1]:
        Xtr_num = pd.DataFrame(sc.fit_transform(Xtr_num), columns=Xtr_num.columns, index=Xtr_num.index)
        Xva_num = pd.DataFrame(sc.transform(Xva_num), columns=Xva_num.columns, index=Xva_num.index)
    return (Xtr_cat, Xva_cat, Xtr_bin, Xva_bin, Xtr_num, Xva_num)

def cv_scores_te(X, y, clf, C=1.0, CV=3, seed=RND, alpha=TE_ALPHA):
    skf = StratifiedKFold(n_splits=CV, shuffle=True, random_state=seed)
    proba = np.zeros(len(y), dtype=float)
    for i, (tr_idx, va_idx) in enumerate(skf.split(X, y)):
        print(f"    - Fold {i+1}/{CV} wird trainiert ({int((i+1)/CV*100)}%)")
        X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
        y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]
        Xtr_cat, Xva_cat, Xtr_bin, Xva_bin, Xtr_num, Xva_num = _prep_te_blocks(X_tr, X_va)
        if Xtr_cat.shape[1]:
            tr_te, va_te = _kfold_target_encode(Xtr_cat, y_tr, Xva_cat, n_splits=CV, alpha=alpha, seed=seed)
        else:
            tr_te = pd.DataFrame(index=X_tr.index); va_te = pd.DataFrame(index=X_va.index)
        Xtr_fin = pd.concat([Xtr_num, Xtr_bin, tr_te], axis=1)
        Xva_fin = pd.concat([Xva_num, Xva_bin, va_te], axis=1)
        clf.fit(Xtr_fin, y_tr)
        proba[va_idx] = clf.predict_proba(Xva_fin)[:,1]
    return roc_auc_score(y, proba), average_precision_score(y, proba)

def holdout_gbm_check(X_tr, y_tr, X_te, y_te, seed=RND):
    try:
        from lightgbm import LGBMClassifier
    except Exception:
        print("[GBM] LightGBM not available – skip.")
        return None
    clf = LGBMClassifier(n_estimators=300, learning_rate=0.1, num_leaves=31,
                         subsample=0.8, colsample_bytree=0.8, reg_lambda=0.0,
                         random_state=seed, n_jobs=-1)
    clf.fit(X_tr, y_tr)
    p = clf.predict_proba(X_te)[:,1]
    return float(roc_auc_score(y_te, p)), float(average_precision_score(y_te, p))


def main():
    reports = ROOT/"reports_Hany"; reports.mkdir(parents=True, exist_ok=True)

    models_to_test = get_models(C_LOGREG, RND)

    # Load data
    df_all = load_and_save_data().replace(-1, np.nan)
    n_rows_total = len(df_all)
    df = df_all
    if N_SAMPLE and N_SAMPLE < len(df_all):
        df = df_all.sample(N_SAMPLE, random_state=RND).sort_index()
    y = df["target"].astype(int)

    # Consistent holdout split (export indices)
    X_tr_all, X_te_all, y_tr, y_te = train_test_split(
        df.drop(columns=["target"]), y, test_size=0.2, stratify=y, random_state=RND
    )
    df_tr = pd.concat([X_tr_all, y_tr], axis=1)
    df_te = pd.concat([X_te_all, y_te], axis=1)

    split_indices = {"train": df_tr.index.tolist(), "test": df_te.index.tolist()}
    (reports/"split_indices.json").write_text(json.dumps(split_indices, indent=2))

    # Candidate configs
    configs = [
        {"name":"all_features", "drop_calc":False, "extra_drop":[], "add_extras":False},
        {"name":"drop_calc+opt+extras", "drop_calc":True, "extra_drop":["ps_ind_14","ps_car_10_cat"], "add_extras":True},
        {"name":"drop_calc_only", "drop_calc":True, "extra_drop":[], "add_extras":False},
        {"name":"drop_calc+extras", "drop_calc":True, "extra_drop":[], "add_extras":True},
    ]

    rows = []

    for model_name, model_clf in models_to_test.items():
        print(f"Starte Kreuzvalidierung für Modell: {model_name}...")
        for cfg in configs:
            print(f"  Konfiguration: {cfg['name']}...")
            X_tr_cfg = make_feature_set(df_tr, drop_calc=cfg["drop_calc"], extra_drop=cfg["extra_drop"], add_extras=cfg["add_extras"])

            if isinstance(model_clf, (RandomForestClassifier, SVC)):
                auc_cv, pr_cv = run_model_cv(X_tr_cfg, y_tr.loc[X_tr_cfg.index], model_clf, model_name, CV=CV)
            elif TE_CAT:
                auc_cv, pr_cv = cv_scores_te(X_tr_cfg, y_tr.loc[X_tr_cfg.index], C=C_LOGREG, CV=CV, clf=model_clf)
            else:
                auc_cv, pr_cv = cv_scores_ohe(X_tr_cfg, y_tr.loc[X_tr_cfg.index], C=C_LOGREG, CV=CV)

            rows.append({
                "model_name": model_name, "config_name": cfg["name"], "n_features": int(X_tr_cfg.shape[1]),
                "cv_auc": float(auc_cv), "cv_pr_auc": float(pr_cv),
                "drop_calc": cfg["drop_calc"], "extra_drop": cfg["extra_drop"],
                "add_extras": cfg["add_extras"], "te_cat": TE_CAT
            })

    res = pd.DataFrame(rows).sort_values(["cv_auc","cv_pr_auc"], ascending=False)
    res_path = reports/"feature_gate_scores.csv"; res.to_csv(res_path, index=False)

    best_cv = res.iloc[0].to_dict()
    X_tr_best = make_feature_set(df_tr, drop_calc=best_cv["drop_calc"], extra_drop=best_cv["extra_drop"], add_extras=best_cv["add_extras"])
    X_te_best = make_feature_set(df_te, drop_calc=best_cv["drop_calc"], extra_drop=best_cv["extra_drop"], add_extras=best_cv["add_extras"])

    (reports/"features_selected.csv").write_text(
        pd.Series(pd.Index(X_tr_best.columns), name="raw_feature").to_csv(index=False)
    )

    best_model_name = best_cv["model_name"]
    best_model_clf = models_to_test[best_model_name]

    if TE_CAT:
        Xtr_cat, Xva_cat, Xtr_bin, Xva_bin, Xtr_num, Xva_num = _prep_te_blocks(X_tr_best, X_te_best)
        tr_te, va_te = _kfold_target_encode(Xtr_cat, y_tr.loc[X_tr_best.index], Xva_cat, n_splits=CV, alpha=TE_ALPHA, seed=RND)
        Xtr_fin = pd.concat([Xtr_num, Xtr_bin, tr_te], axis=1)
        Xva_fin = pd.concat([Xva_num, Xva_bin, va_te], axis=1)
        clf_best = models_to_test[best_model_name]
        if hasattr(clf_best, 'C'): clf_best.C = C_LOGREG
        clf_best.fit(Xtr_fin, y_tr.loc[X_tr_best.index])
        proba_best = clf_best.predict_proba(Xva_fin)[:,1]
    else:
        cat_b, bin_b, num_b = split_cols(X_tr_best.columns)
        pre_b = build_pre(cat_b, bin_b, num_b)
        pipe_b = Pipeline([("pre", pre_b), ("clf", best_model_clf)])
        m_b = pipe_b.fit(X_tr_best, y_tr.loc[X_tr_best.index])
        proba_best = m_b.predict_proba(X_te_best)[:,1]

    y_true_best = y_te.loc[X_te_best.index]

    X_tr_allF = make_feature_set(df_tr, drop_calc=False, extra_drop=[], add_extras=False)
    X_te_allF = make_feature_set(df_te, drop_calc=False, extra_drop=[], add_extras=False)
    catA, binA, numA = split_cols(X_tr_allF.columns)
    pipe_all = Pipeline([("pre", build_pre(catA, binA, numA)),
                             ("clf", models_to_test["LogisticRegression"])])
    m_all = pipe_all.fit(X_tr_allF, y_tr.loc[X_tr_allF.index])
    proba_all = m_all.predict_proba(X_te_allF)[:,1]

    hold_auc_best = roc_auc_score(y_true_best, proba_best)
    hold_pr_best = average_precision_score(y_true_best, proba_best)
    hold_auc_all = roc_auc_score(y_te.loc[X_te_allF.index], proba_all)
    hold_pr_all = average_precision_score(y_te.loc[X_te_allF.index], proba_all)

    prec_b, rec_b, _ = precision_recall_curve(y_true_best, proba_best)
    prec_a, rec_a, _ = precision_recall_curve(y_te.loc[X_te_allF.index], proba_all)

    plt.figure(figsize=(7,5))
    plt.plot(rec_b, prec_b, label=f"Best ({best_model_name}, AP={hold_pr_best:.3f}, AUC={hold_auc_best:.3f})")
    plt.plot(rec_a, prec_a, label=f"All-features (LR, AP={hold_pr_all:.3f}, AUC={hold_auc_all:.3f})")
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.title("Holdout Precision-Recall Curve")
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig(reports/"holdout_pr_curve.png", dpi=150)
    plt.close()

    gbm_out = None
    if GBM_CHECK:
        catB, binB, numB = split_cols(X_tr_best.columns); preB = build_pre(catB, binB, numB)
        XtrB = preB.fit_transform(X_tr_best); XvaB = preB.transform(X_te_best)
        gbm_best = holdout_gbm_check(XtrB, y_tr.loc[X_tr_best.index], XvaB, y_true_best)
        catC, binC, numC = split_cols(X_tr_allF.columns); preC = build_pre(catC, binC, numC)
        XtrC = preC.fit_transform(X_tr_allF); XvaC = preC.transform(X_te_allF)
        gbm_all = holdout_gbm_check(XtrC, y_tr.loc[X_tr_allF.index], XvaC, y_te.loc[X_te_allF.index])
        gbm_out = {"best": gbm_best, "all": gbm_all}

    meta = {
        "random_state": RND, "cv_splits": CV, "C": C_LOGREG,
        "n_rows_total": int(n_rows_total), "sample_n": int(len(df)),
        "te_cat": TE_CAT, "te_alpha": TE_ALPHA, "gbm_check": bool(GBM_CHECK),
        "scores_path": str(res_path),
        "features_path": str(reports/"features_selected.csv"),
        "split_indices_path": str(reports/"split_indices.json"),
        "pr_curve_path": str(reports/"holdout_pr_curve.png"),
        "best_by_cv": best_cv,
        "holdout_scores": {
            "best_auc": float(hold_auc_best), "best_pr_auc": float(hold_pr_best),
            "all_auc": float(hold_auc_all), "all_pr_auc": float(hold_pr_all)
        },
        "gbm_holdout": gbm_out
    }
    (reports/"feature_gate_meta.json").write_text(json.dumps(meta, indent=2))

    print("\nFEATURE-GATE done.")
    print(f"Train n={len(df_tr):,}, Holdout n={len(df_te):,}, CV={CV}, C={C_LOGREG}, TE_CAT={int(TE_CAT)}")
    print("Scores (CV):\n" + res.head(10).to_string(index=False))
    print(f"\nHoldout (Best by CV): AUC={hold_auc_best:.4f}  PR-AUC={hold_pr_best:.4f}")
    print(f"Holdout (All-features): AUC={hold_auc_all:.4f}  PR-AUC={hold_pr_all:.4f}")
    print("\nArtifacts:")
    print("→ features_selected.csv")
    print("→ split_indices.json")
    print("→ holdout_pr_curve.png")
    print("(+ feature_gate_scores.csv, feature_gate_meta.json)")

if __name__ == "__main__":
    main()

Lade Datensatz aus dem Cache.
Starte Kreuzvalidierung für Modell: LogisticRegression...
  Konfiguration: all_features...
    - Fold 1/3 wird trainiert (33%)
