In [14]:
#!/usr/bin/env python3
# Fast Feature-Gate zwischen EDA und ML
# Optional zuschaltbar: Target Encoding (TE) & LightGBM-Check. Standard: AUS (schnell).

import os, sys, json, warnings
warnings.filterwarnings("ignore")
from pathlib import Path
import numpy as np
import pandas as pd

# --- Repo-ROOT robust (Notebook/Skript)
if "__file__" in globals():
    ROOT = Path(__file__).resolve().parents[1]
else:
    CWD = Path.cwd(); ROOT = CWD.parent if CWD.name == "notebooks" else CWD
sys.path.insert(0, str(ROOT))

from src.data_loader import load_and_save_data

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score

# Einstellungen
if 'get_ipython' in globals():
    os.environ.setdefault("CV", "3")
    os.environ.setdefault("RND", "42")
    os.environ.setdefault("TE_CAT", "1")
    os.environ.setdefault("GBM_CHECK", "1")
    os.environ.setdefault("TRAIN_SAMPLE_N", "250000")

# --- Runtime-Parameter (per ENV steuerbar)
RND = int(os.getenv("RND", "42"))

# --- Helpers

def ohe_fallback():
    """Sklearn-kompatibles OHE mit Fallbacks für ältere Versionen."""
    try:
        return OneHotEncoder(handle_unknown="infrequent_if_exist", min_frequency=0.01, sparse_output=True)
    except TypeError:
        try:
            return OneHotEncoder(handle_unknown="ignore", sparse=True)
        except TypeError:
            return OneHotEncoder(handle_unknown="ignore", sparse=False)

def split_cols(cols):
    cat = [c for c in cols if c.endswith("_cat")]
    bin_ = [c for c in cols if c.endswith("_bin")]
    num  = [c for c in cols if (c not in cat and c not in bin_ and c != "target")]
    return cat, bin_, num

def fe_simple(X: pd.DataFrame) -> pd.DataFrame:
    X = X.copy()
    X["missing_count"] = X.isna().sum(axis=1)
    b = [c for c in X.columns if c.endswith("_bin")]
    if b:
        X["sum_all_bin"] = X[b].sum(axis=1)
    return X

def build_pre(cat, bin_, num):
    cat_pipe = Pipeline([("imp", SimpleImputer(strategy="most_frequent")), ("ohe", ohe_fallback())])
    num_pipe = Pipeline([("imp", SimpleImputer(strategy="median")), ("sc", StandardScaler())])
    bin_pipe = Pipeline([("imp", SimpleImputer(strategy="most_frequent"))])
    return ColumnTransformer([("cat", cat_pipe, cat), ("bin", bin_pipe, bin_), ("num", num_pipe, num)], remainder="drop")

def make_feature_set(df, drop_calc=True, extra_drop=None, add_extras=True, drop_groups=None):
    X = df.drop(columns=["target"], errors="ignore").copy().replace(-1, np.nan)
    if drop_calc:
        X = X.drop(columns=[c for c in X.columns if c.startswith("ps_calc_")], errors="ignore")
    if extra_drop:
        X = X.drop(columns=[c for c in extra_drop if c in X.columns], errors="ignore")
    extras_cols = []
    if add_extras:
        X = fe_simple(X); extras_cols = ["missing_count", "sum_all_bin"]
    if drop_groups:
        cat, bin_, num = split_cols(X.columns)
        if drop_groups.get("cat"):   X = X.drop(columns=cat, errors="ignore")
        if drop_groups.get("bin"):   X = X.drop(columns=bin_, errors="ignore")
        if drop_groups.get("num"):   X = X.drop(columns=num, errors="ignore")
        if drop_groups.get("extras"): X = X.drop(columns=[c for c in extras_cols if c in X.columns], errors="ignore")
    return X

def cv_scores_ohe(X, y, C=1.0, CV=3, seed=RND):
    """OOF-CV Scores mit OHE + LogReg (schnell, baseline)."""
    cat, bin_, num = split_cols(X.columns)
    pre  = build_pre(cat, bin_, num)
    clf  = LogisticRegression(penalty="l2", solver="saga", C=C, class_weight="balanced", max_iter=4000, random_state=seed)
    pipe = Pipeline([("pre", pre), ("clf", clf)])
    skf = StratifiedKFold(n_splits=CV, shuffle=True, random_state=seed)
    proba = np.zeros(len(y), dtype=float)
    for tr, te in skf.split(X, y):
        m = pipe.fit(X.iloc[tr], y.iloc[tr])
        proba[te] = m.predict_proba(X.iloc[te])[:, 1]
    return roc_auc_score(y, proba), average_precision_score(y, proba)

# --- Target Encoding (TE) – optional

def _kfold_target_encode(train_cat: pd.DataFrame, y_tr: pd.Series, valid_cat: pd.DataFrame, n_splits=3, alpha=10, seed=RND):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    global_mean = y_tr.mean()
    tr_enc = pd.DataFrame(index=train_cat.index)
    va_enc = pd.DataFrame(index=valid_cat.index)
    for col in train_cat.columns:
        oof = pd.Series(index=train_cat.index, dtype=float)
        for tr_idx, va_idx in skf.split(train_cat, y_tr):
            col_tr = train_cat.iloc[tr_idx][col]
            y_sub  = y_tr.iloc[tr_idx]
            stats = y_sub.groupby(col_tr).agg(['mean','count'])
            m = (stats['mean']*stats['count'] + global_mean*alpha) / (stats['count'] + alpha)
            oof.iloc[va_idx] = train_cat.iloc[va_idx][col].map(m)
        tr_enc[col] = oof.fillna(global_mean)
        stats_full = y_tr.groupby(train_cat[col]).agg(['mean','count'])
        m_full = (stats_full['mean']*stats_full['count'] + global_mean*alpha) / (stats_full['count'] + alpha)
        va_enc[col] = valid_cat[col].map(m_full).fillna(global_mean)
    tr_enc.columns = [f"te_{c}" for c in tr_enc.columns]
    va_enc.columns = [f"te_{c}" for c in va_enc.columns]
    return tr_enc, va_enc

def _prep_te_blocks(X_tr, X_va):
    cat, bin_, num = split_cols(X_tr.columns)
    imp_cat = SimpleImputer(strategy="most_frequent")
    imp_bin = SimpleImputer(strategy="most_frequent")
    imp_num = SimpleImputer(strategy="median")
    Xtr_cat = pd.DataFrame(imp_cat.fit_transform(X_tr[cat]) if cat else np.empty((len(X_tr),0)), columns=cat, index=X_tr.index)
    Xva_cat = pd.DataFrame(imp_cat.transform(X_va[cat]) if cat else np.empty((len(X_va),0)), columns=cat, index=X_va.index)
    Xtr_bin = pd.DataFrame(imp_bin.fit_transform(X_tr[bin_]) if bin_ else np.empty((len(X_tr),0)), columns=bin_, index=X_tr.index)
    Xva_bin = pd.DataFrame(imp_bin.transform(X_va[bin_]) if bin_ else np.empty((len(X_va),0)), columns=bin_, index=X_va.index)
    Xtr_num = pd.DataFrame(imp_num.fit_transform(X_tr[num]) if num else np.empty((len(X_tr),0)), columns=num, index=X_tr.index)
    Xva_num = pd.DataFrame(imp_num.transform(X_va[num]) if num else np.empty((len(X_va),0)), columns=num, index=X_va.index)
    sc = StandardScaler(with_mean=True, with_std=True)
    if Xtr_num.shape[1]:
        Xtr_num = pd.DataFrame(sc.fit_transform(Xtr_num), columns=Xtr_num.columns, index=Xtr_num.index)
        Xva_num = pd.DataFrame(sc.transform(Xva_num), columns=Xva_num.columns, index=Xva_num.index)
    return (Xtr_cat, Xva_cat, Xtr_bin, Xva_bin, Xtr_num, Xva_num)

def cv_scores_te(X, y, C=1.0, CV=3, seed=RND, alpha=10):
    skf = StratifiedKFold(n_splits=CV, shuffle=True, random_state=seed)
    proba = np.zeros(len(y), dtype=float)
    for tr_idx, va_idx in skf.split(X, y):
        X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
        y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]
        Xtr_cat, Xva_cat, Xtr_bin, Xva_bin, Xtr_num, Xva_num = _prep_te_blocks(X_tr, X_va)
        if Xtr_cat.shape[1]:
            tr_te, va_te = _kfold_target_encode(Xtr_cat, y_tr, Xva_cat, n_splits=CV, alpha=alpha, seed=seed)
        else:
            tr_te = pd.DataFrame(index=X_tr.index); va_te = pd.DataFrame(index=X_va.index)
        Xtr_fin = pd.concat([Xtr_num, Xtr_bin, tr_te], axis=1)
        Xva_fin = pd.concat([Xva_num, Xva_bin, va_te], axis=1)
        clf = LogisticRegression(penalty="l2", solver="lbfgs", C=C, class_weight="balanced", max_iter=4000, random_state=seed)
        clf.fit(Xtr_fin, y_tr)
        proba[va_idx] = clf.predict_proba(Xva_fin)[:,1]
    return roc_auc_score(y, proba), average_precision_score(y, proba)

# --- Optional: schneller GBM-Check (LightGBM)

def holdout_gbm_check(X_tr, y_tr, X_te, y_te, seed=RND):
    try:
        from lightgbm import LGBMClassifier
    except Exception:
        print("[GBM] LightGBM nicht verfügbar – überspringe.")
        return None
    clf = LGBMClassifier(n_estimators=300, learning_rate=0.1, num_leaves=31,
                         subsample=0.8, colsample_bytree=0.8, reg_lambda=0.0,
                         random_state=seed, n_jobs=-1)
    clf.fit(X_tr, y_tr)
    p = clf.predict_proba(X_te)[:,1]
    return float(roc_auc_score(y_te, p)), float(average_precision_score(y_te, p))

# --- Main

def main():
    CV = int(os.getenv("CV", "3"))
    C  = float(os.getenv("C", "1.0"))
    N  = int(os.getenv("TRAIN_SAMPLE_N", "250000"))  # 0 = alles
    TE_CAT = int(os.getenv("TE_CAT", "0")) == 1
    GBM_CHECK = int(os.getenv("GBM_CHECK", "0")) == 1

    reports = ROOT/"reports"; reports.mkdir(parents=True, exist_ok=True)

    df = load_and_save_data().replace(-1, np.nan)
    if N and N < len(df):
        df = df.sample(N, random_state=RND).sort_index()
    y  = df["target"].astype(int)

    # Holdout
    X_tr_all, X_te_all, y_tr, y_te = train_test_split(df.drop(columns=["target"]), y, test_size=0.2, stratify=y, random_state=RND)
    df_tr = pd.concat([X_tr_all, y_tr], axis=1)
    df_te = pd.concat([X_te_all, y_te], axis=1)

    # Kandidaten
    configs = [
        {"name":"all_features",          "drop_calc":False, "extra_drop":[],                          "add_extras":False},
        {"name":"drop_calc+opt+extras", "drop_calc":True,  "extra_drop":["ps_ind_14","ps_car_10_cat"], "add_extras":True},
        {"name":"drop_calc_only",       "drop_calc":True,  "extra_drop":[],                          "add_extras":False},
        {"name":"drop_calc+extras",     "drop_calc":True,  "extra_drop":[],                          "add_extras":True},
    ]

    rows = []
    for cfg in configs:
        X_tr = make_feature_set(df_tr, drop_calc=cfg["drop_calc"], extra_drop=cfg["extra_drop"], add_extras=cfg["add_extras"])
        if TE_CAT:
            auc_cv, pr_cv = cv_scores_te(X_tr, y_tr.loc[X_tr.index], C=C, CV=CV)
        else:
            auc_cv, pr_cv = cv_scores_ohe(X_tr, y_tr.loc[X_tr.index], C=C, CV=CV)
        rows.append({"name":cfg["name"], "n_features":int(X_tr.shape[1]), "cv_auc":float(auc_cv), "cv_pr_auc":float(pr_cv),
                     "drop_calc":cfg["drop_calc"], "extra_drop":cfg["extra_drop"], "add_extras":cfg["add_extras"],
                     "te_cat": TE_CAT})

    res = pd.DataFrame(rows).sort_values(["cv_auc","cv_pr_auc"], ascending=False)
    res_path = reports/"feature_gate_scores.csv"; res.to_csv(res_path, index=False)

    # Bestes Set → Holdout
    best = res.iloc[0].to_dict()
    X_tr_best = make_feature_set(df_tr, drop_calc=best["drop_calc"], extra_drop=best["extra_drop"], add_extras=best["add_extras"])
    X_te_best = make_feature_set(df_te, drop_calc=best["drop_calc"], extra_drop=best["extra_drop"], add_extras=best["add_extras"])

    if TE_CAT:
        Xtr_cat, Xva_cat, Xtr_bin, Xva_bin, Xtr_num, Xva_num = _prep_te_blocks(X_tr_best, X_te_best)
        tr_te, va_te = _kfold_target_encode(Xtr_cat, y_tr.loc[X_tr_best.index], Xva_cat, n_splits=CV, alpha=10)
        Xtr_fin = pd.concat([Xtr_num, Xtr_bin, tr_te], axis=1)
        Xva_fin = pd.concat([Xva_num, Xva_bin, va_te], axis=1)
        clf = LogisticRegression(penalty="l2", solver="lbfgs", C=C, class_weight="balanced", max_iter=4000, random_state=RND)
        clf.fit(Xtr_fin, y_tr.loc[X_tr_best.index])
        proba_hold = clf.predict_proba(Xva_fin)[:,1]
    else:
        cat, bin_, num = split_cols(X_tr_best.columns)
        pipe = Pipeline([("pre", build_pre(cat, bin_, num)), ("clf", LogisticRegression(penalty="l2", solver="saga", C=C, class_weight="balanced", max_iter=4000, random_state=RND))])
        m = pipe.fit(X_tr_best, y_tr.loc[X_tr_best.index])
        proba_hold = m.predict_proba(X_te_best)[:,1]

    hold_auc = roc_auc_score(y_te.loc[X_te_best.index], proba_hold)
    hold_pr  = average_precision_score(y_te.loc[X_te_best.index], proba_hold)

    # Gegenprobe: all_features
    X_tr_all = make_feature_set(df_tr, drop_calc=False, extra_drop=[], add_extras=False)
    X_te_all = make_feature_set(df_te, drop_calc=False, extra_drop=[], add_extras=False)
    if TE_CAT:
        Xtr_catA, Xva_catA, Xtr_binA, Xva_binA, Xtr_numA, Xva_numA = _prep_te_blocks(X_tr_all, X_te_all)
        tr_teA, va_teA = _kfold_target_encode(Xtr_catA, y_tr.loc[X_tr_all.index], Xva_catA, n_splits=CV, alpha=10)
        Xtr_all_fin = pd.concat([Xtr_numA, Xtr_binA, tr_teA], axis=1)
        Xva_all_fin = pd.concat([Xva_numA, Xva_binA, va_teA], axis=1)
        clfA = LogisticRegression(penalty="l2", solver="lbfgs", C=C, class_weight="balanced", max_iter=4000, random_state=RND)
        clfA.fit(Xtr_all_fin, y_tr.loc[X_tr_all.index])
        proba_hold_all = clfA.predict_proba(Xva_all_fin)[:,1]
    else:
        catA, binA, numA = split_cols(X_tr_all.columns)
        pipe_all = Pipeline([("pre", build_pre(catA, binA, numA)), ("clf", LogisticRegression(penalty="l2", solver="saga", C=C, class_weight="balanced", max_iter=4000, random_state=RND))])
        m_all = pipe_all.fit(X_tr_all, y_tr.loc[X_tr_all.index])
        proba_hold_all = m_all.predict_proba(X_te_all)[:,1]

    hold_auc_all = roc_auc_score(y_te.loc[X_te_all.index], proba_hold_all)
    hold_pr_all  = average_precision_score(y_te.loc[X_te_all.index], proba_hold_all)

    # Fallback-Regel
    if (hold_auc + 1e-12) < (hold_auc_all - 0.002) or (hold_pr + 1e-12) < (hold_pr_all - 0.002):
        chosen = {"name":"all_features", "drop_calc":False, "extra_drop":[], "add_extras":False, "cv_auc":None, "cv_pr_auc":None,
                  "holdout_auc":float(hold_auc_all), "holdout_pr_auc":float(hold_pr_all), "te_cat": TE_CAT}
        X_choose = X_tr_all.columns
    else:
        chosen = {"name":best["name"], "drop_calc":best["drop_calc"], "extra_drop":best["extra_drop"], "add_extras":best["add_extras"],
                  "cv_auc":float(best["cv_auc"]), "cv_pr_auc":float(best["cv_pr_auc"]), "holdout_auc":float(hold_auc), "holdout_pr_auc":float(hold_pr),
                  "te_cat": TE_CAT}
        X_choose = X_tr_best.columns

    # Optionaler GBM-Check (verwendet OHE-Preprocessing)
    gbm_out = None
    if GBM_CHECK:
        catB, binB, numB = split_cols(X_tr_best.columns); preB = build_pre(catB, binB, numB)
        XtrB = preB.fit_transform(X_tr_best); XvaB = preB.transform(X_te_best)
        gbm_best = holdout_gbm_check(XtrB, y_tr.loc[X_tr_best.index], XvaB, y_te.loc[X_te_best.index])
        catC, binC, numC = split_cols(X_tr_all.columns); preC = build_pre(catC, binC, numC)
        XtrC = preC.fit_transform(X_tr_all); XvaC = preC.transform(X_te_all)
        gbm_all  = holdout_gbm_check(XtrC, y_tr.loc[X_tr_all.index], XvaC, y_te.loc[X_te_all.index])
        gbm_out = {"best": gbm_best, "all": gbm_all}

    # Outputs
    feat_path = reports/"features_selected.csv"
    pd.Series(pd.Index(X_choose), name="raw_feature").to_csv(feat_path, index=False)

    meta = {"random_state":RND, "cv_splits":CV, "C":C, "n_rows":int(len(df)), "sample_n":int(len(df)),
            "te_cat": TE_CAT, "gbm_check": bool(GBM_CHECK),
            "scores_path":str(res_path), "features_path":str(feat_path), "chosen":chosen,
            "gbm_holdout": gbm_out}
    (reports/"feature_gate_meta.json").write_text(json.dumps(meta, indent=2))

   # Console-Report
    print("\nFEATURE-GATE fertig.")
    print(f"Train n={len(df_tr):,}, Holdout n={len(df_te):,}, CV={CV}, C={C}, TE_CAT={int(TE_CAT)}, GBM_CHECK={int(GBM_CHECK)}")
    print("Scores (CV):\n" + res.head(6).to_string(index=False))
    print(f"\nHoldout {best['name']}:   AUC={hold_auc:.4f}  PR-AUC={hold_pr:.4f}")
    print(f"Holdout all_features: AUC={hold_auc_all:.4f}  PR-AUC={hold_pr_all:.4f}")
    if gbm_out:
        print("\n[GBM] Holdout AUC/PR (Best):", gbm_out["best"])
        print("[GBM] Holdout AUC/PR (All): ", gbm_out["all"])
    print(f"\nGewählt: {chosen['name']}")
    print(f"→ Spalten: {feat_path}")
    print(f"→ Scores:  {res_path}")

if __name__ == "__main__":
    main()


Lade Datensatz aus dem Cache.

FEATURE-GATE fertig.
Train n=200,000, Holdout n=50,000, CV=3, C=1.0, TE_CAT=0, GBM_CHECK=0
Scores (CV):
                name  n_features   cv_auc  cv_pr_auc  drop_calc                 extra_drop  add_extras  te_cat
drop_calc+opt+extras          37 0.620888   0.059965       True [ps_ind_14, ps_car_10_cat]        True   False
    drop_calc+extras          39 0.620864   0.059968       True                         []        True   False
      drop_calc_only          37 0.620800   0.059986       True                         []       False   False
        all_features          57 0.619775   0.059835      False                         []       False   False

Holdout drop_calc+opt+extras:   AUC=0.6325  PR-AUC=0.0651
Holdout all_features: AUC=0.6284  PR-AUC=0.0645

Gewählt: drop_calc+opt+extras
→ Spalten: /Users/lucasbeseler/ada_portoSeguro/reports/features_selected.csv
→ Scores:  /Users/lucasbeseler/ada_portoSeguro/reports/feature_gate_scores.csv
