In [None]:
#!/usr/bin/env python3
# Fast Feature-Gate zwischen EDA und ML: vergleicht Feature-Sets per CrossValidation (CV), bestätigt auf Holdout,
# wählt das beste Set, speichert Auswahl + Scores und zeigt Kurzreport in der Konsole.

import os, sys, json, warnings
warnings.filterwarnings("ignore")
from pathlib import Path
import numpy as np
import pandas as pd

# --- Repo-ROOT robust (Notebook/Skript)
if "__file__" in globals():
    ROOT = Path(__file__).resolve().parents[1]
else:
    CWD = Path.cwd(); ROOT = CWD.parent if CWD.name == "notebooks" else CWD
sys.path.insert(0, str(ROOT))

from src.data_loader import load_and_save_data

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score

RND = 42

# --- Helpers (kurz & pragmatisch)
def ohe_fallback():
    try:
        return OneHotEncoder(handle_unknown="infrequent_if_exist", min_frequency=0.01, sparse_output=True)
    except TypeError:
        try:
            return OneHotEncoder(handle_unknown="ignore", sparse=True)
        except TypeError:
            return OneHotEncoder(handle_unknown="ignore", sparse=False)

def split_cols(cols):
    cat = [c for c in cols if c.endswith("_cat")]
    bin_ = [c for c in cols if c.endswith("_bin")]
    num  = [c for c in cols if c not in cat and c not in bin_ and c != "target"]
    return cat, bin_, num

def fe_simple(X: pd.DataFrame) -> pd.DataFrame:
    X = X.copy()
    X["missing_count"] = X.isna().sum(axis=1)
    b = [c for c in X.columns if c.endswith("_bin")]
    if b: X["sum_all_bin"] = X[b].sum(axis=1)
    return X

def build_pre(cat, bin_, num):
    cat_pipe = Pipeline([("imp", SimpleImputer(strategy="most_frequent")), ("ohe", ohe_fallback())])
    num_pipe = Pipeline([("imp", SimpleImputer(strategy="median")), ("sc", StandardScaler())])
    bin_pipe = Pipeline([("imp", SimpleImputer(strategy="most_frequent"))])
    return ColumnTransformer([("cat", cat_pipe, cat), ("bin", bin_pipe, bin_), ("num", num_pipe, num)], remainder="drop")

def make_feature_set(df, drop_calc=True, extra_drop=None, add_extras=True, drop_groups=None):
    X = df.drop(columns=["target"], errors="ignore").copy().replace(-1, np.nan)
    if drop_calc:
        X = X.drop(columns=[c for c in X.columns if c.startswith("ps_calc_")], errors="ignore")
    if extra_drop:
        X = X.drop(columns=[c for c in extra_drop if c in X.columns], errors="ignore")
    extras_cols = []
    if add_extras:
        X = fe_simple(X); extras_cols = ["missing_count", "sum_all_bin"]
    if drop_groups:
        cat, bin_, num = split_cols(X.columns)
        if drop_groups.get("cat"):   X = X.drop(columns=cat, errors="ignore")
        if drop_groups.get("bin"):   X = X.drop(columns=bin_, errors="ignore")
        if drop_groups.get("num"):   X = X.drop(columns=num, errors="ignore")
        if drop_groups.get("extras"): X = X.drop(columns=[c for c in extras_cols if c in X.columns], errors="ignore")
    return X

def cv_scores(X, y, C=1.0, CV=3, seed=RND):
    cat, bin_, num = split_cols(X.columns)
    pre  = build_pre(cat, bin_, num)
    clf  = LogisticRegression(penalty="l2", solver="saga", C=C, class_weight="balanced", max_iter=4000, random_state=seed)
    pipe = Pipeline([("pre", pre), ("clf", clf)])
    skf = StratifiedKFold(n_splits=CV, shuffle=True, random_state=seed)
    proba = np.zeros(len(y), dtype=float)
    for tr, te in skf.split(X, y):
        m = pipe.fit(X.iloc[tr], y.iloc[tr])
        proba[te] = m.predict_proba(X.iloc[te])[:, 1]
    return roc_auc_score(y, proba), average_precision_score(y, proba)

# --- Main

def main():
    CV = int(os.getenv("CV", "3"))                 # schnell & stabil
    C  = float(os.getenv("C", "1.0"))              # LogReg-Stärke
    N  = int(os.getenv("TRAIN_SAMPLE_N", "250000")) # Default: schneller Teil-Sample

    reports = ROOT/"reports"; reports.mkdir(parents=True, exist_ok=True)

    df = load_and_save_data().replace(-1, np.nan)
    if N and N < len(df):
        df = df.sample(N, random_state=RND).sort_index()
    y  = df["target"].astype(int)

    # Holdout zur Bestätigung
    X_tr_all, X_te_all, y_tr, y_te = train_test_split(df.drop(columns=["target"]), y, test_size=0.2, stratify=y, random_state=RND)
    df_tr = pd.concat([X_tr_all, y_tr], axis=1)
    df_te = pd.concat([X_te_all, y_te], axis=1)

    # Kandidaten (knapp, schnell)
    configs = [
        {"name":"all_features",                 "drop_calc":False, "extra_drop":[],                         "add_extras":False},
        {"name":"drop_calc+opt+extras",        "drop_calc":True,  "extra_drop":["ps_ind_14","ps_car_10_cat"], "add_extras":True},
        {"name":"drop_calc_only",              "drop_calc":True,  "extra_drop":[],                         "add_extras":False},
        {"name":"drop_calc+extras",            "drop_calc":True,  "extra_drop":[],                         "add_extras":True},
    ]

    rows = []
    for cfg in configs:
        X_tr = make_feature_set(df_tr, drop_calc=cfg["drop_calc"], extra_drop=cfg["extra_drop"], add_extras=cfg["add_extras"])
        auc_cv, pr_cv = cv_scores(X_tr, y_tr.loc[X_tr.index], C=C, CV=CV)
        rows.append({"name":cfg["name"], "n_features":int(X_tr.shape[1]), "cv_auc":float(auc_cv), "cv_pr_auc":float(pr_cv),
                     "drop_calc":cfg["drop_calc"], "extra_drop":cfg["extra_drop"], "add_extras":cfg["add_extras"]})

    res = pd.DataFrame(rows).sort_values(["cv_auc","cv_pr_auc"], ascending=False)
    res_path = reports/"feature_gate_scores.csv"; res.to_csv(res_path, index=False)

    # Bestes Set via CV, dann auf Holdout bestätigen (Fallback: all_features, wenn schlechter)
    best = res.iloc[0].to_dict()
    X_tr_best = make_feature_set(df_tr, drop_calc=best["drop_calc"], extra_drop=best["extra_drop"], add_extras=best["add_extras"]) 
    X_te_best = make_feature_set(df_te, drop_calc=best["drop_calc"], extra_drop=best["extra_drop"], add_extras=best["add_extras"]) 

    # Finales Fit auf Train, Eval auf Holdout
    cat, bin_, num = split_cols(X_tr_best.columns)
    pipe = Pipeline([("pre", build_pre(cat, bin_, num)), ("clf", LogisticRegression(penalty="l2", solver="saga", C=C, class_weight="balanced", max_iter=4000, random_state=RND))])
    m = pipe.fit(X_tr_best, y_tr.loc[X_tr_best.index])
    proba_hold = m.predict_proba(X_te_best)[:,1]
    hold_auc = roc_auc_score(y_te.loc[X_te_best.index], proba_hold)
    hold_pr  = average_precision_score(y_te.loc[X_te_best.index], proba_hold)

    # Gegenprobe: all_features auf Holdout
    X_tr_all = make_feature_set(df_tr, drop_calc=False, extra_drop=[], add_extras=False)
    X_te_all = make_feature_set(df_te, drop_calc=False, extra_drop=[], add_extras=False)
    catA, binA, numA = split_cols(X_tr_all.columns)
    pipe_all = Pipeline([("pre", build_pre(catA, binA, numA)), ("clf", LogisticRegression(penalty="l2", solver="saga", C=C, class_weight="balanced", max_iter=4000, random_state=RND))])
    m_all = pipe_all.fit(X_tr_all, y_tr.loc[X_tr_all.index])
    proba_hold_all = m_all.predict_proba(X_te_all)[:,1]
    hold_auc_all = roc_auc_score(y_te.loc[X_te_all.index], proba_hold_all)
    hold_pr_all  = average_precision_score(y_te.loc[X_te_all.index], proba_hold_all)

    # Falls Bestes auf Holdout schlechter als alle Features um >0.002 → fallback
    if (hold_auc + 1e-12) < (hold_auc_all - 0.002) or (hold_pr + 1e-12) < (hold_pr_all - 0.002):
        chosen = {"name":"all_features", "drop_calc":False, "extra_drop":[], "add_extras":False, "cv_auc":None, "cv_pr_auc":None,
                  "holdout_auc":float(hold_auc_all), "holdout_pr_auc":float(hold_pr_all)}
        X_choose = X_tr_all.columns
    else:
        chosen = {"name":best["name"], "drop_calc":best["drop_calc"], "extra_drop":best["extra_drop"], "add_extras":best["add_extras"],
                  "cv_auc":float(best["cv_auc"]), "cv_pr_auc":float(best["cv_pr_auc"]), "holdout_auc":float(hold_auc), "holdout_pr_auc":float(hold_pr)}
        X_choose = X_tr_best.columns

    feat_path = reports/"features_selected.csv"
    pd.Series(pd.Index(X_choose), name="raw_feature").to_csv(feat_path, index=False)

    meta = {"random_state":RND, "cv_splits":CV, "C":C, "n_rows":int(len(df)), "sample_n":int(len(df)),
            "scores_path":str(res_path), "features_path":str(feat_path), "chosen":chosen}
    (reports/"feature_gate_meta.json").write_text(json.dumps(meta, indent=2))

    # --- Console-Report (kurz)
    print("\nFEATURE-GATE fertig.")
    print(f"Train n={len(df_tr):,}, Holdout n={len(df_te):,}, CV={CV}, C={C}")
    print("Scores (CV):\n", res.head(6).to_string(index=False))
    print(f"\nHoldout {best['name']}:   AUC={hold_auc:.4f}  PR-AUC={hold_pr:.4f}")
    print(f"Holdout all_features: AUC={hold_auc_all:.4f}  PR-AUC={hold_pr_all:.4f}")
    print(f"\nGewählt: {chosen['name']}")
    print(f"→ Spalten: {feat_path}")
    print(f"→ Scores:  {res_path}")

if __name__ == "__main__":
    main()


Lade Datensatz lokal...
Datensatz erfolgreich geladen

FEATURE-GATE fertig.
Train n=200,000, Holdout n=50,000, CV=3, C=1.0
Scores (CV):
                 name  n_features   cv_auc  cv_pr_auc  drop_calc                 extra_drop  add_extras
drop_calc+opt+extras          37 0.620888   0.059965       True [ps_ind_14, ps_car_10_cat]        True
    drop_calc+extras          39 0.620864   0.059968       True                         []        True
      drop_calc_only          37 0.620800   0.059986       True                         []       False
        all_features          57 0.619775   0.059835      False                         []       False

Holdout drop_calc+opt+extras:   AUC=0.6325  PR-AUC=0.0651
Holdout all_features: AUC=0.6284  PR-AUC=0.0645

Gewählt: drop_calc+opt+extras
→ Spalten: /Users/lucasbeseler/ada_portoSeguro/reports/features_selected.csv
→ Scores:  /Users/lucasbeseler/ada_portoSeguro/reports/feature_gate_scores.csv
