In [None]:
import os, sys, time, json, warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt

# --- ROOT robust (Notebook + Skript)
if "__file__" in globals():
    ROOT = Path(__file__).resolve().parents[1]
else:
    CWD = Path.cwd()
    ROOT = CWD.parent if CWD.name == "notebooks" else CWD
sys.path.insert(0, str(ROOT))

# optionales Theme
try:
    from src import theme
    theme.set_project_theme()
except Exception:
    pass

from src.data_loader import load_and_save_data

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.feature_selection import RFECV, mutual_info_classif
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.inspection import permutation_importance

RANDOM_STATE = 42

# ------------ kleine Tools ------------
def pbar(iterable, total=None, desc=""):
    total = total or (len(iterable) if hasattr(iterable, "__len__") else None)
    cnt, start = 0, time.time()
    for x in iterable:
        yield x
        cnt += 1
        if total:
            pct = int(100 * cnt / total)
            bar = "█" * (pct // 4) + "·" * (25 - pct // 4)
            rate = cnt / max(1e-9, time.time() - start)
            print(f"\r{desc} [{bar}] {pct:3d}% {cnt}/{total} {rate:.1f} it/s", end="")
    if total:
        print(f"\r{desc} [{'█'*25}] 100% {cnt}/{total} done       ")

from contextlib import contextmanager
@contextmanager
def timer(name):
    t0 = time.perf_counter()
    yield
    dt = time.perf_counter() - t0
    print(f"[t] {name}: {dt:.1f}s")

def gini_from_auc(auc): return 2*auc - 1
def ensure_dir(p: Path): p.mkdir(parents=True, exist_ok=True)

# ------------ leichtes FE ------------
def kaggle_style_features(df):
    df = df.copy()
    df["missing_count"] = df.isna().sum(axis=1)
    bin_cols = [c for c in df.columns if c.endswith("_bin")]
    if bin_cols: df["sum_all_bin"] = df[bin_cols].sum(axis=1)
    return df

def add_missing_indicators(df):
    df = df.copy()
    for c in df.columns:
        if c != "target" and df[c].isna().any():
            df[f"{c}_isna"] = df[c].isna().astype(int)
    return df

def drop_near_zero_variance(df, thresh=1e-6):
    keep = []
    for c in df.columns:
        if c == "target": continue
        if not pd.api.types.is_numeric_dtype(df[c]): keep.append(c)
        else:
            if df[c].var(ddof=0) > thresh: keep.append(c)
    return df[keep + (["target"] if "target" in df.columns else [])]

def drop_high_corr_numeric(df, thr=0.98):
    num = [c for c in df.columns if c!="target" and pd.api.types.is_numeric_dtype(df[c])]
    if len(num)<2: return df
    corr = df[num].corr().abs()
    upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
    drop = [col for col in upper.columns if any(upper[col] > thr)]
    return df.drop(columns=drop, errors="ignore")

def split_columns(cols):
    cat = [c for c in cols if c.endswith("_cat")]
    bin_ = [c for c in cols if c.endswith("_bin")]
    other = [c for c in cols if (c not in cat and c not in bin_ and c != "target")]
    return cat, bin_, other

def _ohe_dense_minfreq():
    try:
        return OneHotEncoder(handle_unknown="ignore", sparse_output=False, min_frequency=0.01)
    except TypeError:
        try:    return OneHotEncoder(handle_unknown="ignore", sparse_output=False)
        except TypeError:
                return OneHotEncoder(handle_unknown="ignore", sparse=False)

def build_preprocessor(cat, bin_, num):
    cat_pipe = Pipeline([("imp", SimpleImputer(strategy="most_frequent")),
                         ("ohe", _ohe_dense_minfreq())])
    num_pipe = Pipeline([("imp", SimpleImputer(strategy="median")),
                         ("sc", StandardScaler())])
    bin_pipe = Pipeline([("imp", SimpleImputer(strategy="most_frequent"))])
    return ColumnTransformer([("cat", cat_pipe, cat),
                              ("bin", bin_pipe, bin_),
                              ("num", num_pipe, num)],
                             remainder="drop")

def get_feature_names(pre, cat, bin_, num):
    names = []
    for name, trans, cols in pre.transformers_:
        if name == "cat":
            names.extend(list(trans.named_steps["ohe"].get_feature_names_out(cols)))
        elif name == "bin": names.extend(cols)
        elif name == "num": names.extend(cols)
    return names

def cv_predict_proba(pipeline, X, y, n_splits=5, desc="CV"):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_STATE)
    proba = np.zeros(len(y))
    for (tr, te) in pbar(list(skf.split(X, y)), total=n_splits, desc=desc):
        m = pipeline.fit(X.iloc[tr], y.iloc[tr])
        proba[te] = m.predict_proba(X.iloc[te])[:,1]
    return proba

# ------------ einmal C tunen ------------
def tune_C_once(X, y, penalty="l1", sample_n=150_000):
    Cs = np.logspace(-3, 1.5, 9)
    cat, bin_, num = split_columns(X.columns)
    pre = build_preprocessor(cat, bin_, num)
    base = LogisticRegression(
        penalty=penalty, solver=("saga" if penalty=="l1" else "lbfgs"),
        class_weight="balanced", max_iter=4000, random_state=RANDOM_STATE, n_jobs=-1
    )
    pipe = Pipeline([("pre", pre), ("clf", base)])
    grid = GridSearchCV(pipe, {"clf__C": Cs},
                        cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE),
                        scoring="roc_auc", n_jobs=-1)
    Xs = X.sample(min(sample_n, len(X)), random_state=RANDOM_STATE) if sample_n else X
    ys = y.loc[Xs.index]
    grid.fit(Xs, ys)
    return float(grid.best_params_["clf__C"])

# ------------ plotting ------------
def save_auc_plot(summary_df, out_png):
    fig = plt.figure(figsize=(9,5))
    order = summary_df.sort_values("auc", ascending=True)
    plt.barh(order["feature_set"], order["auc"], color="#402D6F")
    for i, v in enumerate(order["auc"]):
        plt.text(v+0.001, i, f"{v:.3f}", va="center", fontsize=9)
    plt.xlabel("AUC"); plt.title("Feature-Set Vergleich (CV-AUC)")
    plt.tight_layout(); fig.savefig(out_png, dpi=150); plt.close(fig)

def save_vote_plot(vote_df, out_png, topn=40):
    fig = plt.figure(figsize=(9,10))
    dd = vote_df.sort_values("votes", ascending=True).tail(topn)
    plt.barh(dd["raw_feature"], dd["votes"], color="#402D6F")
    for i, v in enumerate(dd["votes"]):
        plt.text(v+0.02, i, f"{v:.1f}", va="center", fontsize=8)
    plt.xlabel("Votes (Methoden)"); plt.title(f"Feature-Votes (Top {topn})")
    plt.tight_layout(); fig.savefig(out_png, dpi=150); plt.close(fig)

# ------------ OHE→Raw Mapping (fix) ------------
def raw_from_ohe(name: str) -> str:
    # NA-Flags behalten
    if "_isna" in name:
        return name
    # echte OHE-Dummies: "<col>_cat_<level>" -> "<col>_cat"
    if "_cat_" in name:
        return name.split("_cat_")[0] + "_cat"
    # numerische/binäre bleiben 1:1 (z.B. ps_calc_12, ps_ind_17_bin)
    return name

# ------------ Hauptlauf ------------
def main():
    # Profile
    PROFILE = os.environ.get("FS_PROFILE", "FAST")
    if PROFILE == "FAST":
        CV_SPLITS = 3; RUN_RFECV = False; RUN_PI = False
        N_BASE, N_KAG, N_RFE, N_PI = 100_000, 100_000, 60_000, 60_000
    elif PROFILE == "FULL":
        CV_SPLITS = 5; RUN_RFECV = True; RUN_PI = True
        N_BASE = N_KAG = N_RFE = N_PI = None
    else:  # MEDIUM
        CV_SPLITS = 5; RUN_RFECV = True; RUN_PI = True
        N_BASE, N_KAG, N_RFE, N_PI = 200_000, 200_000, 150_000, 150_000

    DROP_EXTRA = True
    EXTRA_TO_DROP = ["ps_ind_14", "ps_car_10_cat"]  # oft gedroppt

    reports = ROOT / "reports"
    ensure_dir(reports)

    print(f"ROOT={ROOT}")
    print(f"PROFILE={PROFILE} (CV={CV_SPLITS})")

    # Daten
    with timer("load & prep"):
        df = load_and_save_data(); assert df is not None and len(df)>0
        df = df.replace(-1, np.nan)
        y = df["target"].astype(int)
        X = df.drop(columns=["target"])
        X = kaggle_style_features(X)
        X = add_missing_indicators(X)
        Xc = pd.concat([X, y], axis=1)
        Xc = drop_near_zero_variance(Xc)
        Xc = drop_high_corr_numeric(Xc)
        X = Xc.drop(columns=["target"])

    # C einmal tunen
    with timer("tune C (L1)"):
        C_L1 = tune_C_once(X, y, penalty="l1", sample_n=150_000)
    with timer("tune C (L2)"):
        C_L2 = tune_C_once(X, y, penalty="l2", sample_n=150_000)

    # Modelle (fixes C)
    lr_l1 = LogisticRegression(penalty="l1", solver="saga", C=C_L1,
                               class_weight="balanced", max_iter=4000, n_jobs=-1, random_state=RANDOM_STATE)
    lr_l2 = LogisticRegression(penalty="l2", solver="lbfgs", C=C_L2,
                               class_weight="balanced", max_iter=4000, n_jobs=-1, random_state=RANDOM_STATE)
    hgb = HistGradientBoostingClassifier(learning_rate=0.08, max_leaf_nodes=31, random_state=RANDOM_STATE)

    # Helper
    def mkpipe(Xcols, clf):
        c,b,n = split_columns(Xcols)
        pre = build_preprocessor(c,b,n)
        return Pipeline([("pre", pre), ("clf", clf)]), pre, (c,b,n)

    # Baseline (L1)
    print("baseline (L1 fixed C)…")
    Xb = X.sample(min(len(X), N_BASE), random_state=RANDOM_STATE) if N_BASE else X
    yb = y.loc[Xb.index]
    pipe_b, pre_b, (cb,bb,nb) = mkpipe(Xb.columns, lr_l1)
    with timer("CV baseline"):
        proba_b = cv_predict_proba(pipe_b, Xb, yb, CV_SPLITS, "CV baseline")
        auc_b = roc_auc_score(yb, proba_b)
    rep_base = {"feature_set":"Baseline (L1 tuned once)","auc":auc_b,"gini":gini_from_auc(auc_b)}
    # Top-K
    model_b = pipe_b.fit(Xb, yb)
    fn_b = get_feature_names(model_b.named_steps["pre"], cb, bb, nb)
    coef_b = model_b.named_steps["clf"].coef_.ravel()
    imp_b = (pd.DataFrame({"feature":fn_b,"coef":coef_b,"abs_coef":np.abs(coef_b)})
             .sort_values("abs_coef", ascending=False).head(80))
    imp_b.to_csv(reports/"topcoef_logreg_l1_baseline.csv", index=False)

    # Kaggle-Style (drop calc + optional extra)
    print("kaggle-style …")
    drop_calc = [c for c in X.columns if c.startswith("ps_calc_")]
    Xk = X.drop(columns=drop_calc, errors="ignore")
    if DROP_EXTRA: Xk = Xk.drop(columns=[c for c in EXTRA_TO_DROP if c in Xk.columns], errors="ignore")
    Xk_eval = Xk.sample(min(len(Xk), N_KAG), random_state=RANDOM_STATE) if N_KAG else Xk
    yk_eval = y.loc[Xk_eval.index]
    pipe_k, pre_k, (ck,bk,nk) = mkpipe(Xk_eval.columns, lr_l1)
    with timer("CV kaggle"):
        proba_k = cv_predict_proba(pipe_k, Xk_eval, yk_eval, CV_SPLITS, "CV kaggle")
        auc_k = roc_auc_score(yk_eval, proba_k)
    rep_kaggle = {"feature_set":"Kaggle-Style (L1 tuned once)","auc":auc_k,"gini":gini_from_auc(auc_k)}

    # RFECV (optional)
    if RUN_RFECV:
        print("rfecv …")
        Xr = X.sample(min(len(X), N_RFE), random_state=RANDOM_STATE) if N_RFE else X
        yr = y.loc[Xr.index]
        cr, br, nr = split_columns(Xr.columns)
        pre_r = build_preprocessor(cr, br, nr)
        Xr_mat = pre_r.fit_transform(Xr, yr)
        fn_r = get_feature_names(pre_r, cr, br, nr)
        rfecv = RFECV(
            estimator=LogisticRegression(penalty="l1", solver="saga", C=C_L1,
                                         class_weight="balanced", max_iter=4000, n_jobs=-1, random_state=RANDOM_STATE),
            step=0.2, cv=3, scoring="roc_auc", n_jobs=-1
        )
        with timer("RFECV fit"):
            rfecv.fit(Xr_mat, yr)
        mask = rfecv.support_
        sel_names = [f for f,keep in zip(fn_r, mask) if keep]
        pd.Series(sel_names, name="selected_feature").to_csv(reports/"rfe_selected_features.csv", index=False)
        # CV mit Auswahl
        skf = StratifiedKFold(n_splits=CV_SPLITS, shuffle=True, random_state=RANDOM_STATE)
        proba_rfe = np.zeros(len(yr))
        for (tr, te) in pbar(list(skf.split(Xr_mat[:,mask], yr)), total=CV_SPLITS, desc="CV rfe"):
            clf = LogisticRegression(penalty="l1", solver="saga", C=C_L1,
                                     class_weight="balanced", max_iter=4000, n_jobs=-1, random_state=RANDOM_STATE)
            clf.fit(Xr_mat[tr][:,mask], yr.iloc[tr])
            proba_rfe[te] = clf.predict_proba(Xr_mat[te][:,mask])[:,1]
        auc_rfe = roc_auc_score(yr, proba_rfe)
        rep_rfe = {"feature_set":f"RFECV ({mask.sum()} feats)","auc":auc_rfe,"gini":gini_from_auc(auc_rfe)}
    else:
        rep_rfe = {"feature_set":"RFECV (skipped)","auc":np.nan,"gini":np.nan}
        sel_names = []

    # Mutual Information (auf Baseline-Sample)
    print("mutual information …")
    pre_mi = build_preprocessor(cb, bb, nb)
    with timer("MI"):
        Xmat_full = pre_mi.fit_transform(Xb, yb)
        fn_mi = get_feature_names(pre_mi, cb, bb, nb)
        mi_vals = mutual_info_classif(Xmat_full, yb, random_state=RANDOM_STATE)
        mi_rank = (pd.DataFrame({"feature":fn_mi,"mi":mi_vals})
                   .sort_values("mi", ascending=False).head(80))
        mi_rank.to_csv(reports/"mi_rank_baseline.csv", index=False)

    # HGB + Permutation Importance (optional)
    if RUN_PI:
        print("histgradientboosting + PI …")
        Xpi = X.sample(min(len(X), N_PI), random_state=RANDOM_STATE) if N_PI else X
        ypi = y.loc[Xpi.index]
        cp, bp, np_ = split_columns(Xpi.columns)
        pre_h = build_preprocessor(cp, bp, np_)
        pipe_hgb = Pipeline([("pre", pre_h), ("clf", hgb)])
        with timer("CV hgb"):
            proba_hgb = cv_predict_proba(pipe_hgb, Xpi, ypi, CV_SPLITS, "CV hgb")
            auc_hgb = roc_auc_score(ypi, proba_hgb)
        model_hgb = pipe_hgb.fit(Xpi, ypi)
        pre_fit = model_hgb.named_steps["pre"]
        fn_h = get_feature_names(pre_fit, cp, bp, np_)
        Xpi_mat = pre_fit.transform(Xpi)
        with timer("Permutation Importance"):
            pi = permutation_importance(model_hgb.named_steps["clf"], Xpi_mat, ypi,
                                        n_repeats=3, random_state=RANDOM_STATE, n_jobs=-1, scoring="roc_auc")
        pi_rank = (pd.DataFrame({"feature":fn_h,"pi":pi.importances_mean})
                   .sort_values("pi", ascending=False).head(80))
        pi_rank.to_csv(reports/"permutation_importance_hgb.csv", index=False)
        rep_hgb = {"feature_set":"HGB + PI","auc":auc_hgb,"gini":gini_from_auc(auc_hgb)}
    else:
        pi_rank = pd.DataFrame(columns=["feature","pi"])
        rep_hgb = {"feature_set":"HGB + PI (skipped)","auc":np.nan,"gini":np.nan}

    # Ridge (L2) fix
    print("ridge (L2 fixed C) …")
    def mkpipe(Xcols, clf):
        c,b,n = split_columns(Xcols)
        pre = build_preprocessor(c,b,n)
        return Pipeline([("pre", pre), ("clf", clf)]), pre, (c,b,n)
    pipe_ridge, pre_ridge, (c2,b2,n2) = mkpipe(X.columns, lr_l2)
    with timer("CV ridge"):
        proba_ridge = cv_predict_proba(pipe_ridge, X, y, CV_SPLITS, "CV ridge")
        auc_ridge = roc_auc_score(y, proba_ridge)
    rep_ridge = {"feature_set":"Ridge (L2 tuned once)","auc":auc_ridge,"gini":gini_from_auc(auc_ridge)}
    # Coef-Ranking
    model_ridge = pipe_ridge.fit(X, y)
    fn_ridge = get_feature_names(model_ridge.named_steps["pre"], c2,b2,n2)
    coef_r = model_ridge.named_steps["clf"].coef_.ravel()
    ridge_rank = (pd.DataFrame({"feature":fn_ridge,"coef":coef_r,"abs_coef":np.abs(coef_r)})
                  .sort_values("abs_coef", ascending=False).head(80))
    ridge_rank.to_csv(reports/"topcoef_ridge.csv", index=False)

    # --- stabile Kandidaten (OHE) & Votes je Roh-Feature
    top_l1 = set(imp_b["feature"])
    top_mi = set(mi_rank["feature"])
    top_pi = set(pi_rank["feature"]) if not pi_rank.empty else set()
    top_rfe = set(sel_names)

    from collections import Counter
    vote_ohe = Counter()
    for s in [top_l1, top_mi, top_pi, top_rfe]:
        for f in s: vote_ohe[f] += 1

    stable_ohe = [f for f, v in vote_ohe.items() if v >= 2]
    pd.Series(stable_ohe, name="feature").to_csv(reports/"feature_candidates_stable_ohe.csv", index=False)

    vote_raw = Counter()
    for f, v in vote_ohe.items():
        vote_raw[raw_from_ohe(f)] += v
    votes_df = pd.DataFrame(sorted(vote_raw.items(), key=lambda x: (-x[1], x[0])),
                            columns=["raw_feature","votes"])
    votes_df.to_csv(reports/"feature_votes_raw.csv", index=False)

    # Schwelle: env steuert, sonst auto (>=4 wenn viele, sonst >=2)
    th_env = os.environ.get("FS_VOTE_THRESHOLD")
    THRESHOLD = int(th_env) if th_env is not None else (4 if len(votes_df)>=80 else 2)
    selected_raw = votes_df.loc[votes_df["votes"]>=THRESHOLD, "raw_feature"].tolist()
    pd.Series(selected_raw, name="raw_feature").to_csv(reports/"selected_features_raw.csv", index=False)

    # Summary + Plots
    summary = pd.DataFrame([rep_base, rep_kaggle, rep_rfe, rep_hgb, rep_ridge])\
                .sort_values("auc", ascending=False)
    summary.to_csv(reports/"feature_set_cv_summary.csv", index=False)
    save_auc_plot(summary, reports/"feature_set_cv_summary.png")
    save_vote_plot(votes_df, reports/"feature_votes_raw.png", topn=40)

    # Entscheidung schreiben
    decision = {
        "profile": PROFILE, "cv_splits": CV_SPLITS,
        "C_L1": C_L1, "C_L2": C_L2,
        "threshold": THRESHOLD,
        "selected_raw_n": len(selected_raw),
        "selected_raw": selected_raw[:200],
        "summary_auc": summary.to_dict(orient="records")
    }
    (reports/"final_feature_decision.json").write_text(json.dumps(decision, indent=2))
    (reports/"final_feature_decision.txt").write_text(
        "Empfehlung: nutze diese Roh-Variablen (regel: votes >= %d)\n\n%s\n" %
        (THRESHOLD, "\n".join(selected_raw))
    )

    print("\n== CV performance (AUC/Gini) ==")
    print(summary.to_string(index=False))
    print(f"\nSelected raw features: {len(selected_raw)}  -> {reports/'selected_features_raw.csv'}")
    print(f"Votes plot: {reports/'feature_votes_raw.png'}")
    print(f"AUC plot:   {reports/'feature_set_cv_summary.png'}")
    print(f"Decision:   {reports/'final_feature_decision.json'}")

if __name__ == "__main__":
    main()


ROOT=/Users/lucasbeseler/ada_portoSeguro
PROFILE=MEDIUM (CV=5)
Lade Datensatz aus dem Cache.
[t] load & prep: 4.8s


KeyboardInterrupt: 