In [None]:
#!/usr/bin/env python3

import os, sys, time, warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from pathlib import Path
import sys

# Robust: funktioniert als .py **und** im Notebook
if "__file__" in globals():
    ROOT = Path(__file__).resolve().parents[1]  # <repo>/
else:
    CWD = Path.cwd()
    ROOT = CWD.parent if CWD.name == "notebooks" else CWD  # <repo>/notebooks -> <repo>/
sys.path.insert(0, str(ROOT))

# optional: sanity check
print("ROOT =", ROOT)
assert (ROOT / "src").exists(), "src/ nicht gefunden – stimmt dein Repo-Pfad?"

# optionales Theme
try:
    from src import theme
    theme.set_project_theme()
except Exception:
    pass

from src.data_loader import load_and_save_data

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.feature_selection import RFECV, mutual_info_classif
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt  # Plot als PNG speichern

RANDOM_STATE = 42

# --- kleine Tools
def pbar(iterable, total=None, desc=""):
    total = total or (len(iterable) if hasattr(iterable, "__len__") else None)
    count, start = 0, time.time()
    for x in iterable:
        yield x
        count += 1
        if total:
            pct = int(100 * count / total)
            bar = "█" * (pct // 4) + "·" * (25 - pct // 4)
            rate = count / max(1e-9, time.time() - start)
            print(f"\r{desc} [{bar}] {pct:3d}% {count}/{total} {rate:.1f} it/s", end="")
    if total:
        print(f"\r{desc} [{'█'*25}] 100% {count}/{total} done        ")

from contextlib import contextmanager
@contextmanager
def timer(name):
    t0 = time.perf_counter()
    yield
    dt = time.perf_counter() - t0
    print(f"[t] {name}: {dt:.1f}s")

def gini_from_auc(auc: float) -> float:
    return 2.0 * auc - 1.0

# --- leichtes FE
def kaggle_style_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df["missing_count"] = df.isna().sum(axis=1)
    bin_cols = [c for c in df.columns if c.endswith("_bin")]
    if bin_cols:
        df["sum_all_bin"] = df[bin_cols].sum(axis=1)
    return df

def add_missing_indicators(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    for c in df.columns:
        if c != "target" and df[c].isna().any():
            df[f"{c}_isna"] = df[c].isna().astype(int)
    return df

def drop_near_zero_variance(df: pd.DataFrame, thresh=1e-6) -> pd.DataFrame:
    keep = []
    for c in df.columns:
        if c == "target": 
            continue
        if not pd.api.types.is_numeric_dtype(df[c]): 
            keep.append(c)
        else:
            if df[c].var(ddof=0) > thresh:
                keep.append(c)
    return df[keep + (["target"] if "target" in df.columns else [])]

def drop_high_corr_numeric(df: pd.DataFrame, thr=0.98) -> pd.DataFrame:
    num_cols = [c for c in df.columns if c != "target" and pd.api.types.is_numeric_dtype(df[c])]
    if len(num_cols) < 2: 
        return df
    corr = df[num_cols].corr().abs()
    upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
    to_drop = [col for col in upper.columns if any(upper[col] > thr)]
    return df.drop(columns=to_drop, errors="ignore")

def split_columns(cols):
    cat = [c for c in cols if c.endswith("_cat")]
    bin_ = [c for c in cols if c.endswith("_bin")]
    other = [c for c in cols if (c not in cat and c not in bin_ and c != "target")]
    return cat, bin_, other

def _ohe_dense_minfreq():
    # robust über sklearn-Versionen
    try:
        return OneHotEncoder(handle_unknown="ignore", sparse_output=False, min_frequency=0.01)
    except TypeError:
        try:
            return OneHotEncoder(handle_unknown="ignore", sparse_output=False)
        except TypeError:
            return OneHotEncoder(handle_unknown="ignore", sparse=False)

def build_preprocessor(cat_cols, bin_cols, num_cols) -> ColumnTransformer:
    cat_pipeline = Pipeline([("imputer", SimpleImputer(strategy="most_frequent")),
                             ("ohe", _ohe_dense_minfreq())])
    num_pipeline = Pipeline([("imputer", SimpleImputer(strategy="median")),
                             ("scaler", StandardScaler())])
    bin_pipeline = Pipeline([("imputer", SimpleImputer(strategy="most_frequent"))])
    return ColumnTransformer(
        [("cat", cat_pipeline, cat_cols),
         ("bin", bin_pipeline, bin_cols),
         ("num", num_pipeline, num_cols)],
        remainder="drop"
    )

def get_feature_names(pre: ColumnTransformer, cat_cols, bin_cols, num_cols):
    names = []
    for name, trans, cols in pre.transformers_:
        if name == "cat":
            names.extend(list(trans.named_steps["ohe"].get_feature_names_out(cols)))
        elif name == "bin":
            names.extend(cols)
        elif name == "num":
            names.extend(cols)
    return names

def cv_predict_proba_manual(pipeline: Pipeline, X, y, n_splits=5, desc="CV"):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_STATE)
    proba = np.zeros(len(y), dtype=float)
    for (tr, te) in pbar(list(skf.split(X, y)), total=n_splits, desc=desc):
        model = pipeline.fit(X.iloc[tr], y.iloc[tr])
        proba[te] = model.predict_proba(X.iloc[te])[:, 1]
    return proba

def ensure_dir(p: Path):
    p.mkdir(parents=True, exist_ok=True)

def save_auc_plot(summary_df: pd.DataFrame, out_png: Path):
    fig = plt.figure(figsize=(7.5, 4.5))
    order = summary_df.sort_values("auc", ascending=True)
    plt.barh(order["feature_set"], order["auc"])
    for i, v in enumerate(order["auc"]):
        plt.text(v + 0.001, i, f"{v:.3f}", va="center", fontsize=9)
    plt.xlabel("AUC")
    plt.title("Feature-Set Vergleich (CV-AUC)")
    plt.tight_layout()
    fig.savefig(out_png, dpi=150)
    plt.close(fig)

def main():
    # --- Profile (anpassbar)
    PROFILE = os.environ.get("FS_PROFILE", "MEDIUM")  # FAST / MEDIUM / FULL

    if PROFILE == "FAST":
        BASELINE_SAMPLE_N = 100_000
        KAGGLE_SAMPLE_N   = 100_000
        RFE_SAMPLE_N      = 60_000
        PI_SAMPLE_N       = 60_000
        CV_SPLITS         = 3
        RUN_RFECV         = False
        RUN_PI            = False
    elif PROFILE == "FULL":
        BASELINE_SAMPLE_N = None
        KAGGLE_SAMPLE_N   = None
        RFE_SAMPLE_N      = None
        PI_SAMPLE_N       = None
        CV_SPLITS         = 5
        RUN_RFECV         = True
        RUN_PI            = True
    else:  # MEDIUM
        BASELINE_SAMPLE_N = 200_000
        KAGGLE_SAMPLE_N   = 200_000
        RFE_SAMPLE_N      = 150_000
        PI_SAMPLE_N       = 150_000
        CV_SPLITS         = 5
        RUN_RFECV         = True
        RUN_PI            = True

    # Optional: zusätzliche Drops (manche Kaggle-Lösungen)
    DROP_EXTRA = True
    EXTRA_TO_DROP = ["ps_ind_14", "ps_car_10_cat"]

    reports_dir = ROOT / "reports"
    ensure_dir(reports_dir)

    # --- Daten laden & vorbereiten
    print(f"PROFILE={PROFILE}  (CV={CV_SPLITS})")
    with timer("load & prep"):
        df = load_and_save_data()
        assert df is not None and len(df) > 0, "Kein DataFrame erhalten."
        df = df.replace(-1, np.nan)
        y = df["target"].astype(int)
        X = df.drop(columns=["target"])
        X = kaggle_style_features(X)
        X = add_missing_indicators(X)
        Xc = pd.concat([X, y], axis=1)
        Xc = drop_near_zero_variance(Xc)
        Xc = drop_high_corr_numeric(Xc)
        X = Xc.drop(columns=["target"])

    # --- Modelle (L1/L2 getuned via CV)
    cv5 = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
    l1cv = LogisticRegressionCV(
        Cs=np.logspace(-3, 2, 10), cv=cv5, scoring="roc_auc",
        penalty="l1", solver="saga", class_weight="balanced",
        max_iter=4000, n_jobs=-1, refit=True
    )
    l2cv = LogisticRegressionCV(
        Cs=np.logspace(-3, 2, 10), cv=cv5, scoring="roc_auc",
        penalty="l2", solver="lbfgs", class_weight="balanced",
        max_iter=4000, n_jobs=-1, refit=True
    )
    hgb = HistGradientBoostingClassifier(
        learning_rate=0.08, max_leaf_nodes=31, random_state=RANDOM_STATE
    )

    # --- Baseline (L1 getuned)
    print("baseline (L1 tuned)…")
    Xb = X.sample(BASELINE_SAMPLE_N, random_state=RANDOM_STATE) if BASELINE_SAMPLE_N else X
    yb = y.loc[Xb.index]
    cb, bb, nb = split_columns(Xb.columns)
    pre_b = build_preprocessor(cb, bb, nb)
    pipe_b = Pipeline([("pre", pre_b), ("clf", l1cv)])
    with timer("CV baseline"):
        proba_b = cv_predict_proba_manual(pipe_b, Xb, yb, n_splits=CV_SPLITS, desc="CV baseline")
        auc_b = roc_auc_score(yb, proba_b)
    rep_base = {"feature_set": "Baseline (L1 tuned)", "auc": auc_b, "gini": gini_from_auc(auc_b)}
    # Top-K (auf Full-Fit)
    with timer("fit coef baseline (L1 tuned)"):
        model_b = pipe_b.fit(Xb, yb)
        fn_b = get_feature_names(model_b.named_steps["pre"], cb, bb, nb)
        coef_b = model_b.named_steps["clf"].coef_.ravel()
        imp_b = (pd.DataFrame({"feature": fn_b, "coef": coef_b, "abs_coef": np.abs(coef_b)})
                 .sort_values("abs_coef", ascending=False).head(60))
        imp_b.to_csv(reports_dir / "topcoef_logreg_l1_tuned_baseline.csv", index=False)

    # --- Kaggle-Style (drop ps_calc_* [+ optional extra])
    print("kaggle-style …")
    drop_calc = [c for c in X.columns if c.startswith("ps_calc_")]
    Xk = X.drop(columns=drop_calc, errors="ignore")
    if DROP_EXTRA:
        Xk = Xk.drop(columns=[c for c in EXTRA_TO_DROP if c in Xk.columns], errors="ignore")
    yk = y.loc[Xk.index]
    ck, bk, nk = split_columns(Xk.columns)
    pre_k = build_preprocessor(ck, bk, nk)
    pipe_k = Pipeline([("pre", pre_k), ("clf", l1cv)])
    with timer("CV kaggle"):
        Xk_eval = Xk.sample(KAGGLE_SAMPLE_N, random_state=RANDOM_STATE) if KAGGLE_SAMPLE_N else Xk
        yk_eval = y.loc[Xk_eval.index]
        proba_k = cv_predict_proba_manual(pipe_k, Xk_eval, yk_eval, n_splits=CV_SPLITS, desc="CV kaggle")
        auc_k = roc_auc_score(yk_eval, proba_k)
    rep_kaggle = {"feature_set": "Kaggle-Style (L1 tuned)", "auc": auc_k, "gini": gini_from_auc(auc_k)}

    # --- RFECV (L1, auf Sample)
    if RUN_RFECV:
        print("rfecv …")
        Xr = X.sample(RFE_SAMPLE_N, random_state=RANDOM_STATE) if RFE_SAMPLE_N else X
        yr = y.loc[Xr.index]
        cr, br, nr = split_columns(Xr.columns)
        pre_r = build_preprocessor(cr, br, nr)
        Xr_mat = pre_r.fit_transform(Xr, yr)
        fn_r = get_feature_names(pre_r, cr, br, nr)
        rfecv = RFECV(
            estimator=LogisticRegression(penalty="l1", solver="saga", max_iter=4000,
                                         class_weight="balanced", random_state=RANDOM_STATE, n_jobs=-1),
            step=0.2, cv=3, scoring="roc_auc", n_jobs=-1
        )
        with timer("RFECV fit"):
            rfecv.fit(Xr_mat, yr)
        mask = rfecv.support_
        sel_names = [f for f, keep in zip(fn_r, mask) if keep]
        pd.Series(sel_names, name="selected_feature").to_csv(reports_dir / "rfe_selected_features.csv", index=False)
        # Out-of-fold auf selek. Matrix
        skf = StratifiedKFold(n_splits=CV_SPLITS, shuffle=True, random_state=RANDOM_STATE)
        proba_rfe = np.zeros(len(yr))
        for (tr, te) in pbar(list(skf.split(Xr_mat[:, mask], yr)), total=CV_SPLITS, desc="CV rfe"):
            clf = LogisticRegression(penalty="l1", solver="saga", max_iter=4000,
                                     class_weight="balanced", random_state=RANDOM_STATE, n_jobs=-1)
            clf.fit(Xr_mat[tr][:, mask], yr.iloc[tr])
            proba_rfe[te] = clf.predict_proba(Xr_mat[te][:, mask])[:, 1]
        auc_rfe = roc_auc_score(yr, proba_rfe)
        rep_rfe = {"feature_set": f"RFECV ({mask.sum()} feats)", "auc": auc_rfe, "gini": gini_from_auc(auc_rfe)}
    else:
        rep_rfe = {"feature_set": "RFECV (skipped)", "auc": np.nan, "gini": np.nan}
        sel_names = []

    # --- Mutual Information (Baseline-View)
    print("mutual information …")
    pre_mi = build_preprocessor(cb, bb, nb)
    with timer("MI"):
        Xmat_full = pre_mi.fit_transform(Xb, yb)
        fn_mi = get_feature_names(pre_mi, cb, bb, nb)
        mi_vals = mutual_info_classif(Xmat_full, yb, random_state=RANDOM_STATE)
        mi_rank = (pd.DataFrame({"feature": fn_mi, "mi": mi_vals})
                   .sort_values("mi", ascending=False).head(60))
        mi_rank.to_csv(reports_dir / "mi_rank_baseline.csv", index=False)

    # --- HGB + Permutation Importance (optional)
    if RUN_PI:
        print("histgradientboosting + PI …")
        Xpi = X.sample(PI_SAMPLE_N, random_state=RANDOM_STATE) if PI_SAMPLE_N else X
        ypi = y.loc[Xpi.index]
        cp, bp, np_ = split_columns(Xpi.columns)
        pre_h = build_preprocessor(cp, bp, np_)
        pipe_hgb = Pipeline([("pre", pre_h), ("clf", hgb)])
        with timer("CV hgb"):
            proba_hgb = cv_predict_proba_manual(pipe_hgb, Xpi, ypi, n_splits=CV_SPLITS, desc="CV hgb")
            auc_hgb = roc_auc_score(ypi, proba_hgb)
        model_hgb = pipe_hgb.fit(Xpi, ypi)
        pre_fit = model_hgb.named_steps["pre"]
        fn_h = get_feature_names(pre_fit, cp, bp, np_)
        Xpi_mat = pre_fit.transform(Xpi)
        with timer("Permutation Importance"):
            pi = permutation_importance(model_hgb.named_steps["clf"], Xpi_mat, ypi,
                                        n_repeats=3, random_state=RANDOM_STATE, n_jobs=-1,
                                        scoring="roc_auc")
        pi_rank = (pd.DataFrame({"feature": fn_h, "pi": pi.importances_mean})
                   .sort_values("pi", ascending=False).head(60))
        pi_rank.to_csv(reports_dir / "permutation_importance_hgb.csv", index=False)
        rep_hgb = {"feature_set": "HGB + PI", "auc": auc_hgb, "gini": gini_from_auc(auc_hgb)}
    else:
        pi_rank = pd.DataFrame(columns=["feature", "pi"])
        rep_hgb = {"feature_set": "HGB + PI (skipped)", "auc": np.nan, "gini": np.nan}

    # --- Ridge (L2 getuned)
    print("ridge (L2 tuned) …")
    c2, b2, n2 = split_columns(X.columns)
    pre_ridge = build_preprocessor(c2, b2, n2)
    pipe_ridge = Pipeline([("pre", pre_ridge), ("clf", l2cv)])
    with timer("CV ridge"):
        proba_ridge = cv_predict_proba_manual(pipe_ridge, X, y, n_splits=CV_SPLITS, desc="CV ridge")
        auc_ridge = roc_auc_score(y, proba_ridge)
    rep_ridge = {"feature_set": "Ridge (L2 tuned)", "auc": auc_ridge, "gini": gini_from_auc(auc_ridge)}
    with timer("fit coef ridge"):
        model_ridge = pipe_ridge.fit(X, y)
        fn_ridge = get_feature_names(model_ridge.named_steps["pre"], c2, b2, n2)
        coef_r = model_ridge.named_steps["clf"].coef_.ravel()
        ridge_rank = (pd.DataFrame({"feature": fn_ridge, "coef": coef_r, "abs_coef": np.abs(coef_r)})
                      .sort_values("abs_coef", ascending=False).head(60))
        ridge_rank.to_csv(reports_dir / "topcoef_ridge_tuned.csv", index=False)

    # --- Stabile Kandidatenliste (Schnittmenge aus Methoden)
    top_l1 = set(imp_b["feature"])
    top_mi = set(mi_rank["feature"].head(60))
    top_pi = set(pi_rank["feature"].head(60)) if not pi_rank.empty else set()
    top_rfe = set(sel_names)
    def in_methods(f):
        return int(f in top_l1) + int(f in top_mi) + int(f in top_pi) + int(f in top_rfe)
    all_feats = list(top_l1 | top_mi | top_pi | top_rfe)
    stable = [f for f in all_feats if in_methods(f) >= 2]
    pd.Series(stable, name="feature").to_csv(reports_dir / "feature_candidates_stable.csv", index=False)
    print(f"stable candidates: {len(stable)} -> {reports_dir/'feature_candidates_stable.csv'}")

    # --- Summary + Plot
    summary = pd.DataFrame([rep_base, rep_kaggle, rep_rfe, rep_hgb, rep_ridge]).sort_values("auc", ascending=False)
    summary.to_csv(reports_dir / "feature_set_cv_summary.csv", index=False)
    save_auc_plot(summary, reports_dir / "feature_set_cv_summary.png")

    print("\n== CV performance (AUC/Gini) ==")
    print(summary.to_string(index=False))
    print(f"\nReports & Plot -> {reports_dir}")

if __name__ == "__main__":
    main()


ROOT = /Users/lucasbeseler/ada_portoSeguro
PROFILE=MEDIUM  (CV=5)
Lade Datensatz aus dem Cache.
[t] load & prep: 5.0s
baseline (L1 tuned)…
