In [2]:
import os, sys, json, warnings, time
warnings.filterwarnings("ignore")

from pathlib import Path
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import average_precision_score, roc_auc_score, brier_score_loss, precision_recall_curve
from sklearn.calibration import calibration_curve

# ---------- Paths ----------
if "__file__" in globals():
    ROOT = Path(__file__).resolve().parents[1]
else:
    ROOT = Path.cwd() if Path.cwd().name not in ("notebooks","tools","tests") else Path.cwd().parent

REPORTS_IN  = Path(os.getenv("REPORTS_IN")  or (ROOT / "reports"))         # shared inputs (split, features)
REPORTS_OUT = Path(os.getenv("REPORTS_OUT") or (ROOT / "reports_Lucas"))   # your outputs
REPORTS_OUT.mkdir(parents=True, exist_ok=True)

sys.path.insert(0, str(ROOT))
from src.data_loader import load_and_save_data

# ---------- Speed/Profile ----------
SPEED = os.getenv("SPEED", "MEDIUM").upper().strip()
def speed_cfg():
    cfg = dict(CV=5, N_EST=6000, EARLY_STOP=200, MODELS=["lgbm","xgb"], LR=0.03)
    if SPEED == "FAST":
        cfg.update(CV=3, N_EST=2000, EARLY_STOP=50, MODELS=["lgbm"], LR=0.05)
    elif SPEED == "MEDIUM":
        cfg.update(CV=5, N_EST=4000, EARLY_STOP=100)
    elif SPEED == "FULL":
        cfg.update(CV=5, N_EST=8000, EARLY_STOP=300)
    return cfg

CFG        = speed_cfg()
RND        = int(os.getenv("RND", "42"))
CV         = int(os.getenv("CV", str(CFG["CV"])))
N_EST      = int(os.getenv("N_EST", str(CFG["N_EST"])))
ESR        = int(os.getenv("EARLY_STOP", str(CFG["EARLY_STOP"])))
MODELS     = [m.strip() for m in os.getenv("MODELS", ",".join(CFG["MODELS"])).split(",") if m.strip()]
IMB        = os.getenv("IMB", "spw").lower()   # 'iso' (LGBM is_unbalance) or 'spw' (scale_pos_weight)
LR         = float(os.getenv("LR", str(CFG["LR"])))
MEMBER     = os.getenv("MEMBER", "Lucas")

# ---------- Utils ----------
def split_cols(cols):
    cat = [c for c in cols if str(c).endswith("_cat")]
    bin_ = [c for c in cols if str(c).endswith("_bin")]
    num  = [c for c in cols if c not in cat and c not in bin_ and c != "target"]
    return cat, bin_, num

def load_selected_feature_list():
    f = REPORTS_IN / "features_selected.csv"
    if not f.exists():
        raise FileNotFoundError(f"Missing {f}. Run feature-gate first.")
    s = pd.read_csv(f)
    if "raw_feature" not in s.columns:
        raise ValueError("features_selected.csv must have column 'raw_feature'.")
    return s["raw_feature"].astype(str).tolist()

def fe_extras(X, selected):
    X = X.copy()
    if "missing_count" in selected:
        X["missing_count"] = X.isna().sum(axis=1)
    if "sum_all_bin" in selected:
        b = [c for c in X.columns if str(c).endswith("_bin")]
        X["sum_all_bin"] = X[b].sum(axis=1) if b else 0
    return X

def prep_for_trees(X: pd.DataFrame, selected_cols):
    X = fe_extras(X, selected_cols)
    keep = [c for c in selected_cols if c in X.columns]
    missing = [c for c in selected_cols if c not in X.columns]
    if missing:
        print(f"[WARN] ignoring {len(missing)} missing selected feature(s).")
    X = X[keep].copy()
    cat, _, _ = split_cols(X.columns)
    for c in cat:
        try: X[c] = X[c].astype("category")
        except: pass
    return X, cat

def scale_pos_weight(y):
    pos = int((y==1).sum()); neg = int((y==0).sum())
    return float(neg / max(pos,1))

# ---------- XGB helpers ----------
def xgb_train_predict(Xtr, ytr, Xva, yva, Xte=None, params=None, seed=RND, lr=LR):
    import xgboost as xgb
    params = dict(params or {})
    p = {
        "objective": "binary:logistic",
        "eval_metric": "aucpr",
        "tree_method": params.pop("tree_method", "hist"),
        "eta": params.pop("learning_rate", lr),
        "max_depth": int(params.pop("max_depth", 6)),
        "min_child_weight": float(params.pop("min_child_weight", 2.0)),
        "subsample": float(params.pop("subsample", 0.9)),
        "colsample_bytree": float(params.pop("colsample_bytree", 0.9)),
        "lambda": float(params.pop("reg_lambda", 1.0)),
        "alpha": float(params.pop("reg_alpha", 0.0)),
        "gamma": float(params.pop("gamma", 0.0)),
        "seed": int(params.pop("seed", seed)),
        "nthread": -1,
    }
    if IMB == "spw":
        p["scale_pos_weight"] = float(params.pop("scale_pos_weight", 1.0))

    meta = {"use_ohe": False, "ohe_cols": None}
    cats = [c for c in Xtr.columns if str(c).endswith("_cat")]
    try:
        dtr = xgb.DMatrix(Xtr, label=ytr, enable_categorical=True)
        dva = xgb.DMatrix(Xva, label=yva, enable_categorical=True)
        dte = xgb.DMatrix(Xte, enable_categorical=True) if Xte is not None else None
        p["enable_categorical"] = True
    except Exception:
        meta["use_ohe"] = True
        dXtr = pd.get_dummies(Xtr, columns=cats, dummy_na=True)
        dXva = pd.get_dummies(Xva, columns=cats, dummy_na=True).reindex(columns=dXtr.columns, fill_value=0)
        dtr = xgb.DMatrix(dXtr, label=ytr)
        dva = xgb.DMatrix(dXva, label=yva)
        dte = None
        if Xte is not None:
            dXte = pd.get_dummies(Xte, columns=cats, dummy_na=True).reindex(columns=dXtr.columns, fill_value=0)
            dte = xgb.DMatrix(dXte)
        meta["ohe_cols"] = list(dXtr.columns)

    booster = xgb.train(
        params=p, dtrain=dtr, num_boost_round=N_EST,
        evals=[(dva, "valid")], early_stopping_rounds=ESR, verbose_eval=False
    )
    pred_va = booster.predict(dva, iteration_range=(0, (booster.best_iteration or N_EST)))
    pred_te = booster.predict(dte, iteration_range=(0, (booster.best_iteration or N_EST))) if Xte is not None else None

    imp = booster.get_score(importance_type="gain")
    if meta["use_ohe"]:
        fi = pd.Series(imp, name="gain").sort_values(ascending=False)
        fi.index.name = "feature"
    else:
        names = list(Xtr.columns)
        pairs = []
        for k, v in imp.items():
            if k.startswith("f") and k[1:].isdigit():
                idx = int(k[1:]); name = names[idx] if 0 <= idx < len(names) else k
            else:
                name = k
            pairs.append((name, v))
        fi = pd.Series(dict(pairs), name="gain").sort_values(ascending=False)

    return booster, pred_va, pred_te, fi, meta

def xgb_predict_time_ms_per_1k(booster, X, meta):
    import xgboost as xgb
    t0 = time.perf_counter()
    if meta["use_ohe"]:
        cats = [c for c in X.columns if str(c).endswith("_cat")]
        dX = pd.get_dummies(X, columns=cats, dummy_na=True).reindex(columns=meta["ohe_cols"], fill_value=0)
        d = xgb.DMatrix(dX)
    else:
        d = xgb.DMatrix(X, enable_categorical=True)
    _ = booster.predict(d, iteration_range=(0, (booster.best_iteration or N_EST)))
    dt = time.perf_counter() - t0
    return 1000 * dt / (len(X)/1000)

# ---------- CV (OOF) ----------
def oof_cv(model_name, base_params, X, y, cat_cols):
    skf = StratifiedKFold(n_splits=CV, shuffle=True, random_state=RND)
    oof = np.zeros(len(y), dtype=float)

    for tr, va in skf.split(X, y):
        Xtr, Xva = X.iloc[tr], X.iloc[va]; ytr, yva = y.iloc[tr], y.iloc[va]

        if model_name == "lgbm":
            from lightgbm import LGBMClassifier, early_stopping, log_evaluation
            imb_kwargs = {"is_unbalance": True} if IMB == "iso" else {"is_unbalance": False}

            defaults = {
                "n_estimators": N_EST, "random_state": RND, "n_jobs": -1,
                "learning_rate": LR, "num_leaves": 128, "max_depth": -1,
                "min_child_samples": 20, "subsample": 0.9, "colsample_bytree": 0.9,
                "reg_lambda": 1.0, "reg_alpha": 0.0, "max_bin": 511,
                "feature_pre_filter": False, **imb_kwargs
            }
            if base_params:
                defaults.update(base_params)   # safe merge

            clf = LGBMClassifier(**defaults)
            clf.fit(
                Xtr, ytr, eval_set=[(Xva, yva)], eval_metric="auc",
                categorical_feature=[c for c in cat_cols if c in Xtr.columns],
                callbacks=[early_stopping(ESR), log_evaluation(0)]
            )
            oof[va] = clf.predict_proba(Xva)[:,1]

        elif model_name == "xgb":
            spw = scale_pos_weight(ytr) if IMB == "spw" else 1.0
            _, pred_va, _, _, _ = xgb_train_predict(
                Xtr, ytr, Xva, yva, params={**(base_params or {}), "scale_pos_weight": spw}
            )
            oof[va] = pred_va
        else:
            raise ValueError("Unknown model")

    pr = average_precision_score(y, oof)
    roc = roc_auc_score(y, oof)
    br  = brier_score_loss(y, oof)
    return dict(pr_auc=float(pr), roc_auc=float(roc), brier=float(br), oof=oof)

# ---------- Final fit ----------
def fit_final(model_name, params, Xtr, ytr, Xte, yte, cat_cols):
    # small val split for ES
    X_tr, X_val, y_tr, y_val = train_test_split(Xtr, ytr, test_size=0.1, stratify=ytr, random_state=RND)

    if model_name == "lgbm":
        from lightgbm import LGBMClassifier, early_stopping, log_evaluation
        imb_kwargs = {"is_unbalance": True} if IMB == "iso" else {"is_unbalance": False}

        defaults = {
            "n_estimators": N_EST, "random_state": RND, "n_jobs": -1,
            "learning_rate": LR, "num_leaves": 128, "max_depth": -1,
            "min_child_samples": 20, "subsample": 0.9, "colsample_bytree": 0.9,
            "reg_lambda": 1.0, "reg_alpha": 0.0, "max_bin": 511,
            "feature_pre_filter": False, **imb_kwargs
        }
        if params:
            defaults.update(params)  # safe merge

        clf = LGBMClassifier(**defaults)
        t0 = time.perf_counter()
        clf.fit(
            X_tr, y_tr, eval_set=[(X_val, y_val)], eval_metric="auc",
            categorical_feature=[c for c in cat_cols if c in X_tr.columns],
            callbacks=[early_stopping(ESR), log_evaluation(0)]
        )
        fit_time_s = time.perf_counter() - t0

        t1 = time.perf_counter()
        proba = clf.predict_proba(Xte)[:,1]
        pred_ms_per_1k = 1000 * (time.perf_counter() - t1) / (len(Xte)/1000)

        try:
            gain = clf.booster_.feature_importance(importance_type="gain")
            names = clf.booster_.feature_name()
            fi = pd.Series(gain, index=names, name="gain").sort_values(ascending=False)
        except Exception:
            fi = pd.Series(clf.feature_importances_, index=Xtr.columns, name="split").sort_values(ascending=False)

        meta = {
            "encoder": "native(LGBM)",
            "best_iteration": getattr(clf, "best_iteration_", None),
            "n_trees": getattr(clf, "n_estimators_", None),
            "fit_time_s": float(fit_time_s),
            "predict_time_ms_per_1k": float(pred_ms_per_1k),
            "model_obj": clf
        }

    else:  # xgb
        spw = scale_pos_weight(y_tr) if IMB == "spw" else 1.0
        t0 = time.perf_counter()
        booster, _, proba, fi, xmeta = xgb_train_predict(
            X_tr, y_tr, X_val, y_val, Xte, params={**(params or {}), "scale_pos_weight": spw}
        )
        fit_time_s = time.perf_counter() - t0
        pred_ms_per_1k = xgb_predict_time_ms_per_1k(booster, Xte, xmeta)
        meta = {
            "encoder": "native(XGB)" if not xmeta["use_ohe"] else "OHE(XGB-fallback)",
            "best_iteration": getattr(booster, "best_iteration", None),
            "n_trees": getattr(booster, "best_ntree_limit", None),
            "fit_time_s": float(fit_time_s),
            "predict_time_ms_per_1k": float(pred_ms_per_1k),
            "model_obj": booster
        }

    hold = dict(
        pr_auc=float(average_precision_score(yte, proba)),
        roc_auc=float(roc_auc_score(yte, proba)),
        brier=float(brier_score_loss(yte, proba))
    )
    return proba, hold, fi, meta

# ---------- Plots ----------
def save_pr_curve(y_true, proba, out_path):
    import matplotlib.pyplot as plt
    prec, rec, _ = precision_recall_curve(y_true, proba)
    ap = average_precision_score(y_true, proba)
    plt.figure(figsize=(7,5))
    plt.plot(rec, prec, label=f'AP={ap:.4f}')
    plt.xlabel('Recall'); plt.ylabel('Precision'); plt.title('Precision-Recall')
    plt.xlim([0,1]); plt.ylim([0,1]); plt.grid(True, alpha=0.3); plt.legend()
    plt.tight_layout(); plt.savefig(out_path, dpi=150); plt.close()

def save_calibration(y_true, proba, out_path):
    import matplotlib.pyplot as plt
    prob_true, prob_pred = calibration_curve(y_true, proba, n_bins=20, strategy="quantile")
    plt.figure(figsize=(6,6))
    plt.plot([0,1],[0,1],'--',label='Perfect')
    plt.plot(prob_pred, prob_true, marker='o', label='Model')
    plt.xlabel('Predicted'); plt.ylabel('Observed'); plt.title('Calibration')
    plt.grid(True, alpha=0.3); plt.legend()
    plt.tight_layout(); plt.savefig(out_path, dpi=150); plt.close()

def save_top20_importance(fi: pd.Series, out_path):
    import matplotlib.pyplot as plt
    if fi is None or fi.empty: return
    top = fi.head(20).iloc[::-1]
    plt.figure(figsize=(8,6))
    plt.barh(top.index, top.values)
    plt.xlabel('Gain'); plt.title('Top-20 Feature Importance')
    plt.tight_layout(); plt.savefig(out_path, dpi=150); plt.close()

# ---------- Param sampler ----------
def sample_params(model_name, n):
    rng = np.random.default_rng(RND); out = []
    if model_name == "lgbm":
        for _ in range(n):
            out.append(dict(
                learning_rate=LR,
                num_leaves=int(rng.integers(64, 256)),
                max_depth=int(rng.integers(-1, 12)),
                min_child_samples=int(rng.integers(10, 80)),
                subsample=float(rng.uniform(0.7, 1.0)),
                colsample_bytree=float(rng.uniform(0.7, 1.0)),
                reg_lambda=float(rng.uniform(0.0, 2.0)),
                reg_alpha=float(rng.uniform(0.0, 1.0)),
                is_unbalance=(IMB=="iso"),
                max_bin=511, feature_pre_filter=False
            ))
    elif model_name == "xgb":
        for _ in range(n):
            out.append(dict(
                learning_rate=LR, tree_method="hist",
                max_depth=int(rng.integers(3, 10)),
                min_child_weight=float(rng.uniform(1.0, 8.0)),
                subsample=float(rng.uniform(0.7, 1.0)),
                colsample_bytree=float(rng.uniform(0.7, 1.0)),
                reg_lambda=float(rng.uniform(0.0, 2.0)),
                reg_alpha=float(rng.uniform(0.0, 1.0)),
                gamma=float(rng.uniform(0.0, 2.0)),
                scale_pos_weight=1.0  # set per-fold if IMB=="spw"
            ))
    return out

# ---------- Main ----------
def main():
    print(f"[SETUP] SPEED={SPEED} CV={CV} N_EST={N_EST} EARLY_STOP={ESR} MODELS={MODELS} IMB={IMB}")

    split_p = REPORTS_IN / "split_indices.json"
    feats_p = REPORTS_IN / "features_selected.csv"
    assert split_p.exists() and feats_p.exists(), "Missing split and/or features files in reports."

    split = json.loads(split_p.read_text())
    selected = load_selected_feature_list()

    df = load_and_save_data().replace(-1, np.nan)
    X_tr_all = df.loc[split["train"]].drop(columns=["target"])
    y_tr     = df.loc[split["train"], "target"].astype(int)
    X_te_all = df.loc[split["test"]].drop(columns=["target"])
    y_te     = df.loc[split["test"], "target"].astype(int)

    Xtr, cat_cols = prep_for_trees(X_tr_all, selected)
    Xte, _        = prep_for_trees(X_te_all, selected)

    available = []
    for m in MODELS:
        if m == "lgbm":
            try: import lightgbm  # noqa
            except Exception: print("[WARN] LightGBM not available. Skipping.")
            else: available.append("lgbm")
        elif m == "xgb":
            try: import xgboost  # noqa
            except Exception: print("[WARN] XGBoost not available. Skipping.")
            else: available.append("xgb")
    if not available:
        raise RuntimeError("No models available.")

    # OOF baselines
    baselines = {}
    for m in available:
        res = oof_cv(m, {}, Xtr, y_tr, cat_cols)
        baselines[m] = {k: float(v) for k, v in res.items() if k in ("pr_auc","roc_auc","brier")}
        pd.DataFrame({"oof": res["oof"]}).to_csv(REPORTS_OUT/f"oof_{m}.csv", index=False)
    pd.DataFrame.from_dict(baselines, orient="index").reset_index().rename(
        columns={"index":"model"}).to_csv(REPORTS_OUT/"baselines_summary.csv", index=False)

    # pick best by PR-AUC
    best_model = max(baselines.items(), key=lambda kv: kv[1]["pr_auc"])[0]

    # tiny random search around best
    n_iters = 4 if SPEED == "FAST" else (8 if SPEED == "MEDIUM" else 20)
    tuning_rows, best_cv = [], {"score": -1.0, "model": None, "params": None}
    for params in sample_params(best_model, n_iters):
        res = oof_cv(best_model, params, Xtr, y_tr, cat_cols)
        tuning_rows.append({
            "model": best_model, "pr_auc": res["pr_auc"], "roc_auc": res["roc_auc"],
            "brier": res["brier"], "params": json.dumps(params)
        })
        if res["pr_auc"] > best_cv["score"]:
            best_cv = {"score": float(res["pr_auc"]), "model": best_model, "params": params}

    if tuning_rows:
        pd.DataFrame(tuning_rows).to_csv(REPORTS_OUT/"tuning_log.csv", index=False)

    best_params = best_cv["params"] if best_cv["model"] else {}

    # final fit + holdout
    proba, hold, fi, meta = fit_final(best_model, best_params, Xtr, y_tr, Xte, y_te, cat_cols)

    # save predictions, metrics, FI, plots
    pd.DataFrame({"proba": proba, "y_true": y_te.values}).to_csv(REPORTS_OUT/"holdout_preds.csv", index=False)
    pd.DataFrame([{
        "model": best_model, "params": json.dumps(best_params),
        "pr_auc": hold["pr_auc"], "roc_auc": hold["roc_auc"], "brier": hold["brier"]
    }]).to_csv(REPORTS_OUT/"holdout_metrics.csv", index=False)

    if fi is not None and not fi.empty:
        fi.reset_index().rename(columns={"index":"feature"}).to_csv(REPORTS_OUT/"fi_gain.csv", index=False)

    save_pr_curve(y_te.values, proba, REPORTS_OUT/"plot_pr.png")
    save_calibration(y_te.values, proba, REPORTS_OUT/"plot_calibration.png")
    save_top20_importance(fi, REPORTS_OUT/"plot_fi_top20.png")

    # team summary row
    row = {
        "member": MEMBER,
        "model_name": best_model.upper(),
        "encoder": meta["encoder"],
        "split_path": str(split_p),
        "feature_recipe": "selected_from_feature_gate",
        "seed": RND, "cv_folds": CV,
        "hold_auc": hold["roc_auc"], "hold_ap": hold["pr_auc"], "hold_brier": hold["brier"],
        "cv_auc_mean": None, "cv_ap_mean": None,
        "early_stopping": True,
        "best_iteration": meta["best_iteration"],
        "n_trees": meta["n_trees"],
        "fit_time_s": meta["fit_time_s"],
        "predict_time_ms_per_1k": meta["predict_time_ms_per_1k"],
        "params_json": json.dumps(best_params),
    }
    out_csv = REPORTS_OUT/"team_model_summary.csv"
    pd.DataFrame([row]).to_csv(out_csv, mode="a", index=False, header=not out_csv.exists())

    print("\n[BASELINES]", json.dumps(baselines, indent=2))
    if best_cv["model"]:
        print(f"[CV] tuned={best_model}  PR-AUC={best_cv['score']:.5f}")
    print(f"[HOLDOUT] PR-AUC={hold['pr_auc']:.5f}  ROC-AUC={hold['roc_auc']:.5f}  Brier={hold['brier']:.5f}")
    print(f"Saved to: {REPORTS_OUT}")

if __name__ == "__main__":
    try:
        main()
    except Exception as ex:
        print("ERROR:", type(ex).__name__, "-", ex)
        sys.exit(1)


[SETUP] SPEED=MEDIUM CV=5 N_EST=4000 EARLY_STOP=100 MODELS=['lgbm', 'xgb'] IMB=spw
Loading dataset from local file: /Users/lucasbeseler/ada_portoSeguro/data/raw/porto_seguro_safe_driver_prediction.csv
Dataset loaded successfully.
[LightGBM] [Info] Number of positive: 5853, number of negative: 154147
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006730 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1884
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 37
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.036581 -> initscore=-3.270952
[LightGBM] [Info] Start training from score -3.270952
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[57]	valid_0's auc: 0.613508	valid_0's binary_logloss: 0.153924
[LightGBM] [Info] Number of positive: 5853, number

In [None]:
# ===== Logbuch 7: Feature recipe evaluation (table + bar chart) =====

import json, os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# --- helpers specific to this block ---
def build_selected_for_recipe(recipe: str, X_cols):
    base = [c for c in X_cols if c != "target"]
    sel = list(base)
    if recipe in ("drop_calc_only", "drop_calc+extras", "drop_calc+opt+extras"):
        sel = [c for c in sel if not str(c).startswith("ps_calc_")]
    if recipe in ("drop_calc+extras", "drop_calc+opt+extras"):
        sel += ["missing_count", "sum_all_bin"]  # fe_extras() will create if selected
    if recipe == "drop_calc+opt+extras":
        sel = [c for c in sel if c not in ("ps_ind_14", "ps_car_10_cat")]
    return sel

def pick_best_model_and_params(Xtr, y_tr, cat_cols):
    # Try to use tuning_log if available; else fall back to baseline best by PR-AUC.
    tl = REPORTS_OUT / "tuning_log.csv"
    if tl.exists() and tl.stat().st_size > 0:
        df_tl = pd.read_csv(tl)
        best = df_tl.iloc[df_tl["pr_auc"].idxmax()]
        model = str(best["model"])
        try:
            params = json.loads(best["params"])
        except Exception:
            params = {}
        return model, params

    # Baseline fallback (recompute quickly with default params)
    available = []
    for m in MODELS:
        if m == "lgbm":
            try: import lightgbm  # noqa
            except Exception: pass
            else: available.append("lgbm")
        elif m == "xgb":
            try: import xgboost  # noqa
            except Exception: pass
            else: available.append("xgb")
    if not available:
        raise RuntimeError("No models available for feature-recipe eval.")

    scores = {}
    for m in available:
        res = oof_cv(m, {}, Xtr, y_tr, cat_cols)
        scores[m] = res["pr_auc"]
    model = max(scores.items(), key=lambda kv: kv[1])[0]
    return model, {}

# --- data (reload to be independent of main()) ---
split_p = REPORTS_IN / "split_indices.json"
feats_p = REPORTS_IN / "features_selected.csv"
assert split_p.exists() and feats_p.exists(), "Missing split/features input."

split = json.loads(split_p.read_text())
df = load_and_save_data().replace(-1, np.nan)

X_tr_all = df.loc[split["train"]].drop(columns=["target"])
y_tr     = df.loc[split["train"], "target"].astype(int)

# Use current selected list to encode cats consistently
selected_current = load_selected_feature_list()
Xtr_tmp, cat_cols_tmp = prep_for_trees(X_tr_all, selected_current)

# --- choose best model/params (from tuning_log if present) ---
best_model, best_params = pick_best_model_and_params(Xtr_tmp, y_tr, cat_cols_tmp)

# --- run the four recipes with identical model/params ---
recipes = [
    "all_features",
    "drop_calc_only",
    "drop_calc+extras",
    "drop_calc+opt+extras",
]
rows = []
for r in recipes:
    selected = build_selected_for_recipe(r, X_tr_all.columns)
    Xr, cat_cols = prep_for_trees(X_tr_all, selected)
    res = oof_cv(best_model, best_params or {}, Xr, y_tr, cat_cols)
    rows.append({
        "feature_set": r,
        "pr_auc": float(res["pr_auc"]),
        "roc_auc": float(res["roc_auc"]),
        "brier": float(res["brier"]),
        "n_features": int(Xr.shape[1]),
        "model": best_model.upper(),
        "params_json": json.dumps(best_params or {}),
        "cv_folds": int(CV),
        "seed": int(RND),
    })

df_out = pd.DataFrame(rows)
csv_path = REPORTS_OUT / "feature_recipes_cv.csv"
df_out.to_csv(csv_path, index=False)

# --- bar chart (ROC-AUC) ---
order = ["all_features","drop_calc_only","drop_calc+extras","drop_calc+opt+extras"]
dfp = df_out.set_index("feature_set").loc[order].reset_index()
plt.figure(figsize=(7,5))
plt.bar(dfp["feature_set"], dfp["roc_auc"])
plt.ylabel("ROC AUC (CV Mean)")
plt.title("Feature-set comparison (ROC AUC)")
plt.xticks(rotation=20)
plt.tight_layout()
png_path = REPORTS_OUT / "feature_recipes_auc.png"
plt.savefig(png_path, dpi=150)
plt.close()

# --- console summary ---
print("\n[FEATURE-RECIPES] Model:", best_model.upper())
print("[FEATURE-RECIPES] Params:", json.dumps(best_params or {}))
print("[FEATURE-RECIPES] Saved CSV:", csv_path)
print("[FEATURE-RECIPES] Saved Plot:", png_path)
print("\nTable preview:\n", df_out[["feature_set","n_features","roc_auc","pr_auc","brier"]])


Loading dataset from in-memory cache.
[LightGBM] [Info] Number of positive: 5853, number of negative: 154147
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008341 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2057
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 57
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.036581 -> initscore=-3.270952
[LightGBM] [Info] Start training from score -3.270952
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[109]	valid_0's auc: 0.616041	valid_0's binary_logloss: 0.153724
[LightGBM] [Info] Number of positive: 5853, number of negative: 154147
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008529 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if m

In [10]:
# ===== Robust cleaner for team_model_summary.csv (standalone, fixed) =====
import os, json
from pathlib import Path
import pandas as pd
import numpy as np

# ---------- locate paths ----------
def find_reports_out():
    env = os.getenv("REPORTS_OUT")
    if env:
        p = Path(env); p.mkdir(parents=True, exist_ok=True); return p
    here = Path.cwd()
    for parent in [here, *here.parents][:6]:
        cand = parent / "reports_Lucas"
        if cand.exists(): return cand
    for parent in [here, *here.parents][:8]:
        if parent.name == "ada_portoSeguro":
            p = parent / "reports_Lucas"; p.mkdir(parents=True, exist_ok=True); return p
    p = here / "reports_Lucas"; p.mkdir(parents=True, exist_ok=True); return p

def find_reports_in(reports_out: Path):
    if (reports_out.parent / "reports").exists():
        return reports_out.parent / "reports"
    for parent in [reports_out.parent, *reports_out.parent.parents][:8]:
        cand = parent / "reports"
        if cand.exists(): return cand
    return reports_out.parent / "reports"

REPORTS_OUT = find_reports_out()
REPORTS_IN  = find_reports_in(REPORTS_OUT)
CSV_PATH    = REPORTS_OUT / "team_model_summary.csv"
METRICS_CSV = REPORTS_OUT / "holdout_metrics.csv"
TUNING_CSV  = REPORTS_OUT / "tuning_log.csv"
SPLIT_JSON  = REPORTS_IN  / "split_indices.json"

print("[PATHS]")
print("  REPORTS_OUT:", REPORTS_OUT)
print("  REPORTS_IN :", REPORTS_IN)
print("  team_model_summary.csv:", CSV_PATH)

# ---------- init from holdout_metrics if missing ----------
if not CSV_PATH.exists():
    if not METRICS_CSV.exists():
        raise FileNotFoundError(
            f"Not found: {CSV_PATH}\n"
            f"Also missing holdout metrics at {METRICS_CSV}. "
            "Run your main training script first."
        )
    m = pd.read_csv(METRICS_CSV).iloc[0].to_dict()
    model_name  = str(m.get("model", "")).upper() or "LGBM"
    params_json = m.get("params", m.get("params_json", "{}"))
    try: json.loads(params_json)
    except Exception: params_json = "{}"
    row = {
        "member": os.getenv("MEMBER", "Lucas"),
        "model_name": model_name,
        "encoder": "native(LGBM)" if "LGBM" in model_name else ("native(XGB)" if "XGB" in model_name else ""),
        "split_path": str(SPLIT_JSON) if SPLIT_JSON.exists() else "",
        "feature_recipe": os.getenv("FEATURE_RECIPE", "drop_calc+opt+extras"),
        "seed": int(os.getenv("RND", "42")),
        "cv_folds": int(os.getenv("CV", "5")),
        "hold_auc": float(m.get("roc_auc", np.nan)),
        "hold_ap": float(m.get("pr_auc", np.nan)),
        "hold_brier": float(m.get("brier", np.nan)),
        "cv_auc_mean": np.nan,
        "cv_ap_mean": np.nan,
        "early_stopping": True,
        "best_iteration": np.nan,
        "n_trees": np.nan,
        "fit_time_s": np.nan,
        "predict_time_ms_per_1k": np.nan,
        "params_json": params_json,
    }
    pd.DataFrame([row]).to_csv(CSV_PATH, index=False)
    print("[INIT] Created team_model_summary.csv from holdout_metrics.csv")

# ---------- load & normalize ----------
df = pd.read_csv(CSV_PATH)

num_cols = ["seed","cv_folds","hold_auc","hold_ap","hold_brier",
            "cv_auc_mean","cv_ap_mean","best_iteration","n_trees",
            "fit_time_s","predict_time_ms_per_1k"]
for c in num_cols:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce")

if "model_name" in df.columns:
    mask_empty = df["model_name"].isna() | (df["model_name"].astype(str).str.len()==0)
    df.loc[mask_empty, "model_name"] = "LGBM"
if "encoder" in df.columns:
    enc_empty = df["encoder"].isna() | (df["encoder"].astype(str).str.len()==0)
    infer = np.where(df.get("model_name","").astype(str).str.contains("XGB", case=False),
                     "native(XGB)", "native(LGBM)")
    df.loc[enc_empty, "encoder"] = infer

# ---------- consolidate duplicates ----------
req_cols = ["member","split_path","feature_recipe"]
for k in req_cols:
    if k not in df.columns:
        raise ValueError(f"Missing column '{k}' in {CSV_PATH.name}")

def consolidate_group(g: pd.DataFrame) -> pd.Series:
    # prefer rows with more filled meta, higher hold_auc, later position
    meta_cols = [c for c in ["best_iteration","n_trees","fit_time_s","predict_time_ms_per_1k"] if c in g.columns]
    fill_score = g[meta_cols].notna().sum(axis=1) if meta_cols else pd.Series(0, index=g.index)
    auc = g["hold_auc"].fillna(-np.inf) if "hold_auc" in g.columns else pd.Series(-np.inf, index=g.index)

    # np.lexsort sorts by last key primary → use (position, auc, fill) so last two are primary
    order = np.lexsort((g.index.to_numpy(), auc.to_numpy(), fill_score.to_numpy()))
    best = g.iloc[order[-1]].copy()  # <-- FIX: pick last row by integer position

    # backfill missing fields from other rows in group
    for c in g.columns:
        if pd.isna(best.get(c)):
            cand = g[c].dropna()
            if not cand.empty:
                best[c] = cand.iloc[-1]
    return best

out = (
    df.groupby(req_cols, dropna=False, as_index=False, sort=False, group_keys=False)
      .apply(consolidate_group)
      .reset_index(drop=True)
)

sort_cols = [c for c in ["member","feature_recipe","cv_folds"] if c in out.columns]
if sort_cols:
    out = out.sort_values(sort_cols, kind="stable").reset_index(drop=True)

out.to_csv(CSV_PATH, index=False)

# ---------- preview ----------
def one_liner(x):
    return (
        f'{x.get("member","")} | {x.get("model_name","")} | {x.get("encoder","")} | '
        f'{x.get("feature_recipe","")} | folds={int(x.get("cv_folds",0))} | '
        f'ROC-AUC={float(x.get("hold_auc",np.nan)):.6f} | '
        f'PR-AUC={float(x.get("hold_ap",np.nan)):.6f} | '
        f'Brier={float(x.get("hold_brier",np.nan)):.6f}'
    )

print("\n[CLEANED] ->", CSV_PATH)
for _, r in out.iterrows():
    print("  ", one_liner(r))


[PATHS]
  REPORTS_OUT: /Users/lucasbeseler/ada_portoSeguro/reports_Lucas
  REPORTS_IN : /Users/lucasbeseler/ada_portoSeguro/reports
  team_model_summary.csv: /Users/lucasbeseler/ada_portoSeguro/reports_Lucas/team_model_summary.csv

[CLEANED] -> /Users/lucasbeseler/ada_portoSeguro/reports_Lucas/team_model_summary.csv
   Lucas | LGBM | native(LGBM) | drop_calc+opt+extras | folds=5 | ROC-AUC=0.632201 | PR-AUC=0.066425 | Brier=0.034880
   Lucas | LGBM | native(LGBM) | selected_from_feature_gate | folds=5 | ROC-AUC=0.632201 | PR-AUC=0.066425 | Brier=0.034880


In [11]:
import platform
import importlib

pkgs = ["numpy","pandas","sklearn","lightgbm","xgboost"]
vers = {}
for p in pkgs:
    try:
        m = importlib.import_module(p)
        v = getattr(m, "__version__", "n/a")
    except Exception:
        v = "n/a"
    vers[p] = v

PY = platform.python_version()
SEED = 42
CV = 5

print("**Tools & Reproduzierbarkeit.**")
print(f"Umgebung: Python {PY}, NumPy {vers['numpy']}, pandas {vers['pandas']}, "
      f"scikit-learn {vers['sklearn']}, LightGBM {vers['lightgbm']}, XGBoost {vers['xgboost']}. "
      f"Läufe reproduzierbar (Seed = {SEED}), CV = {CV}, fixe Splits über `split_indices.json`. "
      "Code, Pfade und Ergebnisse (CSV/PNG) sind im Repository dokumentiert.")


**Tools & Reproduzierbarkeit.**
Umgebung: Python 3.13.3, NumPy 2.3.2, pandas 2.3.1, scikit-learn 1.7.1, LightGBM 4.6.0, XGBoost 3.0.4. Läufe reproduzierbar (Seed = 42), CV = 5, fixe Splits über `split_indices.json`. Code, Pfade und Ergebnisse (CSV/PNG) sind im Repository dokumentiert.
