In [13]:
#!/usr/bin/env python3
# Final, klein & schnell: Kaggle-Preset (calc drop + Extras) + LogReg L2 + Summary-Text

import os, sys, json, warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
from pathlib import Path

# --- ROOT robust (Notebook + Skript)
if "__file__" in globals():
    ROOT = Path(__file__).resolve().parents[1]
else:
    CWD = Path.cwd()
    ROOT = CWD.parent if CWD.name == "notebooks" else CWD
sys.path.insert(0, str(ROOT))

from src.data_loader import load_and_save_data

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import joblib

RND = 42

def ohe_fallback():
    # robust gegen verschiedene scikit-learn Versionen
    try:
        return OneHotEncoder(handle_unknown="ignore", sparse_output=False, min_frequency=0.01)
    except TypeError:
        try:
            return OneHotEncoder(handle_unknown="ignore", sparse_output=False)
        except TypeError:
            return OneHotEncoder(handle_unknown="ignore", sparse=False)

def split_cols(cols):
    cat = [c for c in cols if c.endswith("_cat")]
    bin_ = [c for c in cols if c.endswith("_bin")]
    num  = [c for c in cols if (c not in cat and c not in bin_ and c != "target")]
    return cat, bin_, num

def fe_simple(X: pd.DataFrame) -> pd.DataFrame:
    X = X.copy()
    X["missing_count"] = X.isna().sum(axis=1)
    bin_cols = [c for c in X.columns if c.endswith("_bin")]
    if bin_cols:
        X["sum_all_bin"] = X[bin_cols].sum(axis=1)
    return X

def build_pre(cat, bin_, num):
    cat_pipe = Pipeline([("imp", SimpleImputer(strategy="most_frequent")),
                         ("ohe", ohe_fallback())])
    num_pipe = Pipeline([("imp", SimpleImputer(strategy="median")),
                         ("sc", StandardScaler())])
    bin_pipe = Pipeline([("imp", SimpleImputer(strategy="most_frequent"))])
    return ColumnTransformer([("cat", cat_pipe, cat),
                              ("bin", bin_pipe, bin_),
                              ("num", num_pipe, num)], remainder="drop")

def main():
    # einfache Steuerung per Env (optional)
    CV = int(os.getenv("CV", "3"))             # 3-fold ist schnell & stabil
    C  = float(os.getenv("C", "1.0"))          # L2-Stärke
    N  = int(os.getenv("TRAIN_SAMPLE_N", "0")) # 0 = alles, sonst z.B. 200000

    reports = ROOT / "reports"; reports.mkdir(parents=True, exist_ok=True)
    models  = ROOT / "models";  models.mkdir(parents=True, exist_ok=True)

    # a) Daten laden & vorbereiten
    df = load_and_save_data()
    df = df.replace(-1, np.nan)
    y  = df["target"].astype(int)
    X  = df.drop(columns=["target"])

    # ps_calc_* + optionale Extras droppen
    calc_cols = [c for c in X.columns if c.startswith("ps_calc_")]
    extra_drop = [c for c in ["ps_ind_14", "ps_car_10_cat"] if c in X.columns]
    X = X.drop(columns=calc_cols + extra_drop, errors="ignore")

    # Zusatzfeatures
    X = fe_simple(X)

    # optional sample
    if N:
        X = X.sample(min(N, len(X)), random_state=RND)
        y = y.loc[X.index]

    # Guard: keine calc-Spalten mehr?
    assert not any(c.startswith("ps_calc_") for c in X.columns), "ps_calc_* nicht komplett entfernt"

    # Preproz + Modell
    cat, bin_, num = split_cols(X.columns)
    pre  = build_pre(cat, bin_, num)
    clf  = LogisticRegression(penalty="l2", solver="lbfgs", C=C,
                              class_weight="balanced", max_iter=4000, random_state=RND)
    pipe = Pipeline([("pre", pre), ("clf", clf)])

    # CV-AUC
    skf = StratifiedKFold(n_splits=CV, shuffle=True, random_state=RND)
    proba = np.zeros(len(y), dtype=float)
    for tr, te in skf.split(X, y):
        m = pipe.fit(X.iloc[tr], y.iloc[tr])
        proba[te] = m.predict_proba(X.iloc[te])[:, 1]
    auc = roc_auc_score(y, proba)

    # Final fit + speichern
    final_model = pipe.fit(X, y)
    joblib.dump(final_model, models / "kagglepreset_logreg_l2.joblib")

    # Featureliste (Rohspalten) + Meta
    feat_path = reports / "selected_features_kagglepreset.csv"
    pd.Series(X.columns, name="raw_feature").to_csv(feat_path, index=False)

    meta = {
        "cv_auc": float(auc),
        "n_rows": int(len(y)),
        "cv_splits": CV,
        "C": C,
        "dropped": {"ps_calc_*": True, "extra": extra_drop},
        "extras_added": ["missing_count", "sum_all_bin"],
        "model_path": str(models / "kagglepreset_logreg_l2.joblib"),
        "features_path": str(feat_path)
    }
    (reports / "kagglepreset_decision.json").write_text(json.dumps(meta, indent=2))

    # --- Abschluss: erklärender Text (a/b) auf Deutsch ---
    print()
    print("a) Was macht das Skript?")
    print("- lädt den Datensatz, ersetzt -1→NaN")
    print("- dropt ps_calc_* (+ optional ps_ind_14, ps_car_10_cat)")
    print("- fügt missing_count & sum_all_bin hinzu")
    print("- OHE für *_cat, Impute/Scale für numerische, Impute für binär")
    print("- trainiert LogReg L2 und misst CV-AUC")
    print("- speichert:")
    print(f"  • Modell: {models/'kagglepreset_logreg_l2.joblib'}")
    print(f"  • Featureliste: {feat_path}")
    print(f"  • Meta/Entscheidung: {reports/'kagglepreset_decision.json'}")

    print()
    print("b) Was sagen die Ergebnisse?")
    print(f"- AUC (CV={CV}, N={len(y):,}): {auc:.4f}")
    print("- Interpretation: Der Kaggle-Preset (calc raus + einfache FE + OHE) trägt.")
    print("- Die Datei selected_features_kagglepreset.csv ist jetzt eine schlanke Feature-Basis für weitere Modelle.")

if __name__ == "__main__":
    main()


Lade Datensatz aus dem Cache.

a) Was macht das Skript?
- lädt den Datensatz, ersetzt -1→NaN
- dropt ps_calc_* (+ optional ps_ind_14, ps_car_10_cat)
- fügt missing_count & sum_all_bin hinzu
- OHE für *_cat, Impute/Scale für numerische, Impute für binär
- trainiert LogReg L2 und misst CV-AUC
- speichert:
  • Modell: /Users/lucasbeseler/ada_portoSeguro/models/kagglepreset_logreg_l2.joblib
  • Featureliste: /Users/lucasbeseler/ada_portoSeguro/reports/selected_features_kagglepreset.csv
  • Meta/Entscheidung: /Users/lucasbeseler/ada_portoSeguro/reports/kagglepreset_decision.json

b) Was sagen die Ergebnisse?
- AUC (CV=3, N=595,212): 0.6266
- Interpretation: Der Kaggle-Preset (calc raus + einfache FE + OHE) trägt.
- Die Datei selected_features_kagglepreset.csv ist jetzt eine schlanke Feature-Basis für weitere Modelle.
