In [11]:
import os, sys, json, warnings, time
warnings.filterwarnings("ignore")

from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedKFold, train_test_split, RandomizedSearchCV
from sklearn.metrics import average_precision_score, roc_auc_score, brier_score_loss, precision_recall_curve
from sklearn.calibration import calibration_curve
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


# ---------- Pfade ----------
if "__file__" in globals():
    ROOT = Path(__file__).resolve().parents[1]
else:
    ROOT = Path.cwd() if Path.cwd().name not in ("notebooks","tools","tests") else Path.cwd().parent
sys.path.insert(0, str(ROOT))
from src.data_loader import load_and_save_data

REPORTS_IN  = Path(os.getenv("REPORTS_IN")  or (ROOT / "reports"))
REPORTS_OUT = Path(os.getenv("REPORTS_OUT") or (ROOT / "reports_Hany"))
REPORTS_OUT.mkdir(parents=True, exist_ok=True)

# ---------- Konfiguration ----------
SPEED = os.getenv("SPEED", "MEDIUM").upper().strip()
def speed_cfg():
    cfg = dict(CV=5, N_EST=100)
    if SPEED == "FAST":
        cfg.update(CV=3, N_EST=50)
    elif SPEED == "MEDIUM":
        cfg.update(CV=5, N_EST=100)
    elif SPEED == "FULL":
        cfg.update(CV=5, N_EST=200)
    return cfg

CFG        = speed_cfg()
RND        = int(os.getenv("RND", "42"))
CV         = int(os.getenv("CV", str(CFG["CV"])))
N_EST      = int(os.getenv("N_EST", str(CFG["N_EST"])))
MEMBER     = os.getenv("MEMBER", "Hany")

# ---------- Hilfsfunktionen zur Datenaufbereitung ----------
def split_cols(cols):
    cat = [c for c in cols if str(c).endswith("_cat")]
    bin_ = [c for c in cols if str(c).endswith("_bin")]
    num  = [c for c in cols if c not in cat and c not in bin_ and c != "target"]
    return cat, bin_, num

def load_selected_feature_list():
    f = REPORTS_IN / "features_selected.csv"
    if not f.exists():
        raise FileNotFoundError(f"Missing {f}. Run feature-gate first.")
    s = pd.read_csv(f)
    if "raw_feature" not in s.columns:
        raise ValueError("features_selected.csv must have column 'raw_feature'.")
    return s["raw_feature"].astype(str).tolist()

def fe_extras(X, selected):
    X = X.copy()
    if "missing_count" in selected:
        X["missing_count"] = X.isna().sum(axis=1)
    if "sum_all_bin" in selected:
        b = [c for c in X.columns if str(c).endswith("_bin")]
        X["sum_all_bin"] = X[b].sum(axis=1)
    return X

def prep_for_model(X: pd.DataFrame, selected_cols):
    keep = [c for c in selected_cols if c in X.columns]
    missing = [c for c in selected_cols if c not in X.columns]
    if missing: print(f"[WARN] ignoring {len(missing)} missing selected feature(s).")
    X = X[keep].copy()

    cat, bin_, num = split_cols(X.columns)

    pre = ColumnTransformer([
        ("cat", Pipeline([("imp", SimpleImputer(strategy="most_frequent")), ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))]), cat),
        ("bin", SimpleImputer(strategy="most_frequent"), bin_),
        ("num", Pipeline([("imp", SimpleImputer(strategy="median"))]), num)
    ], remainder="drop")

    return pre

def oof_cv(model_name, X, y, selected, model_params=None):
    skf = StratifiedKFold(n_splits=CV, shuffle=True, random_state=RND)
    oof = np.zeros(len(y), dtype=float)
    pre = prep_for_model(X, selected)

    defaults = {
        'n_estimators': N_EST,
        'random_state': RND,
        'n_jobs': -1,
        'class_weight': 'balanced'
    }
    defaults.update(model_params or {})

    clf = RandomForestClassifier(**defaults)
    pipe = Pipeline([("pre", pre), ("clf", clf)])

    for tr, va in skf.split(X, y):
        Xtr, Xva = X.iloc[tr], X.iloc[va]; ytr, yva = y.iloc[tr], y.iloc[va]
        pipe.fit(Xtr, ytr)
        oof[va] = pipe.predict_proba(Xva)[:,1]

    pr  = average_precision_score(y, oof)
    roc = roc_auc_score(y, oof)
    brier = brier_score_loss(y, oof)
    return dict(pr_auc=float(pr), roc_auc=float(roc), brier=float(brier), oof=oof)

def fit_final(Xtr, ytr, Xte, yte, selected_cols, model_params=None):
    pre = prep_for_model(Xtr, selected_cols)

    defaults = {
        'n_estimators': N_EST,
        'random_state': RND,
        'n_jobs': -1,
        'class_weight': 'balanced'
    }
    defaults.update(model_params or {})

    clf = RandomForestClassifier(**defaults)
    pipe = Pipeline([("pre", pre), ("clf", clf)])

    t0 = time.perf_counter()
    pipe.fit(Xtr, ytr)
    fit_time_s = time.perf_counter() - t0

    t1 = time.perf_counter()
    proba = pipe.predict_proba(Xte)[:,1]
    pred_ms_per_1k = 1000 * (time.perf_counter() - t1) / (len(Xte)/1000)

    try:
        feature_names = pipe.named_steps['pre'].get_feature_names_out()
        fi = pd.Series(pipe.named_steps['clf'].feature_importances_, index=feature_names, name="gain").sort_values(ascending=False)
    except:
        fi = None

    meta = {
        "encoder": "OHE",
        "n_trees": defaults['n_estimators'],
        "fit_time_s": float(fit_time_s),
        "predict_time_ms_per_1k": float(pred_ms_per_1k),
        "params": model_params
    }
    hold = dict(pr_auc=float(average_precision_score(yte, proba)), roc_auc=float(roc_auc_score(yte, proba)), brier=float(brier_score_loss(yte, proba)))

    return proba, hold, fi, meta

# ---------- Berichtsfunktionen ----------
def save_pr_curve(y_true, proba, out_path):
    prec, rec, _ = precision_recall_curve(y_true, proba); ap = average_precision_score(y_true, proba)
    plt.figure(figsize=(7,5)); plt.plot(rec, prec, label=f'AP={ap:.4f}')
    plt.xlabel('Recall'); plt.ylabel('Precision'); plt.title('Precision-Recall')
    plt.xlim([0,1]); plt.ylim([0,1]); plt.grid(True, alpha=0.3); plt.legend()
    plt.tight_layout(); plt.savefig(out_path, dpi=150); plt.close()

def save_calibration(y_true, proba, out_path):
    prob_true, prob_pred = calibration_curve(y_true, proba, n_bins=20, strategy="quantile")
    plt.figure(figsize=(6,6)); plt.plot([0,1],[0,1],'--',label='Perfect')
    plt.plot(prob_pred, prob_true, marker='o', label='Model')
    plt.xlabel('Predicted'); plt.ylabel('Observed'); plt.title('Calibration')
    plt.grid(True, alpha=0.3); plt.legend(); plt.tight_layout(); plt.savefig(out_path, dpi=150); plt.close()

def save_top20_importance(fi: pd.Series, out_path):
    if fi is None or fi.empty: return
    top = fi.head(20).iloc[::-1]
    plt.figure(figsize=(8,6)); plt.barh(top.index, top.values)
    plt.xlabel('Gain'); plt.title('Top-20 Feature Importance')
    plt.tight_layout(); plt.savefig(out_path, dpi=150); plt.close()

def log_summary(row, filename="team_model_summary.csv"):
    out_csv = REPORTS_OUT/filename
    pd.DataFrame([row]).to_csv(out_csv, mode="a", index=False, header=not out_csv.exists())

# Daten laden und aufteilen
print(f"[SETUP] SPEED={SPEED} CV={CV} N_EST={N_EST} MODEL=RandomForest")

split_p = REPORTS_IN / "split_indices.json"
feats_p = REPORTS_IN / "features_selected.csv"
assert split_p.exists() and feats_p.exists(), "Missing split and/or features files in reports."

split = json.loads(split_p.read_text())
selected = load_selected_feature_list()

df = load_and_save_data().replace(-1, np.nan)
X_tr_all = df.loc[split["train"]].drop(columns=["target"])
y_tr     = df.loc[split["train"], "target"].astype(int)
X_te_all = df.loc[split["test"]].drop(columns=["target"])
y_te     = df.loc[split["test"], "target"].astype(int)

# Erstellen der neuen Features auf den DataFrames
X_tr_fe = fe_extras(X_tr_all.copy(), selected)
X_te_fe = fe_extras(X_te_all.copy(), selected)

# Update der Featureliste, um die neuen Features einzuschliessen
selected_fe = list(set(selected + ["missing_count", "sum_all_bin"]))
selected_fe = [c for c in selected_fe if c in X_tr_fe.columns]

[SETUP] SPEED=MEDIUM CV=5 N_EST=100 MODEL=RandomForest
Loading dataset from in-memory cache.


In [13]:
print(f"\n[EXPERIMENT 1] Baseline RandomForest with default params")

base_params = {}
res = oof_cv("rf_baseline", X_tr_fe, y_tr, selected_fe, model_params=base_params)
proba, hold, fi, meta = fit_final(X_tr_fe, y_tr, X_te_fe, y_te, selected_fe, model_params=base_params)

report_name = "rf_baseline"
pd.DataFrame({"oof": res["oof"]}).to_csv(REPORTS_OUT/f"oof_{report_name}.csv", index=False)
pd.DataFrame({"proba": proba, "y_true": y_te.values}).to_csv(REPORTS_OUT/f"holdout_preds_{report_name}.csv", index=False)
if fi is not None and not fi.empty:
    fi.reset_index().rename(columns={"index":"feature"}).to_csv(REPORTS_OUT/f"fi_gain_{report_name}.csv", index=False)
save_pr_curve(y_te.values, proba, REPORTS_OUT/f"plot_pr_{report_name}.png")
save_calibration(y_te.values, proba, REPORTS_OUT/f"plot_calibration_{report_name}.png")
if fi is not None: save_top20_importance(fi, REPORTS_OUT/f"plot_fi_top20_{report_name}.png")

row = {
    "member": MEMBER, "model_name": "RF_Baseline", "encoder": "OHE",
    "split_path": str(split_p), "feature_recipe": "selected",
    "seed": RND, "cv_folds": CV, "hold_auc": hold["roc_auc"], "hold_ap": hold["pr_auc"],
    "hold_brier": hold["brier"], "cv_auc_mean": res["roc_auc"], "cv_ap_mean": res["pr_auc"],
    "early_stopping": False, "best_iteration": None,
    "n_trees": N_EST, "fit_time_s": meta["fit_time_s"],
    "predict_time_ms_per_1k": meta["predict_time_ms_per_1k"], "params_json": json.dumps(base_params)
}
log_summary(row)

print(f"\n[BASELINES] PR-AUC={res['pr_auc']:.5f}  ROC-AUC={res['roc_auc']:.5f}  Brier={res['brier']:.5f}")
print(f"[HOLDOUT] PR-AUC={hold['pr_auc']:.5f}  ROC-AUC={hold['roc_auc']:.5f}  Brier={hold['brier']:.5f}")
print(f"Reports saved to: {REPORTS_OUT}")


[EXPERIMENT 1] Baseline RandomForest with default params

[BASELINES] PR-AUC=0.04754  ROC-AUC=0.57511  Brier=0.03557
[HOLDOUT] PR-AUC=0.05037  ROC-AUC=0.59361  Brier=0.03545
Reports saved to: D:\AdA_Project25\158_portoSeguro\reports_Hany


In [15]:
print(f"\n[EXPERIMENT 2] Hyperparameter Tuning with Randomized Search")

param_grid = {
    'clf__n_estimators': [100, 200, 300],
    'clf__max_depth': [10, 20, None],
    'clf__min_samples_leaf': [1, 2, 4],
    'clf__class_weight': [None, 'balanced']
}

pre = prep_for_model(X_tr_fe, selected_fe)
clf = RandomForestClassifier(random_state=RND, n_jobs=-1)
pipe = Pipeline([("pre", pre), ("clf", clf)])

random_search = RandomizedSearchCV(
    pipe,
    param_distributions=param_grid,
    n_iter=20,
    cv=CV,
    scoring='average_precision',
    random_state=RND,
    n_jobs=-1,
    verbose=1
)

random_search.fit(X_tr_fe, y_tr)

best_params = {k.replace('clf__', ''): v for k, v in random_search.best_params_.items()}
best_score = random_search.best_score_
print(f"Best parameters found: {best_params}")
print(f"Best cross-validation PR-AUC: {best_score:.4f}")

print("[INFO] Training final model with best parameters...")
proba, hold, fi, meta = fit_final(X_tr_fe, y_tr, X_te_fe, y_te, selected_fe, model_params=best_params)

report_name_tuned = "rf_tuned"
pd.DataFrame({"proba": proba, "y_true": y_te.values}).to_csv(REPORTS_OUT/f"holdout_preds_{report_name_tuned}.csv", index=False)
if fi is not None and not fi.empty:
    fi.reset_index().rename(columns={"index":"feature"}).to_csv(REPORTS_OUT/f"fi_gain_{report_name_tuned}.csv", index=False)
save_pr_curve(y_te.values, proba, REPORTS_OUT/f"plot_pr_{report_name_tuned}.png")
save_calibration(y_te.values, proba, REPORTS_OUT/f"plot_calibration_{report_name_tuned}.png")
if fi is not None: save_top20_importance(fi, REPORTS_OUT/f"plot_fi_top20_{report_name_tuned}.png")

row = {
    "member": MEMBER, "model_name": "RF_Tuned", "encoder": meta["encoder"],
    "split_path": str(split_p), "feature_recipe": "selected",
    "seed": RND, "cv_folds": CV, "hold_auc": hold["roc_auc"], "hold_ap": hold["pr_auc"],
    "hold_brier": hold["brier"], "cv_auc_mean": random_search.cv_results_['mean_test_score'].max(), "cv_ap_mean": best_score,
    "early_stopping": False, "best_iteration": None,
    "n_trees": best_params.get('n_estimators', N_EST), "fit_time_s": meta["fit_time_s"],
    "predict_time_ms_per_1k": meta["predict_time_ms_per_1k"], "params_json": json.dumps(best_params)
}
log_summary(row)

print(f"\n[HOLDOUT] (Tuned) PR-AUC={hold['pr_auc']:.5f}  ROC-AUC={hold['roc_auc']:.5f}  Brier={hold['brier']:.5f}")
print(f"Reports saved to: {REPORTS_OUT}")


[EXPERIMENT 2] Hyperparameter Tuning with Randomized Search
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best parameters found: {'n_estimators': 300, 'min_samples_leaf': 2, 'max_depth': 10, 'class_weight': None}
Best cross-validation PR-AUC: 0.0643
[INFO] Training final model with best parameters...

[HOLDOUT] (Tuned) PR-AUC=0.06903  ROC-AUC=0.63688  Brier=0.03491
Reports saved to: D:\AdA_Project25\158_portoSeguro\reports_Hany
