In [6]:
import os, sys, json, warnings, time
warnings.filterwarnings("ignore")

from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold, train_test_split, RandomizedSearchCV
from sklearn.metrics import average_precision_score, roc_auc_score, brier_score_loss, precision_recall_curve
from sklearn.calibration import calibration_curve
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin


# ---------- Pfade und Konfiguration ---------->
if "__file__" in globals():
    ROOT = Path(__file__).resolve().parents[1]
else:
    ROOT = Path.cwd() if Path.cwd().name not in ("notebooks","tools","tests") else Path.cwd().parent
sys.path.insert(0, str(ROOT))
from src.data_loader import load_and_save_data

REPORTS_IN  = Path(os.getenv("REPORTS_IN")  or (ROOT / "reports"))
REPORTS_OUT = Path(os.getenv("REPORTS_OUT") or (ROOT / "reports_Hany/RandomForest"))
REPORTS_OUT.mkdir(parents=True, exist_ok=True)

SPEED = os.getenv("SPEED", "FAST").upper().strip()
def speed_cfg():
    cfg = dict(CV=5, N_ITER_SEARCH=20)
    if SPEED == "FAST":
        cfg.update(CV=5, N_EST=200, EN_ITER_SEARCH=5)
    elif SPEED == "MEDIUM":
        cfg.update(CV=5, N_EST=1000, N_ITER_SEARCH=10)
    elif SPEED == "FULL":
        cfg.update(CV=5, N_EST=2000, N_ITER_SEARCH=20)
    return cfg

CFG        = speed_cfg()
RND        = int(os.getenv("RND", "42"))
CV         = int(os.getenv("CV", str(CFG["CV"])))
N_EST      = int(os.getenv("N_EST", str(CFG["N_EST"])))
MEMBER     = os.getenv("MEMBER", "Hany")

print(f"[SETUP] SPEED={SPEED} CV={CV} N_EST={N_EST} MODEL=RandomForest")

# Hilfsfunktionen zur Datenaufbereitung
def split_cols(cols):
    #Trenne Spalten in kategorische, binäre und numerische."""
    cat = [c for c in cols if str(c).endswith("_cat")]
    bin_ = [c for c in cols if str(c).endswith("_bin")]
    num  = [c for c in cols if c not in cat and c not in bin_ and c != "target"]
    return cat, bin_, num

def load_selected_feature_list():
    #Lade die Liste der ausgewählten Features.
    f = REPORTS_IN / "features_selected.csv"
    if not f.exists():
        raise FileNotFoundError(f"Missing {f}. Run feature-gate first.")
    s = pd.read_csv(f)
    if "raw_feature" not in s.columns:
        raise ValueError("features_selected.csv must have column 'raw_feature'.")
    return s["raw_feature"].astype(str).tolist()

def fe_extras(X, selected):
    #Erstelle zusätzliche Features.
    X = X.copy()
    if "missing_count" in selected:
        X["missing_count"] = X.isna().sum(axis=1)
    if "sum_all_bin" in selected:
        b = [c for c in X.columns if str(c).endswith("_bin")]
        X["sum_all_bin"] = X[b].sum(axis=1)
    return X

def prep_for_model(X: pd.DataFrame, selected_cols):
    #Preprocessor-Schritt für die OHE-Pipeline
    keep = [c for c in selected_cols if c in X.columns]
    missing = [c for c in selected_cols if c not in X.columns]
    if missing: print(f"[WARN] ignoring {len(missing)} missing selected feature(s).")
    X = X[keep].copy()

    cat, bin_, num = split_cols(X.columns)

    pre = ColumnTransformer([
        ("cat", Pipeline([("imp", SimpleImputer(strategy="most_frequent")), ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))]), cat),
        ("bin", SimpleImputer(strategy="most_frequent"), bin_),
        ("num", Pipeline([("imp", SimpleImputer(strategy="median"))]), num)
    ], remainder="drop")

    return pre

# Hilfsfunktionen für OOF und finales Training
def oof_cv(model_name, X, y, preprocessor, model_params=None):
    #Kreuzvalidierung.
    skf = StratifiedKFold(n_splits=CV, shuffle=True, random_state=RND)
    oof = np.zeros(len(y), dtype=float)

    defaults = {
        'n_estimators': N_EST,
        'random_state': RND,
        'n_jobs': -1,
        'class_weight': 'balanced'
    }
    defaults.update(model_params or {})

    clf = RandomForestClassifier(**defaults)
    pipe = Pipeline([("pre", preprocessor), ("clf", clf)])

    for tr, va in skf.split(X, y):
        Xtr, Xva = X.iloc[tr], X.iloc[va]; ytr, yva = y.iloc[tr], y.iloc[va]
        pipe.fit(Xtr, ytr)
        oof[va] = pipe.predict_proba(Xva)[:,1]

    pr  = average_precision_score(y, oof)
    roc = roc_auc_score(y, oof)
    brier = brier_score_loss(y, oof)
    return dict(pr_auc=float(pr), roc_auc=float(roc), brier=float(brier), oof=oof)

def fit_final(Xtr, ytr, Xte, yte, preprocessor, model_params=None):
    defaults = {
        'n_estimators': N_EST,
        'random_state': RND,
        'n_jobs': -1,
        'class_weight': 'balanced'
    }
    defaults.update(model_params or {})

    clf = RandomForestClassifier(**defaults)
    pipe = Pipeline([("pre", preprocessor), ("clf", clf)])

    t0 = time.perf_counter()
    pipe.fit(Xtr, ytr)
    fit_time_s = time.perf_counter() - t0

    t1 = time.perf_counter()
    proba = pipe.predict_proba(Xte)[:,1]
    pred_ms_per_1k = 1000 * (time.perf_counter() - t1) / (len(Xte)/1000)

    try:
        feature_names = pipe.named_steps['pre'].get_feature_names_out()
        fi = pd.Series(pipe.named_steps['clf'].feature_importances_, index=feature_names, name="gain").sort_values(ascending=False)
    except:
        fi = None

    meta = {
        "encoder": "OHE",
        "n_trees": defaults['n_estimators'],
        "fit_time_s": float(fit_time_s),
        "predict_time_ms_per_1k": float(pred_ms_per_1k),
        "params": model_params
    }
    hold = dict(pr_auc=float(average_precision_score(yte, proba)), roc_auc=float(roc_auc_score(yte, proba)), brier=float(brier_score_loss(yte, proba)))

    return proba, hold, fi, meta

#Berichtsfunktionen
def save_pr_curve(y_true, proba, out_path):
    prec, rec, _ = precision_recall_curve(y_true, proba); ap = average_precision_score(y_true, proba)
    plt.figure(figsize=(7,5)); plt.plot(rec, prec, label=f'AP={ap:.4f}')
    plt.xlabel('Recall'); plt.ylabel('Precision'); plt.title('Precision-Recall')
    plt.xlim([0,1]); plt.ylim([0,1]); plt.grid(True, alpha=0.3); plt.legend(); plt.tight_layout(); plt.savefig(out_path, dpi=150); plt.close()

def save_calibration(y_true, proba, out_path):
    prob_true, prob_pred = calibration_curve(y_true, proba, n_bins=20, strategy="quantile")
    plt.figure(figsize=(6,6)); plt.plot([0,1],[0,1],'--',label='Perfect')
    plt.plot(prob_pred, prob_true, marker='o', label='Model')
    plt.xlabel('Predicted'); plt.ylabel('Observed'); plt.title('Calibration')
    plt.grid(True, alpha=0.3); plt.legend(); plt.tight_layout(); plt.savefig(out_path, dpi=150); plt.close()

def save_top20_importance(fi: pd.Series, out_path):
    if fi is None or fi.empty: return
    top = fi.head(20).iloc[::-1]
    plt.figure(figsize=(8,6)); plt.barh(top.index, top.values)
    plt.xlabel('Gain'); plt.title('Top-20 Feature Importance')
    plt.tight_layout(); plt.savefig(out_path, dpi=150); plt.close()

def log_summary(row, filename="team_model_summary.csv"):
    out_csv = REPORTS_OUT/filename
    pd.DataFrame([row]).to_csv(out_csv, mode="a", index=False, header=not out_csv.exists())

def plot_model_comparison(results_list, out_path):
    # Vergleicht die PR-AUC, ROC-AUC und den Brier-Score der Modelle
    df_results = pd.DataFrame(results_list)
    df_results.set_index("model_name", inplace=True)

    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
    df_results['hold_ap'].plot(kind='barh', ax=axes[0], title='PR-AUC')
    df_results['hold_auc'].plot(kind='barh', ax=axes[1], title='ROC-AUC')
    df_results['hold_brier'].plot(kind='barh', ax=axes[2], title='Brier-Score')

    for ax in axes:
        ax.grid(axis='x', alpha=0.3)
        ax.set_ylabel('')
        ax.set_yticklabels(ax.get_yticklabels(), rotation=0, ha='right')

    plt.tight_layout()
    plt.savefig(out_path, dpi=150)
    plt.close()
    print(f"[INFO] Modellvergleichs-Plot gespeichert: {out_path}")

def plot_prediction_distributions_kde(predictions, y_true, out_path):
    all_data = []
    for model_name, proba in predictions.items():
        df_plot = pd.DataFrame({'proba': proba, 'model': model_name, 'y_true': y_true})
        all_data.append(df_plot)

    df_combined = pd.concat(all_data, ignore_index=True)

    plt.figure(figsize=(14, 9))
    sns.kdeplot(
        data=df_combined,
        x='proba',
        hue='y_true',
        fill=True,
        alpha=0.6,
        common_norm=False,
        legend=True
    )
    plt.title('Verteilung der Vorhersagewahrscheinlichkeiten nach Modell und wahrer Klasse')
    plt.xlabel('Vorhergesagte Wahrscheinlichkeit')
    plt.ylabel('Dichte')
    plt.legend(title='Modell | Klasse')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig(out_path, dpi=150)
    plt.close()
    print(f"[INFO] Verteilungs-Plot (KDE) wurde gespeichert: {out_path}")


def plot_pr_curve_all(predictions, y_true, out_path):
    plt.figure(figsize=(10, 8))

    for model_name, proba in predictions.items():
        prec, rec, _ = precision_recall_curve(y_true, proba)
        ap_score = average_precision_score(y_true, proba)
        plt.plot(rec, prec, label=f'{model_name} (AP={ap_score:.4f})')

    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Vergleich der Precision-Recall-Kurven')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.grid(True, alpha=0.3)
    plt.legend(loc='upper right')
    plt.tight_layout()
    plt.savefig(out_path, dpi=150)
    plt.close()
    print(f"[INFO] Precision-Recall-Kurven-Vergleichsplot wurde gespeichert: {out_path}")

# Daten laden und aufteilen
split_p = REPORTS_IN / "split_indices.json"
feats_p = REPORTS_IN / "features_selected.csv"
assert split_p.exists() and feats_p.exists(), "Missing split and/or features files in reports."

split = json.loads(split_p.read_text())
selected = load_selected_feature_list()

df = load_and_save_data().replace(-1, np.nan)
X_tr_all = df.loc[split["train"]].drop(columns=["target"])
y_tr     = df.loc[split["train"], "target"].astype(int)
X_te_all = df.loc[split["test"]].drop(columns=["target"])
y_te     = df.loc[split["test"], "target"].astype(int)

X_tr_fe = fe_extras(X_tr_all.copy(), selected)
X_te_fe = fe_extras(X_te_all.copy(), selected)

selected_fe = list(set(selected + ["missing_count", "sum_all_bin"]))
selected_fe = [c for c in selected_fe if c in X_tr_fe.columns]


[SETUP] SPEED=FAST CV=5 N_EST=200 MODEL=RandomForest
Loading dataset from in-memory cache.


In [3]:
print(f"\n[EXPERIMENT 1] Baseline RandomForest with default params")

preprocessor_ohe = prep_for_model(X_tr_fe, selected_fe)
base_params = {}

res_base = oof_cv("rf_baseline", X_tr_fe, y_tr, preprocessor_ohe, model_params=base_params)
proba_base, hold_base, fi_base, meta_base = fit_final(X_tr_fe, y_tr, X_te_fe, y_te, preprocessor_ohe, model_params=base_params)

report_name_base = "rf_baseline"
pd.DataFrame({"oof": res_base["oof"]}).to_csv(REPORTS_OUT/f"oof_{report_name_base}.csv", index=False)

pd.DataFrame({"proba": proba_base, "y_true": y_te.values}).to_csv(REPORTS_OUT/f"holdout_preds_{report_name_base}.csv", index=False)

if fi_base is not None and not fi_base.empty:
    fi_base.reset_index().rename(columns={"index":"feature"}).to_csv(REPORTS_OUT/f"fi_gain_{report_name_base}.csv", index=False)

save_pr_curve(y_te.values, proba_base, REPORTS_OUT/f"plot_pr_{report_name_base}.png")

save_calibration(y_te.values, proba_base, REPORTS_OUT/f"plot_calibration_{report_name_base}.png")

if fi_base is not None: save_top20_importance(fi_base, REPORTS_OUT/f"plot_fi_top20_{report_name_base}.png")

row_base = {
    "member": MEMBER, "model_name": "RF_Baseline", "encoder": "OHE",
    "split_path": str(split_p), "feature_recipe": "selected",
    "seed": RND, "cv_folds": CV, "hold_auc": hold_base["roc_auc"], "hold_ap": hold_base["pr_auc"],
    "hold_brier": hold_base["brier"], "cv_auc_mean": res_base["roc_auc"], "cv_ap_mean": res_base["pr_auc"],
    "early_stopping": False, "best_iteration": None,
    "n_trees": N_EST, "fit_time_s": meta_base["fit_time_s"],
    "predict_time_ms_per_1k": meta_base["predict_time_ms_per_1k"], "params_json": json.dumps(base_params)
}
log_summary(row_base)
print(f"\n[BASELINES] PR-AUC={res_base['pr_auc']:.5f}  ROC-AUC={res_base['roc_auc']:.5f}  Brier={res_base['brier']:.5f}")
print(f"[HOLDOUT] PR-AUC={hold_base['pr_auc']:.5f}  ROC-AUC={hold_base['roc_auc']:.5f}  Brier={hold_base['brier']:.5f}")
print(f"Reports saved to: {REPORTS_OUT}")


[EXPERIMENT 1] Baseline RandomForest with default params

[BASELINES] PR-AUC=0.04908  ROC-AUC=0.58584  Brier=0.03541
[HOLDOUT] PR-AUC=0.05352  ROC-AUC=0.60310  Brier=0.03526
Reports saved to: D:\AdA_Project25\158_portoSeguro\reports_Hany\RandomForest


In [4]:
print(f"\n[EXPERIMENT 2] Hyperparameter Tuning with Randomized Search (OHE)")

param_grid_ohe = {
    'clf__n_estimators': [int(N_EST*0.5), N_EST, int(N_EST*1.5)],
    'clf__max_depth': [10, 20, None],
    'clf__min_samples_leaf': [1, 2, 4],
    'clf__class_weight': [None, 'balanced']
}

clf_ohe = RandomForestClassifier(random_state=RND, n_jobs=-1)
pipe_ohe = Pipeline([("pre", preprocessor_ohe), ("clf", clf_ohe)])

random_search_ohe = RandomizedSearchCV(
    pipe_ohe,
    param_distributions=param_grid_ohe,
    n_iter=20,
    cv=CV,
    scoring='average_precision',
    random_state=RND,
    n_jobs=-1,
    verbose=1
)

random_search_ohe.fit(X_tr_fe, y_tr)

best_params_ohe = {k.replace('clf__', ''): v for k, v in random_search_ohe.best_params_.items()}
best_score_ohe = random_search_ohe.best_score_

print(f"Best parameters found: {best_params_ohe}")
print(f"Best cross-validation PR-AUC: {best_score_ohe:.4f}")

print("[INFO] Training final model with best parameters...")

proba_tuned_ohe, hold_tuned_ohe, fi_tuned_ohe, meta_tuned_ohe = fit_final(X_tr_fe, y_tr, X_te_fe, y_te, preprocessor_ohe, model_params=best_params_ohe)


report_name_tuned_ohe = "rf_tuned_ohe"
pd.DataFrame({"proba": proba_tuned_ohe, "y_true": y_te.values}).to_csv(REPORTS_OUT/f"holdout_preds_{report_name_tuned_ohe}.csv", index=False)
if fi_tuned_ohe is not None and not fi_tuned_ohe.empty:
    fi_tuned_ohe.reset_index().rename(columns={"index":"feature"}).to_csv(REPORTS_OUT/f"fi_gain_{report_name_tuned_ohe}.csv", index=False)
save_pr_curve(y_te.values, proba_tuned_ohe, REPORTS_OUT/f"plot_pr_{report_name_tuned_ohe}.png")
save_calibration(y_te.values, proba_tuned_ohe, REPORTS_OUT/f"plot_calibration_{report_name_tuned_ohe}.png")
if fi_tuned_ohe is not None: save_top20_importance(fi_tuned_ohe, REPORTS_OUT/f"plot_fi_top20_{report_name_tuned_ohe}.png")

row_tuned_ohe = {
    "member": MEMBER, "model_name": "RF_Tuned_OHE", "encoder": meta_tuned_ohe["encoder"],
    "split_path": str(split_p), "feature_recipe": "selected",
    "seed": RND, "cv_folds": CV, "hold_auc": hold_tuned_ohe["roc_auc"], "hold_ap": hold_tuned_ohe["pr_auc"],
    "hold_brier": hold_tuned_ohe["brier"], "cv_auc_mean": random_search_ohe.cv_results_['mean_test_score'].max(), "cv_ap_mean": best_score_ohe,
    "early_stopping": False, "best_iteration": None,
    "n_trees": best_params_ohe.get('n_estimators', N_EST), "fit_time_s": meta_tuned_ohe["fit_time_s"],
    "predict_time_ms_per_1k": meta_tuned_ohe["predict_time_ms_per_1k"], "params_json": json.dumps(best_params_ohe)
}
log_summary(row_tuned_ohe)
print(f"\n[HOLDOUT] (Tuned, OHE) PR-AUC={hold_tuned_ohe['pr_auc']:.5f}  ROC-AUC={hold_tuned_ohe['roc_auc']:.5f}  Brier={hold_tuned_ohe['brier']:.5f}")
print(f"Reports saved to: {REPORTS_OUT}")



[EXPERIMENT 2] Hyperparameter Tuning with Randomized Search (OHE)
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best parameters found: {'n_estimators': 300, 'min_samples_leaf': 4, 'max_depth': 10, 'class_weight': None}
Best cross-validation PR-AUC: 0.0642
[INFO] Training final model with best parameters...

[HOLDOUT] (Tuned, OHE) PR-AUC=0.06872  ROC-AUC=0.63874  Brier=0.03491
Reports saved to: D:\AdA_Project25\158_portoSeguro\reports_Hany\RandomForest


In [5]:
print("\\n[EXPERIMENT 3] RandomForest mit Target Encoding")

class TargetEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, smoothing=10.0):
        self.smoothing = smoothing
        self.mappings = {}
        self.global_mean = 0

    def fit(self, X, y):
        X_temp = X.copy()
        y_temp = y.copy()
        X_temp['target'] = y_temp
        self.global_mean = y_temp.mean()

        for col in X.columns:
            agg = X_temp.groupby(col)['target'].agg(['count', 'mean'])
            counts = agg['count']
            means = agg['mean']
            smooth = (counts * means + self.smoothing * self.global_mean) / (counts + self.smoothing)
            self.mappings[col] = smooth
        return self

    def transform(self, X):
        X_new = X.copy()
        for col, mapping in self.mappings.items():
            X_new[col] = X_new[col].map(mapping).fillna(self.global_mean)
        return X_new

# Datenvorverearveitung
cat_cols, _, _ = split_cols(X_tr_fe.columns)
X_tr_te = X_tr_fe.copy()
X_te_te = X_te_fe.copy()

target_encoder = TargetEncoder()
target_encoder.fit(X_tr_te[cat_cols], y_tr)

X_tr_te[cat_cols] = target_encoder.transform(X_tr_te[cat_cols])
X_te_te[cat_cols] = target_encoder.transform(X_te_te[cat_cols])

print("Target Encoding wurde auf die Daten angewendet.")

# Pipeline und Hyperparameter-Suche anpassen ----------
preprocessor_te = SimpleImputer(strategy='median')
pipe_te = Pipeline([
    ("pre", preprocessor_te),
    ("clf", RandomForestClassifier(random_state=RND, n_jobs=-1))
])

param_grid_te = {
    'clf__n_estimators': [int(N_EST*0.5), N_EST, int(N_EST*1.5)],
    'clf__max_depth': [10, 20, None],
    'clf__min_samples_leaf': [1, 2, 4],
    'clf__class_weight': [None, 'balanced']
}

random_search_te = RandomizedSearchCV(
    pipe_te,
    param_distributions=param_grid_te,
    n_iter=20,
    cv=CV,
    scoring='average_precision',
    random_state=RND,
    n_jobs=-1,
    verbose=1
)

print("\\nStarte Hyperparameter-Suche für Target-Encoded Modell")
t0 = time.perf_counter()

random_search_te.fit(X_tr_te, y_tr)
fit_time_te = time.perf_counter() - t0
best_params_te = {k.replace('clf__', ''): v for k, v in random_search_te.best_params_.items()}
best_score_te = random_search_te.best_score_

print(f"\nBeste Parameter (Target Encoding): {best_params_te}")
print(f"Bester Cross-Validation PR-AUC (Target Encoding): {best_score_te:.4f}")

# Finale Auswertung und Ergebnisse
final_model_te = random_search_te.best_estimator_
t1 = time.perf_counter()
proba_te = final_model_te.predict_proba(X_te_te)[:, 1]
pred_time_te = 1000 * (time.perf_counter() - t1) / (len(X_te_te)/1000)

hold_te = {
    "pr_auc": average_precision_score(y_te, proba_te),
    "roc_auc": roc_auc_score(y_te, proba_te),
    "brier": brier_score_loss(y_te, proba_te),
}
print(f"\n[HOLDOUT] (Tuned, Target Encoding) PR-AUC={hold_te['pr_auc']:.5f}  ROC-AUC={hold_te['roc_auc']:.5f}  Brier={hold_te['brier']:.5f}")

# Reporting, Plots und Logging
try:
    fi_te = pd.Series(final_model_te.named_steps['clf'].feature_importances_, index=X_tr_te.columns, name="gain").sort_values(ascending=False)
except Exception as e:
    print(f"[WARN] Feature Importance konnte nicht extrahiert werden: {e}")
    fi_te = None

report_name_te = "rf_tuned_targetenc"
pd.DataFrame({"proba": proba_te, "y_true": y_te.values}).to_csv(REPORTS_OUT/f"holdout_preds_{report_name_te}.csv", index=False)
if fi_te is not None and not fi_te.empty:
    fi_te.reset_index().rename(columns={"index":"feature"}).to_csv(REPORTS_OUT/f"fi_gain_{report_name_te}.csv", index=False)
save_pr_curve(y_te.values, proba_te, REPORTS_OUT/f"plot_pr_{report_name_te}.png")
save_calibration(y_te.values, proba_te, REPORTS_OUT/f"plot_calibration_{report_name_te}.png")
if fi_te is not None:
    save_top20_importance(fi_te, REPORTS_OUT/f"plot_fi_top20_{report_name_te}.png")

# Da RandomizedSearchCV keine CV-Metriken für ROC-AUC speichert - AP-Score als Platzhalter
cv_results_te = {"pr_auc": best_score_te, "roc_auc": best_score_te}
row_te = {
    "member": MEMBER, "model_name": "RF_Tuned_TargetEnc", "encoder": "TargetEnc",
    "split_path": str(split_p), "feature_recipe": "selected",
    "seed": RND, "cv_folds": CV, "hold_auc": hold_te["roc_auc"], "hold_ap": hold_te["pr_auc"],
    "hold_brier": hold_te["brier"], "cv_auc_mean": best_score_te, "cv_ap_mean": best_score_te,
    "early_stopping": False, "best_iteration": None,
    "n_trees": best_params_te.get('n_estimators', N_EST),
    "fit_time_s": fit_time_te,
    "predict_time_ms_per_1k": pred_time_te,
    "params_json": json.dumps(best_params_te)
}
log_summary(row_te)
print("Ergebnisse wurden erfolgreich in die Zusammenfassungs-CSV geloggt.")
print(f"Reports saved to: {REPORTS_OUT}")

\n[EXPERIMENT 3] RandomForest mit Target Encoding
Target Encoding wurde auf die Daten angewendet.
\nStarte Hyperparameter-Suche für Target-Encoded Modell
Fitting 5 folds for each of 20 candidates, totalling 100 fits

Beste Parameter (Target Encoding): {'n_estimators': 300, 'min_samples_leaf': 4, 'max_depth': 10, 'class_weight': None}
Bester Cross-Validation PR-AUC (Target Encoding): 0.0650

[HOLDOUT] (Tuned, Target Encoding) PR-AUC=0.06645  ROC-AUC=0.63437  Brier=0.03490
Ergebnisse wurden erfolgreich in die Zusammenfassungs-CSV geloggt.
Reports saved to: D:\AdA_Project25\158_portoSeguro\reports_Hany\RandomForest


In [7]:
print("\n--- Zusätzliche Analysen und Visualisierungen ---")

try:
    # Sammeln der Hold-out-Ergebnisse für den Vergleichsplot
    all_holdout_results = [
        {"model_name": "RF_Baseline", "hold_ap": hold_base["pr_auc"], "hold_auc": hold_base["roc_auc"], "hold_brier": hold_base["brier"]},
        {"model_name": "RF_Tuned_OHE", "hold_ap": hold_tuned_ohe["pr_auc"], "hold_auc": hold_tuned_ohe["roc_auc"], "hold_brier": hold_tuned_ohe["brier"]},
        {"model_name": "RF_Tuned_TargetEnc", "hold_ap": hold_te["pr_auc"], "hold_auc": hold_te["roc_auc"], "hold_brier": hold_te["brier"]}
    ]

    all_predictions = {
        "RF_Baseline": proba_base,
        "RF_Tuned_OHE": proba_tuned_ohe,
        "RF_Tuned_TargetEnc": proba_te
    }
    # Plot 1: KDE-Plot der Vorhersage-Verteilungen
    plot_prediction_distributions_kde(all_predictions, y_te, REPORTS_OUT / "plot_proba_distribution_comparison.png")

    # Plot 2: Vergleich der Precision-Recall-Kurven
    plot_pr_curve_all(all_predictions, y_te, REPORTS_OUT / "plot_pr_curve_comparison.png")

except NameError as e:
    print(f"Fehler: Eine der Variablen ist nicht definiert. Stellen Sie sicher, dass alle vorherigen Schritte zur Modellauswertung erfolgreich ausgeführt wurden. Details: {e}")

print(f"Reports saved to: {REPORTS_OUT}")



--- Zusätzliche Analysen und Visualisierungen ---
[INFO] Verteilungs-Plot (KDE) wurde gespeichert: D:\AdA_Project25\158_portoSeguro\reports_Hany\RandomForest\plot_proba_distribution_comparison.png
[INFO] Precision-Recall-Kurven-Vergleichsplot wurde gespeichert: D:\AdA_Project25\158_portoSeguro\reports_Hany\RandomForest\plot_pr_curve_comparison.png
Reports saved to: D:\AdA_Project25\158_portoSeguro\reports_Hany\RandomForest
