In [3]:
import os, sys, json, warnings, time
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import average_precision_score, roc_auc_score, brier_score_loss, precision_recall_curve
from sklearn.calibration import calibration_curve, CalibratedClassifierCV
from sklearn.svm import LinearSVC
from sklearn.kernel_approximation import Nystroem
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin, clone
from sklearn.exceptions import NotFittedError

warnings.filterwarnings("ignore")

# Pfade und Konfiguration
if "__file__" in globals():
    ROOT = Path(__file__).resolve().parents[1]
else:
    ROOT = Path.cwd() if Path.cwd().name not in ("notebooks","tools","tests") else Path.cwd().parent
sys.path.insert(0, str(ROOT))
try:
    from src.data_loader import load_and_save_data
except ImportError:
    print("[ERROR] 'src.data_loader' konnte nicht importiert werden. Sicherstellen, dass der Pfad stimmt.")
    # Platzhalter-Funktionen für Testzwecke, falls src.data_loader nicht verfügbar ist
    def load_selected_feature_list():
        return [f'feature_{i}' for i in range(37)] + [f'binary_feature_{i}_bin' for i in range(10)] + [f'categorical_feature_{i}_cat' for i in range(10)]

    def load_and_save_data():
        data_len = 500
        n_features = 37
        X = pd.DataFrame(np.random.rand(data_len, n_features), columns=[f'feature_{i}' for i in range(n_features)])
        for i in range(10):
            X[f'binary_feature_{i}_bin'] = np.random.randint(0, 2, data_len)
            X[f'categorical_feature_{i}_cat'] = np.random.randint(0, 5, data_len)
        X['target'] = np.random.randint(0, 2, data_len)
        return X

BASE_REPORTS_OUT = Path(os.getenv("REPORTS_OUT") or (ROOT / "reports_Hany/SVM"))
REPORTS_IN = Path(os.getenv("REPORTS_IN") or (ROOT / "reports"))

SPEED = os.getenv("SPEED", "MEDIUM").upper().strip()
def speed_cfg():
    cfg = dict(CV=5, N_ITER_SEARCH=20)
    if SPEED == "FAST":
        cfg.update(CV=3, N_ITER_SEARCH=5)
    elif SPEED == "MEDIUM":
        cfg.update(CV=5, N_ITER_SEARCH=10)
    elif SPEED == "FULL":
        cfg.update(CV=5, N_ITER_SEARCH=20)
    return cfg

CFG = speed_cfg()
RND = int(os.getenv("RND", "42"))
CV = int(os.getenv("CV", str(CFG["CV"])))
MEMBER = os.getenv("MEMBER", "Hany")
N_ITER_SEARCH = int(os.getenv("N_ITER_SEARCH", str(CFG["N_ITER_SEARCH"])))

print(f"[SETUP] SPEED={SPEED} CV={CV} N_ITER_SEARCH={N_ITER_SEARCH} MODEL=SVM")


# Hilfsfunktionen
def split_cols(cols):
    # Trennt Spalten in kategorische, binäre und numerische.
    cat = [c for c in cols if str(c).endswith("_cat")]
    bin_ = [c for c in cols if str(c).endswith("_bin")]
    num = [c for c in cols if c not in cat and c not in bin_ and c != "target"]
    return cat, bin_, num

def load_selected_feature_list():
    f = REPORTS_IN / "features_selected.csv"
    if not f.exists():
        raise FileNotFoundError(f"Missing {f}. Run feature-gate first.")
    s = pd.read_csv(f)
    if "raw_feature" not in s.columns:
        raise ValueError("features_selected.csv must have column 'raw_feature'.")
    return s["raw_feature"].astype(str).tolist()

class CustomFeatureGenerator(BaseEstimator, TransformerMixin):
    def __init__(self, selected_features):
        self.selected_features = selected_features
        self.bin_cols = [c for c in selected_features if str(c).endswith("_bin")]

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Behandle Pandas DataFrame und NumPy Array Inputs.
        if isinstance(X, pd.DataFrame):
            X_df = X.copy()
        elif isinstance(X, np.ndarray):
            X_df = pd.DataFrame(X, columns=[c for c in self.selected_features if c != 'target'])
        else:
            raise TypeError("Input must be a pandas DataFrame or a numpy array.")

        # Erzeuge 'missing_count'
        if "missing_count" in self.selected_features and "missing_count" not in X_df.columns:
            if isinstance(X, pd.DataFrame):
                X_df["missing_count"] = X_df.isna().sum(axis=1)
            else:
                X_df["missing_count"] = np.sum(np.isnan(X), axis=1)

        # Erzeuge 'sum_all_bin'
        if "sum_all_bin" in self.selected_features:
            bin_cols_in_df = [c for c in self.bin_cols if c in X_df.columns]
            if bin_cols_in_df:
                X_df["sum_all_bin"] = X_df[bin_cols_in_df].sum(axis=1)
            else:
                X_df["sum_all_bin"] = 0

        if isinstance(X, np.ndarray):
            return X_df.values

        return X_df

def create_preprocessor(all_features):
    # Preprocessor-Schritt für die Pipeline
    cat, bin_, num = split_cols(all_features)
    preprocessor = ColumnTransformer([
        ("num", Pipeline([("imp", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]), num),
        ("cat", Pipeline([("imp", SimpleImputer(strategy="most_frequent")), ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))]), cat),
        ("bin", SimpleImputer(strategy="most_frequent"), bin_)
    ], remainder="drop")
    return preprocessor

def get_pipeline(selected_cols, clf_model):
    # Pipeline zusammenbauen.
    all_features = list(set(selected_cols + ["missing_count", "sum_all_bin"]))
    all_features = [c for c in all_features if c != 'target']
    preprocessor = create_preprocessor(all_features)

    return Pipeline([
        ("feature_gen", CustomFeatureGenerator(all_features)),
        ("pre", preprocessor),
        ("nystroem", Nystroem(kernel='rbf', random_state=RND)),
        ("clf", clf_model)
    ]), all_features

def oof_cv(X, y, pipeline, model_params=None):
    # OOF Kreuzvalidierung
    skf = StratifiedKFold(n_splits=CV, shuffle=True, random_state=RND)
    oof = np.zeros(len(y), dtype=float)

    if model_params:
        pipeline.set_params(**{f'clf__{k}': v for k, v in model_params.items()})

    for tr, va in skf.split(X, y):
        Xtr, Xva = X.iloc[tr], X.iloc[va]; ytr, yva = y.iloc[tr], y.iloc[va]

        pipe_fold = clone(pipeline).fit(Xtr, ytr)

        calibrator = CalibratedClassifierCV(estimator=pipe_fold, method='sigmoid', cv='prefit')
        calibrator.fit(Xva, yva)
        oof[va] = calibrator.predict_proba(Xva)[:, 1]

    pr = average_precision_score(y, oof)
    roc = roc_auc_score(y, oof)
    brier = brier_score_loss(y, oof)
    return dict(pr_auc=float(pr), roc_auc=float(roc), brier=float(brier), oof=oof)

def fit_final(Xtr, ytr, Xte, yte, pipeline, model_params=None):
    # finale Modell trainieren + bewerten.
    if model_params:
        pipeline.set_params(**{f'clf__{k}': v for k, v in model_params.items()})

    t0 = time.perf_counter()
    pipeline.fit(Xtr, ytr)
    fit_time_s = time.perf_counter() - t0

    calibrator = CalibratedClassifierCV(estimator=pipeline, method='sigmoid', cv='prefit')
    calibrator.fit(Xtr, ytr)

    t1 = time.perf_counter()
    proba = calibrator.predict_proba(Xte)[:, 1]
    pred_ms_per_1k = 1000 * (time.perf_counter() - t1) / (len(Xte) if len(Xte) > 0 else 1)

    try:
        if isinstance(pipeline.named_steps['clf'], LinearSVC):
            n_components = pipeline.named_steps['nystroem'].n_components
            coefs = pipeline.named_steps['clf'].coef_[0]
            fi = pd.Series(coefs, index=[f'nystroem_component_{i}' for i in range(n_components)], name="gain").abs().sort_values(ascending=False)
        else:
            fi = None
    except (NotFittedError, KeyError):
        fi = None

    meta = {
        "encoder": "OHE + StandardScaler + Nystroem + Calibrator",
        "fit_time_s": float(fit_time_s),
        "predict_time_ms_per_1k": float(pred_ms_per_1k),
        "params": model_params
    }
    hold = dict(pr_auc=float(average_precision_score(yte, proba)), roc_auc=float(roc_auc_score(yte, proba)), brier=float(brier_score_loss(yte, proba)))

    return proba, hold, fi, meta

# Berichtsfunktionen
def save_pr_curve(y_true, proba, out_path):
    # Precision-Recall-Kurve.
    prec, rec, _ = precision_recall_curve(y_true, proba); ap = average_precision_score(y_true, proba)
    plt.figure(figsize=(7,5)); plt.plot(rec, prec, label=f'AP={ap:.4f}')
    plt.xlabel('Recall'); plt.ylabel('Precision'); plt.title('Precision-Recall'); plt.xlim([0,1]); plt.ylim([0,1]); plt.grid(True, alpha=0.3); plt.legend(); plt.tight_layout(); plt.savefig(out_path, dpi=150); plt.close()

def save_calibration(y_true, proba, out_path):
    # Kalibrierungskurve.
    prob_true, prob_pred = calibration_curve(y_true, proba, n_bins=20, strategy="quantile")
    plt.figure(figsize=(6,6)); plt.plot([0,1],[0,1],'--',label='Perfect')
    plt.plot(prob_pred, prob_true, marker='o', label='Model')
    plt.xlabel('Predicted'); plt.ylabel('Observed'); plt.title('Calibration')
    plt.grid(True, alpha=0.3); plt.legend(); plt.tight_layout(); plt.savefig(out_path, dpi=150); plt.close()

def save_top20_importance(fi: pd.Series, out_path):
    # Balkendiagramm der 20 wichtigsten Features.
    if fi is None or fi.empty:
        print("[INFO] Feature Importance nicht verfügbar oder leer.")
        return
    top = fi.head(20).iloc[::-1]
    plt.figure(figsize=(8,6)); plt.barh(top.index, top.values)
    plt.xlabel('Absoluter Koeffizient'); plt.title('Top-20 Feature Importance (LinearSVC Koeffizienten)')
    plt.tight_layout(); plt.savefig(out_path, dpi=150); plt.close()

def log_summary(row, filename="team_model_summary.csv"):
    # Modellzusammenfassung
    out_csv = BASE_REPORTS_OUT / filename
    pd.DataFrame([row]).to_csv(out_csv, mode="a", index=False, header=not out_csv.exists())

def plot_svm_metrics_comparison(metrics_data, out_path):
    # Vergleich der SVM-Modelle
    df_metrics = pd.DataFrame(metrics_data)
    df_metrics.set_index("model_name", inplace=True)
    fig, axes = plt.subplots(1, 3, figsize=(18, 6))
    df_metrics['hold_ap'].plot(kind='barh', ax=axes[0], title='PR-AUC')
    df_metrics['hold_auc'].plot(kind='barh', ax=axes[1], title='ROC-AUC')
    df_metrics['hold_brier'].plot(kind='barh', ax=axes[2], title='Brier-Score')
    for ax in axes:
        ax.grid(axis='x', alpha=0.3)
        ax.set_ylabel('')
        ax.tick_params(axis='y', rotation=0)
    plt.tight_layout()
    plt.savefig(out_path, dpi=150)
    plt.close()
    print(f"[INFO] SVM-Metriken-Vergleichsplot wurde gespeichert: {out_path}")

def plot_svm_proba_distributions(out_dir, out_path):
    # KDE-Plot -- Vorhersagewahrscheinlichkeiten.
    try:
        df_baseline = pd.read_csv(out_dir / "holdout_preds_baseline.csv")
        df_tuned = pd.read_csv(out_dir / "holdout_preds_tuned_nystroem_calibrated.csv")
        df_baseline['model'] = 'SVM_Baseline'
        df_tuned['model'] = 'SVM_Tuned'
        df_combined = pd.concat([df_baseline, df_tuned], ignore_index=True)
        df_combined['legend_label'] = df_combined.apply(
            lambda row: f"{row['model']} - Klasse {int(row['y_true'])}", axis=1
        )
        plt.figure(figsize=(14, 9))
        sns.kdeplot(
            data=df_combined,
            x='proba',
            hue='legend_label',
            fill=True,
            alpha=0.6,
            common_norm=False,
            legend=True
        )
        plt.title('Verteilung der Vorhersagewahrscheinlichkeiten: SVM-Modelle')
        plt.xlabel('Vorhergesagte Wahrscheinlichkeit')
        plt.ylabel('Dichte')
        plt.legend(title='Modell und Klasse')
        plt.grid(True, alpha=0.3)
        plt.tight_layout()
        plt.savefig(out_path, dpi=150)
        plt.close()
        print(f"[INFO] SVM-Vorhersage-Verteilungsplot wurde gespeichert: {out_path}")
    except FileNotFoundError as e:
        print(f"Fehler: Eine der benötigten Dateien wurde nicht gefunden. {e}")

def plot_pr_curve_all(predictions, y_true, out_path):
    # Vergleicht die Precision-Recall-Kurven mehrerer Modelle in einem Plot.
    plt.figure(figsize=(10, 8))

    for model_name, proba in predictions.items():
        prec, rec, _ = precision_recall_curve(y_true, proba)
        ap_score = average_precision_score(y_true, proba)
        plt.plot(rec, prec, label=f'{model_name} (AP={ap_score:.4f})')

    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Vergleich der Precision-Recall-Kurven')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.grid(True, alpha=0.3)
    plt.legend(loc='upper right')
    plt.tight_layout()
    plt.savefig(out_path, dpi=150)
    plt.close()
    print(f"[INFO] Precision-Recall-Kurven-Vergleichsplot wurde gespeichert: {out_path}")

[SETUP] SPEED=MEDIUM CV=5 N_ITER_SEARCH=10 MODEL=SVM


In [4]:
if __name__ == "__main__":
    print(f"[SETUP] Starting data loading and splitting.")
    split_p = REPORTS_IN / "split_indices.json"
    feats_p = REPORTS_IN / "features_selected.csv"
    assert split_p.exists() and feats_p.exists(), "Missing split and/or features files in reports."

    split = json.loads(split_p.read_text())
    selected = load_selected_feature_list()

    df = load_and_save_data().replace(-1, np.nan)
    X_tr_all = df.loc[split["train"]].drop(columns=["target"])
    y_tr = df.loc[split["train"], "target"].astype(int)
    X_te_all = df.loc[split["test"]].drop(columns=["target"])
    y_te = df.loc[split["test"], "target"].astype(int)

    selected_fe = selected.copy()
    print("[SETUP] Data loaded and split successfully.")

    REPORTS_OUT_MODEL = BASE_REPORTS_OUT
    REPORTS_OUT_MODEL.mkdir(parents=True, exist_ok=True)

    print(f"\n[EXPERIMENT 1] Baseline LinearSVC with Nystroem using default params")
    base_params = {'C': 1.0, 'class_weight': 'balanced'}
    clf_model_baseline = LinearSVC(random_state=RND, max_iter=8000, dual=False, **base_params)

    pipe_baseline, all_features = get_pipeline(selected_fe, clf_model_baseline)
    pipe_baseline.set_params(nystroem__gamma=0.1, nystroem__n_components=1000)

    res_baseline = oof_cv(X_tr_all, y_tr, pipe_baseline)
    proba_baseline, hold_baseline, fi_baseline, meta_baseline = fit_final(X_tr_all, y_tr, X_te_all, y_te, pipe_baseline)

    report_name_baseline = "baseline"
    pd.DataFrame({"oof": res_baseline["oof"]}).to_csv(REPORTS_OUT_MODEL / f"oof_{report_name_baseline}.csv", index=False)
    pd.DataFrame({"proba": proba_baseline, "y_true": y_te.values}).to_csv(REPORTS_OUT_MODEL / f"holdout_preds_{report_name_baseline}.csv", index=False)
    if fi_baseline is not None:
        fi_baseline.reset_index().rename(columns={"index": "feature"}).to_csv(REPORTS_OUT_MODEL / f"fi_gain_{report_name_baseline}.csv", index=False)
    save_pr_curve(y_te.values, proba_baseline, REPORTS_OUT_MODEL / f"plot_pr_{report_name_baseline}.png")
    save_calibration(y_te.values, proba_baseline, REPORTS_OUT_MODEL / f"plot_calibration_{report_name_baseline}.png")
    if fi_baseline is not None:
        save_top20_importance(fi_baseline, REPORTS_OUT_MODEL / f"plot_fi_top20_{report_name_baseline}.png")

    row_baseline = {
        "member": MEMBER, "model_name": "SVM_Baseline", "encoder": meta_baseline["encoder"],
        "split_path": str(split_p), "feature_recipe": "selected_fe",
        "seed": RND, "cv_folds": CV, "hold_auc": hold_baseline["roc_auc"], "hold_ap": hold_baseline["pr_auc"],
        "hold_brier": hold_baseline["brier"], "cv_auc_mean": res_baseline["roc_auc"], "cv_ap_mean": res_baseline["pr_auc"],
        "early_stopping": False, "best_iteration": None,
        "n_trees": None, "fit_time_s": meta_baseline["fit_time_s"],
        "predict_time_ms_per_1k": meta_baseline["predict_time_ms_per_1k"], "params_json": json.dumps(base_params)
    }
    log_summary(row_baseline)
    print(f"\n[BASELINE] PR-AUC={res_baseline['pr_auc']:.5f}  ROC-AUC={res_baseline['roc_auc']:.5f}  Brier={res_baseline['brier']:.5f}")
    print(f"[HOLDOUT] PR-AUC={hold_baseline['pr_auc']:.5f}  ROC-AUC={hold_baseline['roc_auc']:.5f}  Brier={hold_baseline['brier']:.5f}")
    print(f"Reports saved to: {REPORTS_OUT_MODEL}")



[SETUP] Starting data loading and splitting.
Loading dataset from local file: D:\AdA_Project25\158_portoSeguro\data\raw\porto_seguro_safe_driver_prediction.csv
Dataset loaded successfully.
[SETUP] Data loaded and split successfully.

[EXPERIMENT 1] Baseline LinearSVC with Nystroem using default params

[BASELINE] PR-AUC=0.05538  ROC-AUC=0.61295  Brier=0.03503
[HOLDOUT] PR-AUC=0.05998  ROC-AUC=0.62776  Brier=0.03500
Reports saved to: D:\AdA_Project25\158_portoSeguro\reports_Hany\SVM


In [6]:
print(f"\n[EXPERIMENT 2] Running RandomizedSearchCV for Nystroem tuning...")
param_grid = {
    'nystroem__gamma': [0.01, 0.1, 1],
    'nystroem__n_components': [100, 250, 500],
    'clf__C': [0.1, 1, 10]
}
clf_model_search = LinearSVC(random_state=RND, class_weight='balanced', max_iter=8000, dual=False)
pipe_search, _ = get_pipeline(selected_fe, clf_model_search)
calibrated_pipe_search = CalibratedClassifierCV(estimator=pipe_search, method='sigmoid', cv=CV)

random_search = RandomizedSearchCV(
    estimator=calibrated_pipe_search,
    param_distributions={f'estimator__{k}': v for k, v in param_grid.items() if k.startswith('clf__') or k.startswith('nystroem__')},
    n_iter=N_ITER_SEARCH,
    cv=CV,
    scoring='average_precision',
    random_state=RND,
    n_jobs=-1,
    verbose=1
)

random_search.fit(X_tr_all, y_tr)
best_params_tuned = {k.replace('estimator__', ''): v for k, v in random_search.best_params_.items()}
best_score_tuned = random_search.best_score_
best_pipeline_tuned = random_search.best_estimator_

print(f"\nBest Parameters found: {best_params_tuned}")
print(f"Best PR-AUC from CV: {best_score_tuned:.4f}")

t0_pred = time.perf_counter()
proba_tuned = best_pipeline_tuned.predict_proba(X_te_all)[:, 1]
pred_ms_per_1k_tuned = 1000 * (time.perf_counter() - t0_pred) / (len(X_te_all) if len(X_te_all) > 0 else 1)

hold_tuned = {
    "pr_auc": average_precision_score(y_te, proba_tuned),
    "roc_auc": roc_auc_score(y_te, proba_tuned),
    "brier": brier_score_loss(y_te, proba_tuned),
}

try:
    best_clf = best_pipeline_tuned.estimator.named_steps['clf']
    nystroem_features = best_pipeline_tuned.estimator.named_steps['nystroem'].n_components
    fi_tuned = pd.Series(best_clf.coef_[0], index=[f'nystroem_component_{i}' for i in range(nystroem_features)], name="gain").abs().sort_values(ascending=False)
except Exception as e:
    print(f"[WARN] Feature Importance konnte nicht extrahiert werden: {e}")
    fi_tuned = None

report_name_tuned = "tuned_nystroem_calibrated"
pd.DataFrame({"proba": proba_tuned, "y_true": y_te.values}).to_csv(REPORTS_OUT_MODEL / f"holdout_preds_{report_name_tuned}.csv", index=False)
if fi_tuned is not None:
    fi_tuned.reset_index().rename(columns={"index": "feature"}).to_csv(REPORTS_OUT_MODEL / f"fi_gain_{report_name_tuned}.csv", index=False)
save_pr_curve(y_te.values, proba_tuned, REPORTS_OUT_MODEL / f"plot_pr_{report_name_tuned}.png")
save_calibration(y_te.values, proba_tuned, REPORTS_OUT_MODEL / f"plot_calibration_{report_name_tuned}.png")
if fi_tuned is not None:
    save_top20_importance(fi_tuned, REPORTS_OUT_MODEL / f"plot_fi_top20_{report_name_tuned}.png")

row_tuned = {
    "member": MEMBER, "model_name": "SVM_Tuned", "encoder": "OHE + StandardScaler + Nystroem + Calibrator",
    "split_path": str(split_p), "feature_recipe": "selected_fe",
    "seed": RND, "cv_folds": CV, "hold_auc": hold_tuned["roc_auc"], "hold_ap": hold_tuned["pr_auc"],
    "hold_brier": hold_tuned["brier"], "cv_auc_mean": best_score_tuned, "cv_ap_mean": best_score_tuned,
    "early_stopping": False, "best_iteration": None,
    "n_trees": None, "fit_time_s": 0,
    "predict_time_ms_per_1k": pred_ms_per_1k_tuned, "params_json": json.dumps(best_params_tuned)
}
log_summary(row_tuned)

print(f"\n[TUNED] PR-AUC from CV: {best_score_tuned:.5f}")
print(f"[HOLDOUT] PR-AUC: {hold_tuned['pr_auc']:.5f} ROC-AUC: {hold_tuned['roc_auc']:.5f} Brier: {hold_tuned['brier']:.5f}")
print(f"Reports saved to: {REPORTS_OUT_MODEL}")


[EXPERIMENT 2] Running RandomizedSearchCV for Nystroem tuning...
Fitting 5 folds for each of 10 candidates, totalling 50 fits

Best Parameters found: {'nystroem__n_components': 100, 'nystroem__gamma': 0.01, 'clf__C': 0.1}
Best PR-AUC from CV: 0.0618
[WARN] Feature Importance konnte nicht extrahiert werden: 'LinearSVC' object has no attribute 'coef_'

[TUNED] PR-AUC from CV: 0.06179
[HOLDOUT] PR-AUC: 0.06567 ROC-AUC: 0.63904 Brier: 0.03489
Reports saved to: D:\AdA_Project25\158_portoSeguro\reports_Hany\SVM


In [7]:
REPORTS_OUT = BASE_REPORTS_OUT


svm_metrics = [
    {"model_name": "SVM_Baseline", "hold_ap": hold_baseline["pr_auc"], "hold_auc": hold_baseline["roc_auc"], "hold_brier": hold_baseline["brier"]},
    {"model_name": "SVM_Tuned", "hold_ap": hold_tuned["pr_auc"], "hold_auc": hold_tuned["roc_auc"], "hold_brier": hold_tuned["brier"]}
]

plot_svm_metrics_comparison(svm_metrics, REPORTS_OUT_MODEL / "plot_svm_metrics_comparison.png")
plot_svm_proba_distributions(REPORTS_OUT_MODEL, REPORTS_OUT_MODEL / "plot_svm_proba_distributions.png")

# Plotten der PR-Kurven beider Modelle
print("\n[INFO] Generiere Vergleichsplot für PR-Kurven...")
preds_for_plot = {
    'SVM_Baseline': proba_baseline,
    'SVM_Tuned': proba_tuned
}
plot_pr_curve_all(preds_for_plot, y_te, REPORTS_OUT_MODEL / "plot_pr_curve_comparison.png")


[INFO] SVM-Metriken-Vergleichsplot wurde gespeichert: D:\AdA_Project25\158_portoSeguro\reports_Hany\SVM\plot_svm_metrics_comparison.png
[INFO] SVM-Vorhersage-Verteilungsplot wurde gespeichert: D:\AdA_Project25\158_portoSeguro\reports_Hany\SVM\plot_svm_proba_distributions.png

[INFO] Generiere Vergleichsplot für PR-Kurven...
[INFO] Precision-Recall-Kurven-Vergleichsplot wurde gespeichert: D:\AdA_Project25\158_portoSeguro\reports_Hany\SVM\plot_pr_curve_comparison.png
