In [1]:
import os, sys, json, warnings, time
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedKFold, train_test_split, RandomizedSearchCV
from sklearn.metrics import average_precision_score, roc_auc_score, brier_score_loss, precision_recall_curve
from sklearn.calibration import calibration_curve, CalibratedClassifierCV
from sklearn.svm import LinearSVC
from sklearn.kernel_approximation import Nystroem
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin, clone

warnings.filterwarnings("ignore")

# ---------- Pfade ----------
if "__file__" in globals():
    ROOT = Path(__file__).resolve().parents[1]
else:
    ROOT = Path.cwd() if Path.cwd().name not in ("notebooks","tools","tests") else Path.cwd().parent
sys.path.insert(0, str(ROOT))
try:
    from src.data_loader import load_and_save_data
except ImportError:
    print("[ERROR] 'src.data_loader' konnte nicht importiert werden. Sicherstellen, dass der Pfad stimmt.")
    sys.exit(1)

# Basis-Report-Ordner, von dem aus Unterordner erstellt werden
BASE_REPORTS_OUT = Path(os.getenv("REPORTS_OUT") or (ROOT / "reports_Hany"))
REPORTS_IN = Path(os.getenv("REPORTS_IN") or (ROOT / "reports"))

# ---------- Konfiguration ----------
SPEED = os.getenv("SPEED", "MEDIUM").upper().strip()
def speed_cfg():
    cfg = dict(CV=5)
    if SPEED == "FAST":
        cfg.update(CV=3)
    elif SPEED == "MEDIUM":
        cfg.update(CV=5)
    elif SPEED == "FULL":
        cfg.update(CV=5)
    return cfg

CFG = speed_cfg()
RND = int(os.getenv("RND", "42"))
CV = int(os.getenv("CV", str(CFG["CV"])))
MEMBER = os.getenv("MEMBER", "Hany")

# ---------- Hilfsfunktionen zur Datenaufbereitung ----------
def split_cols(cols):
    cat = [c for c in cols if str(c).endswith("_cat")]
    bin_ = [c for c in cols if str(c).endswith("_bin")]
    num = [c for c in cols if c not in cat and c not in bin_ and c != "target"]
    return cat, bin_, num

def load_selected_feature_list():
    f = REPORTS_IN / "features_selected.csv"
    if not f.exists():
        raise FileNotFoundError(f"Missing {f}. Run feature-gate first.")
    s = pd.read_csv(f)
    if "raw_feature" not in s.columns:
        raise ValueError("features_selected.csv must have column 'raw_feature'.")
    return s["raw_feature"].astype(str).tolist()

class CustomFeatureGenerator(BaseEstimator, TransformerMixin):
    def __init__(self, selected_features):
        self.selected_features = selected_features
        self.bin_cols = []

    def fit(self, X, y=None):
        self.bin_cols = [c for c in X.columns if str(c).endswith("_bin") and c in self.selected_features]
        return self

    def transform(self, X):
        X_transformed = X.copy()
        if "missing_count" in self.selected_features:
            X_transformed["missing_count"] = X_transformed.isna().sum(axis=1)
        if "sum_all_bin" in self.selected_features:
            if self.bin_cols and not X_transformed[self.bin_cols].empty:
                X_transformed["sum_all_bin"] = X_transformed[self.bin_cols].sum(axis=1)
            else:
                X_transformed["sum_all_bin"] = 0
        return X_transformed

def create_preprocessor(all_features):
    cat, bin_, num = split_cols(all_features)
    preprocessor = ColumnTransformer([
        ("num", Pipeline([("imp", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]), num),
        ("cat", Pipeline([("imp", SimpleImputer(strategy="most_frequent")), ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))]), cat),
        ("bin", SimpleImputer(strategy="most_frequent"), bin_)
    ], remainder="drop")
    return preprocessor

def create_full_pipeline(clf_model, selected_cols):
    all_features = list(set(selected_cols + ["missing_count", "sum_all_bin"]))
    all_features = [c for c in all_features if c != 'target']

    preprocessor = create_preprocessor(all_features)

    pipeline = Pipeline([
        ("feature_gen", CustomFeatureGenerator(all_features)),
        ("pre", preprocessor),
        ("nystroem", Nystroem(kernel='rbf', gamma=0.1, random_state=RND, n_components=1000)),
        ("clf", clf_model)
    ])

    return pipeline, all_features

def oof_cv(X, y, selected_cols, clf_model, model_params=None):
    skf = StratifiedKFold(n_splits=CV, shuffle=True, random_state=RND)
    oof = np.zeros(len(y), dtype=float)

    all_features = list(set(selected_cols + ["missing_count", "sum_all_bin"]))
    all_features = [c for c in all_features if c != 'target']
    preprocessor = create_preprocessor(all_features)

    for tr, va in skf.split(X, y):
        Xtr, Xva = X.iloc[tr], X.iloc[va]; ytr, yva = y.iloc[tr], y.iloc[va]

        pipe_base = Pipeline([
            ("feature_gen", CustomFeatureGenerator(selected_cols)),
            ("pre", preprocessor),
            ("nystroem", Nystroem(kernel='rbf', gamma=0.1, random_state=RND, n_components=1000)),
            ("clf", clone(clf_model))
        ])
        if model_params:
            pipe_base.set_params(**{f'clf__{k}': v for k, v in model_params.items()})

        pipe_base.fit(Xtr, ytr)

        # Vereinfachte Kalibrierung
        clf_calibrated = CalibratedClassifierCV(estimator=pipe_base, method='isotonic', cv='prefit')
        clf_calibrated.fit(Xtr, ytr)
        oof[va] = clf_calibrated.predict_proba(Xva)[:, 1]

    pr = average_precision_score(y, oof)
    roc = roc_auc_score(y, oof)
    brier = brier_score_loss(y, oof)
    return dict(pr_auc=float(pr), roc_auc=float(roc), brier=float(brier), oof=oof)


def fit_final(Xtr, ytr, Xte, yte, selected_cols, clf_model, model_params=None):
    all_features = list(set(selected_cols + ["missing_count", "sum_all_bin"]))
    all_features = [c for c in all_features if c != 'target']
    preprocessor = create_preprocessor(all_features)

    pipe_base = Pipeline([
        ("feature_gen", CustomFeatureGenerator(selected_cols)),
        ("pre", preprocessor),
        ("nystroem", Nystroem(kernel='rbf', gamma=0.1, random_state=RND, n_components=1000)),
        ("clf", clone(clf_model))
    ])
    if model_params:
        pipe_base.set_params(**{f'clf__{k}': v for k, v in model_params.items()})

    t0 = time.perf_counter()
    pipe_base.fit(Xtr, ytr)
    fit_time_s = time.perf_counter() - t0

    calibrator = CalibratedClassifierCV(estimator=pipe_base, method='isotonic', cv='prefit')
    calibrator.fit(Xtr, ytr)

    t1 = time.perf_counter()
    proba = calibrator.predict_proba(Xte)[:,1]
    pred_ms_per_1k = 1000 * (time.perf_counter() - t1) / (len(Xte) if len(Xte) > 0 else 1)

    fi = None
    meta = {
        "encoder": "OHE + StandardScaler + Nystroem + Calibrator",
        "fit_time_s": float(fit_time_s),
        "predict_time_ms_per_1k": float(pred_ms_per_1k),
        "params": model_params
    }
    hold = dict(pr_auc=float(average_precision_score(yte, proba)), roc_auc=float(roc_auc_score(yte, proba)), brier=float(brier_score_loss(yte, proba)))

    return proba, hold, fi, meta

def save_pr_curve(y_true, proba, out_path):
    prec, rec, _ = precision_recall_curve(y_true, proba)
    ap = average_precision_score(y_true, proba)

    plt.figure(figsize=(7,5))
    plt.plot(rec, prec, label=f'AP={ap:.4f}')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall')
    plt.xlim([0,1])
    plt.ylim([0,1])
    plt.grid(True, alpha=0.3)
    plt.legend()
    plt.tight_layout()
    plt.savefig(out_path, dpi=150)
    plt.close()


def save_calibration(y_true, proba, out_path):
    print("[INFO] Kalibrierungsplot für LinearSVC nicht erstellt, da die Ausgabe keine Wahrscheinlichkeit ist.")

def save_top20_importance(fi: pd.Series, out_path):
    if fi is None or fi.empty:
        print("[INFO] Feature Importance nicht verfügbar für dieses Modell.")
        return
    top = fi.head(20).iloc[::-1]
    plt.figure(figsize=(8,6));
    plt.barh(top.index, top.values)
    plt.xlabel('Gain');
    plt.title('Top-20 Feature Importance')
    plt.tight_layout();
    plt.savefig(out_path, dpi=150);
    plt.close()

def log_summary(row, filename="team_model_summary.csv"):
    out_csv = BASE_REPORTS_OUT/filename
    pd.DataFrame([row]).to_csv(out_csv, mode="a", index=False, header=not out_csv.exists())

In [3]:
if __name__ == "__main__":
    print(f"[SETUP] Starting data loading and splitting.")
    split_p = REPORTS_IN / "split_indices.json"
    feats_p = REPORTS_IN / "features_selected.csv"
    assert split_p.exists() and feats_p.exists(), "Missing split and/or features files in reports."

    split = json.loads(split_p.read_text())
    selected = load_selected_feature_list()

    df = load_and_save_data().replace(-1, np.nan)
    X_tr_all = df.loc[split["train"]].drop(columns=["target"])
    y_tr = df.loc[split["train"], "target"].astype(int)
    X_te_all = df.loc[split["test"]].drop(columns=["target"])
    y_te = df.loc[split["test"], "target"].astype(int)

    selected_fe = selected.copy()

    print("[SETUP] Data loaded and split successfully.")

    model_name = "LinearSVC_Nystroem"
    print(f"\n--- Running experiments for {model_name} ---")

    REPORTS_OUT_MODEL = BASE_REPORTS_OUT / model_name
    REPORTS_OUT_MODEL.mkdir(parents=True, exist_ok=True)

    # Baselinemodell-Lauf
    print(f"\n[EXPERIMENT 1] Baseline {model_name} with default params")
    base_params = {'C': 1.0, 'class_weight': 'balanced'}
    clf_model_baseline = LinearSVC(random_state=RND, max_iter=1000, **base_params)

    res = oof_cv(X_tr_all, y_tr, selected_fe, clf_model_baseline)
    proba, hold, fi, meta = fit_final(X_tr_all, y_tr, X_te_all, y_te, selected_fe, clf_model_baseline)

    report_name = "baseline"
    pd.DataFrame({"oof": res["oof"]}).to_csv(REPORTS_OUT_MODEL / f"oof_{report_name}.csv", index=False)
    pd.DataFrame({"proba": proba, "y_true": y_te.values}).to_csv(REPORTS_OUT_MODEL / f"holdout_preds_{report_name}.csv", index=False)
    if fi is not None:
        fi.reset_index().rename(columns={"index": "feature"}).to_csv(REPORTS_OUT_MODEL / f"fi_gain_{report_name}.csv", index=False)
    save_pr_curve(y_te.values, proba, REPORTS_OUT_MODEL / f"plot_pr_{report_name}.png")
    # Calibration Plot wird ausgelassen, da die Ausgabe von LinearSVC keine Wahrscheinlichkeit ist
    if fi is not None:
        save_top20_importance(fi, REPORTS_OUT_MODEL / f"plot_fi_top20_{report_name}.png")

    row = {
        "member": MEMBER, "model_name": f"{model_name}_{report_name}", "encoder": meta["encoder"],
        "split_path": str(split_p), "feature_recipe": "selected_fe",
        "seed": RND, "cv_folds": CV, "hold_auc": hold["roc_auc"], "hold_ap": hold["pr_auc"],
        "hold_brier": hold["brier"], "cv_auc_mean": res["roc_auc"], "cv_ap_mean": res["pr_auc"],
        "early_stopping": False, "best_iteration": None,
        "n_trees": None, "fit_time_s": meta["fit_time_s"],
        "predict_time_ms_per_1k": meta["predict_time_ms_per_1k"], "params_json": json.dumps(base_params)
    }
    log_summary(row)

    print(f"\n[BASELINES] PR-AUC={res['pr_auc']:.5f}  ROC-AUC={res['roc_auc']:.5f}  Brier={res['brier']:.5f}")
    print(f"[HOLDOUT] PR-AUC={hold['pr_auc']:.5f}  ROC-AUC={hold['roc_auc']:.5f}  Brier={hold['brier']:.5f}")
    print(f"Reports saved to: {REPORTS_OUT_MODEL}")

[SETUP] Starting data loading and splitting.
Loading dataset from local file: D:\AdA_Project25\158_portoSeguro\data\raw\porto_seguro_safe_driver_prediction.csv
Dataset loaded successfully.
[SETUP] Data loaded and split successfully.

--- Running experiments for LinearSVC_Nystroem ---

[EXPERIMENT 1] Baseline LinearSVC_Nystroem with default params

[BASELINES] PR-AUC=0.05486  ROC-AUC=0.61256  Brier=0.03512
[HOLDOUT] PR-AUC=0.05884  ROC-AUC=0.62792  Brier=0.03499
Reports saved to: D:\AdA_Project25\158_portoSeguro\reports_Hany\LinearSVC_Nystroem


In [None]:
if __name__ == "__main__":
    # Hyperparametersuche
    print(f"\n[EXPERIMENT 2] Running RandomizedSearchCV for Nystroem tuning...")

    search_all_features = list(set(selected_fe + ["missing_count", "sum_all_bin"]))
    search_all_features = [c for c in search_all_features if c != 'target']
    preprocessor_search = create_preprocessor(search_all_features)

    # HSuchbereich für Nystroem und LinearSVC
    param_grid = {
        'nystroem__gamma': [0.01, 0.1, 1],
        'nystroem__n_components': [500, 1000, 2000],
        'clf__estimator__C': [0.1, 1, 10]
    }

    pipe_search = Pipeline([
    ("feature_gen", CustomFeatureGenerator(selected_fe)),
    ("pre", preprocessor_search),
    ("nystroem", Nystroem(kernel='rbf', random_state=RND)),

    ("clf", CalibratedClassifierCV(LinearSVC(random_state=RND, class_weight='balanced', max_iter=8000), method='sigmoid'))
])

    random_search = RandomizedSearchCV(
        pipe_search,
        param_distributions=param_grid,
        n_iter=5,
        cv=CV,
        scoring='average_precision',
        random_state=RND,
        n_jobs=2,
        verbose=1
    )

    random_search.fit(X_tr_all, y_tr)

    best_params = random_search.best_params_

    print("\nBest Parameters found:", best_params)
    print("Best PR-AUC from CV:", random_search.best_score_)

    best_pipeline = random_search.best_estimator_

    oof_tuned = best_pipeline.predict_proba(X_tr_all)[:, 1]

    proba_tuned = best_pipeline.predict_proba(X_te_all)[:, 1]

    hold_tuned = {
        "pr_auc": average_precision_score(y_te, proba_tuned),
        "roc_auc": roc_auc_score(y_te, proba_tuned),
        "brier": brier_score_loss(y_te, proba_tuned),
    }

    report_name = "tuned_nystroem_calibrated"

    pd.DataFrame({"oof": oof_tuned}).to_csv(REPORTS_OUT_MODEL / f"oof_{report_name}.csv", index=False)
    pd.DataFrame({"proba": proba_tuned, "y_true": y_te.values}).to_csv(REPORTS_OUT_MODEL / f"holdout_preds_{report_name}.csv", index=False)
    save_pr_curve(y_te.values, proba_tuned, REPORTS_OUT_MODEL / f"plot_pr_{report_name}.png")
    save_calibration(y_te.values, proba_tuned, REPORTS_OUT_MODEL / f"plot_calibration_{report_name}.png")

    row = {
        "member": MEMBER, "model_name": f"{model_name}_{report_name}", "encoder": "meta_tuned_encoder_info", # Placeholder
        "split_path": str(split_p), "feature_recipe": "selected_fe",
        "seed": RND, "cv_folds": CV, "hold_auc": hold_tuned["roc_auc"], "hold_ap": hold_tuned["pr_auc"],
        "hold_brier": hold_tuned["brier"], "cv_auc_mean": random_search.best_score_, "cv_ap_mean": random_search.best_score_,
        "early_stopping": False, "best_iteration": None,
        "n_trees": None, "fit_time_s": "fit_time_info", # Placeholder
        "predict_time_ms_per_1k": "predict_time_info", # Placeholder
        "params_json": json.dumps(random_search.best_params_)
    }
    log_summary(row)

    print(f"\n[TUNED] PR-AUC from CV: {random_search.best_score_:.5f}")
    print(f"[HOLDOUT] PR-AUC: {hold_tuned['pr_auc']:.5f} ROC-AUC: {hold_tuned['roc_auc']:.5f} Brier: {hold_tuned['brier']:.5f}")
    print(f"Reports saved to: {REPORTS_OUT_MODEL}")


[EXPERIMENT 2] Running RandomizedSearchCV for Nystroem tuning...
Fitting 5 folds for each of 5 candidates, totalling 25 fits
