In [1]:
import pandas as pd
import numpy as np
from statsmodels.tsa.statespace.sarimax import SARIMAX
from utils import load_series_dfs
from pathlib import Path
import time
import numpy as np
import pandas as pd
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error, mean_absolute_error
import os, re, time
import pickle
import json, numpy as np, pandas as pd
from tslearn.metrics import cdist_dtw
from tslearn.preprocessing import TimeSeriesResampler

In [2]:
with open("../data/series_by_bin.pkl", "rb") as f:
    series_by_bin_loaded = pickle.load(f)

In [3]:
series_by_bin_loaded.keys()

dict_keys(['0–5%', '10–15%', '5–10%', '20–25%', '>30%', '25–30%'])

In [7]:
assignment_auto = pd.read_csv("../data/assignments/assignment_auto.csv")

assignment_auto = assignment_auto [['bin', 'series_key', 'assigned_medoid', 'distance']]

assignment_auto.head()

Unnamed: 0,bin,series_key,assigned_medoid,distance
0,0–5%,FL_00024360.PLAN.MengeHH.2,CV_00003432.PLAN.Menge,37.91466
1,0–5%,FL_00024360.PLAN.MengeGA,FL_00024360.PLAN.MengeGA,5.527229e-07
2,0–5%,FL_00352244.PLAN.MengeKW,CV_00003432.PLAN.Menge,12.1262
3,0–5%,FL_00352244.PLAN.MengeGA,CV_00003432.PLAN.Menge,20.05376
4,0–5%,FL_00352251.PLAN.MengeGA,CV_00003432.PLAN.Menge,18.80089


In [10]:
assignment_fixed = pd.read_csv("../data/assignments/assignment_minimaler_rmse.csv")

assignment_fixed = assignment_fixed[['bin', 'series_key', 'assigned_medoid', 'distance']]

assignment_fixed.head()

Unnamed: 0,bin,series_key,assigned_medoid,distance
0,0–5%,FL_00024360.PLAN.MengeHH.2,CV_00003432.PLAN.Menge,37.91466
1,0–5%,FL_00024360.PLAN.MengeGA,FL_00024360.PLAN.MengeGA,5.527229e-07
2,0–5%,FL_00352244.PLAN.MengeKW,CV_00003432.PLAN.Menge,12.1262
3,0–5%,FL_00352244.PLAN.MengeGA,CV_00003432.PLAN.Menge,20.05376
4,0–5%,FL_00352251.PLAN.MengeGA,CV_00003432.PLAN.Menge,18.80089


In [14]:
# -*- coding: utf-8 -*-
import os
import pickle
import numpy as np
import pandas as pd
from pathlib import Path
from typing import Dict, Tuple, List

# =========================
# CONFIG
# =========================
BASE_DIR = Path("../data")
MODELS_DIR = BASE_DIR / "sarimax_models"
ASSIGNMENTS = {
    "auto": BASE_DIR / "assignments" / "assignment_auto.csv",
    "fixed": BASE_DIR / "assignments" / "assignment_minimaler_rmse.csv",
}
# drei Varianten + zugehörige Feature-Sets
FEATURE_SETS = {
    "rmse_toleranz": ['hour', 'w_tl', 'w_ff', 'w_tb10', 'w_tb20', 'CEGH_WAP', 'THE_WAP'],
    "bester_score":  ['w_tl','w_tb10'],
    "minimaler_rmse":['hour', 'weekday', 'month', 'is_weekend', 'w_tl', 'w_rf',
                      'w_ff', 'w_ffx', 'w_cglo', 'w_so_h', 'w_rr',
                      'w_tb10', 'w_tb20', 'CEGH_WAP', 'THE_WAP'],
}
TARGET_COL = "consumption"        # Name deiner Zielspalte in jeder Serie
FREQ = "H"                        # Datenfrequenz
OUT_DIR = BASE_DIR / "experiment_results"
SAVE_IMPUTED_SERIES = True        # auf False setzen, wenn du nur die Metriken willst
RANDOM_SEED = 42

# Synthetische Lücken: (n_windows, window_lengths_in_hours)
GAP_WINDOWS = {
    "short6h":  (3, 6),    # 3 Lücken à 6h
    "day24h":   (2, 24),   # 2 Lücken à 24h
    "long72h":  (1, 72),   # 1 Lücke à 72h
}
# =========================


# ---------- Utils
def ensure_outdir(p: Path):
    p.mkdir(parents=True, exist_ok=True)

def load_assignments(paths: Dict[str, Path]) -> pd.DataFrame:
    dfs = []
    for name, p in paths.items():
        if p and p.exists():
            df = pd.read_csv(p)
            df["assignment_source"] = name
            dfs.append(df)
    if not dfs:
        raise FileNotFoundError("Keine Assignment-Dateien gefunden.")
    out = pd.concat(dfs, ignore_index=True)
    # Normalisiere Spaltennamen erwartungsgemäß:
    cols = {"series_key":"series_key", "assigned_medoid":"assigned_medoid", "bin":"bin"}
    # naiv absichern
    out.rename(columns={k:v for k,v in cols.items() if k in out.columns and v!=k}, inplace=True)
    return out

def model_path_for(medoid: str, variant: str) -> Path:
    # Dateien heißen z.B. "CV_00003432.PLAN.Menge__minimaler_rmse.pkl"
    fname = f"{medoid}__{variant}.pkl"
    return MODELS_DIR / fname

def load_model(medoid: str, variant: str):
    p = model_path_for(medoid, variant)
    if not p.exists():
        raise FileNotFoundError(f"Modell fehlt: {p}")
    with open(p, "rb") as f:
        res = pickle.load(f)   # statsmodels SARIMAXResults
    return res

def select_exog(df: pd.DataFrame, features: List[str]) -> pd.DataFrame:
    missing = [c for c in features if c not in df.columns]
    if missing:
        raise KeyError(f"Exogene fehlen in Serie: {missing}")
    return df[features]

def metrics(y_true: pd.Series, y_pred: pd.Series) -> Dict[str, float]:
    mask = y_true.notna() & y_pred.notna()
    if mask.sum() == 0:
        return {"rmse": np.nan, "mae": np.nan, "mape": np.nan}
    e = (y_true[mask] - y_pred[mask])
    rmse = float(np.sqrt(np.mean(np.square(e))))
    mae  = float(np.mean(np.abs(e)))
    # MAPE robust (vermeide Nullen)
    denom = y_true[mask].replace(0, np.nan)
    mape = float(np.mean(np.abs(e / denom))) * 100.0
    return {"rmse": rmse, "mae": mae, "mape": mape}

def random_gap_mask(idx: pd.DatetimeIndex, n_windows: int, win_len_hours: int, rng: np.random.Generator) -> pd.Series:
    """Erzeuge bool-Maske (True = Lücke) über den Index."""
    mask = pd.Series(False, index=idx)
    if len(idx) == 0:
        return mask
    # mögliche Startpunkte so wählen, dass Fenster reinpasst
    if len(idx) < win_len_hours:
        return mask
    positions = np.arange(0, len(idx) - win_len_hours + 1)
    rng.shuffle(positions)
    starts = []
    for pos in positions:
        # vermeide starke Überschneidungen
        if all(abs(pos - s) >= win_len_hours for s in starts):
            starts.append(pos)
            if len(starts) >= n_windows:
                break
    for s in starts:
        sel = idx[s : s + win_len_hours]
        mask.loc[sel] = True
    return mask

def inject_gaps(series_df: pd.DataFrame,
                patterns: Dict[str, Tuple[int, int]],
                rng: np.random.Generator) -> Dict[str, pd.Series]:
    """Gibt dict mit Masken (pro Pattern) zurück, welche Zeitpunkte genullt werden sollen."""
    idx = series_df.index
    masks = {}
    for name, (n, L) in patterns.items():
        masks[name] = random_gap_mask(idx, n_windows=n, win_len_hours=L, rng=rng)
    return masks

def impute_with_sarimax(res, endog: pd.Series, exog: pd.DataFrame) -> pd.Series:
    """
    Nutzt ein trainiertes SARIMAXResults-Objekt (vom Medoid) um Werte für endog zu erzeugen.
    Erwartung: endog enthält NaNs an zu imputierenden Stellen, exog deckt gesamten Zeitraum ab.
    """
    # Für statsmodels: get_prediction oder predict – hier sauber mit get_prediction, damit Index passt
    start = endog.index.min()
    end = endog.index.max()
    pred = res.get_prediction(start=start, end=end, exog=exog).predicted_mean
    # nur dort ersetzen, wo endog fehlt:
    out = endog.copy()
    needs = endog.isna()
    out.loc[needs] = pred.loc[needs]
    return out

# ---------- Hauptlauf
def run_experiments(series_by_bin_loaded: Dict[str, Dict[str, pd.DataFrame]]):
    ensure_outdir(OUT_DIR)
    results_rows = []
    rng = np.random.default_rng(RANDOM_SEED)
    assignments = load_assignments(ASSIGNMENTS)

    # schneller Lookup
    assign_lookup = assignments.set_index("series_key")["assigned_medoid"].to_dict()
    bin_lookup = assignments.set_index("series_key")["bin"].to_dict()

    for bin_key, series_map in series_by_bin_loaded.items():
        for series_key, df in series_map.items():
            # Ziel & Exogene prüfen
            if TARGET_COL not in df.columns:
                # ggf. in manchen Strukturen heißt target anders
                continue

            # Serie auf regelmäßige Frequenz bringen (optional; kommentiere aus, wenn schon sauber)
            df = df.sort_index()
            try:
                df = df.asfreq(FREQ)
            except Exception:
                pass

            # Medoid finden – erst aus Assignment, fallback: übers Bin-Assignment matchen
            medoid = assign_lookup.get(series_key)
            if medoid is None:
                # Notfall: suche im gleichen Bin, wo assigned_medoid==series_key (falls es ein Medoid selbst ist)
                # oder überspringe
                # print(f"[WARN] Kein Medoid für {series_key} – übersprungen.")
                continue

            # synthetische Lücken erzeugen
            gap_masks = inject_gaps(df, GAP_WINDOWS, rng=rng)

            # je Variante/Feature-Set testen
            for variant, feats in FEATURE_SETS.items():
                # Modell laden
                try:
                    res = load_model(medoid, variant)
                except FileNotFoundError as e:
                    # print(e)
                    continue

                # Exog wählen
                try:
                    exog_full = select_exog(df, feats)
                except KeyError as e:
                    # print(f"[{series_key}] {e}")
                    continue

                # Für jede Gap-Konfiguration eine Kopie mit NaNs im Target bauen
                for gap_name, mask in gap_masks.items():
                    y_true = df[TARGET_COL].copy()
                    y_with_nans = y_true.copy()
                    # echte NaNs beibehalten + synthetische hinzufügen
                    y_with_nans.loc[mask.index[mask]] = np.nan

                    # imputieren
                    try:
                        y_imputed = impute_with_sarimax(res, y_with_nans, exog_full)
                    except Exception as e:
                        # z.B. Dimensionskonflikte
                        # print(f"[{series_key} | {variant} | {gap_name}] Imputation fehlgeschlagen: {e}")
                        continue

                    # Metriken nur auf synthetischen Lücken auswerten (wo wir Ground Truth kennen)
                    eval_mask = mask & y_true.notna()
                    m = metrics(y_true[eval_mask], y_imputed[eval_mask])

                    results_rows.append({
                        "bin": bin_key,
                        "series_key": series_key,
                        "assignment_bin": bin_lookup.get(series_key, np.nan),
                        "assigned_medoid": medoid,
                        "variant": variant,
                        "features": ",".join(feats),
                        "gap_pattern": gap_name,
                        "n_eval_points": int(eval_mask.sum()),
                        **m
                    })

                    # optional: imputierte Gesamtreihe speichern
                    if SAVE_IMPUTED_SERIES:
                        out_series_dir = OUT_DIR / "imputed_series" / variant / bin_key
                        ensure_outdir(out_series_dir)
                        fn = out_series_dir / f"{series_key}__{gap_name}.parquet"
                        pd.DataFrame({
                            TARGET_COL: y_true,
                            f"{TARGET_COL}_imputed": y_imputed
                        }, index=df.index).to_parquet(fn)

    # Gesamtergebnis als CSV
    res_df = pd.DataFrame(results_rows)
    ensure_outdir(OUT_DIR)
    res_path = OUT_DIR / "experiment_summary.csv"
    res_df.to_csv(res_path, index=False)

    # kleine Pivot-Übersicht (Median-Metrik je variant/gap)
    if not res_df.empty:
        pivot = (
            res_df
            .groupby(["variant", "gap_pattern"])
            .agg(rmse_median=("rmse","median"),
                 mae_median=("mae","median"),
                 mape_median=("mape","median"),
                 n=("n_eval_points","sum"))
            .reset_index()
            .sort_values(["variant","gap_pattern"])
        )
        pivot_path = OUT_DIR / "experiment_pivot.csv"
        pivot.to_csv(pivot_path, index=False)
        print("Fertig. Ergebnisse in:")
        print(f"- {res_path}")
        print(f"- {pivot_path}")
    else:
        print("Keine Ergebnisse erzeugt – bitte Logs/Annahmen prüfen.")

# =========================
# AUFRUF
# =========================
# Erwartet: 'series_by_bin_loaded' ist schon vorhanden; sonst hier laden.
run_experiments(series_by_bin_loaded)


  df = df.asfreq(FREQ)
  df = df.asfreq(FREQ)
  df = df.asfreq(FREQ)
  df = df.asfreq(FREQ)
  df = df.asfreq(FREQ)
  df = df.asfreq(FREQ)
  df = df.asfreq(FREQ)
  df = df.asfreq(FREQ)
  df = df.asfreq(FREQ)
  df = df.asfreq(FREQ)
  df = df.asfreq(FREQ)
  df = df.asfreq(FREQ)
  df = df.asfreq(FREQ)
  df = df.asfreq(FREQ)
  df = df.asfreq(FREQ)
  df = df.asfreq(FREQ)
  df = df.asfreq(FREQ)
  df = df.asfreq(FREQ)
  df = df.asfreq(FREQ)
  df = df.asfreq(FREQ)
  df = df.asfreq(FREQ)
  df = df.asfreq(FREQ)
  df = df.asfreq(FREQ)
  df = df.asfreq(FREQ)
  df = df.asfreq(FREQ)
  df = df.asfreq(FREQ)
  df = df.asfreq(FREQ)
  df = df.asfreq(FREQ)
  df = df.asfreq(FREQ)
  df = df.asfreq(FREQ)
  df = df.asfreq(FREQ)
  df = df.asfreq(FREQ)
  df = df.asfreq(FREQ)
  df = df.asfreq(FREQ)
  df = df.asfreq(FREQ)
  df = df.asfreq(FREQ)
  df = df.asfreq(FREQ)
  df = df.asfreq(FREQ)
  df = df.asfreq(FREQ)
  df = df.asfreq(FREQ)
  df = df.asfreq(FREQ)
  df = df.asfreq(FREQ)
  df = df.asfreq(FREQ)
  df = df.a

Keine Ergebnisse erzeugt – bitte Logs/Annahmen prüfen.


In [13]:
from pathlib import Path

BASE_DIR = Path("../data")  # <- eine Ebene hoch zur Repo-Root
MODELS_DIR = BASE_DIR / "sarimax_models"

ASSIGNMENTS = {
    "auto": BASE_DIR / "assignments" / "assignment_auto.csv",
    # falls vorhanden, kannst du weitere Dateien ergänzen:
    # "fixed": BASE_DIR / "assignments" / "assignment_minimaler_rmse.csv",
}

print("Exists?", (BASE_DIR / "assignments" / "assignment_auto.csv").exists())

Exists? True


In [16]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv("../data/experiment_results/experiment_summary.csv")

# Boxplot: RMSE nach Modellvariante
plt.figure(figsize=(8,5))
sns.boxplot(data=df, x="variant", y="rmse")
plt.title("RMSE Verteilung nach Modellvariante")
plt.show()

# Grouped Barplot: median RMSE pro Gap-Pattern + Modell
pivot = (
    df.groupby(["gap_pattern","variant"])
      .agg(rmse_median=("rmse","median"))
      .reset_index()
)
plt.figure(figsize=(8,5))
sns.barplot(data=pivot, x="gap_pattern", y="rmse_median", hue="variant")
plt.title("Median RMSE pro Gap-Pattern und Modellvariante")
plt.show()

# Heatmap
heat = pivot.pivot(index="gap_pattern", columns="variant", values="rmse_median")
plt.figure(figsize=(6,4))
sns.heatmap(heat, annot=True, fmt=".2f", cmap="viridis")
plt.title("Median RMSE (Heatmap)")
plt.show()

EmptyDataError: No columns to parse from file