In [11]:
import pandas as pd
import numpy as np
from statsmodels.tsa.statespace.sarimax import SARIMAX
from utils import load_series_dfs
from pathlib import Path
import time
import numpy as np
import pandas as pd
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error, mean_absolute_error
import os, re, time
import pickle
import json, numpy as np, pandas as pd
from tslearn.metrics import cdist_dtw
from tslearn.preprocessing import TimeSeriesResampler

In [3]:
with open("../data/series_by_bin.pkl", "rb") as f:
    series_by_bin_loaded = pickle.load(f)

In [4]:
series_by_bin_loaded.keys()

dict_keys(['0–5%', '10–15%', '5–10%', '20–25%', '>30%', '25–30%'])

In [5]:
series_by_bin_loaded['10–15%'].keys()

dict_keys(['FL_00024360.PLAN.MengeHH.1', 'FL_00024360.PLAN.MengeKW', 'FL_00025289.PLAN.MengeGA', 'FL_00025287.PLAN.MengeGA', 'FL_00025297.PLAN.MengeGA', 'FL_00352205.PLAN.MengeKW', 'FL_00352205.PLAN.MengeGA', 'FL_00352218.PLAN.MengeKW', 'FL_00352218.PLAN.MengeGA', 'FL_00024673.PLAN.MengeKW', 'FL_00352217.PLAN.MengeGA', 'FL_00352430.PLAN.MengeGA', 'FL_00352230.PLAN.MengeHH', 'FL_00352206.PLAN.MengeHH', 'FL_00025273.PLAN.MengeHH', 'FL_00025289.PLAN.MengeHH', 'FL_00025287.PLAN.MengeHH', 'FL_00025297.PLAN.MengeHH', 'FL_00352205.PLAN.MengeHH', 'FL_00025252.PLAN.MengeHH', 'FL_00024668.PLAN.MengeHH', 'FL_00024607.PLAN.MengeHH', 'FL_00352251.PLAN.MengeHH', 'FL_00352207.PLAN.MengeHH', 'FL_00352298.PLAN.MengeHH', 'KN_00005139.PLAN.Menge', 'KN_00005138.PLAN.Menge', 'KN_00005137.PLAN.Menge', 'KN_00005136.PLAN.Menge', 'KN_00005135.PLAN.Menge', 'KN_00005134.PLAN.Menge', 'KN_00005133.PLAN.Menge', 'KN_00005132.PLAN.Menge', 'KN_00005130.PLAN.Menge', 'KN_00005129.PLAN.Menge', 'KN_00005128.PLAN.Menge', '

In [18]:
ARTIFACT_DIR = Path("../data_public/cluster_mediods/artifacts_medoid_classifier")

# laden
blob = np.load(ARTIFACT_DIR / "medoid_classifier.npz", allow_pickle=True)
MEDOID_TS   = blob["medoid_series"]        # (k, target_len)
MEDOID_KEYS = blob["medoid_keys"].tolist() # list[str]
PARAMS      = json.loads((ARTIFACT_DIR / "params.json").read_text())
RADIUS      = PARAMS["sakoe_chiba_radius"]

def zscore_safe(x):
    x = np.asarray(x, dtype=float)
    if x.size == 0 or np.isnan(x).all():
        return np.zeros_like(x)
    m = np.nanmean(x); s = np.nanstd(x)
    if not np.isfinite(s) or s == 0.0: s = 1.0
    return (x - m) / s

def preprocess_series_1d(s: pd.Series, params=PARAMS) -> np.ndarray:
    s = s.copy()

    # Index normalisieren
    s.index = pd.to_datetime(s.index, errors="coerce")
    if getattr(s.index, "tz", None) is not None:
        s.index = s.index.tz_convert("UTC").tz_localize(None)
    s = s.sort_index()
    s = s[~s.index.isna()].astype("float64", copy=False)

    # falls nach Cleanup nichts übrig: Null-Vektor zurück (und NICHT resamplen)
    if s.empty:
        return np.zeros(params["target_len"], dtype="float32")

    # resample & interpolate
    s3 = s.resample(params.get("resample_rule","H")).mean()
    s3 = s3.interpolate(limit=params.get("interpolate_limit", None),
                        limit_direction="both").ffill().bfill()

    # falls weiterhin alles NaN -> Null-Vektor zurück
    if s3.dropna().empty:
        return np.zeros(params["target_len"], dtype="float32")

    # z-score
    if params.get("zscore", True):
        s3 = pd.Series(zscore_safe(s3.values), index=s3.index)

    arr = s3.values.astype("float32")
    if arr.size == 0:
        return np.zeros(params["target_len"], dtype="float32")

    # auf target_len bringen – wenn zu kurz, erst simpel pad-doppeln, dann exakt resamplen
    if arr.shape[0] < params["target_len"]:
        pad = np.full((params["target_len"],), arr[-1], dtype="float32")
        pad[:arr.shape[0]] = arr
        arr = pad

    # Resampler nur auf NICHT-leere Sequenzen anwenden
    arr = arr.reshape(-1, 1)
    arr = TimeSeriesResampler(sz=params["target_len"]).fit_transform(arr[None, ...])[0][:,0]

    # Sicherheit: keine NaNs in die DTW geben
    arr[~np.isfinite(arr)] = 0.0
    return arr

def assign_series_to_medoid(new_df: pd.DataFrame) -> dict:
    x = preprocess_series_1d(new_df['consumption'], PARAMS)      # (target_len,)
    X = x.reshape(1, -1)
    M = MEDOID_TS                                                # (k, target_len)

    # DTW mit toleranterem Radius versuchen
    try_radii = [PARAMS.get("sakoe_chiba_radius", 5),
                 max(5, int(0.05 * len(x))),                    # 5% der Länge
                 max(10, int(0.1 * len(x)))]                    # 10% der Länge

    dists = None
    for rad in try_radii:
        d = cdist_dtw(X, M, global_constraint="sakoe_chiba", sakoe_chiba_radius=rad)[0]
        if np.isfinite(d).any():
            dists = d; break

    # Notfall: ohne Constraint oder Euklid
    if dists is None or not np.isfinite(dists).any():
        try:
            dists = cdist_dtw(X, M)[0]
        except Exception:
            # Fallback: euklidische Distanz
            dists = np.linalg.norm(M - X, axis=1)

    order = np.argsort(dists)
    best = int(order[0])
    return {
        "assigned_medoid_key": MEDOID_KEYS[best],
        "distance": float(dists[best]),
        "ranked_candidates": [(MEDOID_KEYS[i], float(dists[i])) for i in order.tolist()]
    }

In [19]:
import os
import pandas as pd
from pathlib import Path
df_results = pd.read_csv("../data/sarimax_models/df_results.csv")
# ---------------------------
# 1) Helfer: bestes Modell je Medoid aus df_results wählen
# ---------------------------
def best_model_per_medoid(df_results: pd.DataFrame) -> pd.DataFrame:
    # Kleinster RMSE je Medoid
    best = (
        df_results
        .sort_values(["medoid", "RMSE"], ascending=[True, True])
        .groupby("medoid", as_index=False)
        .first()[["medoid", "feature_set", "RMSE", "model_path"]]
    )
    return best.rename(columns={"feature_set": "chosen_feature_set",
                                "model_path": "chosen_model_path",
                                "RMSE": "chosen_model_rmse"})

# Optional: Statt "bestes je Medoid" ein global festes Feature-Set erzwingen
def fixed_feature_set_per_medoid(df_results: pd.DataFrame, feature_set: str) -> pd.DataFrame:
    take = (
        df_results[df_results["feature_set"] == feature_set]
        .sort_values(["medoid", "RMSE"])
        .groupby("medoid", as_index=False)
        .first()[["medoid", "feature_set", "RMSE", "model_path"]]
    )
    return take.rename(columns={"feature_set": "chosen_feature_set",
                                "model_path": "chosen_model_path",
                                "RMSE": "chosen_model_rmse"})

# ---------------------------
# 2) Alle Serien in allen Bins dem nächsten Medoid zuordnen
# ---------------------------
def assign_all_series_to_medoids(series_by_bin_loaded: dict) -> pd.DataFrame:
    """
    Erwartet Struktur:
      { bin_label: { series_key: df_with_consumption, ... }, ... }
    Gibt DataFrame mit einer Zeile pro Zeitreihe zurück:
      bin, series_key, assigned_medoid, distance, ranked_candidates (Liste)
    """
    rows = []
    for bin_label, subdict in series_by_bin_loaded.items():
        for series_key, df in subdict.items():
            try:
                res = assign_series_to_medoid(df)  # <-- deine Funktion
                rows.append({
                    "bin": bin_label,
                    "series_key": series_key,
                    "assigned_medoid": res["assigned_medoid_key"],
                    "distance": res["distance"],
                    "ranked_candidates": res["ranked_candidates"],  # Liste von (medoid, dist)
                })
            except Exception as e:
                rows.append({
                    "bin": bin_label,
                    "series_key": series_key,
                    "assigned_medoid": None,
                    "distance": float("nan"),
                    "ranked_candidates": [],
                    "error": str(e),
                })
    return pd.DataFrame(rows)

# ---------------------------
# 3) Zusammenführen: Zuordnung + Modellwahl je Medoid
# ---------------------------
def build_assignment_table(series_by_bin_loaded: dict,
                           df_results: pd.DataFrame,
                           force_feature_set: str | None = None) -> pd.DataFrame:
    """
    force_feature_set=None  -> bestes Feature-Set je Medoid (min RMSE)
    force_feature_set="minimaler_rmse" (oder "bester_score" / "rmse_toleranz") -> fix
    """
    assign_df = assign_all_series_to_medoids(series_by_bin_loaded)

    if force_feature_set:
        model_map = fixed_feature_set_per_medoid(df_results, force_feature_set)
    else:
        model_map = best_model_per_medoid(df_results)

    out = assign_df.merge(model_map, left_on="assigned_medoid", right_on="medoid", how="left")
    # Aufräumen der Spalten
    out = (out.drop(columns=["medoid"])
               .rename(columns={"chosen_feature_set": "feature_set",
                                "chosen_model_path": "model_path",
                                "chosen_model_rmse": "model_rmse"}))
    # Re-Order
    cols = ["bin", "series_key", "assigned_medoid", "distance",
            "feature_set", "model_rmse", "model_path", "ranked_candidates"]
    extra = [c for c in out.columns if c not in cols]
    return out[cols + extra]

# ---------------------------
# 4) Beispielaufrufe
# ---------------------------
# A) Automatisch bestes Modell je Medoid
assignment_auto = build_assignment_table(series_by_bin_loaded, df_results, force_feature_set=None)
print(assignment_auto.head())

# B) Global ein Feature-Set erzwingen (z. B. "minimaler_rmse")
assignment_fixed = build_assignment_table(series_by_bin_loaded, df_results, force_feature_set="minimaler_rmse")
print(assignment_fixed.head())

# Optional: als CSV sichern
Path("../data/assignments").mkdir(parents=True, exist_ok=True)
assignment_auto.to_csv("../data/assignments/assignment_auto.csv", index=False)
assignment_fixed.to_csv("../data/assignments/assignment_minimaler_rmse.csv", index=False)


    bin                  series_key           assigned_medoid      distance  \
0  0–5%  FL_00024360.PLAN.MengeHH.2    CV_00003432.PLAN.Menge  3.791466e+01   
1  0–5%    FL_00024360.PLAN.MengeGA  FL_00024360.PLAN.MengeGA  5.527229e-07   
2  0–5%    FL_00352244.PLAN.MengeKW    CV_00003432.PLAN.Menge  1.212620e+01   
3  0–5%    FL_00352244.PLAN.MengeGA    CV_00003432.PLAN.Menge  2.005376e+01   
4  0–5%    FL_00352251.PLAN.MengeGA    CV_00003432.PLAN.Menge  1.880089e+01   

      feature_set    model_rmse  \
0  minimaler_rmse  1.126163e+02   
1  minimaler_rmse  2.139510e+07   
2  minimaler_rmse  1.126163e+02   
3  minimaler_rmse  1.126163e+02   
4  minimaler_rmse  1.126163e+02   

                                          model_path  \
0  ../data/sarimax_models\CV_00003432.PLAN.Menge_...   
1  ../data/sarimax_models\FL_00024360.PLAN.MengeG...   
2  ../data/sarimax_models\CV_00003432.PLAN.Menge_...   
3  ../data/sarimax_models\CV_00003432.PLAN.Menge_...   
4  ../data/sarimax_models\CV_00003

In [20]:
assignment_auto

Unnamed: 0,bin,series_key,assigned_medoid,distance,feature_set,model_rmse,model_path,ranked_candidates
0,0–5%,FL_00024360.PLAN.MengeHH.2,CV_00003432.PLAN.Menge,3.791466e+01,minimaler_rmse,1.126163e+02,../data/sarimax_models\CV_00003432.PLAN.Menge_...,"[(CV_00003432.PLAN.Menge, 37.914661593712125),..."
1,0–5%,FL_00024360.PLAN.MengeGA,FL_00024360.PLAN.MengeGA,5.527229e-07,minimaler_rmse,2.139510e+07,../data/sarimax_models\FL_00024360.PLAN.MengeG...,"[(FL_00024360.PLAN.MengeGA, 5.527229189641496e..."
2,0–5%,FL_00352244.PLAN.MengeKW,CV_00003432.PLAN.Menge,1.212620e+01,minimaler_rmse,1.126163e+02,../data/sarimax_models\CV_00003432.PLAN.Menge_...,"[(CV_00003432.PLAN.Menge, 12.126200788734051),..."
3,0–5%,FL_00352244.PLAN.MengeGA,CV_00003432.PLAN.Menge,2.005376e+01,minimaler_rmse,1.126163e+02,../data/sarimax_models\CV_00003432.PLAN.Menge_...,"[(CV_00003432.PLAN.Menge, 20.053757143137755),..."
4,0–5%,FL_00352251.PLAN.MengeGA,CV_00003432.PLAN.Menge,1.880089e+01,minimaler_rmse,1.126163e+02,../data/sarimax_models\CV_00003432.PLAN.Menge_...,"[(CV_00003432.PLAN.Menge, 18.800892315069326),..."
...,...,...,...,...,...,...,...,...
314,>30%,CV_00349016.PLAN.Menge,CV_00003432.PLAN.Menge,3.406638e+01,minimaler_rmse,1.126163e+02,../data/sarimax_models\CV_00003432.PLAN.Menge_...,"[(CV_00003432.PLAN.Menge, 34.06637886324753), ..."
315,>30%,CV_00349013.PLAN.Menge,CV_00003432.PLAN.Menge,2.933625e+01,minimaler_rmse,1.126163e+02,../data/sarimax_models\CV_00003432.PLAN.Menge_...,"[(CV_00003432.PLAN.Menge, 29.336245217206784),..."
316,>30%,CV_00349005.PLAN.Menge,CV_00003432.PLAN.Menge,2.604073e+01,minimaler_rmse,1.126163e+02,../data/sarimax_models\CV_00003432.PLAN.Menge_...,"[(CV_00003432.PLAN.Menge, 26.040727552583633),..."
317,>30%,CV_00348857.PLAN.Menge,CV_00003432.PLAN.Menge,3.178190e+01,minimaler_rmse,1.126163e+02,../data/sarimax_models\CV_00003432.PLAN.Menge_...,"[(CV_00003432.PLAN.Menge, 31.781897244269263),..."


In [17]:
assignment_fixed.feature_set.unique()

array(['minimaler_rmse', nan], dtype=object)

In [21]:
def models_for_medoid(df_results: pd.DataFrame) -> pd.DataFrame:
    # erwarten Spalten: medoid, feature_set, RMSE, model_path
    keep = ["medoid", "feature_set", "RMSE", "model_path"]
    df = df_results[keep].copy()
    # sortiere pro Medoid nach RMSE, damit beste zuerst stehen
    df = df.sort_values(["medoid","RMSE"], ascending=[True, True])
    # aggregiere alle Modelle je Medoid
    agg = (df.groupby("medoid")
             .apply(lambda g: [{"feature_set": r["feature_set"],
                                "rmse": float(r["RMSE"]),
                                "model_path": r["model_path"]} for _, r in g.iterrows()])
             .reset_index(name="model_candidates"))
    return agg

def build_assignment_with_all_models(series_by_bin_loaded: dict, df_results: pd.DataFrame) -> pd.DataFrame:
    # 1) alle Serien dem nächstgelegenen Medoid zuordnen
    rows = []
    for bin_label, subdict in series_by_bin_loaded.items():
        for series_key, df in subdict.items():
            try:
                res = assign_series_to_medoid(df)
                rows.append({
                    "bin": bin_label,
                    "series_key": series_key,
                    "assigned_medoid": res["assigned_medoid_key"],
                    "distance": res["distance"],
                    "ranked_candidates": res["ranked_candidates"],
                })
            except Exception as e:
                rows.append({
                    "bin": bin_label,
                    "series_key": series_key,
                    "assigned_medoid": None,
                    "distance": np.nan,
                    "ranked_candidates": [],
                    "error": str(e),
                })
    assign_df = pd.DataFrame(rows)

    # 2) alle 3 Modelle pro zugewiesenem Medoid anhängen
    model_map = models_for_medoid(df_results)   # medoid → Liste aus 3 Dicts
    out = assign_df.merge(model_map, left_on="assigned_medoid", right_on="medoid", how="left")
    out = out.drop(columns=["medoid"])

    # Komfort: zusätzlich einzelne Spalten für „bestes“ Modell (erstes der Liste)
    def first_field(lst, field):
        if isinstance(lst, list) and lst:
            return lst[0].get(field)
        return np.nan

    out["feature_set_best"] = out["model_candidates"].apply(lambda lst: first_field(lst, "feature_set"))
    out["model_rmse_best"]  = out["model_candidates"].apply(lambda lst: first_field(lst, "rmse"))
    out["model_path_best"]  = out["model_candidates"].apply(lambda lst: first_field(lst, "model_path"))

    # Optional: Windows-Backslashes vereinheitlichen
    out["model_path_best"] = out["model_path_best"].astype(str).str.replace("\\", "/")

    return out[["bin","series_key","assigned_medoid","distance",
                "feature_set_best","model_rmse_best","model_path_best",
                "ranked_candidates","model_candidates","error"]]


assignment_auto = build_assignment_table(series_by_bin_loaded, df_results, force_feature_set=None)
print(assignment_auto.head())

# B) Global ein Feature-Set erzwingen (z. B. "minimaler_rmse")
assignment_fixed = build_assignment_table(series_by_bin_loaded, df_results, force_feature_set="minimaler_rmse")
print(assignment_fixed.head())

# Optional: als CSV sichern
Path("../data/assignments").mkdir(parents=True, exist_ok=True)
assignment_auto.to_csv("../data/assignments/assignment_auto.csv", index=False)
assignment_fixed.to_csv("../data/assignments/assignment_minimaler_rmse.csv", index=False)


    bin                  series_key           assigned_medoid      distance  \
0  0–5%  FL_00024360.PLAN.MengeHH.2    CV_00003432.PLAN.Menge  3.791466e+01   
1  0–5%    FL_00024360.PLAN.MengeGA  FL_00024360.PLAN.MengeGA  5.527229e-07   
2  0–5%    FL_00352244.PLAN.MengeKW    CV_00003432.PLAN.Menge  1.212620e+01   
3  0–5%    FL_00352244.PLAN.MengeGA    CV_00003432.PLAN.Menge  2.005376e+01   
4  0–5%    FL_00352251.PLAN.MengeGA    CV_00003432.PLAN.Menge  1.880089e+01   

      feature_set    model_rmse  \
0  minimaler_rmse  1.126163e+02   
1  minimaler_rmse  2.139510e+07   
2  minimaler_rmse  1.126163e+02   
3  minimaler_rmse  1.126163e+02   
4  minimaler_rmse  1.126163e+02   

                                          model_path  \
0  ../data/sarimax_models\CV_00003432.PLAN.Menge_...   
1  ../data/sarimax_models\FL_00024360.PLAN.MengeG...   
2  ../data/sarimax_models\CV_00003432.PLAN.Menge_...   
3  ../data/sarimax_models\CV_00003432.PLAN.Menge_...   
4  ../data/sarimax_models\CV_00003

In [24]:
assignment_auto.model_path.unique()

array(['../data/sarimax_models\\CV_00003432.PLAN.Menge__minimaler_rmse.pkl',
       '../data/sarimax_models\\FL_00024360.PLAN.MengeGA__minimaler_rmse.pkl',
       '../data/sarimax_models\\KN_00000067.PLAN.Menge__rmse_toleranz.pkl'],
      dtype=object)

In [23]:
assignment_fixed

Unnamed: 0,bin,series_key,assigned_medoid,distance,feature_set,model_rmse,model_path,ranked_candidates
0,0–5%,FL_00024360.PLAN.MengeHH.2,CV_00003432.PLAN.Menge,3.791466e+01,minimaler_rmse,1.126163e+02,../data/sarimax_models\CV_00003432.PLAN.Menge_...,"[(CV_00003432.PLAN.Menge, 37.914661593712125),..."
1,0–5%,FL_00024360.PLAN.MengeGA,FL_00024360.PLAN.MengeGA,5.527229e-07,minimaler_rmse,2.139510e+07,../data/sarimax_models\FL_00024360.PLAN.MengeG...,"[(FL_00024360.PLAN.MengeGA, 5.527229189641496e..."
2,0–5%,FL_00352244.PLAN.MengeKW,CV_00003432.PLAN.Menge,1.212620e+01,minimaler_rmse,1.126163e+02,../data/sarimax_models\CV_00003432.PLAN.Menge_...,"[(CV_00003432.PLAN.Menge, 12.126200788734051),..."
3,0–5%,FL_00352244.PLAN.MengeGA,CV_00003432.PLAN.Menge,2.005376e+01,minimaler_rmse,1.126163e+02,../data/sarimax_models\CV_00003432.PLAN.Menge_...,"[(CV_00003432.PLAN.Menge, 20.053757143137755),..."
4,0–5%,FL_00352251.PLAN.MengeGA,CV_00003432.PLAN.Menge,1.880089e+01,minimaler_rmse,1.126163e+02,../data/sarimax_models\CV_00003432.PLAN.Menge_...,"[(CV_00003432.PLAN.Menge, 18.800892315069326),..."
...,...,...,...,...,...,...,...,...
314,>30%,CV_00349016.PLAN.Menge,CV_00003432.PLAN.Menge,3.406638e+01,minimaler_rmse,1.126163e+02,../data/sarimax_models\CV_00003432.PLAN.Menge_...,"[(CV_00003432.PLAN.Menge, 34.06637886324753), ..."
315,>30%,CV_00349013.PLAN.Menge,CV_00003432.PLAN.Menge,2.933625e+01,minimaler_rmse,1.126163e+02,../data/sarimax_models\CV_00003432.PLAN.Menge_...,"[(CV_00003432.PLAN.Menge, 29.336245217206784),..."
316,>30%,CV_00349005.PLAN.Menge,CV_00003432.PLAN.Menge,2.604073e+01,minimaler_rmse,1.126163e+02,../data/sarimax_models\CV_00003432.PLAN.Menge_...,"[(CV_00003432.PLAN.Menge, 26.040727552583633),..."
317,>30%,CV_00348857.PLAN.Menge,CV_00003432.PLAN.Menge,3.178190e+01,minimaler_rmse,1.126163e+02,../data/sarimax_models\CV_00003432.PLAN.Menge_...,"[(CV_00003432.PLAN.Menge, 31.781897244269263),..."
