In [7]:
import pandas as pd
from statsmodels.tsa.statespace.sarimax import SARIMAX
from utils import load_series_dfs
import pickle
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.seasonal import seasonal_decompose, STL, MSTL
from scipy.signal import periodogram
import logging
from sklearn.preprocessing import StandardScaler
import itertools
import numpy as np

In [8]:
series_dfs = load_series_dfs(data_dir=r"C:\Users\johan\Documents\FH_Master\data", filename="final_learning_dfs.pkl")
series_dfs['FL_00024702.PLAN.MengeHH'].columns

Index(['consumption', 'hour', 'weekday', 'month', 'is_weekend', 'w_tl', 'w_rf',
       'w_ff', 'w_ffx', 'w_cglo', 'w_so_h', 'w_rr', 'w_rrm', 'w_tb10',
       'w_tb20', 'CEGH_WAP', 'THE_WAP'],
      dtype='object')

In [9]:
df = series_dfs['FL_00024702.PLAN.MengeHH']

df.index = pd.to_datetime(df.index)
df = df.sort_index()
# 1) Dubletten raushauen (oder vorher groupby mean)
df = df[~df.index.duplicated(keep='first')]

# 2) Sortieren
df = df.sort_index()

# 3) Resample und interpolieren – Index wird gesetzt und freq='h' hinterlegt
df = df.resample('h').interpolate()

# 4) (Optional) nochmal explizit asfreq, freq ist aber schon da
df = df.asfreq('h')

print(df.index.freq)

# Optional: Regressoren standardisieren
features_full = ['hour', 'weekday', 'month', 'is_weekend', 'w_tl', 'w_rf',
            'w_ff', 'w_ffx', 'w_cglo', 'w_so_h', 'w_rr', 'w_rrm',
            'w_tb10', 'w_tb20', 'CEGH_WAP', 'THE_WAP']



<Hour>


In [10]:
import time
import pandas as pd
import numpy as np
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.ar_model import AutoReg
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Train/Test Split
train = df.loc[:'2017-12-31']
test  = df.loc['2018-01-01':]
n_test = len(test)
y_true = test['consumption']

# Funktion für Fehler
def calc_metrics(y, y_hat):
    rmse = np.sqrt(mean_squared_error(y, y_hat))
    mae  = mean_absolute_error(y, y_hat)
    return rmse, mae

features_full = ['hour', 'weekday', 'month', 'is_weekend', 'w_tl', 'w_rf',
            'w_ff', 'w_ffx', 'w_cglo', 'w_so_h', 'w_rr', 'w_rrm',
            'w_tb10', 'w_tb20', 'CEGH_WAP', 'THE_WAP']

# Platz für Ergebnisse
results = []

# SARIMAX (mit Exogenen)
t0 = time.perf_counter()
mod = SARIMAX(train['consumption'],
              exog=train[features_full],
              order=(1,0,2), seasonal_order=(1,0,1,24),
              enforce_stationarity=True, enforce_invertibility=True)
res = mod.fit(disp=False, method='powell', maxiter=300)
t1 = time.perf_counter()
pred_start = time.perf_counter()
yhat = res.get_forecast(steps=n_test, exog=test[features_full]).predicted_mean
t2 = time.perf_counter()

rmse, mae = calc_metrics(y_true, yhat)

results.append({
    'Model': 'SARIMAX',
    'RMSE': rmse, 'MAE': mae,
    'Train Time (s)': t1-t0,
    'Forecast Time (s)': t2-t1
})


KeyboardInterrupt: 

In [12]:
import time
import warnings
import pandas as pd
import numpy as np
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error, mean_absolute_error
warnings.filterwarnings("ignore")

df = series_dfs['FL_00024702.PLAN.MengeHH'].copy()
df.index = pd.to_datetime(df.index)
df = df[~df.index.duplicated(keep='first')].sort_index()
df = df.resample('h').interpolate().asfreq('h')

# Train/Test Split
train = df.loc[:'2017-12-31']
test  = df.loc['2018-01-01':]
n_test = len(test)
y_true = test['consumption']

def calc_metrics(y, y_hat):
    rmse = np.sqrt(mean_squared_error(y, y_hat))
    mae  = mean_absolute_error(y, y_hat)
    return rmse, mae

# Start-Featureliste
features_full = [
    'hour','weekday','month','is_weekend','w_tl','w_rf','w_ff','w_ffx',
    'w_cglo','w_so_h','w_rr','w_rrm','w_tb10','w_tb20','CEGH_WAP','THE_WAP'
]

results = []

def fit_forecast_eval(feats):
    """Fit SARIMAX mit exog=feats, return (rmse, mae, train_time, fc_time, yhat) oder None bei Fehler."""
    X_train = train[feats] if feats else None
    X_test  = test[feats]  if feats else None

    t0 = time.perf_counter()
    try:
        print(feats)
        mod = SARIMAX(
            train['consumption'],
            exog=X_train,
            order=(1,0,2), seasonal_order=(1,0,1,24),
            enforce_stationarity=True, enforce_invertibility=True
        )
        res = mod.fit(disp=False, method='powell', maxiter=300)
    except Exception as e:
        return None
    t1 = time.perf_counter()

    try:
        yhat = res.get_forecast(steps=n_test, exog=X_test).predicted_mean
    except Exception:
        return None
    t2 = time.perf_counter()

    rmse, mae = calc_metrics(y_true, yhat)
    return rmse, mae, (t1 - t0), (t2 - t1), yhat

# 1) Baseline mit allen Features
current_features = features_full.copy()
baseline = fit_forecast_eval(current_features)
if baseline is None:
    raise RuntimeError("Baseline-Fit mit allen Features fehlgeschlagen.")
rmse, mae, tr_t, fc_t, _ = baseline
results.append({
    'Model': f"SARIMAX | k={len(current_features)} | Start (alle Features)",
    'Included Features': current_features.copy(),
    'Dropped in this step': None,
    'RMSE': rmse, 'MAE': mae,
    'Train Time (s)': tr_t, 'Forecast Time (s)': fc_t
})

# 2) Greedy Backward Elimination: in jedem Schritt genau 1 Feature droppen
while len(current_features) > 0:
    print(current_features)
    step_candidates = []
    for f in current_features:
        trial_feats = [x for x in current_features if x != f]
        eval_res = fit_forecast_eval(trial_feats)
        if eval_res is None:
            continue
        rmse_t, mae_t, tr_t, fc_t, _ = eval_res
        step_candidates.append({
            'drop_feature': f,
            'trial_feats': trial_feats,
            'RMSE': rmse_t,
            'MAE': mae_t,
            'Train Time (s)': tr_t,
            'Forecast Time (s)': fc_t
        })

    if not step_candidates:
        # keine erfolgreichen Fits mehr => Abbruch
        break

    # bestes Ergebnis nach RMSE auswählen (bei Gleichstand nach MAE)
    step_candidates.sort(key=lambda d: (d['RMSE'], d['MAE']))
    best = step_candidates[0]

    # Ergebnis protokollieren
    results.append({
        'Model': f"SARIMAX | k={len(best['trial_feats'])} | drop: {best['drop_feature']}",
        'Included Features': best['trial_feats'].copy(),
        'Dropped in this step': best['drop_feature'],
        'RMSE': best['RMSE'], 'MAE': best['MAE'],
        'Train Time (s)': best['Train Time (s)'],
        'Forecast Time (s)': best['Forecast Time (s)']
    })

    # Features aktualisieren (ein Feature weniger)
    current_features = best['trial_feats']


results_df = pd.DataFrame(results)
display(results_df)


['hour', 'weekday', 'month', 'is_weekend', 'w_tl', 'w_rf', 'w_ff', 'w_ffx', 'w_cglo', 'w_so_h', 'w_rr', 'w_rrm', 'w_tb10', 'w_tb20', 'CEGH_WAP', 'THE_WAP']
['hour', 'weekday', 'month', 'is_weekend', 'w_tl', 'w_rf', 'w_ff', 'w_ffx', 'w_cglo', 'w_so_h', 'w_rr', 'w_rrm', 'w_tb10', 'w_tb20', 'CEGH_WAP', 'THE_WAP']
['weekday', 'month', 'is_weekend', 'w_tl', 'w_rf', 'w_ff', 'w_ffx', 'w_cglo', 'w_so_h', 'w_rr', 'w_rrm', 'w_tb10', 'w_tb20', 'CEGH_WAP', 'THE_WAP']
['hour', 'month', 'is_weekend', 'w_tl', 'w_rf', 'w_ff', 'w_ffx', 'w_cglo', 'w_so_h', 'w_rr', 'w_rrm', 'w_tb10', 'w_tb20', 'CEGH_WAP', 'THE_WAP']
['hour', 'weekday', 'is_weekend', 'w_tl', 'w_rf', 'w_ff', 'w_ffx', 'w_cglo', 'w_so_h', 'w_rr', 'w_rrm', 'w_tb10', 'w_tb20', 'CEGH_WAP', 'THE_WAP']
['hour', 'weekday', 'month', 'w_tl', 'w_rf', 'w_ff', 'w_ffx', 'w_cglo', 'w_so_h', 'w_rr', 'w_rrm', 'w_tb10', 'w_tb20', 'CEGH_WAP', 'THE_WAP']
['hour', 'weekday', 'month', 'is_weekend', 'w_rf', 'w_ff', 'w_ffx', 'w_cglo', 'w_so_h', 'w_rr', 'w_rrm', '

Unnamed: 0,Model,Included Features,Dropped in this step,RMSE,MAE,Train Time (s),Forecast Time (s)
0,SARIMAX | k=16 | Start (alle Features),"[hour, weekday, month, is_weekend, w_tl, w_rf,...",,6.566899,5.226988,451.113628,0.238235
1,SARIMAX | k=15 | drop: month,"[hour, weekday, is_weekend, w_tl, w_rf, w_ff, ...",month,6.467763,5.17485,419.311668,0.215744
2,SARIMAX | k=14 | drop: w_so_h,"[hour, weekday, is_weekend, w_tl, w_rf, w_ff, ...",w_so_h,6.448788,5.160024,553.240734,0.382631
3,SARIMAX | k=13 | drop: w_cglo,"[hour, weekday, is_weekend, w_tl, w_rf, w_ff, ...",w_cglo,6.42936,5.141524,727.640007,0.566107
4,SARIMAX | k=12 | drop: w_ffx,"[hour, weekday, is_weekend, w_tl, w_rf, w_ff, ...",w_ffx,6.365116,5.091095,351.557694,0.249103
5,SARIMAX | k=11 | drop: is_weekend,"[hour, weekday, w_tl, w_rf, w_ff, w_rr, w_rrm,...",is_weekend,6.365689,5.090716,341.14338,0.284294
6,SARIMAX | k=10 | drop: w_rf,"[hour, weekday, w_tl, w_ff, w_rr, w_rrm, w_tb1...",w_rf,6.342824,5.082941,351.862645,0.234021
7,SARIMAX | k=9 | drop: w_rrm,"[hour, weekday, w_tl, w_ff, w_rr, w_tb10, w_tb...",w_rrm,6.328288,5.074058,327.982167,0.269406
8,SARIMAX | k=8 | drop: w_rr,"[hour, weekday, w_tl, w_ff, w_tb10, w_tb20, CE...",w_rr,6.337519,5.080702,313.005122,0.2406
9,SARIMAX | k=7 | drop: weekday,"[hour, w_tl, w_ff, w_tb10, w_tb20, CEGH_WAP, T...",weekday,6.359246,5.094482,295.46736,0.269738
