In [None]:
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

In [None]:
# 1. Daten einlesen
df = pd.read_csv("data/dataset.csv", sep=";",parse_dates=["Datetime"])

# Sicherstellen, dass die Daten nach Zeit sortiert sind
df = df.sort_values("Datetime").reset_index(drop=True)


In [None]:
# 2. Grenzen für die Zeit-Splits bestimmen
last_ts = df["Datetime"].max()

test_start = last_ts - pd.Timedelta(days=7)
val_start  = test_start - pd.DateOffset(months=2)

print("Letzter Zeitstempel:", last_ts)
print("Val-Start:", val_start)
print("Test-Start:", test_start)

In [None]:
# 3. Masken (zeitbasiertes Split)
test_mask = df["Datetime"] > test_start
val_mask  = (df["Datetime"] > val_start) & (df["Datetime"] <= test_start)
train_mask = df["Datetime"] <= val_start

df_train = df[train_mask].copy()
df_val   = df[val_mask].copy()
df_test  = df[test_mask].copy()

print("Train:", df_train.shape)
print("Val:  ", df_val.shape)
print("Test: ", df_test.shape)

In [None]:
def add_time_features(df):
    df["hour"] = df["Datetime"].dt.hour
    df["weekday"] = df["Datetime"].dt.weekday  # 0=Montag
    df["month"] = df["Datetime"].dt.month
    return df

def add_lag_features(df, target_col="Power", lags=(1, 96)):
    # lags: 1 = nächste 15 min, 96 ≈ Vortag (24h) bei 15-min-Auflösung
    for lag in lags:
        df[f"{target_col}_lag_{lag}"] = df[target_col].shift(lag)
    return df

# Alle Daten gemeinsam featuren, damit die Shifts konsistent sind
df_feat = df.copy()
df_feat = add_time_features(df_feat)
df_feat = add_lag_features(df_feat, target_col="Power", lags=(1, 2, 96, 97))

# Durch Lags entstehen NaNs am Anfang -> entfernen
df_feat = df_feat.dropna().reset_index(drop=True)

# Nach dem Feature-Engineering neu splitten (gleiche Logik wie oben)
last_ts = df_feat["Datetime"].max()
test_start = last_ts - pd.Timedelta(days=7)
val_start  = test_start - pd.DateOffset(months=2)

test_mask = df_feat["Datetime"] > test_start
val_mask  = (df_feat["Datetime"] > val_start) & (df_feat["Datetime"] <= test_start)
train_mask = df_feat["Datetime"] <= val_start

df_train = df_feat[train_mask].copy()
df_val   = df_feat[val_mask].copy()
df_test  = df_feat[test_mask].copy()

print("Train:", df_train.shape)
print("Val:  ", df_val.shape)
print("Test: ", df_test.shape)

In [None]:
target_col = "Power"

# alle Spalten, die nicht als Feature verwendet werden sollen
drop_cols = ["Datetime", target_col]

feature_cols = [c for c in df_train.columns if c not in drop_cols]

X_train = df_train[feature_cols]
y_train = df_train[target_col]

X_val = df_val[feature_cols]
y_val = df_val[target_col]

X_test = df_test[feature_cols]
y_test = df_test[target_col]

len(feature_cols), feature_cols[:10]

In [None]:
model = CatBoostRegressor(
    loss_function="RMSE",
    eval_metric="RMSE",
    depth=8,
    learning_rate=0.05,
    iterations=2000,
    random_seed=1,
    early_stopping_rounds=100,
    verbose=10
)

model.fit(
    X_train, y_train,
    eval_set=(X_val, y_val),
    use_best_model=True
)

In [None]:
# Validation
val_pred = model.predict(X_val)
val_rmse = mean_squared_error(y_val, val_pred, squared=False)
print(f"Validation RMSE: {val_rmse:.3f}")

# Test
test_pred = model.predict(X_test)
test_rmse = mean_squared_error(y_test, test_pred, squared=False)
print(f"Test RMSE: {test_rmse:.3f}")

import matplotlib.pyplot as plt

plt.figure(figsize=(12,4))
plt.plot(df_test["Datetime"], y_test.values, label="True")
plt.plot(df_test["Datetime"], test_pred, label="Pred", alpha=0.8)
plt.legend()
plt.title("Test-Periode: echte vs. vorhergesagte Power")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
import pandas as pd

importances = model.get_feature_importance()
feature_names = X_train.columns

fi_df = pd.DataFrame({
    "feature": feature_names,
    "importance": importances
}).sort_values("importance", ascending=False)

fi_df