# 🔁 10 - Predict Price Using Forecasted Features

This notebook simulates a real-world scenario: forecast all required features (solar, wind, load, fuel prices, etc.) a week ahead, and use those to predict energy prices for Jan 1–7, 2020.

In [10]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt

# Load cleaned dataset
df = pd.read_csv('../data/processed/merged_energy_data_final_step_1.csv', parse_dates=['utc_timestamp'])
df.set_index('utc_timestamp', inplace=True)
df = df.sort_index()

## 🎯 Step 1: Forecast All Input Features One Week Ahead

In [11]:
# Features to forecast
features_to_forecast = [
    'DE_solar_generation_actual', 'DE_wind_generation_actual',
    'DE_load_actual_entsoe_transparency',
    'Gas_Price', 'Oil_Price',
    'DE_radiation_direct_horizontal', 'DE_radiation_diffuse_horizontal',
    'DE_temperature'
]

# Forecast container
input_predictions = {}
start_forecast = pd.Timestamp('2019-12-30 01:00:00')
forecast_steps = 168

for feature in features_to_forecast:
    print(f'🔮 Forecasting: {feature}')
    
    data = df[[feature]].copy()
    data['hour'] = data.index.hour
    data['dayofweek'] = data.index.dayofweek
    data['month'] = data.index.month

    for lag in [1, 24, 168]:
        data[f'lag_{lag}'] = data[feature].shift(lag)
    data.dropna(inplace=True)

    X = data[[f'lag_{l}' for l in [1, 24, 168]] + ['hour', 'dayofweek', 'month']]
    y = data[feature]

    model = XGBRegressor(n_estimators=100)
    model.fit(X, y)

    preds = []
    current = df.loc[start_forecast - pd.Timedelta(hours=168):start_forecast].copy()

    for i in range(forecast_steps):
        t = start_forecast + pd.Timedelta(hours=i)
        row = {
            'lag_1': current[feature].iloc[-1],
            'lag_24': current[feature].iloc[-24],
            'lag_168': current[feature].iloc[-168],
            'hour': t.hour,
            'dayofweek': t.dayofweek,
            'month': t.month
        }
        x_input = pd.DataFrame([row])
        y_pred = model.predict(x_input)[0]
        preds.append((t, y_pred))
        # Append to current
        new_row = pd.DataFrame({feature: [y_pred]}, index=[t])
        current = pd.concat([current, new_row])

    forecast_df = pd.DataFrame(preds, columns=['utc_timestamp', feature]).set_index('utc_timestamp')
    input_predictions[feature] = forecast_df

🔮 Forecasting: DE_solar_generation_actual
🔮 Forecasting: DE_wind_generation_actual
🔮 Forecasting: DE_load_actual_entsoe_transparency
🔮 Forecasting: Gas_Price
🔮 Forecasting: Oil_Price
🔮 Forecasting: DE_radiation_direct_horizontal
🔮 Forecasting: DE_radiation_diffuse_horizontal
🔮 Forecasting: DE_temperature


## 🧪 Step 2: Evaluate Feature Forecast Accuracy on Last Week of 2019

In [12]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Only evaluate predictions from 2019-12-24 to 2019-12-31 (168 hours)
start_forecast = pd.Timestamp('2019-12-24 00:00:00')
eval_end = pd.Timestamp('2019-12-31 23:00:00')

for feature, forecast_df in input_predictions.items():
    # Restrict forecasted values to where actual data exists
    valid_idx = forecast_df.index.intersection(df.index)

    if len(valid_idx) == 0:
        print(f"⚠️ Skipping {feature}: no overlapping timestamps between predictions and actual data")
        continue

    pred = forecast_df.loc[valid_idx, feature]
    actual = df.loc[valid_idx, feature]

    if pred.empty or actual.empty:
        print(f"⚠️ Skipping {feature}: prediction or actual values are empty after filtering")
        continue

    mae = mean_absolute_error(actual, pred)
    rmse = np.sqrt(mean_squared_error(actual, pred))

    print(f'{feature} → MAE: {mae:.2f}, RMSE: {rmse:.2f}')

    plt.figure(figsize=(12, 3))
    plt.plot(actual.index, actual, label='Actual')
    plt.plot(pred.index, pred, label='Predicted')
    plt.title(f"{feature} Forecast Evaluation")
    plt.legend()
    plt.grid(True)
    plt.show()


⚠️ Skipping DE_solar_generation_actual: no overlapping timestamps between predictions and actual data
⚠️ Skipping DE_wind_generation_actual: no overlapping timestamps between predictions and actual data
⚠️ Skipping DE_load_actual_entsoe_transparency: no overlapping timestamps between predictions and actual data
⚠️ Skipping Gas_Price: no overlapping timestamps between predictions and actual data
⚠️ Skipping Oil_Price: no overlapping timestamps between predictions and actual data
⚠️ Skipping DE_radiation_direct_horizontal: no overlapping timestamps between predictions and actual data
⚠️ Skipping DE_radiation_diffuse_horizontal: no overlapping timestamps between predictions and actual data
⚠️ Skipping DE_temperature: no overlapping timestamps between predictions and actual data


## ⚡ Step 3: Use Forecasted Features to Predict Price (Jan 1–7, 2020)

In [13]:
# Create synthetic feature DataFrame
future_features = pd.concat(input_predictions.values(), axis=1)

# Compute synthetic ratios
future_features['solar_load_ratio'] = future_features['DE_solar_generation_actual'] / future_features['DE_load_actual_entsoe_transparency']
future_features['wind_load_ratio'] = future_features['DE_wind_generation_actual'] / future_features['DE_load_actual_entsoe_transparency']

# Add calendar features
future_features['hour'] = future_features.index.hour
future_features['dayofweek'] = future_features.index.dayofweek
future_features['month'] = future_features.index.month

# Load trained price model and previous price data
price_model = XGBRegressor()
full_df = df.copy()
full_df['price_lag_1h'] = full_df['Price (EUR/MWhe)'].shift(1)
full_df['price_lag_24h'] = full_df['Price (EUR/MWhe)'].shift(24)
full_df['price_lag_168h'] = full_df['Price (EUR/MWhe)'].shift(168)
full_df['price_roll_24h'] = full_df['Price (EUR/MWhe)'].rolling(24).mean()
full_df['price_roll_168h'] = full_df['Price (EUR/MWhe)'].rolling(168).mean()
full_df.dropna(inplace=True)

# Train price model on original features
features = [
    'DE_solar_generation_actual', 'DE_wind_generation_actual',
    'DE_load_actual_entsoe_transparency', 'Gas_Price', 'Oil_Price',
    'DE_radiation_direct_horizontal', 'DE_radiation_diffuse_horizontal', 'DE_temperature',
    'price_lag_1h', 'price_lag_24h', 'price_lag_168h',
    'price_roll_24h', 'price_roll_168h', 'hour', 'dayofweek', 'month',
    'solar_load_ratio', 'wind_load_ratio'
]

X_price = full_df[features]
y_price = full_df['Price (EUR/MWhe)']
price_model.fit(X_price, y_price)

# Add price lags and roll features based on historical prices
last_prices = df['Price (EUR/MWhe)'].loc['2019-12-23':'2019-12-30'].copy()
for i, t in enumerate(future_features.index):
    lags = {
        'price_lag_1h': last_prices.iloc[-1],
        'price_lag_24h': last_prices.iloc[-24],
        'price_lag_168h': last_prices.iloc[-168],
        'price_roll_24h': last_prices[-24:].mean(),
        'price_roll_168h': last_prices[-168:].mean()
    }
    for k, v in lags.items():
        future_features.loc[t, k] = v

# Predict
X_future = future_features[features].copy()
y_pred_price = price_model.predict(X_future)

# Compare with actual price
actual = df['Price (EUR/MWhe)'].loc['2020-01-01':'2020-01-07']
pred_series = pd.Series(y_pred_price[:len(actual)], index=actual.index)

mae = mean_absolute_error(actual, pred_series)
rmse = np.sqrt(mean_squared_error(actual, pred_series))
print(f'📊 Price MAE using forecasted features: {mae:.2f}')
print(f'📊 Price RMSE using forecasted features: {rmse:.2f}')

plt.figure(figsize=(15,5))
plt.plot(actual.index, actual.values, label='Actual')
plt.plot(pred_series.index, pred_series.values, label='Predicted (from forecasted features)')
plt.title('Energy Price Forecast (Jan 1–7, 2020)')
plt.xlabel('Date')
plt.ylabel('Price (EUR/MWhe)')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

KeyError: "['hour', 'dayofweek', 'month', 'solar_load_ratio', 'wind_load_ratio'] not in index"

Failed Experiment, also trying to predict features then predict energy prices </br>
Finished by Jad Akra on April 20th 2025