In [1]:
import os
import pickle
import time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import mean_pinball_loss 
from statsforecast.models import MSTL

# = = = = = =
# own stuff
# = = = = = =
os.chdir("C:/2023_11-PTSFC")
import data_prepro as data_prepro
import model_train as model_train
import model_fcast as model_fcast

  from tqdm.autonotebook import tqdm


In [2]:
os.environ["LOKY_MAX_CPU_COUNT"] = "1"  # Replace "4" with the desired number of cores

quantiles = [0.025, 0.25, 0.5, 0.75, 0.975]
fcast_hor = [36, 40, 44, 60, 64, 68] # in hours

In [3]:
# = = = = = = = = = = = = = 
# get data
# df_energy = data_prepro.get_energy_data_today(to_date=t_wednesday.strftime('%Y%m%d'))

# Read data from file with specified data types
df_energy = pd.read_csv("data/2015-01-01_2024-02-21_energy.csv", index_col=0, parse_dates=[0])
df_energy['timestamp_CET'] = pd.to_datetime(df_energy['timestamp_CET'], utc=True).dt.tz_convert('CET')
print(df_energy.info())

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 80136 entries, 2014-12-31 23:00:00+00:00 to 2024-02-21 22:00:00+00:00
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype              
---  ------         --------------  -----              
 0   timestamp_CET  80136 non-null  datetime64[ns, CET]
 1   gesamt         80136 non-null  float64            
dtypes: datetime64[ns, CET](1), float64(1)
memory usage: 1.8 MB
None


In [4]:
def preprocess_data(df_energy, start_date):

    df_energy_small = df_energy.loc[(df_energy['timestamp_CET'] > start_date)].copy()
    
    df_energy_dummy = data_prepro.create_dummy_df(df_energy_small, hour_method='seasonal', holiday_method='separate')
    df_energy_fturs = data_prepro.create_features_df(df_energy_small, holiday_method='separate')

    X_train_fturs = df_energy_fturs.drop(['gesamt', 'timestamp_CET'], axis=1)
    y_train_fturs = df_energy_fturs['gesamt']

    X_train_dummy = df_energy_dummy.drop(['gesamt', 'timestamp_CET'], axis=1)
    y_train_dummy = df_energy_dummy['gesamt']
    
    return X_train_dummy, y_train_dummy, X_train_fturs, y_train_fturs

In [7]:
# Define the start and end dates
start_date = pd.Timestamp('2023-11-15')
end_date = pd.Timestamp('2024-02-14')

# Generate a list of weekly dates in UTC
fcast_dates_cet = pd.date_range(start=start_date, end=end_date, freq='W-WED').tz_localize('CET').strftime('%Y-%m-%d').tolist()
dict_all_fcasts = {}
dict_all_evals = {}

# Iterate over the forecast dates
for fcast_date in fcast_dates_cet[:]:

    print('= '*30)
    print(f"Forecasting for week starting from {fcast_date} ...")
    dict_weekly_fcasts = {}
    dict_weekly_models = {}

    # = = = = = = = = = = = = = 
    # generate prediction timestamps based on t0 = following thursday 00:00
    # = = = = = = = = = = = = = 

    # Calculate the Thursday and Wednesday of the week
    t_wednesday = pd.Timestamp(fcast_date).replace(hour=0, minute=0, second=0, microsecond=0).tz_localize('CET')
    t_thursday = t_wednesday + pd.Timedelta(days=1)

    # Generate required submission timestamps
    subm_timestamps = [(t_thursday + pd.Timedelta(hours=fcast)) for fcast in fcast_hor]
    print(f"Submission timestamps = {subm_timestamps[0]} to {subm_timestamps[-1]}")

    # parse t_thursday to string
    t_thursday_str = t_thursday.strftime('%Y-%m-%d')
    df_energy_current = df_energy.loc[df_energy['timestamp_CET'] <= t_thursday_str].copy()
    # print(df_energy_current.info())
    print('= '*30)

    # = = = = = = = = = = = = = 
    # XGBoost model
    # = = = = = = = = = = = = = 

    method = 'xgboost_2022'

    start_date = "2022-01-01"
    X_train_dummy, y_train_dummy, X_train_fturs, y_train_fturs = preprocess_data(df_energy_current, start_date)

    # create fcast index for next 68 hours
    fcast_timestamp_CET = pd.date_range(start=t_thursday, periods=68+1, freq='H')
    fcast_timestamp_UTC = fcast_timestamp_CET.tz_convert('UTC')
    
    # create df with fcast timestamps as INPUT for model
    df_temp = pd.DataFrame(index=fcast_timestamp_UTC)
    df_temp['timestamp_CET'] = fcast_timestamp_CET

    all_models = model_train.fit_xgboost(X_train_dummy, y_train_dummy, quantiles)
    df_fcast_dummy = data_prepro.create_dummy_df(df_temp, hour_method='seasonal', holiday_method='separate')

    # create empty OUTPUT df with columns = quantiles
    df_direct_fcast = pd.DataFrame(index=df_fcast_dummy.index)
    df_direct_fcast['timestamp_CET'] = fcast_timestamp_CET

    # Prediction for Quantile Regression
    for name, model in sorted(all_models.items()):
        pred = model.predict(df_fcast_dummy.drop('timestamp_CET', axis=1))
        df_direct_fcast[name] = pred

    dict_weekly_fcasts[method] = df_direct_fcast

    # = = = = = = = = = = = = = 
    # Evaluation based on submission timestamps
    # = = = = = = = = = = = = = 

    # get actual values at every submission timestamp
    df_energy_eval = df_energy.loc[df_energy['timestamp_CET'].isin(subm_timestamps)].copy()
    # display(df_energy_eval)

    # Initialize an empty dictionary to store evaluation results
    evaluation_results = {}

    # Iterate over each model's forecast for the week
    for model_name, forecast_df in dict_weekly_fcasts.items():

        # Initialize an empty DataFrame to store quantile scores
        quantile_scores = pd.DataFrame(index=subm_timestamps, columns=[f"q {q:.3f}" for q in quantiles])
        # take subset of fcast df at submission timestamps
        forecast_df = forecast_df.loc[forecast_df['timestamp_CET'].isin(subm_timestamps)].copy()

        # Iterate over each submission timestamp
        for q_idx, q in enumerate(quantiles):

            qscore = mean_pinball_loss(alpha=q, 
                                       y_true=df_energy_eval['gesamt'].values, 
                                       y_pred=forecast_df.iloc[:,q_idx+1].values) # skip timestamp_CET col
            
            quantile_scores.iloc[:,q_idx] = qscore / 1000
        
        # Store the quantile scores for the model
        evaluation_results[model_name] = quantile_scores
    
    # Calculate mean scores for each quantile over time
    mean_scores = {}
    for model_name, quantile_scores in evaluation_results.items():
        mean_scores[model_name] = quantile_scores.mean()
    
    display(pd.DataFrame(mean_scores).T.iloc[13:].style.highlight_min(color='lightgreen', axis=0))

    # calculate mean scores over all quantiles
    mean_scores_df = pd.DataFrame(mean_scores)
    
    print('- '*15)
    print('scores:')
    print(mean_scores_df.mean(axis=0).sort_values(ascending=True))


= = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 
Forecasting for week starting from 2023-11-15 ...
Submission timestamps = 2023-11-17 12:00:00+01:00 to 2023-11-18 20:00:00+01:00
= = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 
- - - - - - - - - - - - - - - 
> start fitting XGBoost models ...
> time taken: 3.47 seconds
- - - - - - - - - - - - - - - 


Unnamed: 0,q 0.025,q 0.250,q 0.500,q 0.750,q 0.975


- - - - - - - - - - - - - - - 
scores:
xgboost_2022    0.826299
dtype: object
= = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 
Forecasting for week starting from 2023-11-22 ...
Submission timestamps = 2023-11-24 12:00:00+01:00 to 2023-11-25 20:00:00+01:00
= = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 
- - - - - - - - - - - - - - - 
> start fitting XGBoost models ...
> time taken: 5.47 seconds
- - - - - - - - - - - - - - - 


Unnamed: 0,q 0.025,q 0.250,q 0.500,q 0.750,q 0.975


- - - - - - - - - - - - - - - 
scores:
xgboost_2022    1.263267
dtype: object
= = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 
Forecasting for week starting from 2023-11-29 ...
Submission timestamps = 2023-12-01 12:00:00+01:00 to 2023-12-02 20:00:00+01:00
= = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 
- - - - - - - - - - - - - - - 
> start fitting XGBoost models ...
> time taken: 5.79 seconds
- - - - - - - - - - - - - - - 


Unnamed: 0,q 0.025,q 0.250,q 0.500,q 0.750,q 0.975


- - - - - - - - - - - - - - - 
scores:
xgboost_2022    1.525777
dtype: object
= = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 
Forecasting for week starting from 2023-12-06 ...
Submission timestamps = 2023-12-08 12:00:00+01:00 to 2023-12-09 20:00:00+01:00
= = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 
- - - - - - - - - - - - - - - 
> start fitting XGBoost models ...
> time taken: 6.18 seconds
- - - - - - - - - - - - - - - 


Unnamed: 0,q 0.025,q 0.250,q 0.500,q 0.750,q 0.975


- - - - - - - - - - - - - - - 
scores:
xgboost_2022    1.167636
dtype: object
= = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 
Forecasting for week starting from 2023-12-13 ...
Submission timestamps = 2023-12-15 12:00:00+01:00 to 2023-12-16 20:00:00+01:00
= = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 
- - - - - - - - - - - - - - - 
> start fitting XGBoost models ...
> time taken: 6.17 seconds
- - - - - - - - - - - - - - - 


Unnamed: 0,q 0.025,q 0.250,q 0.500,q 0.750,q 0.975


- - - - - - - - - - - - - - - 
scores:
xgboost_2022    0.799405
dtype: object
= = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 
Forecasting for week starting from 2023-12-20 ...
Submission timestamps = 2023-12-22 12:00:00+01:00 to 2023-12-23 20:00:00+01:00
= = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 
- - - - - - - - - - - - - - - 
> start fitting XGBoost models ...


KeyboardInterrupt: 