In [1]:
import os
import pickle
import itertools
from datetime import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from statsforecast.models import MSTL
from sklearn.metrics import mean_pinball_loss

os.chdir("C:/2023_11-PTSFC")
import model_train as model_train
import data_prepro as data_prepro

  from tqdm.autonotebook import tqdm


### Params

In [2]:
os.environ["LOKY_MAX_CPU_COUNT"] = "1"  # Replace "4" with the desired number of cores

quantiles = [0.025, 0.25, 0.5, 0.75, 0.975]
fcast_hor = [36, 40, 44, 60, 64, 68] # in hours

### Data Prep

In [3]:
# = = = = = = = = = = = = = 
# get data
# df_energy = data_prepro.get_energy_data_today(to_date=t_wednesday.strftime('%Y%m%d'))

# Read data from file with specified data types
df_energy = pd.read_csv("data/2015-01-01_2024-02-21_energy.csv", index_col=0, parse_dates=[0])
df_energy['timestamp_CET'] = pd.to_datetime(df_energy['timestamp_CET'], utc=True).dt.tz_convert('CET')
print(df_energy.info())

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 80136 entries, 2014-12-31 23:00:00+00:00 to 2024-02-21 22:00:00+00:00
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype              
---  ------         --------------  -----              
 0   timestamp_CET  80136 non-null  datetime64[ns, CET]
 1   gesamt         80136 non-null  float64            
dtypes: datetime64[ns, CET](1), float64(1)
memory usage: 1.8 MB
None


In [4]:
cwd = os.getcwd()
folder_name = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
directory = os.path.join(cwd, folder_name)

if not os.path.exists(directory):
    os.makedirs(directory)
os.chdir(directory)

# Define the start and end dates
start_date = pd.Timestamp('2023-11-15')
end_date = pd.Timestamp('2024-02-14')

# Generate a list of weekly dates in UTC
fcast_dates_cet = pd.date_range(start=start_date, end=end_date, freq='W-WED').tz_localize('CET').strftime('%Y-%m-%d').tolist()
dict_all_fcasts = {}
dict_all_evals = {}
dict_all_model_scores = {}

# Iterate over the forecast dates
for fcast_date in fcast_dates_cet[:2]:

    print('= '*30)
    print(f"Forecasting for week starting from {fcast_date} ...")
    
    dict_weekly_fcasts = {}
    dict_weekly_models = {}

    # = = = = = = = = = = = = = 
    # generate prediction timestamps based on t0 = following thursday 00:00
    # = = = = = = = = = = = = = 

    # Calculate the Thursday and Wednesday of the week
    t_wednesday = pd.Timestamp(fcast_date).replace(hour=0, minute=0, second=0, microsecond=0).tz_localize('CET')
    t_thursday = t_wednesday + pd.Timedelta(days=1)

    # Generate required submission timestamps
    subm_timestamps = [(t_thursday + pd.Timedelta(hours=fcast)) for fcast in fcast_hor]
    print(f"Submission timestamps = {subm_timestamps[0]} to {subm_timestamps[-1]}")
    
    # Create df with Information at fcast start date
    t_thursday_str = t_thursday.strftime('%Y-%m-%d')
    df_energy_current = df_energy.loc[df_energy['timestamp_CET'] <= t_thursday_str].copy()
    print('= '*30)

    # = = = = = = = = = = = = = 
    # Data Prep for All Methods
    # = = = = = = = = = = = = = 
    
    # create fcast index for next 68 hours
    fcast_timestamp_CET = pd.date_range(start=t_thursday, periods=68+1, freq='H')
    fcast_timestamp_UTC = fcast_timestamp_CET.tz_convert('UTC')

    # create df with fcast timestamps as INPUT for model
    df_temp = pd.DataFrame(index=fcast_timestamp_UTC)
    df_temp['timestamp_CET'] = fcast_timestamp_CET
    df_fcast_dummy = data_prepro.create_dummy_df(df_temp, hour_method='simple', holiday_method='separate')

    # = = = = = = = = = = = = = 
    # Simple Benchmark
    # = = = = = = = = = = = = = 

    df_benchmark = pd.DataFrame(index=fcast_timestamp_UTC, columns=[f"q {q:.3f}" for q in quantiles])

    # = = = = = = = = = = = = = 
    # MSTL
    # = = = = = = = = = = = = = 

    mstl_train_horizon = 0.5 # in years

    for mstl_train_horizon in [4, 3, 2, 1.5, 1, 0.5, 0.25]:
        
        method = f"mstl_{mstl_train_horizon}"
        print(f"method = {method}")

        df_mstl_train = df_energy_current.iloc[-int(mstl_train_horizon * 365 * 24):].copy()
        mstl_model = MSTL(season_length=[24, 24 * 7]).fit(df_mstl_train["gesamt"])

        n_steps = df_benchmark.shape[0]

        y_hat_dict = mstl_model.predict(h=n_steps, level=[50, 95])
        y_hat_df = pd.DataFrame(y_hat_dict)
        y_hat_df["timestamp_CET"] = pd.date_range(start=t_thursday, periods=len(y_hat_df), freq="H")

        # rename columns
        y_hat_df = y_hat_df.rename(
            columns={
                "mean": "q 0.500",
                "lo-50": "q 0.250",
                "hi-50": "q 0.750",
                "lo-95": "q 0.025",
                "hi-95": "q 0.975",
            }
        )

        # rearrange cols
        y_hat_df = y_hat_df[["timestamp_CET", "q 0.025", "q 0.250", "q 0.500", "q 0.750", "q 0.975"]]

        df_mstl_fcast = y_hat_df
        df_mstl_fcast.index = fcast_timestamp_UTC

        dict_weekly_fcasts[method] = df_mstl_fcast
        
    # = = = = = = = = = = = = = 
    # Evaluation based on submission timestamps
    # = = = = = = = = = = = = = 

    # get actual values at every submission timestamp
    df_energy_eval = df_energy.loc[df_energy['timestamp_CET'].isin(subm_timestamps)].copy()
    evaluation_results = {}
    
    for model_name, forecast_df in dict_weekly_fcasts.items():

        # Initialize an empty DataFrame to store quantile scores
        quantile_scores = pd.DataFrame(index=subm_timestamps, columns=[f"q {q:.3f}" for q in quantiles])
        # take subset of fcast df at submission timestamps
        forecast_df = forecast_df.loc[forecast_df['timestamp_CET'].isin(subm_timestamps)].copy()

        # Iterate over each submission timestamp
        for q_idx, q in enumerate(quantiles):

            qscore = mean_pinball_loss(alpha=q, 
                                       y_true=df_energy_eval['gesamt'].values, 
                                       y_pred=forecast_df.iloc[:,q_idx+1].values) # skip timestamp_CET col
            
            quantile_scores.iloc[:,q_idx] = qscore / 1000
        
        # Store the quantile scores for the model
        evaluation_results[model_name] = quantile_scores
    
    # Calculate mean scores for each quantile over time
    mean_scores = {}
    for model_name, quantile_scores in evaluation_results.items():
        mean_scores[model_name] = quantile_scores.mean()

    # calculate mean scores over all quantiles
    mean_scores_df = pd.DataFrame(mean_scores)
    
    print('- '*15)
    print('scores:')
    print(mean_scores_df.mean(axis=0).sort_values(ascending=True))
    dict_all_model_scores[fcast_date] = mean_scores_df.mean(axis=0).sort_values(ascending=True)
        
    # = = = = = = = = = = = = = 
    # Save all fcasts & trained models for the week
    # = = = = = = = = = = = = = 
    
    dict_all_fcasts[fcast_date] = dict_weekly_fcasts
    dict_all_evals[fcast_date] = evaluation_results

with open('eval.pickle', 'wb') as handle:
    pickle.dump(dict_all_evals, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('fcasts.pickle', 'wb') as handle:
    pickle.dump(dict_all_fcasts, handle, protocol=pickle.HIGHEST_PROTOCOL)

os.chdir(cwd)

= = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 
Forecasting for week starting from 2023-11-15 ...
Submission timestamps = 2023-11-17 12:00:00+01:00 to 2023-11-18 20:00:00+01:00
= = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 
method = mstl_4
method = mstl_3
method = mstl_2
method = mstl_1.5
method = mstl_1
method = mstl_0.5
method = mstl_0.25
- - - - - - - - - - - - - - - 
scores:
mstl_0.25    0.655269
mstl_0.5     0.660708
mstl_4       0.686128
mstl_3       0.687457
mstl_2       0.692538
mstl_1.5     0.694185
mstl_1       0.702681
dtype: object
= = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 
Forecasting for week starting from 2023-11-22 ...
Submission timestamps = 2023-11-24 12:00:00+01:00 to 2023-11-25 20:00:00+01:00
= = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 
method = mstl_4
method = mstl_3
method = mstl_2
method = mstl_1.5
method = mstl_1
method = mstl_0.5
method = mstl_0.25
- - - - - - - - - - - - - - - 
scores:
mstl_0.5     0

In [5]:
# create empty df to store scores
df_scores = pd.DataFrame(columns=fcast_dates_cet, 
                         index=dict_all_evals[fcast_dates_cet[0]].keys())

for week_key, scores_dict in dict_all_evals.items():
    for model_key, scores_df in scores_dict.items():
        # print(f"{week_key} - {model_key} : {scores_df.values.mean()}")
        df_scores.loc[model_key, week_key] = scores_df.values.mean() 

df_scores['mean'] = df_scores.mean(axis=1)
df_scores.sort_values(by='mean', ascending=True)

Unnamed: 0,2023-11-15,2023-11-22,2023-11-29,2023-12-06,2023-12-13,2023-12-20,2023-12-27,2024-01-03,2024-01-10,2024-01-17,2024-01-24,2024-01-31,2024-02-07,2024-02-14,mean
mstl_0.5,0.660708,0.630093,,,,,,,,,,,,,0.645401
mstl_0.25,0.655269,0.665803,,,,,,,,,,,,,0.660536
mstl_4,0.686128,0.726744,,,,,,,,,,,,,0.706436
mstl_3,0.687457,0.728284,,,,,,,,,,,,,0.70787
mstl_2,0.692538,0.735492,,,,,,,,,,,,,0.714015
mstl_1.5,0.694185,0.735625,,,,,,,,,,,,,0.714905
mstl_1,0.702681,0.736854,,,,,,,,,,,,,0.719767
