In [1]:
import os
import pickle
import itertools
from datetime import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from statsforecast.models import MSTL
from sklearn.metrics import mean_pinball_loss

os.chdir("C:/2023_11-PTSFC")
import model_train as model_train
import data_prepro as data_prepro

  from tqdm.autonotebook import tqdm


### Params

In [2]:
os.environ["LOKY_MAX_CPU_COUNT"] = "1"  # Replace "4" with the desired number of cores

quantiles = [0.025, 0.25, 0.5, 0.75, 0.975]
fcast_hor = [36, 40, 44, 60, 64, 68] # in hours

### Data Prep

In [3]:
# = = = = = = = = = = = = = 
# get data
# df_energy = data_prepro.get_energy_data_today(to_date=t_wednesday.strftime('%Y%m%d'))

# Read data from file with specified data types
df_energy = pd.read_csv("data/2015-01-01_2024-02-21_energy.csv", index_col=0, parse_dates=[0])
df_energy['timestamp_CET'] = pd.to_datetime(df_energy['timestamp_CET'], utc=True).dt.tz_convert('CET')
print(df_energy.info())

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 80136 entries, 2014-12-31 23:00:00+00:00 to 2024-02-21 22:00:00+00:00
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype              
---  ------         --------------  -----              
 0   timestamp_CET  80136 non-null  datetime64[ns, CET]
 1   gesamt         80136 non-null  float64            
dtypes: datetime64[ns, CET](1), float64(1)
memory usage: 1.8 MB
None


In [4]:
def preprocess_data(df_energy, start_date):
    
    df_energy_small = df_energy.loc[(df_energy['timestamp_CET'] > start_date)].copy()
    
    df_energy_dummy = data_prepro.create_dummy_df(df_energy_small, hour_method='simple', holiday_method='separate')
    df_energy_fturs = data_prepro.create_features_df(df_energy_small, holiday_method='separate')

    X_train_fturs = df_energy_fturs.drop(['gesamt', 'timestamp_CET'], axis=1)
    y_train_fturs = df_energy_fturs['gesamt']

    X_train_dummy = df_energy_dummy.drop(['gesamt', 'timestamp_CET'], axis=1)
    y_train_dummy = df_energy_dummy['gesamt']
    
    return X_train_dummy, y_train_dummy, X_train_fturs, y_train_fturs

def generate_param_grids(params):
    
        param_values = list(itertools.product(*params.values()))
        param_names = list(params.keys())

        param_grids = []

        for values in param_values:
            param_dict = dict(zip(param_names, values))
            param_grids.append(param_dict)

        return param_grids

lgbm_params = {
    'max_depth': [4, 10],
    'num_leaves': [5, 15, 20],
    'learning_rate': [0.1, 0.3],
    'n_estimators': [100, 200],
    'boosting_type': ['gbdt'],
    'verbose': [-1]
}

xgb_params = {
    'objective': ['reg:quantileerror'],
    'eval_metric': ['quantile'],
    'booster': ['gbtree'],
    'max_depth': [4, 10, 15],
    'learning_rate': [0.1, 0.3],
    'n_estimators': [100, 200, 300],
}

all_lgbm_params = generate_param_grids(lgbm_params)
all_xgb_params = generate_param_grids(xgb_params)

In [5]:
cwd = os.getcwd()
folder_name = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
directory = os.path.join(cwd, folder_name)

if not os.path.exists(directory):
    os.makedirs(directory)
os.chdir(directory)

# Define the start and end dates
start_date = pd.Timestamp('2023-11-15')
end_date = pd.Timestamp('2024-02-14')

# Generate a list of weekly dates in UTC
fcast_dates_cet = pd.date_range(start=start_date, end=end_date, freq='W-WED').tz_localize('CET').strftime('%Y-%m-%d').tolist()
dict_all_fcasts = {}
dict_all_evals = {}

# Iterate over the forecast dates
for fcast_date in fcast_dates_cet[:]:

    print('= '*30)
    print(f"Forecasting for week starting from {fcast_date} ...")
    
    dict_weekly_fcasts = {}
    dict_weekly_models = {}

    # = = = = = = = = = = = = = 
    # generate prediction timestamps based on t0 = following thursday 00:00
    # = = = = = = = = = = = = = 

    # Calculate the Thursday and Wednesday of the week
    t_wednesday = pd.Timestamp(fcast_date).replace(hour=0, minute=0, second=0, microsecond=0).tz_localize('CET')
    t_thursday = t_wednesday + pd.Timedelta(days=1)

    # Generate required submission timestamps
    subm_timestamps = [(t_thursday + pd.Timedelta(hours=fcast)) for fcast in fcast_hor]
    print(f"Submission timestamps = {subm_timestamps[0]} to {subm_timestamps[-1]}")
    
    # Create df with Information at fcast start date
    t_thursday_str = t_thursday.strftime('%Y-%m-%d')
    df_energy_current = df_energy.loc[df_energy['timestamp_CET'] <= t_thursday_str].copy()
    print('= '*30)

    # = = = = = = = = = = = = = 
    # Data Prep for All Methods
    # = = = = = = = = = = = = = 
    
    # create fcast index for next 68 hours
    fcast_timestamp_CET = pd.date_range(start=t_thursday, periods=68+1, freq='H')
    fcast_timestamp_UTC = fcast_timestamp_CET.tz_convert('UTC')

    # create df with fcast timestamps as INPUT for model
    df_temp = pd.DataFrame(index=fcast_timestamp_UTC)
    df_temp['timestamp_CET'] = fcast_timestamp_CET
    df_fcast_dummy = data_prepro.create_dummy_df(df_temp, hour_method='simple', holiday_method='separate')

    # = = = = = = = = = = = = = 
    # XGBoost model
    # = = = = = = = = = = = = = 

    yr = 2019
    start_date = f"{yr}-01-01"
    X_train_dummy, y_train_dummy, X_train_fturs, y_train_fturs = preprocess_data(df_energy_current, start_date)

    df_fcast_dummy = data_prepro.create_dummy_df(df_temp, hour_method='simple', holiday_method='separate')
    
    methods = [f"xgboost_dummy_{yr}_{i}" for i in range(len(all_xgb_params))]
    
    for method_idx, method in enumerate(methods):

        print(f"method = {method}")
        params = all_xgb_params[method_idx]
    
        all_models = model_train.fit_xgboost(X_train_dummy, y_train_dummy, quantiles, params)
        dict_weekly_models[method] = all_models

        # create empty OUTPUT df with columns = quantiles
        df_direct_fcast = pd.DataFrame(index=df_fcast_dummy.index)
        df_direct_fcast['timestamp_CET'] = fcast_timestamp_CET

        # Prediction
        for name, model in sorted(all_models.items()):
            pred = model.predict(df_fcast_dummy.drop('timestamp_CET', axis=1))
            df_direct_fcast[name] = pred

        dict_weekly_fcasts[method] = df_direct_fcast
    
    # = = = = = = = = = = = = = 
    # Evaluation based on submission timestamps
    # = = = = = = = = = = = = = 

    # get actual values at every submission timestamp
    df_energy_eval = df_energy.loc[df_energy['timestamp_CET'].isin(subm_timestamps)].copy()
    evaluation_results = {}
    
    for model_name, forecast_df in dict_weekly_fcasts.items():

        # Initialize an empty DataFrame to store quantile scores
        quantile_scores = pd.DataFrame(index=subm_timestamps, columns=[f"q {q:.3f}" for q in quantiles])
        # take subset of fcast df at submission timestamps
        forecast_df = forecast_df.loc[forecast_df['timestamp_CET'].isin(subm_timestamps)].copy()

        # Iterate over each submission timestamp
        for q_idx, q in enumerate(quantiles):

            qscore = mean_pinball_loss(alpha=q, 
                                       y_true=df_energy_eval['gesamt'].values, 
                                       y_pred=forecast_df.iloc[:,q_idx+1].values) # skip timestamp_CET col
            
            quantile_scores.iloc[:,q_idx] = qscore / 1000
        
        # Store the quantile scores for the model
        evaluation_results[model_name] = quantile_scores
    
    # Calculate mean scores for each quantile over time
    mean_scores = {}
    for model_name, quantile_scores in evaluation_results.items():
        mean_scores[model_name] = quantile_scores.mean()

    # calculate mean scores over all quantiles
    mean_scores_df = pd.DataFrame(mean_scores)
    
    print('- '*15)
    print('scores:')
    print(mean_scores_df.mean(axis=0).sort_values(ascending=True))
        
    # = = = = = = = = = = = = = 
    # Save all fcasts & trained models for the week
    # = = = = = = = = = = = = = 
    
    dict_all_fcasts[fcast_date] = dict_weekly_fcasts
    dict_all_evals[fcast_date] = evaluation_results
    
    # with open(f'{fcast_date}_models.pickle', 'wb') as handle:
    #     pickle.dump(dict_weekly_models, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('eval.pickle', 'wb') as handle:
    pickle.dump(dict_all_evals, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('fcasts.pickle', 'wb') as handle:
    pickle.dump(dict_all_fcasts, handle, protocol=pickle.HIGHEST_PROTOCOL)

os.chdir(cwd)

= = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 
Forecasting for week starting from 2023-11-15 ...
Submission timestamps = 2023-11-17 12:00:00+01:00 to 2023-11-18 20:00:00+01:00
= = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 
method = xgboost_dummy_2019_0
> time taken: 31.27 seconds
method = xgboost_dummy_2019_1
> time taken: 40.42 seconds
method = xgboost_dummy_2019_2


Exception ignored on calling ctypes callback function: <bound method DataIter._next_wrapper of <xgboost.data.SingleBatchInternalIter object at 0x00000240D31F6B90>>
Traceback (most recent call last):
  File "c:\Users\ytl_c\miniconda3\Lib\site-packages\xgboost\core.py", line 641, in _next_wrapper
    return self._handle_exception(lambda: self.next(input_data), 0)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ytl_c\miniconda3\Lib\site-packages\xgboost\core.py", line 557, in _handle_exception
    return fn()
           ^^^^
  File "c:\Users\ytl_c\miniconda3\Lib\site-packages\xgboost\core.py", line 641, in <lambda>
    return self._handle_exception(lambda: self.next(input_data), 0)
                                          ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ytl_c\miniconda3\Lib\site-packages\xgboost\data.py", line 1280, in next
    input_data(**self.kwargs)
  File "c:\Users\ytl_c\miniconda3\Lib\site-packages\xgboost\core.py", line 730, in inner_f
 

XGBoostError: [13:47:26] C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0b3782d1791676daf-1\xgboost\xgboost-ci-windows\src\data\proxy_dmatrix.h:158: Unknown type: void