### Hourly forecasting of energy meter readings on BDG2 dataset

- historical data = 1 week (168 data points)
- forecast horizon = 1 day (24 data points)

**Loading TimesFM Model**

In [1]:
import os
import glob
import time
from datetime import datetime
import pandas as pd
import numpy as np
from collections import defaultdict
from itertools import islice

from lightgbm import LGBMRegressor
from sklearn.linear_model import LinearRegression
from skforecast.ForecasterAutoreg import ForecasterAutoreg

import warnings
warnings.filterwarnings('ignore') 

In [2]:
# Data pipelining
def get_batched_data_fn(sub_df,
    batch_size: int = 128, 
    context_len: int = 168, 
    horizon_len: int = 24):
    
    examples = defaultdict(list)
    num_examples = 0
    for start in range(0, len(sub_df) - (context_len + horizon_len), horizon_len):
      num_examples += 1
      #examples["country"].append(country)
      examples["inputs"].append(sub_df["y"][start:(context_end := start + context_len)].tolist())
      #examples["gen_forecast"].append(sub_df["gen_forecast"][start:context_end + horizon_len].tolist())
      #examples["week_day"].append(sub_df["week_day"][start:context_end + horizon_len].tolist())
      examples["outputs"].append(sub_df["y"][context_end:(context_end + horizon_len)].tolist())
      examples['inputs_ts'].append(sub_df.index[start:(context_end := start + context_len)])
      examples["outputs_ts"].append(sub_df.index[context_end:(context_end + horizon_len)])

    return examples

In [None]:
# Benchmark
batch_size = 32
context_len = 168
horizon_len = 24

def process_building(df):
    building_name = df.columns[0]
    df.columns = ['y']
    input_data = get_batched_data_fn(df, batch_size=500)
    # print(input_data)
    
    windows_all = []
    counter = 1
    for inputs_ts, inputs, outputs_ts, outputs in zip(input_data['inputs_ts'], 
                                                      input_data['inputs'], 
                                                      input_data['outputs_ts'], 
                                                      input_data['outputs']):
        
        input_df = pd.DataFrame({'timestamp': inputs_ts, 
                                 'target': inputs})
        
        output_df = pd.DataFrame({'timestamp': outputs_ts, 
                                 'target': outputs})
        combined = pd.concat([input_df, output_df], axis=0)
        combined['item_id'] = str(building_name) + '_' + str(counter)
        combined['item_id_no'] = counter
        counter += 1
        windows_all.append(combined)
        
    windows_all_df = pd.concat(windows_all)
    windows_all_df.timestamp = pd.to_datetime(windows_all_df.timestamp)
    windows_all_df.set_index('timestamp', inplace=True)

    return windows_all_df

In [None]:
def process_file(filename):
    df = pd.read_csv(filename)
    df = df.set_index(['timestamp'])
    df.index = pd.to_datetime(df.index)
    df['month'] = df.index.month
    training_set = df[df.month <= 6]
    test_set = df[df.month > 6]
    training_set = training_set.drop(columns='month')
    test_set = test_set.drop(columns='month')
    df = df.drop(columns='month')
    

    print(f'fine-tune set date range: {training_set.index[0]} {training_set.index[-1]}, '
      f'test set date range: {test_set.index[0]} {test_set.index[-1]}')
            

    if df.shape[1] < 2:
        return None
        
    print(datetime.now(), df.shape, flush=True)

    results_all = []
    c =1
    lag = 168 
    for building_name in df.columns:
        print(f'{datetime.now()} {c} / {len(df.columns)} {building_name}', flush=True)

        windowed_df_train = process_building(training_set[[building_name]])
        windowed_df_test = process_building(test_set[[building_name]])

        forecaster = ForecasterAutoreg(
                    regressor        = LinearRegression(),
                    lags             = 168
                )
        forecaster.fit(y= windowed_df_train['target']
            )

        p = []
        for i in windowed_df_test.item_id_no.unique():#(pred_days):
            # i -= 1           
            # seq_ptr =lag + 24 * i
        
            df_test = windowed_df_test[windowed_df_test.item_id_no == i]
            last_window  = df_test.iloc[0:168]
            ground_truth = df_test.iloc[168:192]
        
            predictions = forecaster.predict(
                steps       = 24,
                last_window = last_window['target']
            )
            # p.append(predictions)
            res = ground_truth.copy()
            res = res[['target']]
            # print(res)
            res.columns = ['y_true']
            res = res.reset_index()
            res.insert(2, 'y_pred', predictions.reset_index()['pred'])
            res.set_index('timestamp', inplace=True)
            # res['y_pred'] = predictions
            p.append(res)
        res = pd.concat(p)
        res['building'] = building_name
        results_all.append(res)
        c+=1
        # if i == 2:
        #    break
        #break
        
    results_all_df = pd.concat(results_all)
    return results_all_df

In [None]:
files_list = glob.glob('/home/user/New_Buildings_Datasets/Enernoc/csv-only/processed/*.csv')

dataset = 'Enernoc-Linear'
os.makedirs(f'forecasts/{dataset}/', exist_ok = True)
os.makedirs(f'results/{dataset}/', exist_ok = True)

for filename in files_list:
    print(datetime.now(), filename)
    results = process_file(filename)
    if results is not None:
        results.to_csv(f'forecasts/{dataset}/{os.path.basename(filename)}')
    print('')

2024-11-13 16:33:17.285832 /home/user/New_Buildings_Datasets/Enernoc/csv-only/processed/enernoc.csv
fine-tune set date range: 2012-01-01 00:00:00 2013-01-01 00:00:00, test set date range: 2012-07-01 00:00:00 2012-12-31 23:00:00
2024-11-13 16:33:17.479177 (8785, 100)
2024-11-13 16:33:17.480031 1 / 100 767
2024-11-13 16:33:19.126214 2 / 100 304
2024-11-13 16:33:21.240201 3 / 100 399
2024-11-13 16:33:23.108617 4 / 100 21
2024-11-13 16:33:25.108701 5 / 100 805
2024-11-13 16:33:26.511250 6 / 100 14
2024-11-13 16:33:28.163668 7 / 100 404
2024-11-13 16:33:29.570692 8 / 100 78
2024-11-13 16:33:31.645060 9 / 100 731
2024-11-13 16:33:33.155178 10 / 100 218
2024-11-13 16:33:35.298626 11 / 100 366
2024-11-13 16:33:37.075126 12 / 100 766
2024-11-13 16:33:38.662921 13 / 100 197
2024-11-13 16:33:40.333569 14 / 100 30
2024-11-13 16:33:42.756863 15 / 100 742
2024-11-13 16:33:44.985771 16 / 100 32
2024-11-13 16:33:46.583006 17 / 100 137
2024-11-13 16:33:49.007446 18 / 100 36
2024-11-13 16:33:50.287010 1

### Metrics

In [6]:
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import root_mean_squared_log_error
from permetrics.regression import RegressionMetric

dataset = 'Enernoc-Linear'
files_list = glob.glob(f'forecasts/{dataset}/*.csv')

metrics_all_files = []

for filename in files_list:
    print(filename)
    res = pd.read_csv(filename)
    metrics_all = []
    for (g, data) in res.groupby(['building']):
        data = data.dropna()
        data = data[data.y_pred >= 0]
        print(g[0]) 
        # print(data)
        if not data.empty:
            rmse= root_mean_squared_error(data.y_true, data.y_pred)
            mae= mean_absolute_error(data.y_true, data.y_pred)
            mape = mean_absolute_percentage_error(data.y_true, data.y_pred)
            mse= mean_squared_error(data.y_true, data.y_pred)
            msle= mean_squared_log_error(data.y_true, data.y_pred)
            rmsle= root_mean_squared_log_error(data.y_true, data.y_pred)
            nrmse = rmse / (data.y_true.mean()) 
    
            evaluator = RegressionMetric(data.y_true.to_list(), data.y_pred.to_list())
            nrmse_eve = evaluator.normalized_root_mean_square_error()
            evaluator = RegressionMetric(data.y_true.to_list(), data.y_pred.to_list())
            smape= evaluator.symmetric_mean_absolute_percentage_error()
        
            metrics = pd.DataFrame({'building_name': [g[0]], 
                               'mae': [mae],
                                'mape': [mape],
                               'mse': [mse], 'rmse': [rmse], 'msle': [msle], 'rmsle': [rmsle], 'nrmse' : [nrmse],
                                  'nrmse_eve':[nrmse_eve] , 'sMAPE' : [smape]})
            metrics_all.append(metrics)
        else:
            continue
    
    metrics_all_df = pd.concat(metrics_all)
    metrics_all_df.to_csv(f'results/{dataset}/{os.path.basename(filename)}')

    metrics_all_df['filename'] = os.path.basename(filename)
    metrics_all_files.append(metrics_all_df)

metrics_all_files_df = pd.concat(metrics_all_files)

forecasts/Enernoc-Linear/enernoc.csv
6
8
9
10
12
13
14
21
22
25
29
30
31
32
36
41
42
44
45
49
51
55
56
65
78
88
92
99
100
101
103
109
111
116
136
137
144
153
186
197
213
214
217
218
224
228
236
259
270
275
281
285
304
339
341
363
366
384
386
391
399
400
401
404
427
454
455
472
474
475
478
484
492
496
512
648
654
673
674
690
697
703
716
718
731
737
742
744
745
755
761
765
766
767
771
786
805
808
832
887


In [7]:
metrics_all_files_df.to_csv(f'results/{dataset}/results_combined.csv')
metrics_all_files_df

Unnamed: 0,building_name,mae,mape,mse,rmse,msle,rmsle,nrmse,nrmse_eve,sMAPE,filename
0,6,34.007943,9.933198e-02,2743.307553,52.376594,0.023224,0.152393,0.143134,0.360877,0.049607,enernoc.csv
0,8,278.272229,4.738674e-01,255622.427267,505.591166,0.372662,0.610460,0.494805,0.692133,0.154037,enernoc.csv
0,9,49.782515,2.901887e+16,8460.852992,91.982895,0.977973,0.988925,0.289578,0.588530,0.094287,enernoc.csv
0,10,400.470635,1.398717e-01,470755.477690,686.116228,0.058012,0.240857,0.176813,0.871555,0.060162,enernoc.csv
0,12,28.557741,6.155772e-02,2768.432145,52.615893,0.010412,0.102040,0.111848,0.306791,0.029509,enernoc.csv
...,...,...,...,...,...,...,...,...,...,...,...
0,786,278.793229,4.872196e-01,414938.305491,644.157050,0.340144,0.583219,0.231482,1.574694,0.071726,enernoc.csv
0,805,23.813834,3.497378e-01,1430.316340,37.819523,0.217752,0.466639,0.323719,0.678161,0.125348,enernoc.csv
0,808,20.474297,2.617917e-01,1359.674858,36.873769,0.132386,0.363849,0.459122,0.528240,0.119504,enernoc.csv
0,832,240.912107,4.738680e-01,191591.320150,437.711458,0.372410,0.610254,0.494805,0.692133,0.154038,enernoc.csv


In [8]:
metrics_all_files_df.describe()*100

Unnamed: 0,building_name,mae,mape,mse,rmse,msle,rmsle,nrmse,nrmse_eve,sMAPE
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,33888.0,9314.012091,3.436915e+17,7003792.0,15525.206307,30.030696,39.594729,29.142987,62.791747,9.852659
std,27637.111925,13219.565004,1.21346e+18,14404410.0,21540.357366,56.727838,38.076571,21.65682,25.727496,9.121447
min,600.0,529.054912,4.385436,5328.323,729.953642,0.323161,5.684721,5.487196,30.471953,2.152652
25%,8550.0,1759.956628,8.407415,84431.94,2902.903959,1.345597,11.598408,11.19244,44.033384,3.80265
50%,27800.0,3056.779099,19.93468,231243.7,4808.511217,7.217853,26.863621,26.06112,56.310067,8.22133
75%,54600.0,10445.147826,47.72063,3382083.0,18381.126038,21.35211,46.207592,39.921397,69.360933,11.576638
max,88700.0,52307.598096,7.172178e+18,54449840.0,73790.13611,308.805926,175.728747,109.895171,165.148227,52.996868
