# DLinear NeuralForecast

### Loading Libraries

In [None]:
%cd ../..

In [None]:
# First, install the missing package
!pip install neuralforecast

# Numerical Computing
import numpy as np

# Data Manipulation
import pandas as pd

# OS 
import os
import shutil
import joblib

# Data Visualization
import seaborn as sns
import plotly.io as pio
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go

# JavaScript Object Notation
import json

# Path
from pathlib import Path
from tqdm.autonotebook import tqdm

# IPython & Itertools
from itertools import cycle
from IPython.display import display, HTML

# Stats Forecast
from statsforecast import StatsForecast

# NeuralForecast - will work after installation
from neuralforecast import NeuralForecast
from neuralforecast.models import NBEATS
from neuralforecast.auto import AutoNBEATS
from neuralforecast.losses.pytorch import MQLoss

from statsforecast import StatsForecast
from neuralforecast import NeuralForecast
from neuralforecast.models import NHITS
from neuralforecast.auto import AutoNHITS
from neuralforecast.losses.pytorch import MQLoss

# FuncTools
from functools import partial

In [None]:
# %load_ext autoreload

# %autoreload 2

In [None]:
tqdm.pandas()

np.random.seed(42)

sns.set_style("whitegrid")

pio.templates.default = "plotly_white"

In [None]:
os.makedirs("imgs/chapter_16", exist_ok=True)

preprocessed = Path.home() / "Desktop" / "data" / "london_smart_meters" / "preprocessed"

output = Path.home() / "Desktop" / "data" / "london_smart_meters" / "output"

In [None]:
from ray import tune

from ray.tune.search.hyperopt import HyperOptSearch

In [None]:
TRAIN_SUBSAMPLE = True  # Trains a subsample of IDs to improve run speed

RETUNE = True  # if false, will use pre-trained hyperparameters when generating the AUTO NeuralForecast

In [None]:
preprocessed = Path("data/london_smart_meters/preprocessed")

output = Path("data/london_smart_meters/output")

try:
    #Reading the missing value imputed and train test split data
    train_df = pd.read_parquet(preprocessed/"selected_blocks_train_missing_imputed_feature_engg.parquet")
    # Read in the Validation dataset as test_df so that we predict on it
    test_df = pd.read_parquet(preprocessed/"selected_blocks_val_missing_imputed_feature_engg.parquet")
    # test_df = pd.read_parquet(preprocessed/"selected_blocks_test_missing_imputed_feature_engg.parquet")
except FileNotFoundError:
    display(HTML("""
    <div class="alert alert-block alert-warning">
    <b>Warning!</b> File not found. Please make sure you have run 01-Feature Engineering.ipynb in Chapter06
    </div>
    """))

In [None]:
train_df.head(2)

In [None]:
print("Total # of IDs Pre-Sampling: ", len(train_df.LCLid.unique()))

In [None]:
# To run on smaller set of data for daster iteration.
if TRAIN_SUBSAMPLE:
    print("sub sampling")
    SAMPLE = 10
    sampled_LCLids = pd.Series(train_df.LCLid.unique().remove_unused_categories().categories).sample(SAMPLE, random_state=99).tolist()
    train_df = train_df.loc[train_df.LCLid.isin(sampled_LCLids)]
    test_df = test_df.loc[test_df.LCLid.isin(sampled_LCLids)]

In [None]:
print("Total # of IDs Post Sampling: ", len(train_df.LCLid.unique()))

### Train, Validation, Test Set

In [None]:
print("Training Min Date: ", train_df.timestamp.min(), 
      "\nTraining Max Date: ", train_df.timestamp.max(), 
      "\nTesting Min Date: ", test_df.timestamp.min(),
      "\nTesting Max Date: ", test_df.timestamp.max()
)

In [None]:
# Keeping 1 days aside as a validation set
cutoff = train_df.timestamp.max() - pd.Timedelta(1, "D")

validation_df = train_df[(train_df.timestamp>cutoff)].reset_index(drop=True) # validation prediction set
training_df = train_df[(train_df.timestamp<=cutoff)].reset_index(drop=True) # training set used for validation set

print(f"Train Max: {training_df.timestamp.max()} \nValidation Min: {validation_df.timestamp.min()} \nValidation Max: {validation_df.timestamp.max()}")
print(f"Validation Horizon: {len(validation_df.timestamp.unique())}")

In [None]:
h = 48

max_steps = 100

## Training DLinear Model

In [None]:
training_df[['LCLid']].nunique()[0]

In [None]:
len(training_df['LCLid'].unique())

In [None]:
model_untuned = [DLinear (h=h, input_size = 48*7, 
                max_steps = max_steps)]

model_untuned = NeuralForecast(models=model_untuned, freq='30min')

model_untuned.fit(training_df[['LCLid','timestamp','energy_consumption']],
                  id_col = 'LCLid',
                  time_col = 'timestamp',
                  target_col='energy_consumption')

In [None]:
# Get Predictions for validation
pred_df =  model_untuned.predict(futr_df=validation_df[['LCLid','timestamp','energy_consumption']]).reset_index()
pred_df = pred_df.merge(validation_df[['LCLid','timestamp','energy_consumption']], on=['LCLid','timestamp'], how='left')
pred_df.head()

In [None]:
# Visualize results
StatsForecast.plot(validation_df[['LCLid','timestamp','energy_consumption']], 
                   pred_df, engine='matplotlib', 
                   id_col='LCLid',
                   time_col= 'timestamp', 
                   target_col='energy_consumption',
                   models=['DLinear'])

#### Evaluate DLinear Forecast

In [None]:
fcst_mase = partial(mase, seasonality=48)

# Get metrics for individual LCLid's
DLinear_metrics = evaluate(pred_df, 
        metrics=[rmse, mae, mse, fcst_mase],  
        train_df = train_df[['timestamp', 'LCLid', 'energy_consumption']],      
        id_col = 'LCLid',
        time_col = 'timestamp',
        target_col = 'energy_consumption'
        )

# Get aggregated metrics for across all LCLid's by model
DLinear_metrics_agg = evaluate(pred_df, 
        metrics=[rmse, mae, mse, fcst_mase],  
        train_df = train_df[['timestamp', 'LCLid', 'energy_consumption']],      
        id_col = 'LCLid',
        time_col = 'timestamp',
        target_col = 'energy_consumption',
        agg_fn='mean'
        )

In [None]:
DLinear_metrics_agg

### DLinear Tuned

In [None]:
# Define the file path
config_file_path = 'notebooks/Chapter16/saved_params_config/DLinear_best_config.json'
try:
    with open(config_file_path, 'r') as config_file:
        loaded_config = json.load(config_file)
        print(loaded_config)
except FileNotFoundError:
    display(HTML("""
    <div class="alert alert-block alert-warning">
    <b>Warning!</b> File not found. 
    </div>
    """))

In [None]:
DLinear_config = {
    "max_steps": max_steps,  # This parameter can be adjusted if needed
    "input_size": tune.choice([h,h*7,h*7*2,h*7*3]), # Size of input window
    "learning_rate": tune.loguniform(1e-4, 1e-1),  # Initial learning rate
    "scaler_type": tune.choice(["minmax", "standard"]),
    "batch_size": tune.choice([32, 64,128, 256]),
    "random_seed": tune.choice([10, 20,30]),
}

if RETUNE == True:
    models = [AutoDLinear(h=h, 
                     config = DLinear_config,
                     search_alg = HyperOptSearch(),
                     backend = 'ray',
                     num_samples = 100,
                     #cpus=10, 
                     )]

else:
    models = [AutoDLinear(h=h, 
                    config = loaded_config,
                    search_alg = None,
                    backend = 'ray',
                    #cpus=1
                    )]


model_tuned = NeuralForecast(models=models, freq='30min')

model_tuned.fit(training_df[['LCLid','timestamp','energy_consumption']],
                id_col = 'LCLid',
                time_col = 'timestamp',
                target_col='energy_consumption',
                val_size = 48)

In [None]:
pred_df_test =  models_test.predict(futr_df=test_df[['LCLid','timestamp','energy_consumption']]).reset_index()
pred_df_test = pred_df_test.merge(test_df[['LCLid','timestamp','energy_consumption']], on=['LCLid','timestamp'], how='left')
pred_df_test.head()

In [None]:
fcst_mase = partial(mase, seasonality=48)

# Get metrics for individual LCLid's
DLinear_metrics_test = evaluate(pred_df_test, 
        metrics=[rmse, mae, mse, fcst_mase],  
        train_df = train_df[['timestamp', 'LCLid', 'energy_consumption']],      
        id_col = 'LCLid',
        time_col = 'timestamp',
        target_col = 'energy_consumption'
        )

# Get aggregated metrics for across all LCLid's by model
DLinear_metrics_agg_test = evaluate(pred_df_test, 
        metrics=[rmse, mae, mse, fcst_mase],  
        train_df = train_df[['timestamp', 'LCLid', 'energy_consumption']],      
        id_col = 'LCLid',
        time_col = 'timestamp',
        target_col = 'energy_consumption',
        agg_fn='mean'
        )

In [None]:
DLinear_metrics_agg_test

In [None]:
DLinear_metrics_test.head()

In [None]:
DLinear_metrics_agg_test.to_pickle(output/'DLinear_metrics_agg_test.pkl')
DLinear_metrics_test.to_pickle(output/'DLinear_metrics_test.pkl')