# Forecasting with Machine Learning

### Loading Libraries

In [None]:
%cd ../..

In [None]:
# Numerical Computing
import numpy as np

# Data Manipulation
import pandas as pd
from pandas.api.types import is_list_like

# Data Visualization
import seaborn as sns
import plotly.io as pio
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots

# Warnings
import joblib
import warnings
import humanize

# IO & Requests
import time
import random
import requests
from io import StringIO

# StatsModels
import statsmodels.api as sm
from statsmodels.tsa.seasonal import MSTL , DecomposeResult

# OS
import os
import sys
import pickleshare
import missingno as msno
from itertools import cycle
from typing import List, Tuple

# PyArrow
import pyarrow as pa

# FuncTools
from functools import partial

# Path & Notebook Optimizer
from pathlib import Path
import missingno as msno
from tqdm.auto import tqdm

# Scikit-Learn
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV

# XGBoost
from xgboost import XGBRFRegressor

# LightGBM
from lightgbm import LGBMRegressor

# IPython
from IPython.display import display, HTML

# NIXTLA
from statsforecast.core import StatsForecast
from utilsforecast.plotting import plot_series
from utilsforecast.evaluation import evaluate

# Forecast
# from datasetsforecast.losses import *
from utilsforecast.evaluation import evaluate

# SRC
from src.utils import plotting_utils
from src.utils.general import LogTime
from src.utils.data_utils import _get_32_bit_dtype 
from src.utils.ts_utils_updated import mae, mse, mase
from src.utils.ts_utils_updated import forecast_bias, metrics_adapter, 
from src.transforms.target_transformations import AutoStationaryTransformer

In [None]:
# %load_ext autoreload
# %autoreload 2

In [None]:
warnings.filterwarnings("ignore", category=UserWarning)

warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
os.makedirs("imgs/chapter_08", exist_ok=True)

preprocessed = Path.home() / "Desktop" / "data" / "london_smart_meters" / "preprocessed"

output = Path.home() / "Desktop" / "data" / "london_smart_meters" / "output"

In [None]:
tqdm.pandas()

np.random.seed(42)

pio.templates.default = "plotly_white"

sys.path.append('/Users/joaquinromero/Desktop/MTSF') 

In [None]:
from src.window_ops.rolling import (
    seasonal_rolling_max,
    seasonal_rolling_mean,
    seasonal_rolling_min,
    seasonal_rolling_std,
)

In [None]:
def format_plot(fig, legends=None, xlabel="Time", ylabel="Value", title="", font_size=15):
    if legends:
        names = cycle(legends)
        fig.for_each_trace(lambda t: t.update(name=next(names)))
    fig.update_layout(
        autosize=False,
        width=900,
        height=500,
        title_text=title,
        title={"x": 0.5, "xanchor": "center", "yanchor": "top"},
        titlefont={"size": 20},
        legend_title=None,
        legend=dict(
            font=dict(size=font_size),
            orientation="h",
            yanchor="bottom",
            y=0.98,
            xanchor="right",
            x=1,
        ),
        yaxis=dict(
            title_text=ylabel,
            titlefont=dict(size=font_size),
            tickfont=dict(size=font_size),
        ),
        xaxis=dict(
            title_text=xlabel,
            titlefont=dict(size=font_size),
            tickfont=dict(size=font_size),
        )
    )
    return fig

In [None]:
# Reading The Missing Value Imputed and Train/Test Split Data
try:
    train_df = pd.read_parquet(preprocessed/"selected_blocks_train_missing_imputed_feature_engg.parquet")
    # Read in the Validation dataset as test_df so that we predict on it
    test_df = pd.read_parquet(preprocessed/"selected_blocks_val_missing_imputed_feature_engg.parquet")
    # test_df = pd.read_parquet(preprocessed/"block_0-7_test_missing_imputed_feature_engg.parquet")
except FileNotFoundError:
    display(HTML("""
    <div class="alert alert-block alert-warning">
    <b>Warning!</b> File not found. Please make sure you have run 01-Feature Engineering.ipynb in Chapter06
    </div>
    """))

#### Loading `The Single-Step Backtesting Baselines` for Validation

In [None]:
# Reading the missing Value Imputed and Train/Test Split Data
try:
    baseline_metrics_df = pd.read_pickle(output/"single_step_backtesting_baseline_metrics_val_df.pkl")
    baseline_aggregate_metrics_df = pd.read_pickle(output/"single_step_backtesting_baseline_aggregate_metrics_val.pkl")
    # baseline_metrics_test_df = pd.read_pickle(output/"single_step_backtesting_baseline_metrics_test_df.pkl")
    # baseline_aggregate_metrics_test_df = pd.read_pickle(output/"single_step_backtesting_baseline_aggregate_metrics_test.pkl")
except FileNotFoundError:
    display(HTML("""
    <div class="alert alert-block alert-warning">
    <b>Warning!</b> File not found. Please make sure you have run 00-Single Step Backtesting Baselines.ipynb in Chapter08
    </div>
    """))

In [None]:
len(train_df.LCLid.unique())

In [None]:
train_df.columns

### Feature Definition

In [None]:
feat_config = FeatureConfig(
    date="timestamp",
    target="energy_consumption",
    continuous_features=[
        "visibility",
        "windBearing",
        "temperature",
        "dewPoint",
        "pressure",
        "apparentTemperature",
        "windSpeed",
        "humidity",
        "energy_consumption_lag_1",
        "energy_consumption_lag_2",
        "energy_consumption_lag_3",
        "energy_consumption_lag_4",
        "energy_consumption_lag_5",
        "energy_consumption_lag_46",
        "energy_consumption_lag_47",
        "energy_consumption_lag_48",
        "energy_consumption_lag_49",
        "energy_consumption_lag_50",
        "energy_consumption_lag_334",
        "energy_consumption_lag_335",
        "energy_consumption_lag_336",
        "energy_consumption_lag_337",
        "energy_consumption_lag_338",
        "energy_consumption_rolling_3_mean",
        "energy_consumption_rolling_3_std",
        "energy_consumption_rolling_6_mean",
        "energy_consumption_rolling_6_std",
        "energy_consumption_rolling_12_mean",
        "energy_consumption_rolling_12_std",
        "energy_consumption_rolling_48_mean",
        "energy_consumption_rolling_48_std",
        "energy_consumption_48_seasonal_rolling_3_mean",
        "energy_consumption_48_seasonal_rolling_3_std",
        "energy_consumption_336_seasonal_rolling_3_mean",
        "energy_consumption_336_seasonal_rolling_3_std",
        "energy_consumption_ewma_span_2880",
        "energy_consumption_ewma_span_336",
        "energy_consumption_ewma_span_48",
        "timestamp_Elapsed",
        "timestamp_Month_sin_1",
        "timestamp_Month_sin_2",
        "timestamp_Month_sin_3",
        "timestamp_Month_sin_4",
        "timestamp_Month_sin_5",
        "timestamp_Month_cos_1",
        "timestamp_Month_cos_2",
        "timestamp_Month_cos_3",
        "timestamp_Month_cos_4",
        "timestamp_Month_cos_5",
        "timestamp_Hour_sin_1",
        "timestamp_Hour_sin_2",
        "timestamp_Hour_sin_3",
        "timestamp_Hour_sin_4",
        "timestamp_Hour_sin_5",
        "timestamp_Hour_cos_1",
        "timestamp_Hour_cos_2",
        "timestamp_Hour_cos_3",
        "timestamp_Hour_cos_4",
        "timestamp_Hour_cos_5",
        "timestamp_Minute_sin_1",
        "timestamp_Minute_sin_2",
        "timestamp_Minute_sin_3",
        "timestamp_Minute_sin_4",
        "timestamp_Minute_sin_5",
        "timestamp_Minute_cos_1",
        "timestamp_Minute_cos_2",
        "timestamp_Minute_cos_3",
        "timestamp_Minute_cos_4",
        "timestamp_Minute_cos_5",
    ],
    categorical_features=[
        "holidays",
        "precipType",
        "icon",
        "summary",
        "timestamp_Month",
        "timestamp_Quarter",
        "timestamp_WeekDay",
        "timestamp_Dayofweek",
        "timestamp_Dayofyear",
        "timestamp_Hour",
        "timestamp_Minute",
    ],
    boolean_features=[
        "timestamp_Is_quarter_end",
        "timestamp_Is_quarter_start",
        "timestamp_Is_year_end",
        "timestamp_Is_year_start",
        "timestamp_Is_month_start",
    ],
    index_cols=["timestamp"],
    exogenous_features=[
        "holidays",
        "precipType",
        "icon",
        "summary",
        "visibility",
        "windBearing",
        "temperature",
        "dewPoint",
        "pressure",
        "apparentTemperature",
        "windSpeed",
        "humidity",
    ],
)

### Sample Household

In [None]:
sample_train_df = train_df.loc[train_df.LCLid == "MAC000193", :]
sample_test_df = test_df.loc[test_df.LCLid == "MAC000193", :]
train_features, train_target, train_original_target = feat_config.get_X_y(
    sample_train_df, categorical=False, exogenous=False
)
# Loading the Validation as test
test_features, test_target, test_original_target = feat_config.get_X_y(
    sample_test_df, categorical=False, exogenous=False
)
del sample_train_df, sample_test_df

In [None]:
### Missing Value Handling

#### Null Check

In [None]:
nc = train_features.isnull().sum()
nc[nc>0]

In [None]:
nc = test_features.isnull().sum()
nc[nc>0]

In [None]:
missing_value_config = MissingValueConfig(
    bfill_columns=[
        "energy_consumption_lag_1",
        "energy_consumption_lag_2",
        "energy_consumption_lag_3",
        "energy_consumption_lag_4",
        "energy_consumption_lag_5",
        "energy_consumption_lag_46",
        "energy_consumption_lag_47",
        "energy_consumption_lag_48",
        "energy_consumption_lag_49",
        "energy_consumption_lag_50",
        "energy_consumption_lag_334",
        "energy_consumption_lag_335",
        "energy_consumption_lag_336",
        "energy_consumption_lag_337",
        "energy_consumption_lag_338",
        "energy_consumption_rolling_3_mean",
        "energy_consumption_rolling_3_std",
        "energy_consumption_rolling_6_mean",
        "energy_consumption_rolling_6_std",
        "energy_consumption_rolling_12_mean",
        "energy_consumption_rolling_12_std",
        "energy_consumption_rolling_48_mean",
        "energy_consumption_rolling_48_std",
        "energy_consumption_48_seasonal_rolling_3_mean",
        "energy_consumption_48_seasonal_rolling_3_std",
        "energy_consumption_336_seasonal_rolling_3_mean",
        "energy_consumption_336_seasonal_rolling_3_std",
        "energy_consumption_ewma__span_2880",
        "energy_consumption_ewma__span_336",
        "energy_consumption_ewma__span_48",
    ],
    ffill_columns=[],
    zero_fill_columns=[],
)

### Running ML Models on a Sample Household

In [None]:
pred_df = pd.concat([train_target, test_target])
metric_record = []

In [None]:
metric_record += (
    baseline_metrics_df.loc[baseline_metrics_df.LCLid == "MAC000193"]
    .drop(columns="LCLid")
    .to_dict(orient="records")
)

In [None]:
metric_record

In [None]:
# from typing import Optional, Tuple, Union, Sequence, Callable, cast
# from pandas.api.types import is_datetime64_any_dtype as is_datetime
# def is_datetime_dtypes(x):
#     return is_datetime(x)

# def cast_to_series(df):
#     is_pd_dataframe = isinstance(df, pd.DataFrame)    
#     if is_pd_dataframe: 
#         if df.shape[1]==1:
#             df = df.squeeze()
#         else:
#             raise ValueError("Dataframes with more than one columns cannot be converted to pd.Series")
#     return df

# def metrics_adapter(metric_func, actual_series,
#         pred_series,
#         insample = None,
#         m: Optional[int] = 1,
#         intersect: bool = True,
#         reduction: Callable[[np.ndarray], float] = np.mean,
#         inter_reduction: Callable[[np.ndarray], Union[float, np.ndarray]] = lambda x: x,
#         n_jobs: int = 1,
#         verbose: bool = False):
    
#     actual_series, pred_series = cast_to_series(actual_series), cast_to_series(pred_series)
#     if insample is not None:
#         insample = cast_to_series(insample)
#     assert type(actual_series) is type(pred_series), f"actual_series({type(actual_series)}) and pred_series({type(pred_series)}) should be of same type."
#     if insample is not None:
#         assert type(actual_series) is type(insample), "actual_series and insample should be of same type."
#     is_nd_array = isinstance(actual_series, np.ndarray)
#     is_pd_series = isinstance(actual_series, pd.Series)
    
#     if is_pd_series:
#         is_datetime_index = is_datetime_dtypes(actual_series.index) and is_datetime_dtypes(pred_series.index)
#         if insample is not None:
#             is_datetime_index = is_datetime_index and is_datetime_dtypes(insample.index)
#     else:
#         is_datetime_index = False
#     if metric_func.__name__ == "mase":
#         if not is_datetime_index:
#             raise ValueError("MASE needs pandas Series with datetime index as inputs")
    
#     # if is_nd_array or (is_pd_series and not is_datetime_index):
#     #     actual_series, pred_series = TimeSeries.from_values(actual_series.values if is_pd_series else actual_series), TimeSeries.from_values(pred_series.values if is_pd_series else pred_series)
#     #     if insample is not None:
#     #         insample = TimeSeries.from_values(insample.values if is_pd_series else insample)

#     # elif is_pd_series and is_datetime_index:
#     #     actual_series, pred_series = TimeSeries.from_series(actual_series), TimeSeries.from_series(pred_series)
#     #     if insample is not None:
#     #         insample = TimeSeries.from_series(insample)
#     # else:
#     #     raise ValueError()
#     if metric_func.__name__ == "mase":
#         #return metric_func(actual_series=actual_series, pred_series=pred_series, insample=insample, m=m, intersect=intersect, reduction=reduction, inter_reduction=inter_reduction, n_jobs=n_jobs, verbose=verbose)
#         return metric_func(actual_series, pred_series, insample)

#     else:
#         #return metric_func(actual_series=actual_series, pred_series=pred_series, intersect=intersect, reduction=reduction, inter_reduction=inter_reduction, n_jobs=n_jobs, verbose=verbose)
#         return metric_func(actual_series, pred_series)


# def calculate_metrics(
#     y: pd.Series, y_pred: pd.Series, name: str, y_train: pd.Series = None
# ):
#     """Method to calculate the metrics given the actual and predicted series

#     Args:
#         y (pd.Series): Actual target with datetime index
#         y_pred (pd.Series): Predictions with datetime index
#         name (str): Name or identification for the model
#         y_train (pd.Series, optional): Actual train target to calculate MASE with datetime index. Defaults to None.

#     Returns:
#         Dict: Dictionary with MAE, MSE, MASE, and Forecast Bias
#     """
#     return {
#         "Algorithm": name,
#         "MAE": darts_metrics_adapter(mae, actual_series=y, pred_series=y_pred),
#         "MSE": darts_metrics_adapter(mse, actual_series=y, pred_series=y_pred),
#         "MASE": darts_metrics_adapter(
#             mase, actual_series=y, pred_series=y_pred, insample=y_train
#         )
#         if y_train is not None
#         else None,
#         "Forecast Bias": darts_metrics_adapter(
#             forecast_bias, actual_series=y, pred_series=y_pred
#         )
        
#     }

# def mae(actuals, predictions):
#     return np.nanmean(np.abs(actuals-predictions))

# def mse(actuals, predictions):
#     return np.nanmean(np.power(actuals-predictions, 2))

# def mase(actuals, predictions, insample):
#     """
#     Calculate the Mean Absolute Scaled Error (MASE).
    
#     Parameters:
#     actuals : np.ndarray
#         Actual observed values corresponding to the predictions.
#     predictions : np.ndarray
#         Predicted values.
#     insample : np.ndarray
#         In-sample data to calculate the scaling factor based on a naive forecast.

#     Returns:
#     float
#         The MASE metric.
#     """
#     # Calculate MAE of predictions
#     mae_predictions = np.nanmean(np.abs(actuals - predictions))
    
#     # Shift the insample data to create a simple naive forecast
#     naive_forecast = np.roll(insample, 1)
#     # Assuming the first element is not a valid forecast
#     naive_forecast[0] = np.nan 
    
#     # Calculate MAE of the naive forecast
#     mae_naive = np.nanmean(np.abs(insample - naive_forecast))
    
#     # Calculate MASE
#     mase_value = mae_predictions / mae_naive
#     return mase_value


# def _remove_nan_union(array_a: np.ndarray,
#                       array_b: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
#     """
#     Returns the two inputs arrays where all elements are deleted that have an index that corresponds to
#     a NaN value in either of the two input arrays.
#     """

#     isnan_mask = np.logical_or(np.isnan(array_a), np.isnan(array_b))
#     return np.delete(array_a, isnan_mask), np.delete(array_b, isnan_mask)

# def forecast_bias(actual_series: Union[ np.ndarray],
#         pred_series: Union[ np.ndarray],
#         intersect: bool = True,
#         *,
#         reduction: Callable[[np.ndarray], float] = np.mean,
#         inter_reduction: Callable[[np.ndarray], Union[float, np.ndarray]] = lambda x: x,
#         n_jobs: int = 1,
#         verbose: bool = False) -> Union[float, np.ndarray]:
#     """ Forecast Bias (FB).

#     Given a time series of actual values :math:`y_t` and a time series of predicted values :math:`\\hat{y}_t`
#     both of length :math:`T`, it is a percentage value computed as

#     .. math:: 100 \\cdot \\frac{\\sum_{t=1}^{T}{y_t}
#               - \\sum_{t=1}^{T}{\\hat{y}_t}}{\\sum_{t=1}^{T}{y_t}}.

#     If any of the series is stochastic (containing several samples), the median sample value is considered.

#     Parameters
#     ----------
#     actual_series
#         The `TimeSeries` or `Sequence[TimeSeries]` of actual values.
#     pred_series
#         The `TimeSeries` or `Sequence[TimeSeries]` of predicted values.
#     intersect
#         For time series that are overlapping in time without having the same time index, setting `intersect=True`
#         will consider the values only over their common time interval (intersection in time).
#     reduction
#         Function taking as input a `np.ndarray` and returning a scalar value. This function is used to aggregate
#         the metrics of different components in case of multivariate `TimeSeries` instances.
#     inter_reduction
#         Function taking as input a `np.ndarray` and returning either a scalar value or a `np.ndarray`.
#         This function can be used to aggregate the metrics of different series in case the metric is evaluated on a
#         `Sequence[TimeSeries]`. Defaults to the identity function, which returns the pairwise metrics for each pair
#         of `TimeSeries` received in input. Example: `inter_reduction=np.mean`, will return the average of the pairwise
#         metrics.
#     n_jobs
#         The number of jobs to run in parallel. Parallel jobs are created only when a `Sequence[TimeSeries]` is
#         passed as input, parallelising operations regarding different `TimeSeries`. Defaults to `1`
#         (sequential). Setting the parameter to `-1` means using all the available processors.
#     verbose
#         Optionally, whether to print operations progress

#     Raises
#     ------
#     ValueError
#         If :math:`\\sum_{t=1}^{T}{y_t} = 0`.

#     Returns
#     -------
#     float
#         The Forecast Bias (OPE)
#     """
#     assert type(actual_series) is type(pred_series), "actual_series and pred_series should be of same type."
#     if isinstance(actual_series, np.ndarray):
#         y_true, y_pred = actual_series, pred_series
#     else:
#         y_true = actual_series
#         y_pred = pred_series
#     #     y_true, y_pred = _get_values_or_raise(actual_series, pred_series, intersect)
#     #y_true, y_pred = _remove_nan_union(y_true, y_pred)
#     y_true_sum, y_pred_sum = np.sum(y_true), np.sum(y_pred)
#     # raise_if_not(y_true_sum > 0, 'The series of actual value cannot sum to zero when computing OPE.', logger)
#     return ((y_true_sum - y_pred_sum) / y_true_sum) * 100.

In [None]:
def evaluate_model(
    model_config,
    feature_config,
    missing_config,
    train_features,
    train_target,
    test_features,
    test_target,
):
    ml_model = MLForecast(
        model_config=model_config,
        feature_config=feat_config,
        missing_config=missing_value_config,
    )
    ml_model.fit(train_features, train_target)
    y_pred = ml_model.predict(test_features)
    feat_df = ml_model.feature_importance()
    metrics = calculate_metrics(test_target, y_pred, model_config.name, train_target)
    return y_pred,  metrics, feat_df 


from itertools import cycle


def plot_forecast(pred_df, forecast_columns, forecast_display_names=None):
    if forecast_display_names is None:
        forecast_display_names = forecast_columns
    else:
        assert len(forecast_columns) == len(forecast_display_names)
    mask = ~pred_df[forecast_columns[0]].isnull()
    colors = [
        "rgba(" + ",".join([str(c) for c in plotting_utils.hex_to_rgb(c)]) + ",<alpha>)"
        for c in px.colors.qualitative.Plotly
    ]
    act_color = colors[0]
    colors = cycle(colors[1:])
    fig = go.Figure()
    fig.add_trace(
        go.Scatter(
            x=pred_df[mask].index,
            y=pred_df[mask].energy_consumption,
            mode="lines",
            line=dict(color=act_color.replace("<alpha>", "0.9")),
            name="Actual Consumption",
        )
    )
    for col, display_col in zip(forecast_columns, forecast_display_names):
        fig.add_trace(
            go.Scatter(
                x=pred_df[mask].index,
                y=pred_df.loc[mask, col],
                mode="lines",
                line=dict(dash="dot", color=next(colors).replace("<alpha>", "1")),
                name=display_col,
            )
        )
    return fig

def highlight_abs_min(s, props=''):
    return np.where(s == np.nanmin(np.abs(s.values)), props, '')

### Linear Models

#### Linear Regression

In [None]:
model_config = ModelConfig(
    model=LinearRegression(),
    name="Linear Regression",
    # LinearRegression is sensitive to normalized data
    normalize=True,
    # LinearRegression cannot handle missing values
    fill_missing=True,
)
with LogTime() as timer:
    y_pred, metrics, feat_df, = evaluate_model(
        model_config,
        feat_config,
        missing_value_config,
        train_features,
        train_target,
        test_features,
        test_target,
    )
metrics["Time Elapsed"] = timer.elapsed
metric_record.append(metrics)
pred_df = pred_df.join(y_pred)

In [None]:
metrics

In [None]:
fig = plot_forecast(pred_df, forecast_columns=[model_config.name], forecast_display_names=[model_config.name])
fig = format_plot(fig, title=f"{model_config.name}: MAE: {metrics['MAE']:.4f} | MSE: {metrics['MSE']:.4f} | MASE: {metrics['MASE']:.4f} | Bias: {metrics['Forecast Bias']:.4f}")
fig.update_xaxes(type="date", range=["2014-01-01", "2014-01-08"])
fig.write_image("imgs/chapter_08/lin_reg.png")
fig.show()

In [None]:
fig = px.bar(feat_df.head(15), x="feature", y="importance")
format_plot(fig, xlabel="Features", ylabel="Importance", title=f"Feature Importance - {model_config.name}", font_size=12)
fig.write_image("imgs/chapter_08/lin_reg_fimp.png")
fig.show()

#### Ridge Regression (L2)

In [None]:
model_config = ModelConfig(
    model=RidgeCV(), 
    name="Ridge Regression", 
    # RidgeCV is sensitive to normalized data
    normalize=True, 
    # RidgeCV does not handle missing values
    fill_missing=True
)
with LogTime() as timer:
    y_pred,   metrics, feat_df,= evaluate_model(
        model_config,
        feat_config,
        missing_value_config,
        train_features,
        train_target,
        test_features,
        test_target,
    )
metrics["Time Elapsed"] = timer.elapsed
metric_record.append(metrics)
pred_df = pred_df.join(y_pred)

In [None]:
metrics

In [None]:
fig = plot_forecast(pred_df, forecast_columns=[model_config.name], forecast_display_names=[model_config.name])
fig = format_plot(fig, title=f"{model_config.name}: MAE: {metrics['MAE']:.4f} | MSE: {metrics['MSE']:.4f} | MASE: {metrics['MASE']:.4f} | Bias: {metrics['Forecast Bias']:.4f}")
fig.update_xaxes(type="date", range=["2014-01-01", "2014-01-08"])
fig.write_image("imgs/chapter_08/ridge_reg.png")
fig.show()

In [None]:
fig = px.bar(feat_df.head(15), x="feature", y="importance")
format_plot(fig, xlabel="Features", ylabel="Importance", title=f"Feature Importance - {model_config.name}", font_size=12)
fig.write_image("imgs/chapter_08/ridge_reg_fimp.png")
fig.show()

#### Lasso Regression (L1)

In [None]:
model_config = ModelConfig(
    model=LassoCV(), 
    name="Lasso Regression", 
    # LassoCV is sensitive to normalized data
    normalize=True, 
    # LassoCV does not handle missing values
    fill_missing=True
)
with LogTime() as timer:
    y_pred, metrics, feat_df = evaluate_model(
        model_config,
        feat_config,
        missing_value_config,
        train_features,
        train_target,
        test_features,
        test_target,
    )
metrics["Time Elapsed"] = timer.elapsed
metric_record.append(metrics)
pred_df = pred_df.join(y_pred)

In [None]:
fig = plot_forecast(pred_df, forecast_columns=[model_config.name], forecast_display_names=[model_config.name])
fig = format_plot(fig, title=f"{model_config.name}: MAE: {metrics['MAE']:.4f} | MSE: {metrics['MSE']:.4f} | MASE: {metrics['MASE']:.4f} | Bias: {metrics['Forecast Bias']:.4f}")
fig.update_xaxes(type="date", range=["2014-01-01", "2014-01-08"])
fig.write_image("imgs/chapter_08/lasso_reg.png")
fig.show()

In [None]:
fig = px.bar(feat_df.head(15), x="feature", y="importance")
format_plot(fig, xlabel="Features", ylabel="Importance", title=f"Feature Importance - {model_config.name}", font_size=12)
fig.write_image("imgs/chapter_08/lasso_reg_fimp.png")
fig.show()

### Decision Tree

In [None]:
model_config = ModelConfig(
    model=DecisionTreeRegressor(max_depth=4, random_state=42),
    name="Decision Tree",
    # Decision Tree is not affected by normalization
    normalize=False,
    # Decision Tree in scikit-learn does not handle missing values
    fill_missing=True,
)
with LogTime() as timer:
    y_pred, metrics, feat_df = evaluate_model(
        model_config,
        feat_config,
        missing_value_config,
        train_features,
        train_target,
        test_features,
        test_target,
    )
metrics["Time Elapsed"] = timer.elapsed
metric_record.append(metrics)
pred_df = pred_df.join(y_pred)

In [None]:
fig = plot_forecast(pred_df, forecast_columns=[model_config.name], forecast_display_names=[model_config.name])
fig = format_plot(fig, title=f"{model_config.name}: MAE: {metrics['MAE']:.4f} | MSE: {metrics['MSE']:.4f} | MASE: {metrics['MASE']:.4f} | Bias: {metrics['Forecast Bias']:.4f}")
fig.update_xaxes(type="date", range=["2014-01-01", "2014-01-08"])
fig.write_image("imgs/chapter_08/dtree.png")
fig.show()

In [None]:
fig = px.bar(feat_df.head(15), x="feature", y="importance")
format_plot(fig, xlabel="Features", ylabel="Importance", title=f"Feature Importance - {model_config.name}", font_size=12)
fig.write_image("imgs/chapter_08/dtree_fimp.png")
fig.show()

### Bagging & Boosting Trees

#### Random Forest

In [None]:
model_config = ModelConfig(
    model=RandomForestRegressor(random_state=42, max_depth=4),
    name="Random Forest",
    # RandomForest is not affected by normalization
    normalize=False,
    # RandomForest in scikit-learn does not handle missing values
    fill_missing=True,
)
with LogTime() as timer:
    y_pred, metrics, feat_df = evaluate_model(
        model_config,
        feat_config,
        missing_value_config,
        train_features,
        train_target,
        test_features,
        test_target,
    )
metrics["Time Elapsed"] = timer.elapsed
metric_record.append(metrics)
pred_df = pred_df.join(y_pred)

In [None]:
fig = plot_forecast(pred_df, forecast_columns=[model_config.name], forecast_display_names=[model_config.name])
fig = format_plot(fig, title=f"{model_config.name}: MAE: {metrics['MAE']:.4f} | MSE: {metrics['MSE']:.4f} | MASE: {metrics['MASE']:.4f} | Bias: {metrics['Forecast Bias']:.4f}")
fig.update_xaxes(type="date", range=["2014-01-01", "2014-01-08"])
fig.write_image("imgs/chapter_08/rf.png")
fig.show()

In [None]:
fig = px.bar(feat_df.head(15), x="feature", y="importance")
format_plot(fig, xlabel="Features", ylabel="Importance", title=f"Feature Importance - {model_config.name}", font_size=12)
fig.write_image("imgs/chapter_08/rf_fimp.png")
fig.show()

#### XGBoost Random Forest

In [None]:
model_config = ModelConfig(
    model=XGBRFRegressor(random_state=42, max_depth=4),
    name="XGB Random Forest",
    # XGBRF is not affected by normalization
    normalize=False,
    # XGBRF handles missing values
    fill_missing=False,
)
with LogTime() as timer:
    y_pred, metrics, feat_df = evaluate_model(
        model_config,
        feat_config,
        missing_value_config,
        train_features,
        train_target,
        test_features,
        test_target,
    )
metrics["Time Elapsed"] = timer.elapsed
metric_record.append(metrics)
pred_df = pred_df.join(y_pred)

In [None]:
fig = plot_forecast(pred_df, forecast_columns=[model_config.name], forecast_display_names=[model_config.name])
fig = format_plot(fig, title=f"{model_config.name}: MAE: {metrics['MAE']:.4f} | MSE: {metrics['MSE']:.4f} | MASE: {metrics['MASE']:.4f} | Bias: {metrics['Forecast Bias']:.4f}")
fig.update_xaxes(type="date", range=["2014-01-01", "2014-01-08"])
fig.write_image("imgs/chapter_08/xgbrf.png")
fig.show()

In [None]:
fig = px.bar(feat_df.head(15), x="feature", y="importance")
format_plot(fig, xlabel="Features", ylabel="Importance", title=f"Feature Importance - {model_config.name}", font_size=12)
fig.write_image("imgs/chapter_08/xgbrf_fimp.png")
fig.show()

#### LightGBM

In [None]:
model_config = ModelConfig(
    model=LGBMRegressor(random_state=42),
    name="LightGBM",
    # LightGBM is not affected by normalization
    normalize=False,
    # LightGBM handles missing values
    fill_missing=False,
)
with LogTime() as timer:
    y_pred, metrics, feat_df = evaluate_model(
        model_config,
        feat_config,
        missing_value_config,
        train_features,
        train_target,
        test_features,
        test_target,
    )
metrics["Time Elapsed"] = timer.elapsed
metric_record.append(metrics)
pred_df = pred_df.join(y_pred)

In [None]:
fig = plot_forecast(pred_df, forecast_columns=[model_config.name], forecast_display_names=[model_config.name])
fig = format_plot(fig, title=f"{model_config.name}: MAE: {metrics['MAE']:.4f} | MSE: {metrics['MSE']:.4f} | MASE: {metrics['MASE']:.4f} | Bias: {metrics['Forecast Bias']:.4f}")
fig.update_xaxes(type="date", range=["2014-01-01", "2014-01-08"])
fig.write_image("imgs/chapter_8/lgbm.png")
fig.show()

In [None]:
fig = px.bar(feat_df.head(15), x="feature", y="importance")
format_plot(fig, xlabel="Features", ylabel="Importance", title=f"Feature Importance - {model_config.name}", font_size=12)
fig.write_image("imgs/chapter_8/lgbm_fimp.png")
fig.show()

#### Summary

In [None]:
formatted = pd.DataFrame(metric_record).style.format({"MAE": "{:.4f}", 
                          "MSE": "{:.4f}", 
                          "MASE": "{:.4f}", 
                          "Forecast Bias": "{:.2f}%"})
formatted.highlight_min(color='lightgreen', subset=["MAE","MSE","MASE"]).apply(highlight_abs_min, props='color:black;background-color:lightgreen', axis=0, subset=['Forecast Bias'])

### Running ML Forecast for All Consumers

In [None]:
lcl_ids = sorted(train_df.LCLid.unique())
models_to_run = [
    ModelConfig(
        model=LassoCV(), name="Lasso Regression", normalize=True, fill_missing=True
    ),
    ModelConfig(
        model=XGBRFRegressor(random_state=42, max_depth=4),
        name="XGB Random Forest",
        normalize=False,
        fill_missing=False,
    ),
    ModelConfig(
        model=LGBMRegressor(random_state=42),
        name="LightGBM",
        normalize=False,
        fill_missing=False,
    ),
]

In [None]:
all_preds = []
all_metrics = []

# We can parallelize this loop to run this faster
with LogTime() as timer:
    for lcl_id in tqdm(lcl_ids):
        for model_config in models_to_run:
            model_config = model_config.clone()
            X_train, y_train, _ = feat_config.get_X_y(
                train_df.loc[train_df.LCLid == lcl_id, :],
                categorical=False,
                exogenous=False,
            )
            X_test, y_test, _ = feat_config.get_X_y(
                test_df.loc[test_df.LCLid == lcl_id, :], categorical=False, exogenous=False
            )
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                y_pred, metrics, feat_df = evaluate_model(
                    model_config,
                    feat_config,
                    missing_value_config,
                    X_train,
                    y_train,
                    X_test,
                    y_test,
                )
            y_pred.name = "predictions"
            y_pred = y_pred.to_frame()
            y_pred["LCLid"] = lcl_id
            y_pred["Algorithm"] = model_config.name
            metrics["LCLid"] = lcl_id
            metrics["Algorithm"] = model_config.name
            y_pred["energy_consumption"] = y_test.values
            all_preds.append(y_pred)
            all_metrics.append(metrics)

In [None]:
pred_df = pd.concat(all_preds)
pred_df.head()

In [None]:
metrics_df = pd.DataFrame(all_metrics)
metrics_df.head()

### Evaluation of ML Forecast

In [None]:
from src.utils import ts_utils

In [None]:
baseline_aggregate_metrics_df

In [None]:
metrics = baseline_aggregate_metrics_df.reset_index().rename(columns={"index":"Algorithm"}).to_dict(orient="records")

In [None]:
for model_config in models_to_run:
    pred_mask = pred_df.Algorithm==model_config.name
    metric_mask = metrics_df.Algorithm==model_config.name
    metrics.append({
    "Algorithm": model_config.name,
    "MAE": ts_utils.mae(pred_df.loc[pred_mask,"energy_consumption"], pred_df.loc[pred_mask,"predictions"]),
    "MSE": ts_utils.mse(pred_df.loc[pred_mask,"energy_consumption"], pred_df.loc[pred_mask,"predictions"]),
    "meanMASE": metrics_df.loc[metric_mask, "MASE"].mean(),
    "Forecast Bias": ts_utils.forecast_bias_aggregate(pred_df.loc[pred_mask,"energy_consumption"], pred_df.loc[pred_mask,"predictions"])
})

In [None]:
agg_metrics_df = pd.DataFrame(metrics)
agg_metrics_df.style.format({"MAE": "{:.4f}", 
                          "MSE": "{:.4f}", 
                          "meanMASE": "{:.4f}", 
                          "Forecast Bias": "{:.2f}%"}).highlight_min(color='lightgreen', subset=["MAE","MSE","meanMASE"]).apply(highlight_abs_min, props='color:black;background-color:lightgreen', axis=0, subset=['Forecast Bias'])

In [None]:
fig = px.histogram(metrics_df, 
                   x="MASE", 
                   color="Algorithm",
                   pattern_shape="Algorithm", 
                   marginal="box", 
                   nbins=500, 
                   barmode="overlay",
                   histnorm="probability density")
fig = format_plot(fig, xlabel="MASE", ylabel="Probability Density", title="Distribution of MASE in the dataset")
fig.update_layout(xaxis_range=[0,2.5])
fig.write_image("imgs/chapter_08/mase_dist.png")
fig.show()

In [None]:
fig = px.histogram(metrics_df, 
                   x="MAE", 
                   color="Algorithm",
                   pattern_shape="Algorithm", 
                   marginal="box", 
                   nbins=100, 
                   barmode="overlay",
                   histnorm="probability density")
fig = format_plot(fig, xlabel="MAE", ylabel="Probability Density", title="Distribution of MAE in the dataset")
fig.write_image("imgs/chapter_08/mae_dist.png")
fig.update_layout(xaxis_range=[0,0.4])
fig.show()

In [None]:
fig = px.histogram(metrics_df, 
                   x="MSE", 
                   color="Algorithm",
                   pattern_shape="Algorithm", 
                   marginal="box", 
                   nbins=500, 
                   barmode="overlay",
                   histnorm="probability density")
fig = format_plot(fig, xlabel="MSE", ylabel="Probability Density", title="Distribution of MSE in the dataset")
fig.update_layout(xaxis_range=[0,0.3])
fig.write_image("imgs/chapter_08/mse_dist.png")
fig.show()

In [None]:
fig = px.histogram(metrics_df, 
                   x="Forecast Bias", 
                   color="Algorithm",
                   pattern_shape="Algorithm", 
                   marginal="box", 
                   nbins=250,
                   barmode="overlay",
                   histnorm="probability density")
fig = format_plot(fig, xlabel="Forecast Bias", ylabel="Probability Density", title="Distribution of Forecast Bias in the dataset")
fig.update_layout(xaxis_range=[-50,30])
fig.write_image("imgs/chapter_08/bias_dist.png")
fig.show()

### Saving `The Baseline Forecasts and Metrics`

In [None]:
os.makedirs("data/london_smart_meters/output", exist_ok=True)

output = Path("data/london_smart_meters/output")

In [None]:
pred_df.to_pickle(output/"ml_single_step_prediction_val_df.pkl")
metrics_df.to_pickle(output/"ml_single_step_metrics_val_df.pkl")
agg_metrics_df.to_pickle(output/"ml_single_step_aggregate_metrics_val.pkl")

In [None]:
### Bonus: `Using Exogenous Variables`

In [None]:
lcl_ids = sorted(train_df.LCLid.unique())
models_to_run = [
    ModelConfig(model = LGBMRegressor(random_state=42), name="LightGBM", normalize=False, fill_missing=False)
]

In [None]:
from sklearn.exceptions import DataConversionWarning

In [None]:
all_preds = []
all_metrics = []
#We can parallelize this loop to run this faster
for lcl_id in tqdm(lcl_ids):
    for model_config in models_to_run:
        model_config = model_config.clone()
        X_train, y_train, _ = feat_config.get_X_y(train_df.loc[train_df.LCLid==lcl_id,:], categorical=False, exogenous=True)
        X_test, y_test, _ = feat_config.get_X_y(test_df.loc[test_df.LCLid==lcl_id,:], categorical=False, exogenous=True)
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            # warnings.filterwarnings("ignore",category=DataConversionWarning)
            y_pred, metrics, feat_df = evaluate_model(model_config, feat_config, missing_value_config, X_train, y_train, X_test, y_test)
        y_pred.name = "predictions"
        y_pred = y_pred.to_frame()
        y_pred['LCLid'] = lcl_id
        y_pred['Algorithm'] = model_config.name+"_w_exog"
        metrics["LCLid"] = lcl_id
        metrics["Algorithm"] = model_config.name+"_w_exog"
        y_pred['energy_consumption'] = y_test.values
        all_preds.append(y_pred)
        all_metrics.append(metrics)

In [None]:
pred_w_ex_df = pd.concat(all_preds)
pred_w_ex_df.head()

In [None]:
metrics_w_ex_df = pd.DataFrame(all_metrics)
metrics_w_ex_df.head()

### Evaluation of ML Forecast with Exogenous

In [None]:
from src.utils import ts_utils

In [None]:
metrics = baseline_aggregate_metrics_df.reset_index().rename(columns={"index":"Algorithm"}).to_dict(orient="records")

In [None]:
metrics.append(agg_metrics_df.iloc[4].to_dict())

In [None]:
for model_config in models_to_run:
    pred_mask = pred_w_ex_df.Algorithm==model_config.name+"_w_exog"
    metric_mask = metrics_w_ex_df.Algorithm==model_config.name+"_w_exog"
    metrics.append({
    "Algorithm": model_config.name+"_w_exog",
    "MAE": ts_utils.mae(pred_w_ex_df.loc[pred_mask,"energy_consumption"], pred_w_ex_df.loc[pred_mask,"predictions"]),
    "MSE": ts_utils.mse(pred_w_ex_df.loc[pred_mask,"energy_consumption"], pred_w_ex_df.loc[pred_mask,"predictions"]),
    "meanMASE": metrics_w_ex_df.loc[metric_mask, "MASE"].mean(),
    "Forecast Bias": ts_utils.forecast_bias_aggregate(pred_w_ex_df.loc[pred_mask,"energy_consumption"], pred_w_ex_df.loc[pred_mask,"predictions"])
})

In [None]:
agg_metrics_w_ex_df = pd.DataFrame(metrics)
agg_metrics_w_ex_df.style.format({"MAE": "{:.3f}", 
                          "MSE": "{:.3f}", 
                          "meanMASE": "{:.3f}", 
                          "Forecast Bias": "{:.2f}%"}).highlight_min(color='lightgreen', subset=["MAE","MSE","meanMASE"]).apply(highlight_abs_min, props='color:black;background-color:lightgreen', axis=0, subset=['Forecast Bias'])