# Feature Engineering: `Machine Learning Forecasting`

### Loading Libraries

In [None]:
%cd ../..

In [None]:
# Numerical Computing
import numpy as np

# Data Manipulation
import pandas as pd
from pandas.api.types import is_list_like

# Data Visualization
import seaborn as sns
import plotly.io as pio
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots

# Warnings
import warnings
import humanize

# IO & Requests
import time
import random
import requests
from io import StringIO

# StatsModels
import statsmodels.api as sm
from statsmodels.tsa.seasonal import MSTL , DecomposeResult

# OS
import os
import sys
import pickleshare
import missingno as msno
from itertools import cycle
from typing import List, Tuple

# PyArrow
import pyarrow as pa

# FuncTools
from functools import partial

# Path & Notebook Optimizer
from pathlib import Path
import missingno as msno
from tqdm.auto import tqdm

# Scikit-Learn
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso

# IPython
from IPython.display import display, HTML

# NIXTLA
from statsforecast.core import StatsForecast
from utilsforecast.plotting import plot_series
from utilsforecast.evaluation import evaluate

# Forecast
# from datasetsforecast.losses import *
from utilsforecast.evaluation import evaluate

from src.utils.general import LogTime
from src.utils.data_utils import _get_32_bit_dtype 

In [None]:
warnings.filterwarnings("ignore", category=UserWarning)

warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
os.makedirs("imgs/chapter_06", exist_ok=True)

preprocessed = Path.home() / "Desktop" / "data" / "london_smart_meters" / "preprocessed"

In [None]:
tqdm.pandas()

np.random.seed(0)

pio.templates.default = "plotly_white"

sys.path.append('/Users/joaquinromero/Desktop/MTSF') 

In [None]:
from src.window_ops.rolling import (
    seasonal_rolling_max,
    seasonal_rolling_mean,
    seasonal_rolling_min,
    seasonal_rolling_std,
)

### Reading `The Preprocessed Files`

In [None]:
# Reading The Missing Value Imputed and Train/Test Split Data
try:
    train_df = pd.read_parquet(preprocessed/"selected_blocks_train_missing_imputed.parquet")
    val_df = pd.read_parquet(preprocessed/"selected_blocks_val_missing_imputed.parquet")
    test_df = pd.read_parquet(preprocessed/"selected_blocks_test_missing_imputed.parquet")
except FileNotFoundError:
    display(HTML("""
    <div class="alert alert-block alert-warning">
    <b>Warning!</b> File not found. Please make sure you have run 01-Setting up Experiment Harness.ipynb in Chapter04
    </div>
    """))

#### Combining The Train, Validation & Test Datasets for Feature Engineering

In [None]:
train_df["type"] = "train"
val_df["type"] = "val"
test_df["type"] = "test"

full_df = pd.concat([train_df, val_df, test_df]).sort_values(["LCLid", "timestamp"])
del train_df, test_df, val_df

In [None]:
from mlforecast.lag_transforms import (
    RollingMean,
    RollingStd,
    RollingMin,
    RollingMax,
    SeasonalRollingMean,
    SeasonalRollingMin,
    SeasonalRollingMax,
    SeasonalRollingStd,
    ExponentiallyWeightedMean,
)

from collections import defaultdict

In [None]:
lag_transforms = defaultdict(list)

#### `Lag` Features

In [None]:
lags = (
    (np.arange(5) + 1).tolist()
    + (np.arange(5) + 46).tolist()
    + (np.arange(5) + (48 * 7) - 2).tolist()
)

lags

In [None]:
# with LogTime():
#     full_df, added_features = add_lags(
#         full_df, lags=lags, column="energy_consumption", ts_id="LCLid", use_32_bit=True
#     )
# print(f"Features Created: {','.join(added_features)}")

#### Rolling

In [None]:
# Adding Rolling Mean, Rolling Std, with an offset of one timestep
lag_transforms[1]+= [RollingMean(window_size=n) for n in [3, 6, 12, 48]] + [
    RollingStd(window_size=n) for n in [3, 6, 12, 48]
]

#### Seasonal Rolling

In [None]:
# Adding Seasonal Rolling Mean, Seasonal Rolling Std, with an offset of seasonal period timestep
lag_transforms[48]+= [SeasonalRollingMean(season_length=48, window_size=3)] + [
    SeasonalRollingStd(season_length=48, window_size=3)
]

lag_transforms[48 * 7]+= [SeasonalRollingMean(season_length=48 * 7, window_size=3)] + [
    SeasonalRollingStd(season_length=48 * 7, window_size=3)
]

#### EWMA

In [None]:
t = np.arange(25).tolist()

plot_df = pd.DataFrame({"Timesteps behind t": t})

for alpha in [0.3, 0.5, 0.8]:
    weights = [alpha * math.pow((1 - alpha), i) for i in t]
    span = (2 - alpha) / alpha
    halflife = math.log(1 - alpha) / math.log(0.5)
    plot_df[f"Alpha={alpha} | Span={span:.2f}"] = weights

fig = px.line(
    pd.melt(plot_df, id_vars="Timesteps behind t", var_name="Parameters"),
    x="Timesteps behind t",
    y="value",
    facet_col="Parameters",
)
fig.update_layout(
    autosize=False,
    width=1200,
    height=500,
    yaxis=dict(
        title_text="Weights",
        titlefont=dict(size=15),
        tickfont=dict(size=15),
    ),
    xaxis=dict(
        titlefont=dict(size=15),
        tickfont=dict(size=15),
    ),
)

fig.update_annotations(font=dict(size=16))
fig.write_image(f"imgs/chapter_06/ewma_weights.png")
fig.show()

In [None]:
# Adding Rolling Mean, Rolling Std, with an Offset of One Timestep
lag_transforms[1] += [ExponentiallyWeightedMean(alpha=alpha) for alpha in [0.2, 0.5, 0.9]]

#### Temporal Features

In [None]:
# Define the features you need in the model
# these should either be strings (pandas date function) or functions that take date as an argument
temporal_features = [
    "month",
    "quarter",
    "is_quarter_end",
    "is_quarter_start",
    "is_year_end",
    "is_year_start",
    "is_month_start",
    "is_month_end",
    "week",
    "day",
    "dayofweek",
    "dayofyear",
    "hour",
    "minute",
]

In [None]:
# with LogTime():
#     full_df, added_features = add_temporal_features(
#         full_df,
#         field_name="timestamp",
#         frequency="30min",
#         add_elapsed=True,
#         drop=False,
#         use_32_bit=True,
#     )
# print(f"Features Created: {','.join(added_features)}")

### Calculating The Features

In [None]:
from mlforecast import MLForecast

In [None]:
fcst = MLForecast(
    models=[],
    freq='D',
    lags=lags, # Defining the Lags we need to create
    # Defining some transformations we need to do to the lags (offsets)
    lag_transforms=lag_transforms,
    date_features=temporal_features, # Defining the date features we need
)
with LogTime():
    full_df = fcst.preprocess(
        full_df,
        time_col="timestamp",
        id_col="LCLid",
        target_col="energy_consumption",
    )

In [None]:
full_df.columns

### Fourier Terms

In [None]:
from src.feature_engineering.temporal_features import (
    add_fourier_features,
    bulk_add_fourier_features,
)

In [None]:
full_df.columns