# Feature Engineering

### Loading Libraries

In [1]:
%cd ../..

/Users/joaquinromero/Desktop


In [2]:
# Numerical Computing
import numpy as np

# Data Manipulation
import pandas as pd

# Data Visualization
import seaborn as sns
import plotly.io as pio
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots

# Warnings
import warnings
import humanize

# IO & Requests
import time
import random
import requests
from io import StringIO

# StatsModels
import statsmodels.api as sm
from statsmodels.tsa.seasonal import MSTL , DecomposeResult

# OS
import os
import sys
import pickleshare
import missingno as msno
from itertools import cycle

# PyArrow
import pyarrow as pa

# FuncTools
from functools import partial

# Path & Notebook Optimizer
from pathlib import Path
import missingno as msno
from tqdm.auto import tqdm

# Scikit-Learn
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso

# IPython
from IPython.display import display, HTML

# NIXTLA
from statsforecast.core import StatsForecast
from utilsforecast.plotting import plot_series
from utilsforecast.evaluation import evaluate

# Forecast
# from datasetsforecast.losses import *
from utilsforecast.evaluation import evaluate

from src.utils.general import LogTime

In [5]:
tqdm.pandas()

np.random.seed(0)

pio.templates.default = "plotly_white"

sys.path.append('/Users/joaquinromero/Desktop/MTSF') 

In [6]:
from typing import List, Tuple

from pandas.api.types import is_list_like

from src.utils.data_utils import _get_32_bit_dtype 

In [15]:
from src.window_ops.rolling import (
    seasonal_rolling_max,
    seasonal_rolling_mean,
    seasonal_rolling_min,
    seasonal_rolling_std,
)

In [16]:
warnings.filterwarnings("ignore", category=UserWarning)

warnings.filterwarnings("ignore", category=FutureWarning)

In [17]:
os.makedirs("imgs/chapter_06", exist_ok=True)

preprocessed = Path.home() / "Desktop" / "data" / "london_smart_meters" / "preprocessed"

### Reading `The Preprocessed` Files

In [18]:
# Reading The Missing Value Imputed & Train/Test Split Data
try:
    train_df = pd.read_parquet(preprocessed/"selected_blocks_train_missing_imputed.parquet")
    val_df = pd.read_parquet(preprocessed/"selected_blocks_val_missing_imputed.parquet")
    test_df = pd.read_parquet(preprocessed/"selected_blocks_test_missing_imputed.parquet")
except FileNotFoundError:
    display(HTML("""
    <div class="alert alert-block alert-warning">
    <b>Warning!</b> File not found. Please make sure you have run 01-Setting up Experiment Harness.ipynb in Chapter04
    </div>
    """))

#### Combining The `Train, Validation and Test` Datasets for Feature Engineering

In [19]:
train_df["type"] = "train"
val_df["type"] = "val"
test_df["type"] = "test"

full_df = pd.concat([train_df, val_df, test_df]).sort_values(["LCLid", "timestamp"])
del train_df, test_df, val_df

#### `Lag Features`

In [20]:
from src.feature_engineering.autoregressive_features import add_lags

ModuleNotFoundError: No module named 'window_ops'

In [None]:
lags = (
    (np.arange(5) + 1).tolist()
    + (np.arange(5) + 46).tolist()
    + (np.arange(5) + (48 * 7) - 2).tolist()
)
lags

In [36]:
touch /Users/joaquinromero/Desktop/MTSF/src/window_ops/__init__.py

NameError: name 'touch' is not defined

In [None]:
with LogTime():
    full_df, added_features = add_lags(
        full_df, lags=lags, column="energy_consumption", ts_id="LCLid", use_32_bit=True
    )
print(f"Features Created: {','.join(added_features)}")

### Rolling

In [None]:
from src.feature_engineering.autoregressive_features import add_rolling_features

In [None]:
with LogTime():
    full_df, added_features = add_rolling_features(
        full_df,
        rolls=[3, 6, 12, 48],
        column="energy_consumption",
        agg_funcs=["mean", "std"],
        ts_id="LCLid",
        use_32_bit=True,
    )
print(f"Features Created: {','.join(added_features)}")

### Seasonal Rolling

In [None]:
from src.feature_engineering.autoregressive_features import (
    add_seasonal_rolling_features,
)

In [None]:
with LogTime():
    full_df, added_features = add_seasonal_rolling_features(
        full_df,
        rolls=[3],
        seasonal_periods=[48, 48 * 7],
        column="energy_consumption",
        agg_funcs=["mean", "std"],
        ts_id="LCLid",
        use_32_bit=True,
    )
print(f"Features Created: {','.join(added_features)}")

### EWMA

In [None]:
from src.feature_engineering.autoregressive_features import add_ewma

In [None]:
t = np.arange(25).tolist()
plot_df = pd.DataFrame({"Timesteps behind t": t})
for alpha in [0.3, 0.5, 0.8]:
    weights = [alpha * math.pow((1 - alpha), i) for i in t]
    span = (2 - alpha) / alpha
    halflife = math.log(1 - alpha) / math.log(0.5)
    plot_df[f"Alpha={alpha} | Span={span:.2f}"] = weights

fig = px.line(
    pd.melt(plot_df, id_vars="Timesteps behind t", var_name="Parameters"),
    x="Timesteps behind t",
    y="value",
    facet_col="Parameters",
)
fig.update_layout(
    autosize=False,
    width=1200,
    height=500,
    yaxis=dict(
        title_text="Weights",
        titlefont=dict(size=15),
        tickfont=dict(size=15),
    ),
    xaxis=dict(
        titlefont=dict(size=15),
        tickfont=dict(size=15),
    ),
)
fig.update_annotations(font=dict(size=16))
fig.write_image(f"imgs/chapter_06/ewma_weights.png")
fig.show()

In [None]:
with LogTime():
    # full_df, added_features = add_ewma(full_df, alphas=[0.2, 0.5, 0.9], column="energy_consumption", ts_id="LCLid", use_32_bit=True)
    full_df, added_features = add_ewma(
        full_df,
        spans=[48 * 60, 48 * 7, 48],
        column="energy_consumption",
        ts_id="LCLid",
        use_32_bit=True,
    )
print(f"Features Created: {','.join(added_features)}")

### Temporal Features

In [None]:
from src.feature_engineering.temporal_features import add_temporal_features

In [None]:
with LogTime():
    full_df, added_features = add_temporal_features(
        full_df,
        field_name="timestamp",
        frequency="30min",
        add_elapsed=True,
        drop=False,
        use_32_bit=True,
    )
print(f"Features Created: {','.join(added_features)}")

### Fourier Terms

In [None]:
from src.feature_engineering.temporal_features import (
    add_fourier_features,
    bulk_add_fourier_features,
)

In [None]:
# train_df, added_features = add_fourier_features(train_df, "timestamp_Month", max_value=12, n_fourier_terms=1)

In [None]:
with LogTime():
    full_df, added_features = bulk_add_fourier_features(
        full_df,
        ["timestamp_Month", "timestamp_Hour", "timestamp_Minute"],
        max_values=[12, 24, 60],
        n_fourier_terms=5,
        use_32_bit=True,
    )
print(f"Features Created: {','.join(added_features)}")

In [None]:
full_df.columns

#### Plotting Fourier Terms

In [None]:
plot_df = (
    full_df[["timestamp_Month", "timestamp_Month_sin_1"]]
    .drop_duplicates()
    .sort_values("timestamp_Month")
)

plot_df.columns = ["calendar", "fourier"]

plot_df = pd.concat([plot_df, plot_df, plot_df]).reset_index(drop=True)
# plot_df.reset_index(drop=True, inplace=True)

plot_df.reset_index(inplace=True)
plot_df["index"] += 1
plot_df = pd.melt(
    plot_df, id_vars="index", var_name="month", value_name="Representation"
)

In [None]:
fig = px.line(plot_df, x="index", y="Representation", facet_row="month")
fig.update_layout(
    autosize=False,
    width=900,
    height=800,
    title_text="Step Function vs Continuous Function",
    title={"x": 0.5, "xanchor": "center", "yanchor": "top"},
    titlefont={"size": 20},
    legend_title=None,
    # yaxis=dict(
    #     # title_text=ylabel,
    #     # titlefont=dict(size=12),
    # ),
    xaxis=dict(
        title_text="Time",
        # titlefont=dict(size=12),
    ),
)
fig.update_yaxes(matches=None)
fig.update_xaxes(
    ticktext=np.arange(1, 13).tolist() * 3,
    tickvals=np.arange(len(plot_df)) + 1,
)
fig.write_image(f"imgs/chapter_6/fourier.png")
fig.show()

## Saving The `Feature Engineered` File

In [None]:
full_df.info(memory_usage="deep", verbose=False)

In [None]:
full_df[full_df["type"] == "train"].drop(columns="type").to_parquet(
    preprocessed / "selected_blocks_train_missing_imputed_feature_engg.parquet"
)
full_df[full_df["type"] == "val"].drop(columns="type").to_parquet(
    preprocessed / "selected_blocks_val_missing_imputed_feature_engg.parquet"
)
full_df[full_df["type"] == "test"].drop(columns="type").to_parquet(
    preprocessed / "selected_blocks_test_missing_imputed_feature_engg.parquet"
)