In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from functools import partial
import polars as pl

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsforecast import StatsForecast
from statsforecast.models import MSTL, AutoARIMA
from statsmodels.stats.diagnostic import acorr_ljungbox
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio

from utilsforecast.evaluation import evaluate
from utilsforecast.losses import rmse, mae, mape, mase, mse, smape
from plotting_utils import (
    plotly_series as plot_series,
    plot_residuals_diagnostic,
    plot_real_data_vs_insample_forecast,
)
from summary_utils import (
    print_arima_fitted_summary,
    print_regression_summary_from_model,
    get_fitted_residuals,
)

from prophet import Prophet

from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

from mlforecast import MLForecast
from mlforecast.utils import PredictionIntervals

from utilsforecast.feature_engineering import fourier, pipeline
from scipy import stats

In [None]:
pio.templates.default = "plotly_white"

In [None]:
metrics = [
    mae,
    mse,
    rmse,
    mape,
    smape,
    partial(mase, seasonality=48),
]

In [None]:
data = pl.read_parquet(
    "data/london_smart_meters/preprocessed/london_smart_meters_merged_block_0-7.parquet"
)
timestamp = data.group_by("LCLid").agg(
    pl.datetime_range(
        start=pl.col("start_timestamp"),
        end=pl.col("start_timestamp").dt.offset_by(
            pl.format("{}m", pl.col("series_length").sub(1).mul(30))
        ),
        interval="30m",
    ).alias("ds"),
)
data = timestamp.join(data, on="LCLid", how="inner").rename(
    {"LCLid": "unique_id", "energy_consumption": "y"}
)
data.head(5)

In [None]:
id_ = "unique_id"
time_ = "ds"
target_ = "y"
temp_ = "temperature"
id_col = pl.col(id_)
time_col = pl.col(time_)
target_col = pl.col(target_)
temp_col = pl.col(temp_)

In [None]:
data = (
    data.filter(pl.col("file").eq("block_7"))
    .select(
        [
            time_,
            id_,
            target_,
            "Acorn",
            "Acorn_grouped",
            "holidays",
            "visibility",
            "windBearing",
            "temperature",
            "dewPoint",
            "pressure",
            "apparentTemperature",
            "windSpeed",
            "precipType",
            "icon",
            "humidity",
            "summary",
        ]
    )
    .explode(
        [
            time_,
            target_,
            "holidays",
            "visibility",
            "windBearing",
            "temperature",
            "dewPoint",
            "pressure",
            "apparentTemperature",
            "windSpeed",
            "precipType",
            "icon",
            "humidity",
            "summary",
        ]
    )
)
data.head()

In [None]:
selected_id = "MAC000193"
data = (
    data.filter(id_col.eq(selected_id))
    .with_columns(target_col.forward_fill().backward_fill())
    .select([time_, id_, target_, temp_])
)
data.head()

In [None]:
plot_series(data)

In [None]:
plot_series(data, target_col=temp_)

In [None]:
px.scatter(
    data,
    y=target_,
    x=temp_,
    title="Temperature",
)

In [None]:
mf = MLForecast(models=LinearRegression(), freq="30min")

# Fit model
mf.fit(data, fitted=True, static_features=[])

In [None]:
model = mf.models_["LinearRegression"]
insample_forecasts = mf.forecast_fitted_values()
X = data.select(model.feature_names_in_).to_pandas()
y = data.get_column(target_).to_pandas()

In [None]:
print_regression_summary_from_model(model, X, y)

In [None]:
plot_series(data, insample_forecasts.drop("y"))

In [None]:
px.scatter(
    x=insample_forecasts.get_column("y"),
    y=insample_forecasts.get_column("LinearRegression"),
).update_traces(marker=dict(size=5)).update_layout(
    title="Real vs In-sample forecast",
    xaxis_title="Real",
    yaxis_title="In-sample forecast",
)

In [None]:
residuals = get_fitted_residuals(mf)
residuals = residuals.get_column("LinearRegression")
ds = data.get_column(time_)
plot_residuals_diagnostic(
    residuals=residuals,
    time=ds,
)

In [None]:
acorr_ljungbox(residuals, lags=[10])

In [None]:
fig = px.scatter(x=data.get_column(temp_), y=residuals)
fig.update_layout(
    title="Scatter Plot of Residuals vs Temperature",
    xaxis_title="Temperature",
    yaxis_title="Residuals",
    template="plotly_white",
    width=800,
    height=600,
    showlegend=False,
)
fig.show()

In [None]:
fig = px.scatter(x=insample_forecasts.get_column("LinearRegression"), y=residuals)
fig.update_layout(
    title="Scatter Plot of Residuals vs Fitted Values",
    xaxis_title="Fitted Values",
    yaxis_title="Residuals",
    template="plotly_white",
    width=800,
    height=600,
    showlegend=False,
)
fig.show()

In [None]:
evaluate(
    insample_forecasts,
    metrics=metrics,
    train_df=data.select([id_, time_, target_]),
)

In [None]:
y_hat = mf.cross_validation(
    df=data.select([id_, time_, target_, temp_]).to_pandas(),
    h=48 * 7,
    step_size=1,
    n_windows=1,
    fitted=True,
    static_features=[],
).drop(columns=["cutoff"])

In [None]:
evaluate(
    pl.from_pandas(y_hat),
    metrics=metrics,
    train_df=data.select([id_, time_, target_]),
)

In [None]:
plot_series(data, pl.from_pandas(y_hat), max_insample_length=48 * 7)

In [None]:
features = [
    partial(fourier, season_length=2 * 24, k=10),
    partial(fourier, season_length=2 * 24 * 7, k=5),
    partial(fourier, season_length=2 * 24 * 7 * 365, k=3),
]
data_fourier, data_futr_fourier = pipeline(
    data,
    features=features,
    freq="30m",
    h=48 * 7,
)

In [None]:
data_fourier

In [None]:
mf.fit(data_fourier, fitted=True, static_features=[])

In [None]:
model = mf.models_["LinearRegression"]
insample_forecasts = mf.forecast_fitted_values()
X = data_fourier.select(model.feature_names_in_).to_pandas()
y = data_fourier.get_column(target_).to_pandas()

In [None]:
print_regression_summary_from_model(model, X, y)

In [None]:
plot_series(data, insample_forecasts.drop("y"))

In [None]:
px.scatter(
    x=insample_forecasts.get_column("y"),
    y=insample_forecasts.get_column("LinearRegression"),
).update_traces(marker=dict(size=5)).update_layout(
    title="Real vs In-sample forecast",
    xaxis_title="Real",
    yaxis_title="In-sample forecast",
)

In [None]:
residuals = get_fitted_residuals(mf)
residuals = residuals.get_column("LinearRegression")
ds = data.get_column(time_)
plot_residuals_diagnostic(
    residuals=residuals,
    time=ds,
)

In [None]:
acorr_ljungbox(residuals, lags=[10])

In [None]:
fig = px.scatter(x=data.get_column(temp_), y=residuals)
fig.update_layout(
    title="Scatter Plot of Residuals vs Temperature",
    xaxis_title="Temperature",
    yaxis_title="Residuals",
    template="plotly_white",
    width=800,
    height=600,
    showlegend=False,
)
fig.show()

In [None]:
fig = px.scatter(x=insample_forecasts.get_column("LinearRegression"), y=residuals)
fig.update_layout(
    title="Scatter Plot of Residuals vs Fitted Values",
    xaxis_title="Fitted Values",
    yaxis_title="Residuals",
    template="plotly_white",
    width=800,
    height=600,
    showlegend=False,
)
fig.show()

In [None]:
evaluate(
    insample_forecasts,
    metrics=metrics,
    train_df=data.select([id_, time_, target_]),
)

In [None]:
mf = MLForecast(models=LinearRegression(), freq="30m")

y_hat = mf.cross_validation(
    df=data_fourier,
    h=48 * 7,
    step_size=1,
    n_windows=1,
    fitted=True,
    static_features=[],
).drop("cutoff")

In [None]:
evaluate(
    y_hat,
    metrics=metrics,
    train_df=data_fourier,
)

In [None]:
plot_series(data, y_hat, max_insample_length=48 * 7)

In [None]:
sf = StatsForecast(
    models=[AutoARIMA(max_d=0, seasonal=False, nmodels=20, max_p=3, max_q=3)],
    freq="30m",
)

sf.fit(data_fourier)

In [None]:
print_arima_fitted_summary(sf.fitted_[0, 0].model_)

In [None]:
residuals = sf.fitted_[0, 0].model_["residuals"]
time = data_fourier["ds"]

plot_residuals_diagnostic(
    residuals=residuals,
    time=time,
)

In [None]:
# dynamic regression with temperature and fourier series
# forecast method with regression: direct / recursive / dirrec