In [1]:
%load_ext autoreload
%autoreload 2

In [23]:
import polars as pl
import plotly.express as px
import seaborn as sns
from statsforecast import StatsForecast
from mlforecast import MLForecast
from mlforecast.lag_transforms import RollingMean
from statsforecast.arima import ndiffs, nsdiffs
from utilsforecast.losses import *
from utilsforecast.evaluation import evaluate
from plotly.subplots import make_subplots
from plotting_utils import (
    plot_acf,
    plot_pacf,
    plot_acf_pacf,
    plotly_series as plot_series,
    plot_series_acf_pacf,
    plot_residuals_diagnostic,
)
from statsmodels.stats.diagnostic import acorr_ljungbox
from statsmodels.tsa.stattools import kpss
from summary_utils import print_arima_fitted_summary, arima_fitted_summary_dataframe

import pandas as pd

import plotly.graph_objects as go


In [3]:
data = pl.read_parquet(
    "data/london_smart_meters/preprocessed/london_smart_meters_merged_block_0-7.parquet"
)
timestamp = data.group_by("LCLid").agg(
    pl.datetime_range(
        start=pl.col("start_timestamp"),
        end=pl.col("start_timestamp").dt.offset_by(
            pl.format("{}m", pl.col("series_length").sub(1).mul(30))
        ),
        interval="30m",
    ).alias("ds"),
)
data = timestamp.join(data, on="LCLid", how="inner").rename(
    {"LCLid": "unique_id", "energy_consumption": "y"}
)
data.head(5)

unique_id,ds,start_timestamp,frequency,y,series_length,stdorToU,Acorn,Acorn_grouped,file,holidays,visibility,windBearing,temperature,dewPoint,pressure,apparentTemperature,windSpeed,precipType,icon,humidity,summary,__index_level_0__
str,list[datetime[ns]],datetime[ns],str,list[f64],i64,str,str,str,str,list[str],list[f64],list[i64],list[f64],list[f64],list[f64],list[f64],list[f64],list[str],list[str],list[f64],list[str],i64
"""MAC000002""","[2012-10-13 00:00:00, 2012-10-13 00:30:00, … 2014-02-27 23:30:00]",2012-10-13 00:00:00,"""30min""","[0.263, 0.269, … 1.2180001]",24144,"""Std""","""ACORN-A""","""Affluent""","""block_0""","[""NO_HOLIDAY"", ""NO_HOLIDAY"", … ""NO_HOLIDAY""]","[13.08, 13.08, … 14.03]","[186, 186, … 200]","[8.78, 8.78, … 3.93]","[6.28, 6.28, … 1.61]","[1007.7, 1007.7, … 1004.62]","[7.55, 7.55, … 1.42]","[2.28, 2.28, … 2.75]","[""rain"", ""rain"", … ""rain""]","[""clear-night"", ""clear-night"", … ""clear-night""]","[0.84, 0.84, … 0.85]","[""Clear"", ""Clear"", … ""Clear""]",0
"""MAC000246""","[2012-01-01 00:00:00, 2012-01-01 00:30:00, … 2014-02-27 23:30:00]",2012-01-01 00:00:00,"""30min""","[0.509, 0.317, … 0.223]",37872,"""Std""","""ACORN-A""","""Affluent""","""block_0""","[""NO_HOLIDAY"", ""NO_HOLIDAY"", … ""NO_HOLIDAY""]","[12.99, 12.99, … 14.03]","[229, 229, … 200]","[12.12, 12.12, … 3.93]","[10.97, 10.97, … 1.61]","[1008.1, 1008.1, … 1004.62]","[12.12, 12.12, … 1.42]","[5.9, 5.9, … 2.75]","[""rain"", ""rain"", … ""rain""]","[""partly-cloudy-night"", ""partly-cloudy-night"", … ""clear-night""]","[0.93, 0.93, … 0.85]","[""Mostly Cloudy"", ""Mostly Cloudy"", … ""Clear""]",1
"""MAC000450""","[2012-03-23 00:00:00, 2012-03-23 00:30:00, … 2014-02-27 23:30:00]",2012-03-23 00:00:00,"""30min""","[1.337, 1.426, … null]",33936,"""Std""","""ACORN-A""","""Affluent""","""block_0""","[""NO_HOLIDAY"", ""NO_HOLIDAY"", … ""NO_HOLIDAY""]","[3.19, 3.19, … 14.03]","[78, 78, … 200]","[8.76, 8.76, … 3.93]","[7.25, 7.25, … 1.61]","[1027.41, 1027.41, … 1004.62]","[7.59, 7.59, … 1.42]","[2.18, 2.18, … 2.75]","[""rain"", ""rain"", … ""rain""]","[""fog"", ""fog"", … ""clear-night""]","[0.9, 0.9, … 0.85]","[""Foggy"", ""Foggy"", … ""Clear""]",2
"""MAC001074""","[2012-05-09 00:00:00, 2012-05-09 00:30:00, … 2014-02-27 23:30:00]",2012-05-09 00:00:00,"""30min""","[0.18, 0.086, … null]",31680,"""ToU""","""ACORN-""","""ACORN-""","""block_0""","[""NO_HOLIDAY"", ""NO_HOLIDAY"", … ""NO_HOLIDAY""]","[10.51, 10.51, … 14.03]","[215, 215, … 200]","[11.46, 11.46, … 3.93]","[10.23, 10.23, … 1.61]","[1007.39, 1007.39, … 1004.62]","[11.46, 11.46, … 1.42]","[2.35, 2.35, … 2.75]","[""rain"", ""rain"", … ""rain""]","[""partly-cloudy-night"", ""partly-cloudy-night"", … ""clear-night""]","[0.92, 0.92, … 0.85]","[""Partly Cloudy"", ""Partly Cloudy"", … ""Clear""]",3
"""MAC003223""","[2012-09-18 00:00:00, 2012-09-18 00:30:00, … 2014-02-27 23:30:00]",2012-09-18 00:00:00,"""30min""","[0.076, 0.079, … 0.38]",25344,"""Std""","""ACORN-A""","""Affluent""","""block_0""","[""NO_HOLIDAY"", ""NO_HOLIDAY"", … ""NO_HOLIDAY""]","[13.44, 13.44, … 14.03]","[236, 236, … 200]","[14.06, 14.06, … 3.93]","[10.82, 10.82, … 1.61]","[1011.09, 1011.09, … 1004.62]","[14.06, 14.06, … 1.42]","[3.86, 3.86, … 2.75]","[""rain"", ""rain"", … ""rain""]","[""clear-night"", ""clear-night"", … ""clear-night""]","[0.81, 0.81, … 0.85]","[""Clear"", ""Clear"", … ""Clear""]",4


In [4]:
id_ = "unique_id"
time_ = "ds"
target_ = "y"
id_col = pl.col(id_)
time_col = pl.col(time_)
target_col = pl.col(target_)

In [6]:
data = (
    data.filter(pl.col("file").eq("block_7"))
    .select([time_, id_, target_])
    .explode([time_, target_])
)
data.head()

ds,unique_id,y
datetime[ns],str,f64
2012-01-01 00:00:00,"""MAC000050""",0.175
2012-01-01 00:30:00,"""MAC000050""",0.212
2012-01-01 01:00:00,"""MAC000050""",0.313
2012-01-01 01:30:00,"""MAC000050""",0.302
2012-01-01 02:00:00,"""MAC000050""",0.257


In [87]:
selected_id = "MAC000193"
data = (
    data.filter(id_col.eq(selected_id)).with_columns(
        target_col.forward_fill().backward_fill()
    )
    # .filter(
    #     time_col.is_between(
    #         pl.date(year=2012, month=1, day=1), pl.date(year=2012, month=12, day=31)
    #     )
    # )
)
data.head()

ds,unique_id,y
datetime[ns],str,f64
2012-01-01 00:00:00,"""MAC000193""",0.368
2012-01-01 00:30:00,"""MAC000193""",0.386
2012-01-01 01:00:00,"""MAC000193""",0.17
2012-01-01 01:30:00,"""MAC000193""",0.021
2012-01-01 02:00:00,"""MAC000193""",0.038


In [88]:
fig = plot_series(data, date_range=["2012-11-4", "2012-12-4"])
fig.show()

In [89]:
original = data.get_column(target_)
day_diff = original.diff(48).drop_nulls()
day_week_diff = day_diff.diff(336).drop_nulls()

In [90]:
fig = plot_acf(original)
ljung_box = acorr_ljungbox(original, lags=[10])
print(ljung_box)
fig.show()

         lb_stat  lb_pvalue
10  20247.689652        0.0


In [91]:
fig = plot_acf(day_week_diff)
ljung_box = acorr_ljungbox(day_week_diff, lags=[10])
print(ljung_box)
fig.show()

        lb_stat  lb_pvalue
10  4870.663822        0.0


In [92]:
fig = plot_acf(day_diff)
ljung_box = acorr_ljungbox(day_diff, lags=[10])
print(ljung_box)
fig.show()

        lb_stat  lb_pvalue
10  5809.752822        0.0


In [93]:
kpss_stat, kpss_pvalue, _, _ = kpss(original, nlags=5)

print(f"kpss_stat: {kpss_stat:.3f}, kpss_pvalue: {kpss_pvalue:.2f}")

kpss_stat: 8.025, kpss_pvalue: 0.01



The test statistic is outside of the range of p-values available in the
look-up table. The actual p-value is smaller than the p-value returned.




In [94]:
kpss_stat, kpss_pvalue, _, _ = kpss(day_diff, nlags=5)

print(f"kpss_stat: {kpss_stat:.3f}, kpss_pvalue: {kpss_pvalue:.2f}")

kpss_stat: 0.010, kpss_pvalue: 0.10



The test statistic is outside of the range of p-values available in the
look-up table. The actual p-value is greater than the p-value returned.




In [95]:
kpss_stat, kpss_pvalue, _, _ = kpss(day_week_diff, nlags=5)

print(f"kpss_stat: {kpss_stat:.3f}, kpss_pvalue: {kpss_pvalue:.2f}")

kpss_stat: 0.004, kpss_pvalue: 0.10



The test statistic is outside of the range of p-values available in the
look-up table. The actual p-value is greater than the p-value returned.




In [96]:
plot_series_acf_pacf(
    data=original,
    time=data.get_column(time_),
)

In [97]:
plot_series_acf_pacf(
    data=day_diff,
)

In [98]:
plot_series_acf_pacf(
    data=day_week_diff,
)

In [99]:
plot_series(
    df=data.with_columns(target_col.diff(48).diff(336)).drop_nulls(),
    date_range=["2012-11-4", "2012-12-4"],
)

In [106]:
original.is_null().any()

False

In [107]:
ndiffs(original.to_numpy())

1

In [109]:
nsdiffs(original.to_numpy(), period=48)

0

In [110]:
nsdiffs(original.to_numpy(), period=48 * 7)

0

In [111]:
from statsforecast.models import AutoARIMA, ARIMA

fcst = StatsForecast(
    models=[
        ARIMA(order=(2, 0, 1), seasonal_order=(1, 1, 1), season_length=48),
    ],
    freq="30m",
)

In [112]:
y_hat = fcst.cross_validation(
    df=data.select([id_, time_, target_]).to_pandas(),
    h=48 * 7,
    step_size=1,
    n_windows=1,
    fitted=True,
).drop(columns=["cutoff"])


divide by zero encountered in matmul


overflow encountered in matmul


invalid value encountered in matmul


divide by zero encountered in matmul


overflow encountered in matmul


invalid value encountered in matmul


divide by zero encountered in matmul


overflow encountered in matmul


invalid value encountered in matmul



In [119]:
plot_series(data, pl.from_pandas(y_hat), max_insample_length=48 * 7)

In [114]:
fitted_values = fcst.cross_validation_fitted_values()
insample_forecasts = fitted_values["ARIMA"]
residuals = fitted_values["y"] - insample_forecasts

In [115]:
plot_residuals_diagnostic(
    residuals=residuals,
    time=fitted_values["ds"],
)

In [116]:
ljung_box = acorr_ljungbox(residuals, lags=[10], model_df=5)

ljung_box

Unnamed: 0,lb_stat,lb_pvalue
10,47.537451,4.41423e-09


In [117]:
from functools import partial

metrics = [
    mae,
    mse,
    rmse,
    mape,
    smape,
    partial(mase, seasonality=48 * 7),
]
evaluate(
    pl.from_pandas(y_hat),
    metrics=metrics,
    train_df=data.select([id_, time_, target_]),
)

unique_id,metric,ARIMA
str,str,f64
"""MAC000193""","""mae""",0.209514
"""MAC000193""","""mse""",0.100228
"""MAC000193""","""rmse""",0.316588
"""MAC000193""","""mape""",1.453822
"""MAC000193""","""smape""",0.289458
"""MAC000193""","""mase""",1.186443
