In [None]:
import polars as pl
import plotly.express as px
import seaborn as sns
from utilsforecast.plotting import plot_series
from statsforecast import StatsForecast
from mlforecast import MLForecast
from mlforecast.lag_transforms import RollingMean
from plotly.subplots import make_subplots
import pandas as pd

import plotly.graph_objects as go


In [None]:
data = pl.read_parquet(
    "data/london_smart_meters/preprocessed/london_smart_meters_merged_block_0-7.parquet"
)
timestamp = data.group_by("LCLid").agg(
    pl.datetime_range(
        start=pl.col("start_timestamp"),
        end=pl.col("start_timestamp").dt.offset_by(
            pl.format("{}m", pl.col("series_length").sub(1).mul(30))
        ),
        interval="30m",
    ).alias("ds"),
)
data = timestamp.join(data, on="LCLid", how="inner").rename(
    {"LCLid": "unique_id", "energy_consumption": "y"}
)
data.head(5)

In [None]:
id_ = "unique_id"
time_ = "ds"
target_ = "y"
id_col = pl.col(id_)
time_col = pl.col(time_)
target_col = pl.col(target_)

In [None]:
data = (
    data.filter(pl.col("file").eq("block_7"))
    .select(
        [
            time_,
            id_,
            target_,
            "Acorn",
            "Acorn_grouped",
            "holidays",
            "visibility",
            "windBearing",
            "temperature",
            "dewPoint",
            "pressure",
            "apparentTemperature",
            "windSpeed",
            "precipType",
            "icon",
            "humidity",
            "summary",
        ]
    )
    .explode(
        [
            time_,
            target_,
            "holidays",
            "visibility",
            "windBearing",
            "temperature",
            "dewPoint",
            "pressure",
            "apparentTemperature",
            "windSpeed",
            "precipType",
            "icon",
            "humidity",
            "summary",
        ]
    )
)
data.head()

In [None]:
sns.heatmap(
    data.select(time_, id_, target_)
    .to_pandas()
    .pivot(index="ds", columns="unique_id", values="y")
    .isnull(),
    cbar=False,
    cmap="viridis",
)


In [None]:
data = data.filter(target_col.is_null().sum().over(id_).lt(100)).with_columns(
    target_col.forward_fill().over(id_col)
)

In [None]:
selected_ids = data.select(pl.col(id_).unique().sample(6)).to_numpy().flatten()

In [None]:
selected_id = "MAC000193"

In [None]:
plot_series(
    data,
    ids=selected_ids,
    engine="plotly",
    max_insample_length=1000,
)

In [None]:
plot_series(
    data.with_columns(target_col.rolling_mean(48 * 30).over("unique_id")).drop_nulls(),
    plot_random=True,
    max_ids=6,
    engine="plotly",
    # seed=47,
)

In [None]:
def plot_with_exogneous(data, exogenous_name: str):
    # Create a subplot with secondary y-axis
    fig = make_subplots(specs=[[{"secondary_y": True}]])

    # Add y column to the primary y-axis
    fig.add_trace(
        go.Scatter(x=data["ds"], y=data["y"], name="y"),
        secondary_y=False,
    )

    # Add temperature column to the secondary y-axis
    fig.add_trace(
        go.Scatter(
            x=data["ds"],
            y=data[exogenous_name],
            name=exogenous_name,
        ),
        secondary_y=True,
    )

    # Update layout
    fig.update_layout(
        title_text=f"Energy consumption and {exogenous_name} for {selected_id} (12.1.2013 - 15.1.2013)",
        xaxis_title="Timestamp",
    )

    # Set y-axis titles
    fig.update_yaxes(title_text="Energy Consumption", secondary_y=False)
    fig.update_yaxes(title_text=exogenous_name, secondary_y=True)

    fig.show()

In [None]:
# Filter the data for the selected_id
filtered_data = (
    data.filter(pl.col("unique_id") == selected_id)
    .select(["ds", "y", "temperature"])
    .with_columns(
        time_col.dt.month().alias("month"),
        time_col.dt.year().alias("year"),
        time_col.dt.day().alias("day"),
        time_col.dt.hour().alias("hour"),
        time_col.dt.weekday().alias("weekday"),
    )
)


plot_with_exogneous(filtered_data, "temperature")

In [None]:
# Filter the data for the specified date range
start_date = pd.Timestamp("2013-01-12")
end_date = pd.Timestamp("2013-01-15")
filtered_data_range = filtered_data.filter(
    time_col.is_between(pl.lit(start_date), pl.lit(end_date))
)

plot_with_exogneous(filtered_data_range, exogenous_name="temperature")

In [None]:
year_month = (
    filtered_data.group_by(["year", "month"]).agg(
        pl.mean(["y", "temperature"]), pl.first("ds")
    )
).sort(["year", "month"])

plot_with_exogneous(year_month, exogenous_name="temperature")

In [None]:
month_agg = (
    filtered_data.group_by(["year", "month"]).agg(pl.mean(["y"]), pl.first("ds"))
).sort("year", "month")

px.line(
    month_agg.to_pandas(),
    x="month",
    y="y",
    color="year",
    title="Monthly Aggregated Mean of y",
    labels={"month": "Month", "y_mean": "Mean y"},
).show()

In [None]:
hour_agg = (
    filtered_data.group_by(["day", "hour"])
    .agg(pl.mean(["y"]), pl.first("weekday"))
    .sort(["day", "hour"])
)

px.line(
    hour_agg,
    x="hour",
    y="y",
    color="day",
    title="Hourly Aggregated Mean of y",
    labels={"ds": "Timestamp", "y_mean": "Mean y"},
).show()

In [None]:
px.box(
    hour_agg,
    x="hour",
    y="y",
    title="Hourly Aggregated Distribution of y",
    labels={"hour": "Hour", "y": "y"},
).show()

In [None]:
fig = px.box(
    hour_agg,
    x="hour",
    y="y",
    title="Hourly Aggregated Distribution of y",
    facet_col="weekday",
    facet_col_wrap=2,
    labels={"hour": "Hour", "y": "y"},
)
fig.update_layout(height=800, width=1200)
fig.show()

In [None]:
weekday_hour_hm = (
    filtered_data.pivot(
        on="hour",
        index="weekday",
        values="y",
        aggregate_function="mean",
    )
    .sort("weekday")
    .to_pandas()
    .set_index("weekday")
)
fig = px.imshow(
    weekday_hour_hm, height=600, title="Energy Consumption: Hours vs Week Day"
)
fig.show()

In [None]:
year_month_hm = (
    filtered_data.pivot(
        on="month",
        index="year",
        values="y",
        aggregate_function="mean",
    )
    .sort("year")
    .cast({"year": pl.String})
    .to_pandas()
    .set_index("year")
)
fig = px.imshow(year_month_hm, height=600, title="Energy Consumption: Month vs Year")
fig.show()

In [None]:
year_month_hm = (
    filtered_data.pivot(
        on="month",
        index="year",
        values="temperature",
        aggregate_function="mean",
    )
    .sort("year")
    .cast({"year": pl.String})
    .to_pandas()
    .set_index("year")
)
fig = px.imshow(year_month_hm, height=600, title="Temperature: Month vs Year")
fig.show()