# Note: This notebook is largely ignorable. Have since deviated from this approach and not bothered to tidy the notebook significantly.

# Daily + weekly seasonality model

Modelling both daily and weekly seasonality with gaussian process on $log(lambda)$

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

import polars as pl
import numpy as np
import scipy

import pymc as pm
import arviz as az

import warnings

sns.set(rc={'figure.figsize':(17,11)})

warnings.filterwarnings("ignore")

## Load data

In [None]:
data = pl.read_parquet("../data/counter_data.parquet").with_columns(pl.col("weekday") - 1)

In [None]:
filtered_data = data.filter(
    (pl.col("year") == 2022) &
    (pl.col("site_name") == "Thorndon Quay")
).with_columns(
    pl.col("count_outgoing").fill_null(0)
)

In [None]:
sns.relplot(
    (
        filtered_data.filter(
            (pl.col("day") > 7) & (pl.col("day") <= 14)
        ).with_columns(
            (pl.col("weekday") * 24 + pl.col("hour")).alias("day_hour")
        )
    ),
    x="day_hour", 
    y="count_outgoing",
    kind="line",
    col="month",
    col_wrap=3,
)

In [None]:
sns.relplot(
    (
        filtered_data.filter(
            (pl.col("month") == 10) &
            (pl.col("day") <= 7)
        )
        .with_columns(
            (pl.col("weekday") * 24 + pl.col("hour")).alias("day_hour")
        )
    ), 
    x="day_hour", 
    y="count_outgoing",
    kind="line"
)
plt.xlabel("Days since start")
plt.ylabel("Outgoing count")
plt.ylim(-1,200)
plt.xlim(0,24*5)
plt.xticks([24*i for i in range(6)], list(range(6)))
plt.title("Outgoing count per-hour for first five weekdays in October, Thorndon Quay, 2022", pad=20);

## Model

Model describes bike count $c_i$ as GammaPoisson distribution conditional on time of day, for data sampled where: 

- location = "Thorndon Quay"
- year = 2022
- day = {monday, tuesday, wednesday, thursday, friday}
- direction = "Outgoing"

$hour[i]$ is indexed hour of day for sample $i$, with "12am-1am" = 0. 

\begin{align}
c_i &\sim GammaPoisson(\phi, \mu_i) \\
\log{(\mu_i)} &= \lambda_{hour[i]} \\
\lambda_{j = 0..23} &\sim MVNormal([0,0,\dots,0]^T, K) \\
K &= Periodic(24, \tau) \\
\phi &\sim Exponential(1) \\
\tau &\sim Exponential(0.1)
\end{align}

In [None]:
with pm.Model() as m_3:
    hour = pm.MutableData("hour", np.array(filtered_data["hour"]), dims="obs_id")
    weekday = pm.MutableData("weekday", np.array(filtered_data["weekday"]), dims="obs_id")
    
    ls = pm.Exponential("ls", scale=[2, 1, 24])
    
    # hourly periodic effect
    cov_daily = pm.gp.cov.Periodic(1, period=24, ls=ls[0])
    
    # allowing variability to 24-hour period
    cov_daily *= pm.gp.cov.ExpQuad(1, ls=ls[1])
    
    # weekly periodic effect
    cov_weekly = pm.gp.cov.Periodic(1, period=24*7, ls=ls[2])
    
    # altogether
    cov = cov_daily * cov_weekly + pm.gp.cov.WhiteNoise(1e-4)
    
    # sample hourly * daily alphas
    K = cov(np.arange(24 * 7)[:, None]).eval()
    mu = pm.Normal("mu", 0, 1, shape=len(K))
    alpha = pm.MvNormal("alpha", mu=mu, cov=K, shape=len(K))
    
    # exponentiate to get lambda
    lmda = np.exp(
        alpha[weekday * 24 + hour]
    )
    
    # sample from Gamma Poisson
    phi = pm.Exponential("phi", scale=3)
    c = pm.NegativeBinomial("c", alpha=phi, mu=lmda, observed=np.array(filtered_data["count_outgoing"]), dims="obs_id")
    
    m_3.debug(verbose=True)
    # trace = pm.sample_prior_predictive(1000)
    trace = pm.sample(10000, tune=10000)
    trace.extend(pm.sample_posterior_predictive(trace))
    

## Trace results

In [None]:
az.plot_forest(trace, var_names=["ls"])

In [None]:
az.plot_forest(trace, var_names=["phi"])

In [None]:
az.plot_trace(trace, compact=False, var_names=["ls", "phi", "beta", "alpha"]);
plt.tight_layout()

## Posterior

In [None]:
m = plt.imshow(np.array(az.extract(trace.posterior)["alpha"]).mean(axis=1).reshape((7, 24)), cmap="inferno", interpolation=None)
plt.colorbar(m)

## Posterior predictions

In [None]:
post_preds = np.array(az.extract(trace.posterior_predictive)["c"])
post_preds.shape

In [None]:
day_post_preds = np.array([post_preds[23 + 72 + 72:][i*24:(i+1)*24] for i in range(7)])
post_means = day_post_preds.mean(axis=2)
post_stds = day_post_preds.std(axis=2)

In [None]:
hours = np.arange(24)

fig, axs = plt.subplots(nrows=7, sharex=True)
for i, (mean, std) in enumerate(zip(post_means, post_stds)):
    ax = axs[(i+5) % 7]
    ax.plot(hours, mean, label="Posterior mean", color="darkorange", linestyle="--")
    ax.fill_between(hours, (mean - std).clip(min=0), (mean + std), alpha=0.3, label="1 Std", color="darkorange")
    ax.fill_between(hours, (mean - 2*std).clip(min=0), (mean + 2*std), alpha=0.3, label="2 Std", color="darkorange")

plt.xticks(hours)
plt.xlim(0,23)

fig.supxlabel("Hour of day")
fig.supylabel("Bikes counted (outgoing)")
fig.legend(loc="upper right")
plt.suptitle("Per-hour, per-day posterior predictive distribution of outgoing count to two std. (Thorndon Quay, 2022)")
plt.show()

## Correlation between 8am and 9am in observations vs posterior predictions

Shows strong correlation in observations but none/weak in predictions

In [None]:
df_preds = filtered_data.with_columns(
    pl.Series(name="predictions", values=trace.posterior_predictive["c"][0,0,:].to_numpy())
).filter(
    (pl.col("weekday") < 5) &
    (pl.col("hour") >= 8) &
    (pl.col("hour") <= 9)
).pivot(index=["year", "month", "day"], columns="hour", values=["predictions", "count_outgoing"])

In [None]:
df_preds

In [None]:
sns.relplot(df_preds, x="predictions_hour_8", y="predictions_hour_9")
sns.relplot(df_preds, x="count_outgoing_hour_8", y="count_outgoing_hour_9")
plt.xlim(0,250)
plt.ylim(0,450)

In [None]:
sns.displot(
    (
        df_preds.filter(
            (pl.col("weekday") < 5) &
            (pl.col("hour") > 6) &
            (pl.col("hour") < 10)
        ).set_sorted(
            pl.col("record_time")
        ).group_by_dynamic(
            pl.col("record_time"), 
            every="1d"
        ).agg(
            pl.col("count_outgoing").sum(), 
            pl.col("predictions").sum()
        ).melt(
            id_vars="record_time",
            value_vars=["count_outgoing", "predictions"]
        )
    ),
    x="value",
    hue="variable",
)

In [None]:
sns.relplot(df_preds, x=")