#### Setup

In [1]:
import polars as pl
from analysis.characterisation.clustering import cluster_timeseries_usage, usage_probabilities
from analysis.characterisation.notebooks.notebook_config import (
    dl,
    FEATURES,
    N_CLUSTERS,
    DATASET_START,
    DATASET_END,
    TIME_SERIES_MODE,
    WINDOW_MONTHS,
)

usage = cluster_timeseries_usage(
    loader=dl,
    k=N_CLUSTERS,
    features=FEATURES,
    start=DATASET_START,
    end=DATASET_END,
    mode=TIME_SERIES_MODE,
    window_months=WINDOW_MONTHS
)

usage_probs = usage_probabilities(usage).sort(["station", "probability"], descending=True)


Perform Clustering in Interval 2016-01-01 until 2016-01-01
Perform Clustering in Interval 2016-02-01 until 2016-02-01
Perform Clustering in Interval 2016-03-01 until 2016-03-01
Perform Clustering in Interval 2016-04-01 until 2016-04-01
Perform Clustering in Interval 2016-05-01 until 2016-05-01
Perform Clustering in Interval 2016-06-01 until 2016-06-01
Perform Clustering in Interval 2016-07-01 until 2016-07-01
Perform Clustering in Interval 2016-08-01 until 2016-08-01
Perform Clustering in Interval 2016-09-01 until 2016-09-01
Perform Clustering in Interval 2016-10-01 until 2016-10-01
Perform Clustering in Interval 2016-11-01 until 2016-11-01
Perform Clustering in Interval 2016-12-01 until 2016-12-01
Perform Clustering in Interval 2017-01-01 until 2017-01-01
Perform Clustering in Interval 2017-02-01 until 2017-02-01
Perform Clustering in Interval 2017-03-01 until 2017-03-01
Perform Clustering in Interval 2017-04-01 until 2017-04-01
Perform Clustering in Interval 2017-05-01 until 2017-05-

## 7. Impact of Weather on Station Usage Patterns

### Event based effects

We will now investiage the effects of predefined weather effects on different usage types. 

In [2]:
from analysis.characterisation.weather import weather_response_df
from analysis.characterisation.helpers import dominant_usage_per_station

def base_df(loader, usage_probs, sample_rate="1h"):
  df = weather_response_df(loader=loader, sample_rate=sample_rate)

  # add dominant usage type 
  df = df.join(dominant_usage_per_station(usage_probs), on="station", how="left")

  # add weekday / weekend classification
  df = df.with_columns([ 
    pl.when(pl.col("datetime").dt.weekday() <= 4)
      .then(pl.lit("weekday")) 
      .otherwise(pl.lit("weekend")) 
      .alias("day_type") 
  ])

  return df

**Temperature**

We define three different temperature ranges:
$$
\begin{aligned}
\text{L (Low):} \quad & \max(T_{\text{day}}) < 10^\circ \mathrm{C} \\
\text{M (Medium):} \quad & 10 \leq \max(T_{\text{day}}) < 20^\circ \mathrm{C} \\
\text{H (High):} \quad & \max(T_{\text{day}}) \geq 20^\circ \mathrm{C}
\end{aligned}
$$

implemented in `temp_ranges`.

In [3]:
def temp_ranges(col):
    return (
        pl.when(col < 10).then(pl.lit("L"))
        .when(col < 20).then(pl.lit("M"))
        .otherwise(pl.lit("H"))
    )

We will use low temperatures (*L*) as baseline and calculate the relative change to range *M* and *H* per station. 

$$
\text{rel\_diff}_X = \frac{\bar{C}_X - \bar{C}_L}{\bar{C}_L} \times 100 
\quad \text{for } X \in \{M, H\}
$$

Furthermore we will look seperately at weekdays and weekends.

In [4]:
from analysis.characterisation.event import event_effect_table

df = base_df(loader=dl, usage_probs=usage_probs).drop(["precip_sum", "wind_max"])
df = df.with_columns(
    temp_ranges(pl.col("temp_max")).alias("temp_range")
)

temp_table = event_effect_table(
    df,
    range_col="temp_range",
    group_cols=("station", "usage_type", "day_type"),
    agg_cols=("usage_type", "day_type"),
)

temp_table

usage_type,day_type,mean_count_L,mean_count_H,mean_count_M,rel_diff_L,rel_diff_H,rel_diff_M
str,str,f64,f64,f64,f64,f64,f64
"""mixed""","""weekday""",1446.958716,2281.18677,1784.576159,0.0,71.55,31.37
"""mixed""","""weekend""",948.0,1794.375635,1315.237668,0.0,116.09,48.65
"""recreational""","""weekday""",1034.31003,1678.446575,1315.615925,0.0,36.0,17.48
"""recreational""","""weekend""",800.801587,1460.416084,1108.096026,0.0,58.65,25.51
"""utilitarian""","""weekday""",3542.598566,5230.211073,4271.60423,0.0,42.55,18.29
"""utilitarian""","""weekend""",2524.451923,4221.570796,3244.917355,0.0,64.57,26.12


We can see mixed stations react the strongest to temperature change. 
Furthermore weekends are more sensitive to temperature change then weekdays.

**Precipitation**

For precipitation, daily exposure was quantified using the summed daily precipitation.

Again we defined three classes:

$$
\begin{aligned}
\text{L (Low):} \quad & 0~\text{mm} \\
\text{M (Medium):} \quad & 0 < P_{\text{day}} \leq 5~\text{mm} \\
\text{H (High):} \quad & P_{\text{day}} > 5~\text{mm}
\end{aligned}
$$

implemented in `precip_range`.

In [5]:
def precip_ranges(col):
  return (
    pl.when(col == 0)
    .then(pl.lit("L"))
    .when((col > 0) & (col <= 5))
    .then(pl.lit("M"))
    .otherwise(pl.lit("H"))
  )


As before we will calculate

$$
\text{rel\_diff}_X = \frac{\bar{C}_X - \bar{C}_L}{\bar{C}_L} \times 100 
\quad \text{for } X \in \{M, H\}
$$

In [6]:
df = base_df(loader=dl, usage_probs=usage_probs).drop(["temp_max", "wind_max"])
df = df.with_columns(
    precip_ranges(pl.col("precip_sum")).alias("precip_range")
)

precip_table = event_effect_table(
    df,
    range_col="precip_range",
    group_cols=("station", "usage_type", "day_type"),
    agg_cols=("usage_type", "day_type"),
)

precip_table

usage_type,day_type,mean_count_M,mean_count_H,mean_count_L,rel_diff_M,rel_diff_H,rel_diff_L
str,str,f64,f64,f64,f64,f64,f64
"""mixed""","""weekday""",1793.623907,1418.325397,2099.746753,-17.2,-37.09,0.0
"""mixed""","""weekend""",1316.484,955.565217,1586.577869,-19.41,-46.03,0.0
"""recreational""","""weekday""",1318.321429,1051.587302,1509.688596,-9.22,-22.38,0.0
"""recreational""","""weekend""",1129.962428,827.986577,1274.750725,-11.83,-28.94,0.0
"""utilitarian""","""weekday""",4252.88601,3451.756944,4810.272,-8.49,-23.46,0.0
"""utilitarian""","""weekend""",3281.972125,2404.801724,3822.520147,-13.27,-28.83,0.0


**Wind Speed**

Daily wind conditions were represented by the mean daily wind speed, as wind affects cycling continuously throughout the day.

$$
\begin{aligned}
\text{L (low):} \quad & \max(W_{\text{day}}) < 3~\text{m/s} \\
\text{M (medium):} \quad & 3 \leq \max(W_{\text{day}}) < 6~\text{m/s} \\
\text{H (high):} \quad & \max(W_{\text{day}}) \geq 6~\text{m/s}
\end{aligned}
$$

implemented in `wind_ranges`.

In [7]:
def wind_ranges(col):
  return (
    pl.when(col < 5).then(pl.lit("L"))
    .when(col < 8).then(pl.lit("M"))
    .otherwise(pl.lit("H"))
  )

In [8]:
df = base_df(loader=dl, usage_probs=usage_probs).drop(["precip_sum", "temp_max"])
df = df.with_columns(
    wind_ranges(pl.col("wind_max")).alias("wind_range")
)

wind_table = event_effect_table(
    df,
    range_col="wind_range",
    group_cols=("station", "usage_type", "day_type"),
    agg_cols=("usage_type", "day_type"),
)

wind_table

usage_type,day_type,mean_count_M,mean_count_H,mean_count_L,rel_diff_M,rel_diff_H,rel_diff_L
str,str,f64,f64,f64,f64,f64,f64
"""mixed""","""weekday""",1947.74359,1849.07327,1915.0,1.71,-3.44,0.0
"""mixed""","""weekend""",1626.307692,1545.160763,1227.5,18.36,11.53,0.0
"""recreational""","""weekday""",1166.914348,1114.369741,1199.5,-5.85,-5.68,0.0
"""recreational""","""weekend""",1154.622222,1135.453972,1172.0,19.66,10.0,0.0
"""utilitarian""","""weekday""",4343.648649,4353.658537,4602.0,-9.79,-7.72,0.0
"""utilitarian""","""weekend""",3121.807027,2831.576185,2038.75,62.58,53.28,0.0
