#### Setup

In [1]:
import polars as pl
from analysis.visualization.characterisation.clustering import cluster_timeseries_usage, usage_probabilities
from analysis.visualization.characterisation.notebooks.notebook_config import (
    dl,
    FEATURES,
    N_CLUSTERS,
    DATASET_START,
    DATASET_END,
    TIME_SERIES_MODE,
    WINDOW_MONTHS,
)

usage = cluster_timeseries_usage(
    loader=dl,
    k=N_CLUSTERS,
    features=FEATURES,
    start=DATASET_START,
    end=DATASET_END,
    mode=TIME_SERIES_MODE,
    window_months=WINDOW_MONTHS
)

usage_probs = usage_probabilities(usage).sort(["station", "probability"], descending=True)


Perform Clustering in Interval 2016-01-01 until 2016-01-01
Perform Clustering in Interval 2016-02-01 until 2016-02-01
Perform Clustering in Interval 2016-03-01 until 2016-03-01
Perform Clustering in Interval 2016-04-01 until 2016-04-01
Perform Clustering in Interval 2016-05-01 until 2016-05-01
Perform Clustering in Interval 2016-06-01 until 2016-06-01
Perform Clustering in Interval 2016-07-01 until 2016-07-01
Perform Clustering in Interval 2016-08-01 until 2016-08-01
Perform Clustering in Interval 2016-09-01 until 2016-09-01
Perform Clustering in Interval 2016-10-01 until 2016-10-01
Perform Clustering in Interval 2016-11-01 until 2016-11-01
Perform Clustering in Interval 2016-12-01 until 2016-12-01
Perform Clustering in Interval 2017-01-01 until 2017-01-01
Perform Clustering in Interval 2017-02-01 until 2017-02-01
Perform Clustering in Interval 2017-03-01 until 2017-03-01
Perform Clustering in Interval 2017-04-01 until 2017-04-01
Perform Clustering in Interval 2017-05-01 until 2017-05-

## 8. Impact of Weather on Station Usage Patterns

### Event based effects

We will now investiage the effects of predefined weather effects on different usage types. 

In [2]:
from analysis.visualization.characterisation.weather import weather_response_df
from analysis.visualization.characterisation.helpers import dominant_usage_per_station

def base_df(loader, usage_probs, sample_rate="1h"):
  df = weather_response_df(loader=loader, sample_rate=sample_rate)

  # add dominant usage type 
  df = df.join(dominant_usage_per_station(usage_probs), on="station", how="left")

  # add weekday / weekend classification
  df = df.with_columns([ 
    pl.when(pl.col("datetime").dt.weekday() <= 4)
      .then(pl.lit("weekday")) 
      .otherwise(pl.lit("weekend")) 
      .alias("day_type") 
  ])

  return df

**Temperature**

We define three different temperature ranges:
$$
\begin{aligned}
\text{L (Low):} \quad & \max(T_{\text{day}}) < 10^\circ \mathrm{C} \\
\text{M (Medium):} \quad & 10 \leq \max(T_{\text{day}}) < 20^\circ \mathrm{C} \\
\text{H (High):} \quad & \max(T_{\text{day}}) \geq 20^\circ \mathrm{C}
\end{aligned}
$$

implemented in `temp_ranges`.

In [3]:
def temp_ranges(col):
    return (
        pl.when(col < 10).then(pl.lit("L"))
        .when(col < 20).then(pl.lit("M"))
        .otherwise(pl.lit("H"))
    )

We will use low temperatures (*L*) as baseline and calculate the relative change to range *M* and *H* per station. 

$$
\text{rel\_diff}_X = \frac{\bar{C}_X - \bar{C}_L}{\bar{C}_L} \times 100 
\quad \text{for } X \in \{M, H\}
$$

Furthermore we will look seperately at weekdays and weekends.

In [4]:
from analysis.visualization.characterisation.weather import weather_effect_table

df = base_df(loader=dl, usage_probs=usage_probs).drop(["precip_sum", "wind_max"])
df = df.with_columns(
    temp_ranges(pl.col("temp_max")).alias("temp_range")
)

final_temp_table = weather_effect_table(
    df,
    range_col="temp_range"
)

final_temp_table

usage_type,day_type,mean_count_L,mean_count_M,mean_count_H,rel_diff_L,rel_diff_M,rel_diff_H
str,str,f64,f64,f64,f64,f64,f64
"""mixed""","""weekday""",60.289946,74.451642,95.386803,0.0,31.38,71.55
"""mixed""","""weekend""",39.5,55.163816,74.879855,0.0,48.86,116.1
"""recreational""","""weekday""",43.155817,54.836363,70.022032,0.0,17.7,36.72
"""recreational""","""weekend""",33.369048,46.192095,60.857264,0.0,26.12,58.96
"""utilitarian""","""weekday""",147.608274,177.998057,218.015859,0.0,18.3,42.66
"""utilitarian""","""weekend""",105.273228,135.264246,175.944894,0.0,26.16,64.58


We can see mixed stations react the strongest to temperature change. 
Furthermore weekends are more sensitive to temperature change then weekdays.

**Precipitation**

For precipitation, daily exposure was quantified using the summed daily precipitation.

Again we defined three classes:

$$
\begin{aligned}
\text{L (Low):} \quad & 0~\text{mm} \\
\text{M (Medium):} \quad & 0 < P_{\text{day}} \leq 5~\text{mm} \\
\text{H (High):} \quad & P_{\text{day}} > 5~\text{mm}
\end{aligned}
$$

implemented in `precip_range`.

In [5]:
def precip_ranges(col):
  return (
    pl.when(col == 0)
    .then(pl.lit("L"))
    .when((col > 0) & (col <= 5))
    .then(pl.lit("M"))
    .otherwise(pl.lit("H"))
  )


As before we will calculate

$$
\text{rel\_diff}_X = \frac{\bar{C}_X - \bar{C}_L}{\bar{C}_L} \times 100 
\quad \text{for } X \in \{M, H\}
$$

In [6]:
df = base_df(loader=dl, usage_probs=usage_probs)
df = df.with_columns(
    precip_ranges(pl.col("precip_sum")).alias("precip_range")
)

final_precip_table = weather_effect_table(
    df,
    range_col="precip_range"
)

final_precip_table

usage_type,day_type,mean_count_L,mean_count_M,mean_count_H,rel_diff_L,rel_diff_M,rel_diff_H
str,str,f64,f64,f64,f64,f64,f64
"""mixed""","""weekday""",87.489448,74.966387,59.379299,0.0,-17.4,-37.25
"""mixed""","""weekend""",66.331082,55.017815,39.898098,0.0,-19.2,-46.03
"""recreational""","""weekday""",62.927479,54.96131,43.994278,0.0,-9.37,-22.38
"""recreational""","""weekend""",53.12008,47.101455,34.50107,0.0,-11.74,-28.93
"""utilitarian""","""weekday""",200.497667,177.203584,143.823206,0.0,-8.49,-23.45
"""utilitarian""","""weekend""",159.367995,136.798888,100.22053,0.0,-13.23,-28.89


**Wind Speed**

Daily wind conditions were represented by the mean daily wind speed, as wind affects cycling continuously throughout the day.

$$
\begin{aligned}
\text{L (low):} \quad & \max(W_{\text{day}}) < 3~\text{m/s} \\
\text{M (medium):} \quad & 3 \leq \max(W_{\text{day}}) < 6~\text{m/s} \\
\text{H (high):} \quad & \max(W_{\text{day}}) \geq 6~\text{m/s}
\end{aligned}
$$

implemented in `wind_ranges`.

In [9]:
def wind_ranges(col):
  return (
    pl.when(col < 5).then(pl.lit("L"))
    .when(col < 8).then(pl.lit("M"))
    .otherwise(pl.lit("H"))
  )

In [10]:
df = base_df(loader=dl, usage_probs=usage_probs)
df = df.with_columns(
    wind_ranges(pl.col("wind_max")).alias("wind_range")
)

final_wind_table = weather_effect_table(
    df,
    range_col="wind_range"
)

final_wind_table

usage_type,day_type,mean_count_L,mean_count_M,mean_count_H,rel_diff_L,rel_diff_M,rel_diff_H
str,str,f64,f64,f64,f64,f64,f64
"""mixed""","""weekday""",79.791667,81.155983,77.201001,0.0,1.71,-3.25
"""mixed""","""weekend""",51.145833,67.792977,64.417043,0.0,18.36,11.89
"""recreational""","""weekday""",49.979167,48.682681,46.462132,0.0,-5.75,-5.64
"""recreational""","""weekend""",48.833333,48.109259,47.321856,0.0,19.66,10.06
"""utilitarian""","""weekday""",191.75,180.98536,181.438374,0.0,-9.79,-7.72
"""utilitarian""","""weekend""",84.947917,130.152939,118.050704,0.0,62.64,53.35
