# New Features (Time Based)

In [1]:
import pandas as pd
df = pd.read_parquet("data/final/demand_per_station_per_hour_with_weather.parquet")

## Ensure we have a row for each station/hour combination

In [None]:
df['hour'] = pd.to_datetime(df['hour'])
full_time_index = pd.date_range(start=df['hour'].min(), end=df['hour'].max(), freq='h')
stations = df['station_name'].unique()
full_index = pd.MultiIndex.from_product([stations, full_time_index], names=['station_name', 'hour'])
df_full = df.set_index(['station_name', 'hour']).reindex(full_index)
df_full['departures'] = df_full['departures'].fillna(0)
df_full['arrivals'] = df_full['arrivals'].fillna(0)
df_full = df_full.reset_index()

# After this, we need to rejoin the weather data!

In [16]:
# Holiday
import holidays
hol = holidays.UnitedStates(years=2023)
holiday_dates = set(hol.keys())
df_full['isHoliday'] = df_full['hour'].dt.date.isin(holiday_dates)


In [12]:
# Weekdays
df_full['weekday'] = df_full['hour'].dt.weekday

In [13]:
# Night
df_full['night'] = df_full['hour'].dt.hour.apply(lambda h: h >= 21 or h < 6)

In [None]:
# Worktime
df_full['workhours'] = (
    df_full['hour'].dt.hour.between(8, 16) &
    df_full['weekday'].between(0, 4) &
    (~df_full['isHoliday'])
)

In [None]:
# Commute (2 hours before and after start and end of worktime)
df_full['commute'] = (
    df_full['hour'].dt.hour.between(6, 10) | df_full['hour'].dt.hour.between(15, 19) &
    df_full['weekday'].between(0, 4) &
    (~df_full['isHoliday'])
    )

In [None]:
df_full['free'] = (
    df_full['weekday'].between(5, 6) |
    df_full['isHoliday']
    )

In [19]:
df_full['day'] = df_full['hour'].dt.day
df_full['month'] = df_full['hour'].dt.month
df_full['year'] = df_full['hour'].dt.year

In [20]:
counts = df_full.groupby('station_name').size().reset_index(name='row_count')

# New Features (Station Based)

- pro Station: average delta / average arrivals / average depratures + für alles varianz/sd


In [None]:
df_full["delta"] = df_full["arrivals"] - df_full["departures"]

df_full['var_delta_station_total'] = (
    df_full
      .groupby('station_name')['delta']
      .transform('var')
)


In [26]:
df_full['avg_delta_station_total'] = (
    df_full
      .groupby('station_name')['delta']
      .transform('mean')
)

In [None]:
df_full['avg_arrivals_station_total'] = (
    df_full
      .groupby('station_name')['arrivals']
      .transform('mean')
)
df_full['avg_departures_station_total'] = (
    df_full
      .groupby('station_name')['departures']
      .transform('mean')
)
df_full['var_arrivals_station_total'] = (
    df_full
      .groupby('station_name')['arrivals']
      .transform('var')
)
df_full['var_departures_station_total'] = (
    df_full
      .groupby('station_name')['departures']
      .transform('var')
)

In [None]:
# nacht
night_stats = (
    df_full[df_full['night']]
      .groupby('station_name')['delta']
      .agg(avg_delta_station_night='mean', var_delta_station_night='var')
      .reset_index()
)

df_full = df_full.merge(night_stats, on='station_name', how='left')

night_stats = (
    df_full[df_full['night']]
      .groupby('station_name')['arrivals']
      .agg(avg_arrivals_station_night='mean', var_arrivals_station_night='var')
      .reset_index()
)

df_full = df_full.merge(night_stats, on='station_name', how='left')


night_stats = (
    df_full[df_full['night']]
      .groupby('station_name')['departures']
      .agg(avg_departures_station_night='mean', var_departures_station_night='var')
      .reset_index()
)

df_full = df_full.merge(night_stats, on='station_name', how='left')

: 

In [None]:
# nicht nacht
night_stats = (
    df_full[df_full['night'] == False]
      .groupby('station_name')['delta']
      .agg(avg_delta_station_nonnight='mean', var_delta_station_nonnight='var')
      .reset_index()
)

df_full = df_full.merge(night_stats, on='station_name', how='left')

night_stats = (
    df_full[df_full['night'] == False]
      .groupby('station_name')['arrivals']
      .agg(avg_arrivals_station_nonnight='mean', var_arrivals_station_nonnight='var')
      .reset_index()
)

df_full = df_full.merge(night_stats, on='station_name', how='left')


night_stats = (
    df_full[df_full['night'] == False]
      .groupby('station_name')['departures']
      .agg(avg_departures_station_nonnight='mean', var_departures_station_nonnight='var')
      .reset_index()
)

df_full = df_full.merge(night_stats, on='station_name', how='left')

In [None]:
# feiertage
stats = (
    df_full[df_full['isHoliday']]
      .groupby('station_name')['delta']
      .agg(avg_delta_station_holiday='mean', var_delta_station_holiday='var')
      .reset_index()
)

df_full = df_full.merge(stats, on='station_name', how='left')

stats = (
    df_full[df_full['isHoliday']]
      .groupby('station_name')['arrivals']
      .agg(avg_arrivals_station_holiday='mean', var_arrivals_station_holiday='var')
      .reset_index()
)

df_full = df_full.merge(stats, on='station_name', how='left')


stats = (
    df_full[df_full['isHoliday']]
      .groupby('station_name')['departures']
      .agg(avg_departures_station_holiday='mean', var_departures_station_holiday='var')
      .reset_index()
)

df_full = df_full.merge(stats, on='station_name', how='left')

In [None]:
# commute

stats = (
    df_full[df_full['commute']]
      .groupby('station_name')['delta']
      .agg(avg_delta_station_commute='mean', var_delta_station_commute='var')
      .reset_index()
)

df_full = df_full.merge(stats, on='station_name', how='left')

stats = (
    df_full[df_full['commute']]
      .groupby('station_name')['arrivals']
      .agg(avg_arrivals_station_commute='mean', var_arrivals_station_commute='var')
      .reset_index()
)

df_full = df_full.merge(stats, on='station_name', how='left')


stats = (
    df_full[df_full['commute']]
      .groupby('station_name')['departures']
      .agg(avg_departures_station_commute='mean', var_departures_station_commute='var')
      .reset_index()
)

df_full = df_full.merge(stats, on='station_name', how='left')

In [None]:
# free

stats = (
    df_full[df_full['free']]
      .groupby('station_name')['delta']
      .agg(avg_delta_station_free='mean', var_delta_station_free='var')
      .reset_index()
)

df_full = df_full.merge(stats, on='station_name', how='left')

stats = (
    df_full[df_full['free']]
      .groupby('station_name')['arrivals']
      .agg(avg_arrivals_station_free='mean', var_arrivals_station_free='var')
      .reset_index()
)

df_full = df_full.merge(stats, on='station_name', how='left')


stats = (
    df_full[df_full['free']]
      .groupby('station_name')['departures']
      .agg(avg_departures_station_free='mean', var_departures_station_free='var')
      .reset_index()
)

df_full = df_full.merge(stats, on='station_name', how='left')

In [None]:
# average delta für stunden mit niederschlag (precipitation > 0 oder windgeschwindigkeit über 10)?

mask = (df_full['precipitation'] > 0) | (df_full['windspeed'] > 10)

stats = (
    df_full[mask]
      .groupby('station_name')['delta']
      .agg(avg_delta_station_unfriendly_weather='mean', var_delta_station_unfriendly_weather='var')
      .reset_index()
)

df_full = df_full.merge(stats, on='station_name', how='left')

stats = (
    df_full[mask]
      .groupby('station_name')['arrivals']
      .agg(avg_arrivals_station_unfriendly_weather='mean', var_arrivals_station_unfriendly_weather='var')
      .reset_index()
)

df_full = df_full.merge(stats, on='station_name', how='left')


stats = (
    df_full[mask]
      .groupby('station_name')['departures']
      .agg(avg_departures_station_unfriendly_weather='mean', var_departures_station_unfriendly_weather='var')
      .reset_index()
)

df_full = df_full.merge(stats, on='station_name', how='left')