In [20]:
import sys
!{sys.executable} -m pip install geopandas osmnx fastparquet

Collecting fastparquet
  Downloading fastparquet-0.7.1-cp38-cp38-manylinux2010_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 5.5 MB/s eta 0:00:01
Collecting cramjam>=2.3.0
  Downloading cramjam-2.4.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 95.1 MB/s eta 0:00:01
[?25hCollecting thrift>=0.11.0
  Downloading thrift-0.15.0.tar.gz (59 kB)
[K     |████████████████████████████████| 59 kB 25.3 MB/s  eta 0:00:01
Building wheels for collected packages: thrift
  Building wheel for thrift (setup.py) ... [?25ldone
[?25h  Created wheel for thrift: filename=thrift-0.15.0-cp38-cp38-linux_x86_64.whl size=413761 sha256=b817eb09c748c357e3ff90da89d9bc729aa390053b20e36b5c72a43bcb059da9
  Stored in directory: /home/jovyan/.cache/pip/wheels/4c/b5/5b/10af165d7e0895afdfe25ad487422ae8ada6ea422b0dc444ab
Successfully built thrift
Installing collected packages: thrift, cramjam, fastparquet
Successfully installed cra

# Process speed data

* Merge 1 year's worth data to 1 dataframe per day of week.
* And possibly just merge it again into a dataframe with a week's data
* This notebook generates `overall_means.pkl` which is needed by almost all the subsequent notebooks
* This notebook takes the longest time because of the loading and reading of data
* Requires:
    * `speed_data`
* Generates:
    * `overeall_means.pkl`

In [21]:
%matplotlib inline

In [22]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [23]:
%autoreload 2

In [24]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import pickle
import random
import os
import sys

import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
import osmnx as ox
import networkx as nx

from pprint import pprint
from shapely.geometry import Point, LineString
from scipy.stats import hmean

In [25]:
sys.path.append("..")
from src import data_processing as data_proc
from src.utils import Read_DF, Call_Back

In [26]:
# Confirm directorys are in place

if not os.path.exists(os.path.join(os.getcwd(), '../data')):
    raise OSError("Must first download data, see README.md")
data_dir = os.path.join(os.getcwd(), '../data')

if not os.path.exists(os.path.join(data_dir, 'speed_data')):
    os.mkdir(os.path.join(data_dir, 'speed_data'))
speed_dir = os.path.join(data_dir, 'speed_data')

if not os.path.exists(os.path.join(os.getcwd(), '../img')):
    raise OSError("Must first download data, see README.md")
img_dir = os.path.join(os.getcwd(), '../img')

In [27]:
pd.reset_option('^display.', silent=True)

In [28]:
# # adjusting the editor
# pd.set_option('display.max_columns', 15)
# pd.set_option('display.width', 400)

# Putting it all all together
* This is the final setup
* Using arithmetic means only
* Added sum of incidents just in case it is needed

## I. Load all the mobility dataframes together into memory
* This step requires a lot of ram even if we are using fastparquet
* This is done with the intention of processing the entire thing as a group/vector(?)

In [29]:
from copy import deepcopy
df_overall = []

for month in range(1, 2):
# for month in range(1, 13):
    fp = os.path.join(speed_dir, f'ALL_5m_DF_2019_{month}_1.gzip')
    DF_All = Read_DF(DF_All = fp, Reading_Tag = 'DF_All', MetaData = {})

    dayofweek_df = []
    for dayofweek in range(7):
        df = DF_All[DF_All['time_local'].dt.dayofweek == dayofweek]
        df = (df[df['speed_mean'].notnull()][['window', 'XDSegID', 'congestion_mean', 'speed_mean', 'Total_Number_Incidents']]
              .groupby(['window', 'XDSegID'])
              .agg({'congestion_mean': 'mean', 'speed_mean': 'mean', 'Total_Number_Incidents': 'sum'}))
    
        df['dow'] = dayofweek
        df = df.reset_index()
        df.set_index(['dow', 'window', 'XDSegID'], inplace=True)
        dayofweek_df.append(df)

    df_1 = deepcopy(pd.concat(dayofweek_df))

    fp = os.path.join(speed_dir, f'ALL_5m_DF_2019_{month}_15.gzip')
    DF_All = Read_DF(DF_All = fp, Reading_Tag = 'DF_All', MetaData = {})

    dayofweek_df = []
    for dayofweek in range(7):
        df = DF_All[DF_All['time_local'].dt.dayofweek == dayofweek]
        df = (df[df['speed_mean'].notnull()][['window', 'XDSegID', 'congestion_mean', 'speed_mean', 'Total_Number_Incidents']]
              .groupby(['window', 'XDSegID'])
              .agg({'congestion_mean': 'mean', 'speed_mean': 'mean', 'Total_Number_Incidents': 'sum'}))
        df['dow'] = dayofweek
        df = df.reset_index()
        df.set_index(['dow', 'window', 'XDSegID'], inplace=True)
        dayofweek_df.append(df)

    df_15 = deepcopy(pd.concat(dayofweek_df))

    df_month_all = pd.concat([df_1.reset_index(), df_15.reset_index()])

    df_month_all = (df_month_all
                    .groupby(['dow', 'window', 'XDSegID'], as_index=False)
                    .agg({'congestion_mean': 'mean', 'speed_mean': 'mean', 'Total_Number_Incidents': 'sum'}))
    df_month_all.set_index(['dow', 'window', 'XDSegID'], inplace=True)

    df_overall.append(df_month_all)

## II. Groupby and aggregate one year's worth of data into a smaller dataframe
* The idea is to group the data into `dayofweek`, then `time_window` and `XDSegID`
* `congestion_mean` and `speed_mean` are averaged while the `Total_Number_Incidents` is summed.
* This leaves us with a multi-index dataframe that allows us to easily filter out the needed data

In [30]:
df_overall_all = pd.concat([df.reset_index() for df in df_overall])
df_overall_all = (df_overall_all
                 .groupby(['dow', 'window', 'XDSegID'], as_index=False)
                 .agg({'congestion_mean': 'mean', 'speed_mean': 'mean', 'Total_Number_Incidents': 'sum'}))
df_overall_all.set_index(['dow', 'window', 'XDSegID'], inplace=True)
df_overall_all

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,congestion_mean,speed_mean,Total_Number_Incidents
dow,window,XDSegID,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,155791217,0.204545,17.7500,0
0,0,155795795,0.000000,10.7500,0
0,0,155796708,0.000000,17.5000,0
0,0,155802488,0.014706,20.5000,0
0,0,155826562,0.016667,14.7500,0
...,...,...,...,...,...
6,287,1524644585,0.000806,62.5050,0
6,287,1524645372,0.000000,26.7500,0
6,287,1524646416,0.000000,58.2300,0
6,287,1524646417,0.000000,58.1225,0


### II.b: Saving the file
* This will be used in the next notebook: `003b_Creating_Clusters_correct.ipynb`

In [None]:
fp = os.path.join(data_dir, 'overall_means.pkl')
df_overall_all.to_pickle(fp)

# All other cells below are just helpers or debugging

# III. Accessing the multi-index dataframe `df_overall_all`

In [None]:
## Accessing a day of week using .loc
df_overall_all.loc[[(0)]]

In [None]:
## Accessing specific day of week.
dayofweek = 5
# drop_level is set to true by default, setting it to false, will keep the 'dow' column
_temp = df_overall_all.xs((dayofweek, ), level=['dow'], drop_level=False)
display(_temp)

In [None]:
## Accessing specific time window
_temp = df_overall_all.xs((0, 0), level=['dow', 'window'], drop_level=True)
display(_temp)

In [None]:
## Accessing a segment for a day of week
speeds = df_overall_all.xs((0, 1524646899), level=['dow', 'XDSegID'], drop_level=True)
fig, ax = plt.subplots()
ax.plot(speeds['speed_mean'])

### III.b Accessing based on specific times (time start, time end)

In [None]:
def get_data_slice_from_overall(overall_df, dayofweek, time_start, time_end, date='2019-01-01', segment=None, granularity=5):
    # Always has 5 min granularity
    time_windows = data_proc.get_number_of_time_windows(granularity)

    start_time = pd.Timestamp(time_start)
    end_time = pd.Timestamp(time_end)
    start_window = data_proc.time_window_from_time(start_time, time_windows)
    end_window = data_proc.time_window_from_time(end_time, time_windows)
    
    if segment:
        _slice = overall_df.loc[(dayofweek, start_window):(dayofweek, end_window)].xs((segment,), level=['XDSegID']).droplevel(0)
        
        start_time_str = start_time.strftime(time_start)
        datetime_arr = data_proc.generate_datetime_arr(start_time_str, len(_slice), granularity, date=date)
        _slice['time_local'] = datetime_arr
        _slice = _slice.reset_index()
        _slice.set_index('time_local', inplace=True)
        return _slice

    else:
        return False

In [None]:
start_time = '06:00:00'
end_time = '21:00:00'
granularity = 5
dayofweek = 0
speeds = get_data_slice_from_overall(df_overall_all, dayofweek, start_time, end_time, segment=1524646899, granularity=granularity)
speeds.plot(y='congestion_mean', figsize=(10, 3), marker='o', markevery=2, markersize=4)

In [None]:
new_granularity = 15
fig, ax = plt.subplots(figsize=(20, 5))
for dayofweek in range(7):
    speeds = get_data_slice_from_overall(df_overall_all, dayofweek, start_time, end_time, segment=1524646899, granularity=granularity)
    
    speeds = speeds.groupby(pd.Grouper(freq=f'{new_granularity}T')).agg({'speed_mean':'mean', 'congestion_mean':'mean', 'Total_Number_Incidents': 'sum'})
#     speeds.plot(y='speed_mean', ax=ax)
    speeds.plot(y='congestion_mean', marker='o', markevery=5, markersize=4, ax=ax, lw=0.5, legend=False)

## Accessing and adding time aspect column

In [None]:
# knowing that the granularity of the data is 5 minutes per time window, we can factor that in to generate datetime arr
granularity = 5
total_count = int(24 * 60 / granularity)
print(total_count)
datetime_arr = data_proc.generate_datetime_arr('00:00:00', total_count, granularity, date='2019-01-01')

# Get a time window for a day of week
speeds = df_overall_all.xs((0, 1524646899), level=['dow', 'XDSegID'], drop_level=True)
speeds['datetime'] = datetime_arr

speeds = speeds.set_index('datetime')
display(speeds)

speeds.plot(y='speed_mean')

In [None]:
# Trying to show resampling for hmean (but a bit finicky with edges.)
speeds = speeds.groupby(pd.Grouper(freq='15T')).agg({'speed_mean':'mean', 'congestion_mean':'mean', 'Total_Number_Incidents': 'sum'})
speeds.plot(y='speed_mean')

# Plotting

In [None]:
# Plotting both in 1 graph
granularity = 5
total_count = int(24 * 60 / granularity)
print(total_count)
datetime_arr = data_proc.generate_datetime_arr('00:00:00', total_count, granularity, date='2019-01-01')

# Get a time window for a day of week
speeds = df_overall_all.xs((0, 1524646899), level=['dow', 'XDSegID'], drop_level=True)
speeds['datetime'] = datetime_arr

speeds = speeds.set_index('datetime')
display(speeds)

new_granularity = 30
ax = speeds.plot(y='speed_mean', figsize=(10, 3), alpha=0.4)
speeds = speeds.groupby(pd.Grouper(freq=f'{new_granularity}T')).agg({'speed_mean':'mean', 'congestion_mean':'mean', 'Total_Number_Incidents': 'sum'})
speeds.plot(y='speed_mean', ax=ax)

# Creating twinx
ax1 = ax.twinx()
speeds.plot(y='congestion_mean', ax=ax1)

# Fixing the x-axis label for better visibility
# Maybe no need now

## Putting it all together (all months)

In [None]:
str_start_time = '06:00:00'
str_end_time = '21:00:00'

for month in range(1, 13):

    fp = os.path.join(data_dir, f'ALL_5m_DF_2019_{month}_1.gzip')
    DF_1 = Read_DF(DF_All = fp, Reading_Tag = 'DF_All', MetaData = {})

    fp = os.path.join(data_dir, f'ALL_5m_DF_2019_{month}_15.gzip')
    DF_15 = Read_DF(DF_All = fp, Reading_Tag = 'DF_All', MetaData = {})

    DF_All = pd.concat([DF_1, DF_15])
    DF_All = DF_All[['time_local', 'XDSegID', 'Total_Number_Incidents']]
    DF_All['time'] = DF_All['time_local'].tolist()
    DF_All = DF_All.set_index('time_local')
    DF_All = DF_All.between_time(str_start_time, str_end_time)

    DF_Incidents = DF_All[DF_All['Total_Number_Incidents'] > 0]
    segs = DF_Incidents['XDSegID'].tolist()
    cluster_heads = []
    for s in segs:
        cluster_heads.append(locate_segment_cluster(s, clusters))
    DF_Incidents['cluster_head'] = cluster_heads
    fp = os.path.join(cluster_dir, f"{clustering_version}_incidents_{str(month).zfill(2)}.pkl")
    DF_Incidents[DF_Incidents['cluster_head'] != -1].to_pickle(fp)