In [None]:
import geopandas  as gpd
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import h3 as h3

import re

import os
from os.path import isfile, join
from pathlib import Path
from os import listdir
import os
os.getcwd()

#from commons import download_data, find_vcs_root

path =  Path(os.getcwd())
root = path.parent.absolute()

# import libraries needed for upload / download to AWS
import boto3
import awswrangler
from fiona.session import AWSSession
import fiona
# set name of S3 bucket
s3_bucket = 'traffic-data-bucket'

root

# 1. Import Data

#### 1.1 Base Table
##### LA County shape file transposed to Uber Hexegons at level 8. ~.75 square km
##### This process takes a shape file and maps it to hex files for a given level. The output of the mapping is the a unique hex_id for the hexegon and the shape geometry
##### https://h3geo.org/docs/core-library/restable/

In [None]:
from aws_secrets import aws_access_key_id, aws_secret_access_key, aws_session_token

my_session = boto3.Session(
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
    aws_session_token = aws_session_token

)

In [None]:
with fiona.Env(session=AWSSession(my_session)):
    gdf_all = gpd.read_file(f"s3://{s3_bucket}/h3_processed_data/base_map_hex_all/base_map_hex_all.shp")

# gdf_all = gpd.read_file(root / 'X.data' / 'h3_processed_data' / 'base_map_hex_all' /'base_map_hex_all.shp')
print(gdf_all.shape)
gdf_all.sample(2)

In [None]:
gdf_all = gdf_all[~(gdf_all.hex_id == '0')]

#### 1.2 City and District shape files

In [None]:
city_label = awswrangler.s3.read_csv(path=f's3://{s3_bucket}/h3_processed_data/city_labels_hex.csv', boto3_session=my_session)

In [None]:
district_labels = awswrangler.s3.read_csv(path=f's3://{s3_bucket}/h3_processed_data/district_labels_hex.csv', boto3_session=my_session)

#### 1.3 Nodes
##### LA County nodes - pulled from Ptyhon OSMNX. All street intersections
#####   The lat and lon for each node was mapped hex id for joining onto the the county hex file
##### https://github.com/gboeing/osmnx

In [None]:
highway_cnts = awswrangler.s3.read_csv(path=f's3://{s3_bucket}/nodes_and_edges/nodes_highway_cnts.csv', boto3_session=my_session)


display(highway_cnts.sample())
highway_cnts.highway.value_counts()

In [None]:
street_cnts = awswrangler.s3.read_csv(path=f's3://{s3_bucket}/nodes_and_edges/nodes_street_count_cnts.csv', boto3_session=my_session)

#display(street_cnts.sample())
street_cnts_grps = street_cnts.groupby('hex_id').street_count.agg('max')
street_cnts_grps = street_cnts_grps.reset_index()
street_cnts_grps = street_cnts_grps[~(street_cnts_grps.hex_id == '0')]
street_cnts_grps.columns = ['hex_id', 'node_street_count']
street_cnts_grps.sample()

In [None]:
gdf_all = gdf_all.merge(street_cnts_grps, on = 'hex_id', how = 'left')
gdf_all.shape

#### 1.4 Edges
#### LA County edges (streets) - pulled from Ptyhon OSMNX.
##### These are the line geometry shape files. The will be joined using geo panda sjoin to the shape file for the hex
##### https://github.com/gboeing/osmnx

In [None]:
with fiona.Env(session=AWSSession(my_session)):
    edges = gpd.read_file(f"s3://{s3_bucket}/nodes_and_edges/la_county_edges/la_county_edges.shp")


# edges = gpd.read_file(root / 'X.data' /  'nodes_and_edges' / 'la_county_edges' / 'la_county_edges.shp')
print(edges.shape)

In [None]:
edges.sample(2)

#### 1.5 Collision data

In [None]:
collision_hex = awswrangler.s3.read_csv(path=f's3://{s3_bucket}/h3_processed_data/collisions_hex.csv', boto3_session=my_session)


collision_hex.head(2)

# 2. Nodes (intersections) 

### 2.1 Prep node files by making a wide table.  One unique row per hex id

In [None]:
highway_pivot = highway_cnts.pivot(index="hex_id", columns="highway", values="count").fillna(0)
highway_pivot.columns = 'node_'+highway_pivot.columns
highway_pivot.reset_index(inplace = True)
highway_pivot.sample(2)

In [None]:
highway_pivot.shape

In [None]:
orign_row_count = gdf_all.shape[0]
gdf_all = gdf_all.merge(highway_pivot, on = 'hex_id', how = 'left')
updated_row_count = gdf_all.shape[0]
orign_row_count == updated_row_count

In [None]:
gdf_all.columns

In [None]:
#improvment could be to create this list dynamically
counts_col_list = ['node_street_count','node_crossing', 'node_give_way',
       'node_milestone', 'node_mini_roundabout', 'node_motorway_junction',
       'node_stop', 'node_traffic_signals', 'node_trailhead',
       'node_turning_circle', 'node_turning_loop']

gdf_all.update(gdf_all[counts_col_list].fillna(0))

In [None]:
gdf_all.sample(3)

### 2.3 Attach the neighboring nodes hex ids to the general table

In [None]:
# h3 k_ring returns the ring of hexegons touching a given h3.  
# Set level 
# skin = 1 is first ring plus the hex itself.  
# skin = 2 is second ring out plus ring 1 plus the hex itself, ect...
def rking_neighbors(row, skins):
    neighbors = h3.k_ring(row.hex_id, skins)
    neighbors_list = list(neighbors)
    return(neighbors_list)

In [None]:
gdf_all['hex_neighbors_0_ids'] = gdf_all.apply(lambda x: rking_neighbors(x, skins = 0), axis=1)
gdf_all['hex_neighbors_1_ids'] = gdf_all.apply(lambda x: rking_neighbors(x, skins = 1), axis=1)
gdf_all['hex_neighbors_2_ids'] = gdf_all.apply(lambda x: rking_neighbors(x, skins = 2), axis=1)
gdf_all[['hex_id', 'hex_neighbors_0_ids', 'hex_neighbors_1_ids', 'hex_neighbors_2_ids']].sample(2)

#hex_id should be the same as hex_neighbors_0_ids

### 2.4 Neighboring Hex Counts
#### For all the nodes columns, attach the count for the hex and it ring 1 and 2. 

In [None]:
#create a table with a row for each hex neighbor 1
gd_all_1_tall = gdf_all[['hex_id', 'hex_neighbors_1_ids']].explode('hex_neighbors_1_ids')
#merge all the counts for each neibhbor hex id
gd_all_1_tall = gd_all_1_tall.merge(gdf_all[counts_col_list + ['hex_id']], 
                                    left_on = 'hex_neighbors_1_ids', 
                                    right_on = 'hex_id',
                                    how = 'inner')
#print(gd_all_1_tall.columns)
gd_all_1_tall = gd_all_1_tall[['hex_id_x'] + counts_col_list]
gd_all_1_grp_sum = gd_all_1_tall.groupby('hex_id_x')[counts_col_list].agg('sum')
gd_all_1_grp_cnt = gd_all_1_tall.groupby('hex_id_x')[counts_col_list[0]].agg('count')
#gd_all_1_grp.columns = ['hex_id', 'neighbor_1_collision_count']
gd_all_1_grp_sum.columns = 'neighbor_1_' + gd_all_1_grp_sum.columns

#sum over the hex id neighbors
gd_all_1_grp_sum.index.names = ['hex_id']
gd_all_1_grp_sum.reset_index(inplace = True)
gd_all_1_grp_cnt = gd_all_1_grp_cnt.reset_index()
gd_all_1_grp_cnt.columns = ['hex_id', 'neighbor_1_count']

gdf_all = gdf_all.merge(gd_all_1_grp_sum, on = 'hex_id', how = 'left')
gdf_all = gdf_all.merge(gd_all_1_grp_cnt, on = 'hex_id', how = 'left')
gdf_all.sample(2)

In [None]:
#repeat the cell above for neighbor id 2
gd_all_2_tall = gdf_all[['hex_id', 'hex_neighbors_2_ids']].explode('hex_neighbors_2_ids')
gd_all_2_tall = gd_all_2_tall.merge(gdf_all[counts_col_list + ['hex_id']], 
                                    left_on = 'hex_neighbors_2_ids', 
                                    right_on = 'hex_id',
                                    how = 'inner')
#print(gd_all_1_tall.columns)
gd_all_2_tall = gd_all_2_tall[['hex_id_x'] + counts_col_list]
gd_all_2_grp_sum = gd_all_2_tall.groupby('hex_id_x')[counts_col_list].agg('sum')
gd_all_2_grp_cnt = gd_all_2_tall.groupby('hex_id_x')[counts_col_list[0]].agg('count')
#gd_all_1_grp.columns = ['hex_id', 'neighbor_1_collision_count']
gd_all_2_grp_sum.columns = 'neighbor_2_' + gd_all_2_grp_sum.columns

gd_all_2_grp_sum.index.names = ['hex_id']
gd_all_2_grp_sum.reset_index(inplace = True)
gd_all_2_grp_cnt = gd_all_2_grp_cnt.reset_index()
gd_all_2_grp_cnt.columns = ['hex_id', 'neighbor_2_count']

gdf_all = gdf_all.merge(gd_all_2_grp_sum, on = 'hex_id', how = 'left')
gdf_all = gdf_all.merge(gd_all_2_grp_cnt, on = 'hex_id', how = 'left')
gdf_all.sample(2)

In [None]:
#check to make sure all joins didn't add rows
print(gdf_all.shape[0])
updated_row_count = gdf_all.shape[0]
orign_row_count == updated_row_count

## 2.5 Nearest Hex Neighbor Average
Some hex do not boarder other streets. Coast or just in the middle of mountains.  Taking average of the hex with streets.

In [None]:
#get all columns with the following pattern
r = re.compile("neighbor_1_*")
neighbor_col_list = list(filter(r.match, gdf_all.columns))
neighbor_col_list.remove('neighbor_1_count')


for nc in neighbor_col_list:
    gdf_all[nc + '_ave'] = gdf_all[nc] / gdf_all.neighbor_1_count

    #get all columns with the following pattern
r = re.compile("neighbor_2_*")
neighbor_col_list = list(filter(r.match, gdf_all.columns)) 
neighbor_col_list.remove('neighbor_2_count')

for nc in neighbor_col_list:
    gdf_all[nc + '_ave'] = gdf_all[nc] / gdf_all.neighbor_2_count

gdf_all.sample(2)

# 3. City and Distric Labels

In [None]:
gdf_all = gdf_all.merge(city_label, on = 'hex_id', how = 'left')

In [None]:
city_label.sample(2)

In [None]:
gdf_all = gdf_all.merge(district_labels, on = 'hex_id', how = 'left')

In [None]:
#check to make sure all joins didn't add rows
print(gdf_all.shape[0])
updated_row_count = gdf_all.shape[0]
orign_row_count == updated_row_count

In [None]:
gdf_all.sample(2)

In [None]:
gdf_all.columns

# 4 Edges (streets)

In [None]:
# depending on the version of geopandas installed, 'op' and 'predicate' arguments are the same
gdf_all_edge_exploded = gpd.sjoin(gdf_all[['hex_id', 'geometry']], edges.to_crs(epsg=3857), how='inner', op='intersects')

# gdf_all_edge_exploded = gpd.sjoin(gdf_all[['hex_id', 'geometry']], edges.to_crs(epsg=3857), how='inner', predicate='intersects')
gdf_all_edge_exploded.sample(4)

In [None]:
gdf_all_edge_exploded.highway.unique()

In [None]:
gdf_all_edge_exploded.bridge.unique()

In [None]:
gdf_all_edge_exploded.tunnel.unique()

### 4.1 Create road type indicator (highway)

In [None]:
gdf_all_edge_exploded['motorway_id'] = gdf_all_edge_exploded['highway'].isin(['motorway'])
gdf_all_edge_exploded['motorway_link_id'] = gdf_all_edge_exploded['highway'].isin(['motorway_link'])
gdf_all_edge_exploded['living_street_id'] = gdf_all_edge_exploded['highway'].isin(['living_street'])
gdf_all_edge_exploded['bridge_id'] = gdf_all_edge_exploded['bridge'].isin(['yes'])
gdf_all_edge_exploded['tunnel_id'] = gdf_all_edge_exploded['tunnel'].isin(['yes'])


In [None]:
edge_grp_speed = gdf_all_edge_exploded.groupby('hex_id')[['speed_kph']].agg(['max','min']).reset_index()
edge_grp_speed.columns = ['hex_id', 'edge_speed_kph_max', 'edge_speek_kph_min']
edge_grp_speed.sample()

In [None]:
edge_grp_lanes = gdf_all_edge_exploded.groupby('hex_id')[['lanes']].agg(['max','min']).reset_index()
edge_grp_lanes.columns = ['hex_id', 'edge_lanes_max', 'edge_lanes_min']
edge_grp_lanes.sample()

In [None]:
edge_grp_motorway_id = gdf_all_edge_exploded.groupby('hex_id')[['motorway_id']].agg(['max']).reset_index()
edge_grp_motorway_id.columns = ['hex_id', 'edge_motorway_id']
edge_grp_motorway_id.sample(1)

In [None]:
edge_grp_motorway_link_id = gdf_all_edge_exploded.groupby('hex_id')[['motorway_link_id']].agg(['max']).reset_index()
edge_grp_motorway_link_id.columns = ['hex_id', 'edge_motorway_link_id']
edge_grp_motorway_link_id.sample(1)

In [None]:
edge_grp_living_street_id = gdf_all_edge_exploded.groupby('hex_id')[['living_street_id']].agg(['max']).reset_index()
edge_grp_living_street_id.columns = ['hex_id', 'edge_living_street_id']
edge_grp_living_street_id.sample(1)

In [None]:
edge_grp_bridge_id = gdf_all_edge_exploded.groupby('hex_id')[['bridge_id']].agg(['max']).reset_index()
edge_grp_bridge_id.columns = ['hex_id', 'edge_bridge_id']
edge_grp_bridge_id.sample(1)

In [None]:
edge_grp_oneway_id = gdf_all_edge_exploded.groupby('hex_id')[['oneway']].agg(['max']).reset_index()
edge_grp_oneway_id.columns = ['hex_id', 'edge_oneway_id']
edge_grp_oneway_id.sample(1)

In [None]:
edge_grp_tunnel_id = gdf_all_edge_exploded.groupby('hex_id')[['tunnel_id']].agg(['max']).reset_index()
edge_grp_tunnel_id.columns = ['hex_id', 'edge_tunnel_id']
edge_grp_tunnel_id.sample(1)

In [None]:
gdf_all = gdf_all.merge(edge_grp_speed, how = 'left', on = 'hex_id')
print(gdf_all.shape[0])
updated_row_count = gdf_all.shape[0]
orign_row_count == updated_row_count

In [None]:
gdf_all = gdf_all.merge(edge_grp_lanes, how = 'left', on = 'hex_id')
print(gdf_all.shape[0])
updated_row_count = gdf_all.shape[0]
orign_row_count == updated_row_count

In [None]:
gdf_all = gdf_all.merge(edge_grp_motorway_id, how = 'left', on = 'hex_id')
print(gdf_all.shape[0])
updated_row_count = gdf_all.shape[0]
orign_row_count == updated_row_count

In [None]:
gdf_all = gdf_all.merge(edge_grp_motorway_link_id, how = 'left', on = 'hex_id')
print(gdf_all.shape[0])
updated_row_count = gdf_all.shape[0]
orign_row_count == updated_row_count

In [None]:
gdf_all = gdf_all.merge(edge_grp_living_street_id, how = 'left', on = 'hex_id')
print(gdf_all.shape[0])
updated_row_count = gdf_all.shape[0]
orign_row_count == updated_row_count

In [None]:
gdf_all = gdf_all.merge(edge_grp_bridge_id, how = 'left', on = 'hex_id')
print(gdf_all.shape[0])
updated_row_count = gdf_all.shape[0]
orign_row_count == updated_row_count

In [None]:
gdf_all = gdf_all.merge(edge_grp_oneway_id, how = 'left', on = 'hex_id')
print(gdf_all.shape[0])
updated_row_count = gdf_all.shape[0]
orign_row_count == updated_row_count

In [None]:
gdf_all = gdf_all.merge(edge_grp_tunnel_id, how = 'left', on = 'hex_id')
print(gdf_all.shape[0])
updated_row_count = gdf_all.shape[0]
orign_row_count == updated_row_count

### 4.3 Create a filter hex file to only include hex's with a intersection or street (node or an edge)

In [None]:
#get a list of all hex that have an edge or node
gdf_edge_filter = gpd.sjoin(gdf_all, edges.to_crs(epsg=3857), how = 'inner')
gdf_edge_filter.reset_index(inplace = True)

gdf_edge_filter = gdf_edge_filter[['hex_id']].drop_duplicates()
gdf_node_filter = highway_cnts[['hex_id']].drop_duplicates()

#row bind the two pandas
gdf_filtered = pd.concat([gdf_edge_filter, gdf_node_filter], axis = 0)

#create a vector of valid hex ids
valid_array = gdf_filtered['hex_id'].values
#print(valid_array)
print(gdf_all.shape)

#
mask = gdf_all['hex_id'].isin(valid_array)

gdf_all['valid_accident_location_filter'] = mask

#gdf_all = gdf_all[gdf_all['valid_accident_location_filter'] == True]

#gdf_all.shape

# 5. Collisions by Year

### 5.1 Neighbor collisions count.  

In [None]:
collision_hex.sample(3)

In [None]:
collision_hex_grp = collision_hex.groupby(['hex_id', 'collision_year']).accident_count.agg('sum').to_frame('collisions').reset_index()
collision_hex_grp['collision_year'] = collision_hex_grp['collision_year']
collision_hex_grp.head(2)

In [None]:
gd_all_0_tall = gdf_all[['hex_id', 'hex_neighbors_0_ids']].explode('hex_neighbors_0_ids')
gd_all_0_tall = gd_all_0_tall.merge(collision_hex_grp, 
                                    left_on = 'hex_neighbors_0_ids', 
                                    right_on = 'hex_id',
                                    how = 'inner')

gd_all_0_tall = gd_all_0_tall[['hex_id_x', 'collision_year', 'collisions']]

gd_all_0_tall = gd_all_0_tall.groupby(['hex_id_x', 'collision_year']).collisions.agg('sum').to_frame('neighbor_0_collision').reset_index()
#display(gd_all_1_tall.head(1))

pivot_neighbor_0 = gd_all_0_tall.pivot_table(index = 'hex_id_x', columns = 'collision_year', values = 'neighbor_0_collision')
#display(pivot_neighbor_1.head(1))
#print(pivot_neighbor_1.columns)
pivot_neighbor_0.columns = ["_".join(('collisions_neighbor_0',str(j))) for j in pivot_neighbor_0.columns]
pivot_neighbor_0.index.names = ['hex_id']
pivot_neighbor_0 = pivot_neighbor_0.reset_index()
pivot_neighbor_0 = pivot_neighbor_0.fillna(0)

In [None]:
gd_all_1_tall = gdf_all[['hex_id', 'hex_neighbors_1_ids']].explode('hex_neighbors_1_ids')
gd_all_1_tall = gd_all_1_tall.merge(collision_hex_grp, 
                                    left_on = 'hex_neighbors_1_ids', 
                                    right_on = 'hex_id',
                                    how = 'inner')

gd_all_1_tall = gd_all_1_tall[['hex_id_x', 'collision_year', 'collisions']]

gd_all_1_tall = gd_all_1_tall.groupby(['hex_id_x', 'collision_year']).collisions.agg('sum').to_frame('neighbor_1_collision').reset_index()
#display(gd_all_1_tall.head(1))

pivot_neighbor_1 = gd_all_1_tall.pivot_table(index = 'hex_id_x', columns = 'collision_year', values = 'neighbor_1_collision')
#display(pivot_neighbor_1.head(1))
#print(pivot_neighbor_1.columns)
pivot_neighbor_1.columns = ["_".join(('collisions_neighbor_1',str(j))) for j in pivot_neighbor_1.columns]
pivot_neighbor_1.index.names = ['hex_id']
pivot_neighbor_1 = pivot_neighbor_1.reset_index()
pivot_neighbor_1 = pivot_neighbor_1.fillna(0)


In [None]:
gd_all_2_tall = gdf_all[['hex_id', 'hex_neighbors_2_ids']].explode('hex_neighbors_2_ids')
gd_all_2_tall = gd_all_2_tall.merge(collision_hex_grp, 
                                    left_on = 'hex_neighbors_2_ids', 
                                    right_on = 'hex_id',
                                    how = 'inner')

gd_all_2_tall = gd_all_2_tall[['hex_id_x', 'collision_year', 'collisions']]

gd_all_2_tall = gd_all_2_tall.groupby(['hex_id_x', 'collision_year']).collisions.agg('sum').to_frame('neighbor_1_collision').reset_index()
#display(gd_all_1_tall.head(1))

pivot_neighbor_2 = gd_all_2_tall.pivot_table(index = 'hex_id_x', columns = 'collision_year', values = 'neighbor_1_collision')
#display(pivot_neighbor_1.head(1))
#print(pivot_neighbor_1.columns)
pivot_neighbor_2.columns = ["_".join(('collisions_neighbor_2',str(j))) for j in pivot_neighbor_2.columns]
pivot_neighbor_2.index.names = ['hex_id']
pivot_neighbor_2 = pivot_neighbor_2.reset_index()
pivot_neighbor_2 = pivot_neighbor_2.fillna(0)
pivot_neighbor_2.dtypes

In [None]:
pd.DataFrame(pivot_neighbor_2)

In [None]:
gdf_all.dtypes

In [None]:
gdf_all = gdf_all.merge(pivot_neighbor_0, on = 'hex_id', how = 'left')
gdf_all = gdf_all.merge(pivot_neighbor_1, on = 'hex_id', how = 'left')
gdf_all = gdf_all.merge(pivot_neighbor_2, on = 'hex_id', how = 'left')

In [None]:
gdf_all.sample(1)

In [None]:
gdf_all.columns

### Write to CSV and upload to S3

In [None]:
awswrangler.s3.to_csv(df=gdf_all, path = f's3://{s3_bucket}/joined_data/base_location_data.csv', index=False,
                       boto3_session=my_session, use_threads=True
                       )

# gdf_all.to_csv(root / 'X.data' / 'joined_data' / 'base_location_data.csv', index = False )