In [None]:
import geopandas  as gpd
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import h3 as h3

import re

import os
from os.path import isfile, join
from pathlib import Path
from os import listdir
import os
os.getcwd()

path =  Path(os.getcwd())
root = path.parent.absolute()

root

# 1. Import Data

#### 1.1 Base Table
##### LA County shape file transposed to Uber Hexegons at level 8. ~.75 square km
##### This process takes a shape file and maps it to hex files for a given level. The output of the mapping is the a unique hex_id for the hexegon and the shape geometry
##### https://h3geo.org/docs/core-library/restable/

In [None]:
gdf_all = gpd.read_file(root / 'X.data' / 'h3_processed_data' / 'base_map_hex_all' /'base_map_hex_all.shp')
print(gdf_all.shape)
gdf_all.sample(2)

In [None]:
gdf_all = gdf_all[~(gdf_all.hex_id == '0')]

#### 1.2 City and District shape files

In [None]:
city_label = pd.read_csv(root / 'X.data' / 'h3_processed_data'/ 'city_labels_hex.csv')

In [None]:
district_labels = pd.read_csv(root / 'X.data' / 'h3_processed_data'/ 'district_labels_hex.csv')

#### 1.3 Nodes
##### LA County nodes - pulled from Ptyhon OSMNX. All street intersections
#####   The lat and lon for each node was mapped hex id for joining onto the the county hex file
##### https://github.com/gboeing/osmnx

In [None]:
highway_cnts = pd.read_csv(root / 'X.data' /  'nodes_and_edges' / 'nodes_highway_cnts.csv' )
display(highway_cnts.sample())
highway_cnts.highway.value_counts()

In [None]:
street_cnts = pd.read_csv(root / 'X.data' / 'nodes_and_edges' / 'nodes_street_count_cnts.csv' )
#display(street_cnts.sample())
street_cnts_grps = street_cnts.groupby('hex_id').street_count.agg('max')
street_cnts_grps = street_cnts_grps.reset_index()
street_cnts_grps = street_cnts_grps[~(street_cnts_grps.hex_id == '0')]
street_cnts_grps.columns = ['hex_id', 'node_street_count']
street_cnts_grps.sample()

In [None]:
gdf_all = gdf_all.merge(street_cnts_grps, on = 'hex_id', how = 'left')
gdf_all.shape

#### 1.4 Edges
#### LA County edges (streets) - pulled from Ptyhon OSMNX.
##### These are the line geometry shape files. The will be joined using geo panda sjoin to the shape file for the hex
##### https://github.com/gboeing/osmnx

In [None]:
edges = gpd.read_file(root / 'X.data' / 'nodes_and_edges' / 'la_county_edges' / 'la_county_edges.shp')
print(edges.shape)

#### 1.5 Collision data

In [None]:
collision_hex = pd.read_csv(root / 'X.data' / 'h3_processed_data' / 'collisions_hex.csv')
collision_hex.head(2)

# 2. Nodes (intersections) 

### 2.1 Prep node files by making a wide table.  One unique row per hex id

In [None]:
highway_pivot = highway_cnts.pivot(index="hex_id", columns="highway", values="count").fillna(0)
highway_pivot.columns = 'node_'+highway_pivot.columns
highway_pivot.reset_index(inplace = True)
highway_pivot.sample(2)

In [None]:
orign_row_count = gdf_all.shape[0]
gdf_all = gdf_all.merge(highway_pivot, on = 'hex_id', how = 'left')
updated_row_count = gdf_all.shape[0]
orign_row_count = updated_row_count

In [None]:
gdf_all.shape
gdf_all.columns

In [None]:
#improvment could be to create this list dynamically
counts_col_list = ['node_street_count','node_crossing', 'node_give_way',
       'node_milestone', 'node_mini_roundabout', 'node_motorway_junction',
       'node_stop', 'node_traffic_signals', 'node_trailhead',
       'node_turning_circle', 'node_turning_loop']

gdf_all.update(gdf_all[counts_col_list].fillna(0))

In [None]:
gdf_all.sample()

### 2.3 Attach the neighboring nodes hex ids to the general table

In [None]:
# h3 k_ring returns the ring of hexegons touching a given h3.  
# Set level 
# skin = 1 is first ring plus the hex itself.  
# skin = 2 is second ring out plus ring 1 plus the hex itself, ect...
def rking_neighbors(row, skins):
    neighbors = h3.k_ring(row.hex_id, skins)
    neighbors_list = list(neighbors)
    return(neighbors_list)

In [None]:
gdf_all['hex_neighbors_0_ids'] = gdf_all.apply(lambda x: rking_neighbors(x, skins = 0), axis=1)
gdf_all['hex_neighbors_1_ids'] = gdf_all.apply(lambda x: rking_neighbors(x, skins = 1), axis=1)
gdf_all['hex_neighbors_2_ids'] = gdf_all.apply(lambda x: rking_neighbors(x, skins = 2), axis=1)
gdf_all.sample(2)

### 2.4 Neighboring Hex Counts
#### For all the nodes columns, attach the count for the hex and it ring 1 and 2. 

In [None]:
gd_all_1_tall = gdf_all[['hex_id', 'hex_neighbors_1_ids']].explode('hex_neighbors_1_ids')
gd_all_1_tall = gd_all_1_tall.merge(gdf_all[counts_col_list + ['hex_id']], 
                                    left_on = 'hex_neighbors_1_ids', 
                                    right_on = 'hex_id',
                                    how = 'inner')
#print(gd_all_1_tall.columns)
gd_all_1_tall = gd_all_1_tall[['hex_id_x'] + counts_col_list]
gd_all_1_grp_sum = gd_all_1_tall.groupby('hex_id_x')[counts_col_list].agg('sum')
gd_all_1_grp_cnt = gd_all_1_tall.groupby('hex_id_x')[counts_col_list[0]].agg('count')
#gd_all_1_grp.columns = ['hex_id', 'neighbor_1_collision_count']
gd_all_1_grp_sum.columns = 'neighbor_1_' + gd_all_1_grp_sum.columns

gd_all_1_grp_sum.index.names = ['hex_id']
gd_all_1_grp_sum.reset_index(inplace = True)
gd_all_1_grp_cnt = gd_all_1_grp_cnt.reset_index()
gd_all_1_grp_cnt.columns = ['hex_id', 'neighbor_1_count']

gdf_all = gdf_all.merge(gd_all_1_grp_sum, on = 'hex_id', how = 'left')
gdf_all = gdf_all.merge(gd_all_1_grp_cnt, on = 'hex_id', how = 'left')
gdf_all.sample(2)

In [None]:
gd_all_2_tall = gdf_all[['hex_id', 'hex_neighbors_2_ids']].explode('hex_neighbors_2_ids')
gd_all_2_tall = gd_all_2_tall.merge(gdf_all[counts_col_list + ['hex_id']], 
                                    left_on = 'hex_neighbors_2_ids', 
                                    right_on = 'hex_id',
                                    how = 'inner')
#print(gd_all_1_tall.columns)
gd_all_2_tall = gd_all_2_tall[['hex_id_x'] + counts_col_list]
gd_all_2_grp_sum = gd_all_2_tall.groupby('hex_id_x')[counts_col_list].agg('sum')
gd_all_2_grp_cnt = gd_all_2_tall.groupby('hex_id_x')[counts_col_list[0]].agg('count')
#gd_all_1_grp.columns = ['hex_id', 'neighbor_1_collision_count']
gd_all_2_grp_sum.columns = 'neighbor_2_' + gd_all_2_grp_sum.columns

gd_all_2_grp_sum.index.names = ['hex_id']
gd_all_2_grp_sum.reset_index(inplace = True)
gd_all_2_grp_cnt = gd_all_2_grp_cnt.reset_index()
gd_all_2_grp_cnt.columns = ['hex_id', 'neighbor_2_count']

gdf_all = gdf_all.merge(gd_all_2_grp_sum, on = 'hex_id', how = 'left')
gdf_all = gdf_all.merge(gd_all_2_grp_cnt, on = 'hex_id', how = 'left')
gdf_all.sample(2)

## 2.5 Nearest Hex Neighbor Average

In [None]:
r = re.compile("neighbor_1_*")
neighbor_col_list = list(filter(r.match, gdf_all.columns))
neighbor_col_list.remove('neighbor_1_count')
neighbor_col_list

for nc in neighbor_col_list:
    gdf_all[nc + '_ave'] = gdf_all[nc] / gdf_all.neighbor_1_count

r = re.compile("neighbor_2_*")
neighbor_col_list = list(filter(r.match, gdf_all.columns)) 
neighbor_col_list.remove('neighbor_2_count')
neighbor_col_list

for nc in neighbor_col_list:
    gdf_all[nc + '_ave'] = gdf_all[nc] / gdf_all.neighbor_2_count

gdf_all.sample(2)

# 3 City and Distric Labels

In [None]:
gdf_all = gdf_all.merge(city_label, on = 'hex_id', how = 'left')

In [None]:
city_label.sample(2)

In [None]:
gdf_all.shape

In [None]:
gdf_all = gdf_all.merge(district_labels, on = 'hex_id', how = 'left')

In [None]:
gdf_all.shape

In [None]:
gdf_all.sample(2)

In [None]:
gdf_all.columns

# 4 Edges (streets)

In [None]:
gdf_all = gpd.sjoin(gdf_all, edges[['geometry', 'highway']].to_crs(epsg=3857), how='left', predicate='intersects')
gdf_all.sample(4)

In [None]:
sorter = ['motorway_link'
                      ,'motorway'
                      ,'primary'
                      ,'secondary'
                      ,'residential'
                      ,'primary_link'
                      ,'secondary_link'
                      ,'tertiary'
                      ,'trunk'
                      ,'unclassified'
                      ,'other']

gdf_all['highway_updated'] = 'other'
gdf_all['highway_updated'][gdf_all['highway'].isin(sorter)] = gdf_all['highway']
gdf_all.highway_updated.value_counts()
sorterIndex = dict(zip(sorter, range(len(sorter))))

gdf_all['highway_rank'] = gdf_all.highway_updated.map(sorterIndex)
#gdf_hex_hwy.sort_values(by=['hex_id', 'highway_rank']).head()
gdf_all = gdf_all.groupby('hex_id').first()
gdf_all.reset_index(inplace = True)
gdf_all.sample(3)

In [None]:
gdf_all['highway'] = gdf_all['highway_updated']
gdf_all = gdf_all.drop(columns = ['highway_updated','highway_rank', 'index_right'])
gdf_all.sample(2)

In [None]:
print(gdf_all.shape)
edges[['geometry', 'oneway']].oneway.value_counts()

In [None]:
gdf_all = gpd.sjoin(gdf_all, edges[['geometry', 'oneway']].to_crs(epsg=3857), how='left', predicate='intersects')
gdf_all.sample(2)

In [None]:
sorter = [1,0]
gdf_all['oneway_updated'] = 'other'
gdf_all['oneway_updated'][gdf_all['oneway'].isin(sorter)] = gdf_all['oneway']
gdf_all.oneway_updated.value_counts()
sorterIndex = dict(zip(sorter, range(len(sorter))))

gdf_all['oneway_rank'] = gdf_all.oneway_updated.map(sorterIndex)
#gdf_hex_hwy.sort_values(by=['hex_id', 'highway_rank']).head()
gdf_all = gdf_all.groupby('hex_id').first()
gdf_all.reset_index(inplace = True)
gdf_all.sample(3)

In [None]:
gdf_all['oneway'] = gdf_all['oneway_updated']
gdf_all = gdf_all.drop(columns = ['oneway_updated','oneway_rank', 'index_right'])
gdf_all.sample(2)

In [None]:
gdf_all.shape

In [None]:
gdf_all = gpd.sjoin(gdf_all, edges[['geometry', 'lanes']].to_crs(epsg=3857), how='left', predicate='intersects')
gdf_all.sample(4)

In [None]:
gdf_all = gdf_all.sort_values('lanes', ascending = False)
gdf_all = gdf_all.groupby('hex_id').first()
gdf_all.reset_index(inplace = True)
gdf_all = gdf_all.drop(columns = ['index_right'])
gdf_all.sample(3)

In [None]:
gdf_all.shape

In [None]:
gdf_all = gpd.sjoin(gdf_all, edges[['geometry', 'maxspeed']].to_crs(epsg=3857), how='left', predicate='intersects')
gdf_all = gdf_all.sort_values('maxspeed', ascending = False)
gdf_all = gdf_all.groupby('hex_id').first()
gdf_all.reset_index(inplace = True)
gdf_all = gdf_all.drop(columns = ['index_right'])
gdf_all.shape

In [None]:
gdf_all = gpd.sjoin(gdf_all, edges[['geometry', 'width']].to_crs(epsg=3857), how='left', predicate='intersects')
gdf_all = gdf_all.sort_values('width', ascending = False)
gdf_all = gdf_all.groupby('hex_id').first()
gdf_all.reset_index(inplace = True)
gdf_all = gdf_all.drop(columns = ['index_right'])
gdf_all.sample(3)

In [None]:
edges.loc[edges.bridge != 'yes', 'bridge'] = 'no'
gdf_all = gpd.sjoin(gdf_all, edges[['geometry', 'bridge']].to_crs(epsg=3857), how='left', predicate='intersects')
gdf_all = gdf_all.sort_values('bridge', ascending = False)
gdf_all = gdf_all.groupby('hex_id').first()
gdf_all.reset_index(inplace = True)
gdf_all = gdf_all.drop(columns = ['index_right'])
gdf_all.shape

In [None]:
gdf_all.columns

In [None]:
gdf_edge_filter = gpd.sjoin(gdf_all, edges.to_crs(epsg=3857), how = 'inner')
gdf_edge_filter.reset_index(inplace = True)

gdf_edge_filter = gdf_edge_filter[['hex_id']].drop_duplicates()
gdf_node_filter = highway_cnts[['hex_id']].drop_duplicates()

gdf_filtered = pd.concat([gdf_edge_filter, gdf_node_filter], axis = 0)
valid_array = gdf_filtered['hex_id'].values
print(valid_array)

mask = gdf_all['hex_id'].isin(valid_array)

gdf_all['valid_accident_location_filter'] = mask

print(sum(mask))

gdf_all.shape

# 5. Collisions by Year

### 5.1 Neighbor collisions count.  

In [None]:
collision_hex_grp = collision_hex.groupby(['hex_id', 'collision_year']).accident_count.agg('sum').to_frame('collisions').reset_index()
collision_hex_grp['collision_year'] = collision_hex_grp['collision_year']
collision_hex_grp.head(2)

In [None]:
gd_all_0_tall = gdf_all[['hex_id', 'hex_neighbors_0_ids']].explode('hex_neighbors_0_ids')
gd_all_0_tall = gd_all_0_tall.merge(collision_hex_grp, 
                                    left_on = 'hex_neighbors_0_ids', 
                                    right_on = 'hex_id',
                                    how = 'inner')

gd_all_0_tall = gd_all_0_tall[['hex_id_x', 'collision_year', 'collisions']]

gd_all_0_tall = gd_all_0_tall.groupby(['hex_id_x', 'collision_year']).collisions.agg('sum').to_frame('neighbor0_collision').reset_index()
#display(gd_all_1_tall.head(1))

pivot_neighbor_0 = gd_all_0_tall.pivot_table(index = 'hex_id_x', columns = 'collision_year', values = 'neighbor0_collision')
#display(pivot_neighbor_1.head(1))
#print(pivot_neighbor_1.columns)
pivot_neighbor_0.columns = ["_".join(('collisions_neighbor0',str(j))) for j in pivot_neighbor_0.columns]
pivot_neighbor_0.index.names = ['hex_id']
pivot_neighbor_0 = pivot_neighbor_0.reset_index()
pivot_neighbor_0 = pivot_neighbor_0.fillna(0)

In [None]:
gd_all_1_tall = gdf_all[['hex_id', 'hex_neighbors_1_ids']].explode('hex_neighbors_1_ids')
gd_all_1_tall = gd_all_1_tall.merge(collision_hex_grp, 
                                    left_on = 'hex_neighbors_1_ids', 
                                    right_on = 'hex_id',
                                    how = 'inner')

gd_all_1_tall = gd_all_1_tall[['hex_id_x', 'collision_year', 'collisions']]

gd_all_1_tall = gd_all_1_tall.groupby(['hex_id_x', 'collision_year']).collisions.agg('sum').to_frame('neighbor1_collision').reset_index()
#display(gd_all_1_tall.head(1))

pivot_neighbor_1 = gd_all_1_tall.pivot_table(index = 'hex_id_x', columns = 'collision_year', values = 'neighbor1_collision')
#display(pivot_neighbor_1.head(1))
#print(pivot_neighbor_1.columns)
pivot_neighbor_1.columns = ["_".join(('collisions_neighbor1',str(j))) for j in pivot_neighbor_1.columns]
pivot_neighbor_1.index.names = ['hex_id']
pivot_neighbor_1 = pivot_neighbor_1.reset_index()
pivot_neighbor_1 = pivot_neighbor_1.fillna(0)


In [None]:
gd_all_2_tall = gdf_all[['hex_id', 'hex_neighbors_2_ids']].explode('hex_neighbors_2_ids')
gd_all_2_tall = gd_all_2_tall.merge(collision_hex_grp, 
                                    left_on = 'hex_neighbors_2_ids', 
                                    right_on = 'hex_id',
                                    how = 'inner')

gd_all_2_tall = gd_all_2_tall[['hex_id_x', 'collision_year', 'collisions']]

gd_all_2_tall = gd_all_2_tall.groupby(['hex_id_x', 'collision_year']).collisions.agg('sum').to_frame('neighbor1_collision').reset_index()
#display(gd_all_1_tall.head(1))

pivot_neighbor_2 = gd_all_2_tall.pivot_table(index = 'hex_id_x', columns = 'collision_year', values = 'neighbor1_collision')
#display(pivot_neighbor_1.head(1))
#print(pivot_neighbor_1.columns)
pivot_neighbor_2.columns = ["_".join(('collisions_neighbor2',str(j))) for j in pivot_neighbor_2.columns]
pivot_neighbor_2.index.names = ['hex_id']
pivot_neighbor_2 = pivot_neighbor_2.reset_index()
pivot_neighbor_2 = pivot_neighbor_2.fillna(0)
pivot_neighbor_2.dtypes

In [None]:
pd.DataFrame(pivot_neighbor_2)

In [None]:
gdf_all.dtypes

In [None]:
gdf_all = gdf_all.merge(pivot_neighbor_0, on = 'hex_id', how = 'left')
gdf_all = gdf_all.merge(pivot_neighbor_1, on = 'hex_id', how = 'left')
gdf_all = gdf_all.merge(pivot_neighbor_2, on = 'hex_id', how = 'left')

In [None]:
gdf_all.sample(1)

In [None]:
gdf_all.to_csv(root / 'X.data' / 'joined_data' / 'base_location_data.csv', index = False )