# 3) Calculate BG Distances

## Env

In [1]:
import pandas as pd
import numpy as np
import osmnx as ox
import networkx as nx
import pickle
import os


## Process

### Load Data

In [2]:
import pickle
# Load the centroids
with open('../data/temp/bg_centroids', 'rb') as f:
    BG_centroids = pickle.load(f)

# Load the projections
with open('../data/temp/projections', 'rb') as f:
    PROJ = pickle.load(f)

# Load the Projected Graphs
Gp = {}
for i in PROJ.keys():
    print(f'Trying to open {i}..')
    try:
        with open(f'../data/temp/{i}_walk_graph_proj', 'rb') as f:
            Gp[i] = pickle.load(f)
        print('\tSuccess!')
    except Exception as e:
        print(f'\tFailed with exception: {e}')

Trying to open phi..
	Success!
Trying to open chi..
	Success!
Trying to open sfba..
	Success!
Trying to open sf..
	Failed with exception: [Errno 2] No such file or directory: '../data/temp/sf_walk_graph_proj'
Trying to open eastbay..
	Failed with exception: [Errno 2] No such file or directory: '../data/temp/eastbay_walk_graph_proj'
Trying to open nyc..
	Success!


### Project each BG into coordinate system

#### First Clean the centroids

In [3]:
(BG_centroids['O_bg_fips'].str.len()).value_counts()

BG_centroids.columns = ['fips12', 'lng','lat']
BG_centroids['fips5'] = BG_centroids['fips12'].str[0:5]
BG_centroids['fips5'].str[0:2].value_counts()

convert_dict = {
    '36': 'nyc',
    '06': 'sfba',
    '17': 'chi',
    '42': 'phi'
}

BG_centroids['metro'] = [convert_dict[i] for i in BG_centroids['fips5'].str[0:2]]
BG_centroids = BG_centroids[['metro', 'fips5', 'fips12','lng','lat']]

BG_centroids


Unnamed: 0,metro,fips5,fips12,lng,lat
0,phi,42101,421010001011,-75.1502,39.9518
1,phi,42101,421010001012,-75.1500,39.9500
2,phi,42101,421010001013,-75.1479,39.9515
3,phi,42101,421010001014,-75.1456,39.9503
4,phi,42101,421010001015,-75.1427,39.9500
...,...,...,...,...,...
4370,sfba,06085,060855130003,-122.1588,37.4251
4371,sfba,06085,060855130004,-122.1579,37.4218
4372,sfba,06085,060855130005,-122.1547,37.4243
4373,sfba,06085,060855130006,-122.1615,37.4315


#### Then project the centroids

In [4]:
import geopandas as gpd

#bg_coords = BG_centroids.apply(lambda x: (x['LATITUDE'], x['LONGITUDE']), axis = 1)
gdf_BG_centroids = gpd.GeoDataFrame(BG_centroids, geometry=gpd.points_from_xy(BG_centroids['lng'], BG_centroids['lat']), crs='EPSG:4326')

#gpd_BG_centroids = gpd_BG_centroids.to_crs(PROJ['chi']) ## NOTE: This assumes they are all the same crs
#gpd_BG_centroids

In [5]:
gdf_BG_centroids['metro']

0        phi
1        phi
2        phi
3        phi
4        phi
        ... 
4370    sfba
4371    sfba
4372    sfba
4373    sfba
4374    sfba
Name: metro, Length: 16067, dtype: object

### Load the Projected Simplified Graphs


In [6]:
# Load the Simplififed Projected Graphs
Gps = {}
for i in PROJ.keys():
    print(f'Trying to open {i}..')
    try:
        with open(f'../data/temp/{i}_walk_graph_proj_simpl', 'rb') as f:
            Gps[i] = pickle.load(f)
        print('\tSuccess!')
    except Exception as e:
        print(f'\tFailed with exception: {e}')

Trying to open phi..
	Success!
Trying to open chi..
	Success!
Trying to open sfba..
	Success!
Trying to open sf..
	Failed with exception: [Errno 2] No such file or directory: '../data/temp/sf_walk_graph_proj_simpl'
Trying to open eastbay..
	Failed with exception: [Errno 2] No such file or directory: '../data/temp/eastbay_walk_graph_proj_simpl'
Trying to open nyc..
	Success!



### Assign each to a closest node

In [7]:
nn_idxs = {}
nn_dists = {}

for i in Gps.keys():
    print(f'{i}..')
    nn_idxs[i], nn_dists[i] = ox.distance.nearest_nodes(
        Gps[i], 
        X = gdf_BG_centroids[gdf_BG_centroids['metro'] == i].to_crs(PROJ[i])['geometry'].x,
        Y = gdf_BG_centroids[gdf_BG_centroids['metro'] == i].to_crs(PROJ[i])['geometry'].y,
        return_dist=True
    )

phi..
chi..
sfba..
nyc..


### For each node in "nearest_nodes", run dijkstra, to find shortest path to ALL nodes

In [8]:
# first limit to only those node pairs within 4mi of each other. Hmm, ok, that doesn't actually work. But can I use a cutoff?
# Yes!
shortest_paths = {}
for m in Gps.keys():
    print(f'\n{m}...')
    shortest_paths[m] = {}
    for j in range(len(nn_idxs[m])):
        if j % 100 == 0:
            print(f'{j}/{len(nn_idxs[m])}')
        shortest_paths[m][j] = nx.single_source_dijkstra_path_length(Gps[m], nn_idxs[m][j], weight='length', cutoff=4000)



phi...
0/1338
100/1338
200/1338
300/1338
400/1338
500/1338
600/1338
700/1338
800/1338
900/1338
1000/1338
1100/1338
1200/1338
1300/1338

chi...
0/4002
100/4002
200/4002
300/4002
400/4002
500/4002
600/4002
700/4002
800/4002
900/4002
1000/4002
1100/4002
1200/4002
1300/4002
1400/4002
1500/4002
1600/4002
1700/4002
1800/4002
1900/4002
2000/4002
2100/4002
2200/4002
2300/4002
2400/4002
2500/4002
2600/4002
2700/4002
2800/4002
2900/4002
3000/4002
3100/4002
3200/4002
3300/4002
3400/4002
3500/4002
3600/4002
3700/4002
3800/4002
3900/4002
4000/4002

sfba...
0/4375
100/4375
200/4375
300/4375
400/4375
500/4375
600/4375
700/4375
800/4375
900/4375
1000/4375
1100/4375
1200/4375
1300/4375
1400/4375
1500/4375
1600/4375
1700/4375
1800/4375
1900/4375
2000/4375
2100/4375
2200/4375
2300/4375
2400/4375
2500/4375
2600/4375
2700/4375
2800/4375
2900/4375
3000/4375
3100/4375
3200/4375
3300/4375
3400/4375
3500/4375
3600/4375
3700/4375
3800/4375
3900/4375
4000/4375
4100/4375
4200/4375
4300/4375

nyc...
0/6352
100/63

### Limit to just nodes in "nearest_nodes", then we have a table of O(NN*NN) 

In [10]:
dfs_shortest_paths = {}

dfs_shortest_paths['chi'] = (
    pd.DataFrame(shortest_paths['chi'])
    .loc[nn_idxs['chi']]
    .melt()
    .dropna()
    .reset_index()
    .rename(columns={'index':'o', 'variable':'d'})
    .set_index(['o','d'])
) # 243,416 rows

dfs_shortest_paths['phi'] = pd.DataFrame(shortest_paths['phi']).loc[nn_idxs['phi']].melt().dropna().reset_index().rename(columns={'index':'o', 'variable':'d'}).set_index(['o','d'])

dfs_shortest_paths['nyc'] = pd.DataFrame(shortest_paths['nyc']).loc[nn_idxs['nyc']].melt().dropna().reset_index().rename(columns={'index':'o', 'variable':'d'}).set_index(['o','d'])

dfs_shortest_paths['sfba'] = pd.DataFrame(shortest_paths['sfba']).loc[nn_idxs['sfba']].melt().dropna().reset_index().rename(columns={'index':'o', 'variable':'d'}).set_index(['o','d'])

### And Check in

In [11]:
print(dfs_shortest_paths['chi'].shape)
print(dfs_shortest_paths['phi'].shape)
print(dfs_shortest_paths['nyc'].shape)
print(dfs_shortest_paths['sfba'].shape)

(243416, 1)
(164656, 1)
(1500060, 1)
(252183, 1)


### Save Somehow

In [13]:
# pickle them?
for m in dfs_shortest_paths.keys():
    with open(f'../data/temp/{m}_shortest_paths', 'wb') as fp:
        pickle.dump(dfs_shortest_paths[m], fp)