### Calculating distance between existing residential locations and commercial locations

Process:
- For each residential building, check the distance to each commerical building 
- If the distance is less than a mile, create a binary variable that takes the value 1, 0 otherwise

In [4]:
# Import libraries
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import haversine as hs


In [5]:
# Get the residential and commercial building location data
buildings_df = gpd.read_file('../processed_data/relevant_buildings.shp')
buildings_df


Unnamed: 0,CLASS,class_reco,hood,geoid10,tractce10,geometry
0,C,1-Unit Residential,Brookline,420031918003,191800,"POLYGON ((-80.01230 40.38309, -80.01255 40.383..."
1,R,1-Unit Residential,Brookline,420031918003,191800,"POLYGON ((-80.01382 40.38638, -80.01380 40.386..."
2,R,1-Unit Residential,Brookline,420031918003,191800,"POLYGON ((-80.01309 40.38253, -80.01307 40.382..."
3,R,1-Unit Residential,Brookline,420031918003,191800,"POLYGON ((-80.01399 40.38554, -80.01399 40.385..."
4,R,1-Unit Residential,Brookline,420031918003,191800,"POLYGON ((-80.00736 40.38204, -80.00729 40.381..."
...,...,...,...,...,...,...
116273,C,commercial,Point Breeze,420039811001,981100,"POLYGON ((-79.90934 40.44247, -79.90928 40.442..."
116274,R,2-Unit Residential,Squirrel Hill South,420039803001,980300,"POLYGON ((-79.91447 40.43197, -79.91443 40.431..."
116275,C,commercial,Squirrel Hill South,420039803001,980300,"POLYGON ((-79.90925 40.42686, -79.90910 40.426..."
116276,R,1-Unit Residential,Squirrel Hill South,420039803001,980300,"POLYGON ((-79.91190 40.43303, -79.91190 40.433..."


In [6]:
# Remove grocery store buildings
buildings_df = buildings_df[buildings_df['class_reco'] != 'Grocery Store']
buildings_df

Unnamed: 0,CLASS,class_reco,hood,geoid10,tractce10,geometry
0,C,1-Unit Residential,Brookline,420031918003,191800,"POLYGON ((-80.01230 40.38309, -80.01255 40.383..."
1,R,1-Unit Residential,Brookline,420031918003,191800,"POLYGON ((-80.01382 40.38638, -80.01380 40.386..."
2,R,1-Unit Residential,Brookline,420031918003,191800,"POLYGON ((-80.01309 40.38253, -80.01307 40.382..."
3,R,1-Unit Residential,Brookline,420031918003,191800,"POLYGON ((-80.01399 40.38554, -80.01399 40.385..."
4,R,1-Unit Residential,Brookline,420031918003,191800,"POLYGON ((-80.00736 40.38204, -80.00729 40.381..."
...,...,...,...,...,...,...
116273,C,commercial,Point Breeze,420039811001,981100,"POLYGON ((-79.90934 40.44247, -79.90928 40.442..."
116274,R,2-Unit Residential,Squirrel Hill South,420039803001,980300,"POLYGON ((-79.91447 40.43197, -79.91443 40.431..."
116275,C,commercial,Squirrel Hill South,420039803001,980300,"POLYGON ((-79.90925 40.42686, -79.90910 40.426..."
116276,R,1-Unit Residential,Squirrel Hill South,420039803001,980300,"POLYGON ((-79.91190 40.43303, -79.91190 40.433..."


In [7]:
# Do the CRS converstion (converting polygons to centroid)
df_points = buildings_df.copy()

# Project to CRS
df_points['geometry_crs'] = df_points['geometry'].to_crs(epsg=3035)

# Convert to centroids
df_points['centroids'] = df_points['geometry_crs'].centroid.to_crs(4326)
df_points

Unnamed: 0,CLASS,class_reco,hood,geoid10,tractce10,geometry,geometry_crs,centroids
0,C,1-Unit Residential,Brookline,420031918003,191800,"POLYGON ((-80.01230 40.38309, -80.01255 40.383...","POLYGON ((-1280260.228 6136598.189, -1280260.0...",POINT (-80.01241 40.38317)
1,R,1-Unit Residential,Brookline,420031918003,191800,"POLYGON ((-80.01382 40.38638, -80.01380 40.386...","POLYGON ((-1279946.994 6136891.874, -1279945.5...",POINT (-80.01375 40.38637)
2,R,1-Unit Residential,Brookline,420031918003,191800,"POLYGON ((-80.01309 40.38253, -80.01307 40.382...","POLYGON ((-1280329.396 6136636.826, -1280318.8...",POINT (-80.01300 40.38258)
3,R,1-Unit Residential,Brookline,420031918003,191800,"POLYGON ((-80.01399 40.38554, -80.01399 40.385...","POLYGON ((-1280035.495 6136864.334, -1280039.0...",POINT (-80.01401 40.38548)
4,R,1-Unit Residential,Brookline,420031918003,191800,"POLYGON ((-80.00736 40.38204, -80.00729 40.381...","POLYGON ((-1280292.699 6136126.260, -1280296.6...",POINT (-80.00738 40.38199)
...,...,...,...,...,...,...,...,...
116273,C,commercial,Point Breeze,420039811001,981100,"POLYGON ((-79.90934 40.44247, -79.90928 40.442...","POLYGON ((-1272618.173 6130848.222, -1272613.6...",POINT (-79.90929 40.44247)
116274,R,2-Unit Residential,Squirrel Hill South,420039803001,980300,"POLYGON ((-79.91447 40.43197, -79.91443 40.431...","POLYGON ((-1273769.610 6130756.263, -1273768.9...",POINT (-79.91442 40.43189)
116275,C,commercial,Squirrel Hill South,420039803001,980300,"POLYGON ((-79.90925 40.42686, -79.90910 40.426...","POLYGON ((-1274210.890 6130057.717, -1274201.8...",POINT (-79.90929 40.42676)
116276,R,1-Unit Residential,Squirrel Hill South,420039803001,980300,"POLYGON ((-79.91190 40.43303, -79.91190 40.433...","POLYGON ((-1273621.056 6130591.820, -1273610.6...",POINT (-79.91181 40.43308)


In [8]:
# Check matrix size for distance calculations
print(f"# of commercial buildings = {len(df_points[df_points['class_reco'] == 'commercial'])}")
print(f"# of residential buildings = {len(df_points) - len(df_points[df_points['class_reco'] == 'commercial'])}")

matrix_size = len(df_points[df_points['class_reco'] == 'commercial']) * len(df_points) - len(df_points[df_points['class_reco'] == 'commercial'])
print(matrix_size)


# of commercial buildings = 6895
# of residential buildings = 109324
801323110


In [9]:
# Converting point objects to lat long

def convert_point_to_latlong(point):
    lat = point.x
    long = point.y

    return (lat, long)

# apply the function
df_points['coordinates'] = df_points.apply(lambda row: convert_point_to_latlong(row['centroids']), axis = 1)

df_points



Unnamed: 0,CLASS,class_reco,hood,geoid10,tractce10,geometry,geometry_crs,centroids,coordinates
0,C,1-Unit Residential,Brookline,420031918003,191800,"POLYGON ((-80.01230 40.38309, -80.01255 40.383...","POLYGON ((-1280260.228 6136598.189, -1280260.0...",POINT (-80.01241 40.38317),"(-80.01241198145034, 40.383174075305014)"
1,R,1-Unit Residential,Brookline,420031918003,191800,"POLYGON ((-80.01382 40.38638, -80.01380 40.386...","POLYGON ((-1279946.994 6136891.874, -1279945.5...",POINT (-80.01375 40.38637),"(-80.01375293703208, 40.38637193448172)"
2,R,1-Unit Residential,Brookline,420031918003,191800,"POLYGON ((-80.01309 40.38253, -80.01307 40.382...","POLYGON ((-1280329.396 6136636.826, -1280318.8...",POINT (-80.01300 40.38258),"(-80.01299982129609, 40.382580778371924)"
3,R,1-Unit Residential,Brookline,420031918003,191800,"POLYGON ((-80.01399 40.38554, -80.01399 40.385...","POLYGON ((-1280035.495 6136864.334, -1280039.0...",POINT (-80.01401 40.38548),"(-80.0140061375439, 40.385478011132015)"
4,R,1-Unit Residential,Brookline,420031918003,191800,"POLYGON ((-80.00736 40.38204, -80.00729 40.381...","POLYGON ((-1280292.699 6136126.260, -1280296.6...",POINT (-80.00738 40.38199),"(-80.00738482859445, 40.38199146881134)"
...,...,...,...,...,...,...,...,...,...
116273,C,commercial,Point Breeze,420039811001,981100,"POLYGON ((-79.90934 40.44247, -79.90928 40.442...","POLYGON ((-1272618.173 6130848.222, -1272613.6...",POINT (-79.90929 40.44247),"(-79.90928821722198, 40.442466005506255)"
116274,R,2-Unit Residential,Squirrel Hill South,420039803001,980300,"POLYGON ((-79.91447 40.43197, -79.91443 40.431...","POLYGON ((-1273769.610 6130756.263, -1273768.9...",POINT (-79.91442 40.43189),"(-79.91442414504114, 40.431892538017415)"
116275,C,commercial,Squirrel Hill South,420039803001,980300,"POLYGON ((-79.90925 40.42686, -79.90910 40.426...","POLYGON ((-1274210.890 6130057.717, -1274201.8...",POINT (-79.90929 40.42676),"(-79.90929182071689, 40.42676068310648)"
116276,R,1-Unit Residential,Squirrel Hill South,420039803001,980300,"POLYGON ((-79.91190 40.43303, -79.91190 40.433...","POLYGON ((-1273621.056 6130591.820, -1273610.6...",POINT (-79.91181 40.43308),"(-79.91181037643665, 40.43308374924944)"


In [14]:
# Test time (shorter dataset)
df_points_small = df_points.sample(n=20000)

df_points_small_res = df_points_small[df_points_small['class_reco'] != 'commercial']
df_points_small_res = df_points_small_res[['geoid10', 'tractce10', 'coordinates']]
df_points_small_res.rename(columns={"coordinates": "res_coordinates", "geoid10":"geoid_res", "tractce10":"tract_id_res"}, inplace=True)


df_points_small_comm = df_points_small[df_points_small['class_reco'] == 'commercial']
df_points_small_comm = df_points_small_comm[['geoid10', 'tractce10', 'coordinates']]
df_points_small_comm.rename(columns={"coordinates": "comm_coordinates", "geoid10":"geoid_comm", "tractce10":"tract_id_comm"}, inplace=True)



In [15]:
# Cross join the 2 files
df_points_small_comm['key'] = 1
df_points_small_res['key'] = 1
  
df_cross_joined = pd.merge(df_points_small_comm, df_points_small_res, on ='key').drop("key", 1)

df_cross_joined


  df_cross_joined = pd.merge(df_points_small_comm, df_points_small_res, on ='key').drop("key", 1)


Unnamed: 0,geoid_comm,tract_id_comm,comm_coordinates,geoid_res,tract_id_res,res_coordinates
0,420031414003,141400,"(-79.91051112906024, 40.42451981830797)",420031917001,191700,"(-80.01679979358475, 40.40180752733724)"
1,420031414003,141400,"(-79.91051112906024, 40.42451981830797)",420035625002,562500,"(-80.05317306479834, 40.4621034926783)"
2,420031414003,141400,"(-79.91051112906024, 40.42451981830797)",420032609001,260900,"(-80.00410684294155, 40.47621859467781)"
3,420031414003,141400,"(-79.91051112906024, 40.42451981830797)",420031106002,110600,"(-79.91746952311716, 40.473623550810245)"
4,420031414003,141400,"(-79.91051112906024, 40.42451981830797)",420030603001,060300,"(-79.962954221161, 40.4649821222481)"
...,...,...,...,...,...,...
21907426,420030903002,090300,"(-79.95100448580257, 40.463191692452654)",420030603001,060300,"(-79.96518242724426, 40.46320519362837)"
21907427,420030903002,090300,"(-79.95100448580257, 40.463191692452654)",420031918003,191800,"(-80.02926150732895, 40.394593262558615)"
21907428,420030903002,090300,"(-79.95100448580257, 40.463191692452654)",420031917001,191700,"(-80.01386307510762, 40.401832766250884)"
21907429,420030903002,090300,"(-79.95100448580257, 40.463191692452654)",420030409003,040900,"(-79.95955909078104, 40.435248981871595)"


In [16]:
# Calculate haversine distance
df_cross_joined['distance'] = df_cross_joined.apply(lambda row: hs.haversine(row['comm_coordinates'], row['res_coordinates'], unit=hs.Unit.MILES), axis = 1)
df_cross_joined


Unnamed: 0,geoid_comm,tract_id_comm,comm_coordinates,geoid_res,tract_id_res,res_coordinates,distance
0,420031414003,141400,"(-79.91051112906024, 40.42451981830797)",420031917001,191700,"(-80.01679979358475, 40.40180752733724)",7.348938
1,420031414003,141400,"(-79.91051112906024, 40.42451981830797)",420035625002,562500,"(-80.05317306479834, 40.4621034926783)",9.867346
2,420031414003,141400,"(-79.91051112906024, 40.42451981830797)",420032609001,260900,"(-80.00410684294155, 40.47621859467781)",6.496778
3,420031414003,141400,"(-79.91051112906024, 40.42451981830797)",420031106002,110600,"(-79.91746952311716, 40.473623550810245)",0.764312
4,420031414003,141400,"(-79.91051112906024, 40.42451981830797)",420030603001,060300,"(-79.962954221161, 40.4649821222481)",3.656253
...,...,...,...,...,...,...,...
21907426,420030903002,090300,"(-79.95100448580257, 40.463191692452654)",420030603001,060300,"(-79.96518242724426, 40.46320519362837)",0.979602
21907427,420030903002,090300,"(-79.95100448580257, 40.463191692452654)",420031918003,191800,"(-80.02926150732895, 40.394593262558615)",5.469447
21907428,420030903002,090300,"(-79.95100448580257, 40.463191692452654)",420031917001,191700,"(-80.01386307510762, 40.401832766250884)",4.405280
21907429,420030903002,090300,"(-79.95100448580257, 40.463191692452654)",420030409003,040900,"(-79.95955909078104, 40.435248981871595)",0.680260


In [22]:
# Calculate distance using geopandas distance method to check time
# Test time (shorter dataset)
df_points_small = df_points.sample(n=20000)

df_points_small_res = df_points_small[df_points_small['class_reco'] != 'commercial']
df_points_small_res = df_points_small_res[['geoid10', 'tractce10', 'centroids']]
df_points_small_res.rename(columns={"centroids": "res_centroids", "geoid10":"geoid_res", "tractce10":"tract_id_res"}, inplace=True)


df_points_small_comm = df_points_small[df_points_small['class_reco'] == 'commercial']
df_points_small_comm = df_points_small_comm[['geoid10', 'tractce10', 'centroids']]
df_points_small_comm.rename(columns={"centroids": "comm_centroids", "geoid10":"geoid_comm", "tractce10":"tract_id_comm"}, inplace=True)

# Cross join the 2 files
df_points_small_comm['key'] = 1
df_points_small_res['key'] = 1
  
df_cross_joined = pd.merge(df_points_small_comm, df_points_small_res, on ='key').drop("key", 1)

# Calcuate distance
df_cross_joined['distance_prelim'] = df_cross_joined.apply(lambda row: row['res_centroids'].distance(row['comm_centroids']), axis = 1)

## Longer time

  df_cross_joined = pd.merge(df_points_small_comm, df_points_small_res, on ='key').drop("key", 1)


In [None]:
# Do the full dataset based on Haversine distance (NOT RUN YET)

df_points_res = df_points[df_points['class_reco'] != 'commercial']
df_points_res = df_points_res[['geoid10', 'tractce10', 'coordinates']]
df_points_res.rename(columns={"coordinates": "res_coordinates", "geoid10":"geoid_res", "tractce10":"tract_id_res"}, inplace=True)


df_points_comm = df_points[df_points['class_reco'] == 'commercial']
df_points_comm = df_points_comm[['geoid10', 'tractce10', 'coordinates']]
df_points_comm.rename(columns={"coordinates": "comm_coordinates", "geoid10":"geoid_comm", "tractce10":"tract_id_comm"}, inplace=True)

# Cross join the 2 files
df_points_comm['key'] = 1
df_points_res['key'] = 1
  
df_cross_joined = pd.merge(df_points_comm, df_points_res, on ='key').drop("key", 1)
 
df_cross_joined['distance'] = df_cross_joined.apply(lambda row: hs.haversine(row['comm_coordinates'], row['res_coordinates'], unit=hs.Unit.MILES), axis = 1)
df_cross_joined


