In [1]:
from pathlib import Path
import os
from dotenv import load_dotenv

load_dotenv()

DB_PATH = Path(str(os.getenv('DB_PATH')))

import geopandas as gpd
import pandas as pd

document_files = pd.read_parquet(DB_PATH / "document_files.parquet")
locations = gpd.read_parquet(DB_PATH / "locations.parquet")
project_total_scopes = pd.read_parquet(DB_PATH / "project_total_scopes.parquet")
project_safety_scopes = pd.read_parquet(DB_PATH / "project_safety_scopes.parquet")
project_geos = gpd.read_parquet(DB_PATH / "project_geos.parquet")
projects = pd.read_parquet(DB_PATH / "projects.parquet")
location_year_files = pd.read_parquet(DB_PATH / "location_year_files.parquet")
document_collections = pd.read_parquet(DB_PATH / "document_collections.parquet")
location_to_project = pd.read_parquet(DB_PATH / "_location_to_project.parquet")
imgs_to_filesAPI = pd.read_parquet(DB_PATH / '_imgs_to_filesAPI.parquet')
docs_to_filesAPI = pd.read_parquet(DB_PATH / '_docs_to_filesAPI.parquet')
image_pairs = pd.read_parquet(DB_PATH / 'processed' / 'image_pairs.parquet')
document_collection_geocodes = gpd.read_parquet(DB_PATH / 'document_collection_geocodes.parquet')

# Location to Project

In [2]:
# location_to_project
# locations.merge(
#     projects
# )
locations_p = locations.copy().to_crs('2263')

project_geos_p = project_geos.copy().to_crs('2263') # TODO: store crs in config somewhere
    
# Buffer
locations_buffer = locations_p.copy()
locations_buffer['location_centroid'] = locations_p.geometry
locations_buffer = locations_buffer.set_geometry(locations_buffer.geometry.buffer(100))

hits = gpd.sjoin(locations_buffer, project_geos_p, how="inner", predicate="intersects") # gets all projects within 100 feet of location
hits

joined = hits.merge(
    project_geos_p[['citydata_proj_id', 'geometry']],
    left_on='index_right', right_index=True,
    suffixes = ['', '_project'],
    how='inner', indicator=True # Basically all works. Only 1 is right_only, 0 left_only
)
distance_to_project = joined['location_centroid'].distance(joined['geometry_project']) # pairwise distances

# # summarize
# location_projects_gdf = hits.rename(columns = {'index_right': 'city_project_id',})
joined['distance'] = distance_to_project
location_to_project = joined[['location_id','citydata_proj_id','index_right', 'distance']]

location_to_project = location_to_project.drop_duplicates()
#location_to_project.to_parquet(DB_PATH / '_location_to_project.parquet')
location_to_project


Unnamed: 0,location_id,citydata_proj_id,index_right,distance
0,43609,2836,1935,0.179325
1,41675,315,199,177.099527
1,41675,315,199,0.249511
2,37511,2860,1959,0.292503
3,54810,1398,973,0.267454
...,...,...,...,...
1676,36332,2005,1432,0.280786
1677,9061704,2785,1884,28.946423
1677,9061704,2785,1884,256.647483
1677,9061704,86,26,28.946423


# Location to Document 

In [26]:
locations_buffer = locations_p.copy()
locations_buffer['location_centroid'] = locations_p.geometry
locations_buffer = locations_buffer.set_geometry(locations_buffer.geometry.buffer(50))
locations_buffer.explore()

dc_geos_p = document_collection_geocodes.to_crs('2263')

hits = gpd.sjoin(locations_buffer, dc_geos_p, how="inner", predicate="intersects") # gets all projects within 50 feet of location

#hits[hits['location_id'] == 9046811]
locations_buffer[locations_buffer['location_id'] == 9046811]
dc_geos_p[dc_geos_p['raw_cross_streets'].str.contains('Tillary Street; Bridge St')]
document_collections[document_collections['document_collection_id'] == 823].merge(
    document_files,
    on = 'document_collection_id'
)[['relative_path', 'document_file_id']]
#POINT (988385.857 192867.124)
#POINT (987369.249 192973.571)
#locations_buffer.crs

# joined = hits.merge(
#     dc_geos_p[['document_collection_id', 'geometry']],
#     left_on='index_right', right_index=True,
#     suffixes = ['', '_document_collection'],
#     how='inner', indicator=True # Basically all works. Only 1 is right_only, 0 left_only
# )

# distance_to_document = joined['location_centroid'].distance(joined['geometry_document_collection']) # pairwise distances

# joined['distance'] = distance_to_document
# location_to_document_collection = joined[['location_id','document_collection_id','index_right', 'distance']]

# location_to_document_collection = location_to_document_collection.drop_duplicates()
# location_to_document_collection.to_parquet(DB_PATH / '_location_to_document_collection2.parquet')


Unnamed: 0,relative_path,document_file_id
0,raw/documents/823--Brooklyn Bridge Gateway: Ti...,1487
1,raw/documents/823--Brooklyn Bridge Gateway: Ti...,1488
2,raw/documents/823--Brooklyn Bridge Gateway: Ti...,1489
3,raw/documents/823--Brooklyn Bridge Gateway: Ti...,1490
4,raw/documents/823--Brooklyn Bridge Gateway: Ti...,1491
5,raw/documents/823--Brooklyn Bridge Gateway: Ti...,1492
6,raw/documents/823--Brooklyn Bridge Gateway: Ti...,1493
7,raw/documents/823--Brooklyn Bridge Gateway: Ti...,1494
8,raw/documents/823--Brooklyn Bridge Gateway: Ti...,1495
9,raw/documents/823--Brooklyn Bridge Gateway: Ti...,1496
