# Matching up project document pdfs with the geotagged files in the database
## env

In [1]:
import os
from pathlib import Path

import pandas as pd
import geopandas as gpd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

## Gameplan

- pull all document pdf titles from the folder
- for each project title in the database (n=~1000):
    - create a similarity score for all the documents (n=~1800)
    - pull the top 5 candidates
- validate first by confirming borough.
- Then write an LLM script that extracts addresses and intersections from the documents themselves and stores those in a database (n=~1800). Also extract the years
- for each project (n=~1000)
    - for each candidate (5):
        - loop through the addresses and intersections listed:
            - geocode them
            - calcualte distance from the 
            - find total distance and order by it


## Load files
### Project Documents

In [2]:
BASEPATH_PROJECT_DOCUMENT = Path('../../../proj_data/project_documents')

# Get all project documents - note each of these is a folder
project_document_titles = os.listdir(BASEPATH_PROJECT_DOCUMENT)
project_document_split_titles = {int(x.split('--')[0]): '--'.join(x.split('--')[1:]) for x in project_document_titles if Path(BASEPATH_PROJECT_DOCUMENT / x).is_dir()}
project_document_titles_series =  pd.Series(project_document_split_titles)
project_document_titles_series.name = 'pdf_dir'


project_document_all_document_paths = None

all_files = []
for idx, title in project_document_split_titles.items():

    folder_path = f'{idx}--{title}'
    #print(folder_path)
    files = os.listdir(BASEPATH_PROJECT_DOCUMENT / folder_path)
    #print('\n'.join([f'\t{x}' for x in files]))
    all_files.extend(files)


df_project_files = pd.DataFrame([{'proj_id': int(y[0]), 'doc_id': int(y[1]), 'path': '--'.join(y)} for y in [x.split('--') for x in all_files]])
df_project_files = df_project_files.set_index('proj_id').sort_index()

df_project_folders = pd.read_csv(BASEPATH_PROJECT_DOCUMENT /'projects_df.csv', index_col=0)
df_project_documents_merged = df_project_folders.merge(df_project_files, left_index=True, right_index=True, how='outer', indicator=True)
df_project_documents_merged['_merge'].value_counts()
df_project_documents_merged[df_project_documents_merged['_merge'] != 'both']

df_project_documents_merged#['doc_id'] = df_project_documents_merged['doc_id'].astype(int)
df_project_documents_merged['document_path'] = df_project_documents_merged.index.values.astype('str') + '--' + df_project_documents_merged['name'] + '/' + df_project_documents_merged['path']
df_project_documents_merged[df_project_documents_merged['_merge'] != 'both']


# So I have a bunch of these documents. My goal is to get location data out of them
os.makedirs('../../data/project_documents', exist_ok=True)
df_project_documents_merged.to_csv('../../data/project_documents/projects_docs_merged.csv') # Need to Save


In [3]:
#BASEPATH_PROJECT_DOCUMENT / df_project_documents_merged.iloc[0]['document_path']
df_project_documents_merged

Unnamed: 0,year,borough,name,document_links,source_url,doc_id,path,_merge,document_path
0,2007,the Bronx,Bruckner-Sheridan Expressway Interchange Enhan...,['http://www.nyc.gov/html/dot/downloads/pdf/br...,https://www.nyc.gov/html/dot/html/about/projec...,0.0,0--0--brucknersher.pdf,both,0--Bruckner-Sheridan Expressway Interchange En...
1,2007,the Bronx,Edgewater Road Traffic Calming,['http://www.nyc.gov/html/dot/downloads/pdf/ed...,https://www.nyc.gov/html/dot/html/about/projec...,0.0,1--0--edgewaterrd.pdf,both,1--Edgewater Road Traffic Calming/1--0--edgewa...
2,2007,the Bronx,Lafayette Avenue Reengineering and Safety Impr...,['http://www.nyc.gov/html/dot/downloads/pdf/la...,https://www.nyc.gov/html/dot/html/about/projec...,0.0,2--0--lafayetteave.pdf,both,2--Lafayette Avenue Reengineering and Safety I...
3,2007,Brooklyn,9th Street Bicycle and Street Safety Project,['http://www.nyc.gov/html/dot/downloads/pdf/9t...,https://www.nyc.gov/html/dot/html/about/projec...,0.0,3--0--9thstreet.pdf,both,3--9th Street Bicycle and Street Safety Projec...
4,2007,Brooklyn,Carlton Avenue Green Median and Bike Lanes,['http://www.nyc.gov/html/dot/downloads/pdf/ca...,https://www.nyc.gov/html/dot/html/about/projec...,1.0,4--1--carltonave_gallery.pdf,both,4--Carlton Avenue Green Median and Bike Lanes/...
...,...,...,...,...,...,...,...,...,...
938,2025,Staten Island,"Lincoln Avenue, Father Capodanno Boulevard to ...",['https://www.nyc.gov/html/dot/downloads/pdf/l...,https://www.nyc.gov/html/dot/html/about/curren...,0.0,938--0--lincoln-ave-father-capodanno-blvd-boun...,both,"938--Lincoln Avenue, Father Capodanno Boulevar..."
939,2025,Staten Island,"Victory Boulevard, Bay Street to Wild Avenue",['https://www.nyc.gov/html/dot/downloads/pdf/v...,https://www.nyc.gov/html/dot/html/about/curren...,2.0,939--2--victory-blvd-bay-st-wild-ave-winter202...,both,"939--Victory Boulevard, Bay Street to Wild Ave..."
939,2025,Staten Island,"Victory Boulevard, Bay Street to Wild Avenue",['https://www.nyc.gov/html/dot/downloads/pdf/v...,https://www.nyc.gov/html/dot/html/about/curren...,0.0,939--0--victory-blvd-wild-ave-willowbrook-ave-...,both,"939--Victory Boulevard, Bay Street to Wild Ave..."
939,2025,Staten Island,"Victory Boulevard, Bay Street to Wild Avenue",['https://www.nyc.gov/html/dot/downloads/pdf/v...,https://www.nyc.gov/html/dot/html/about/curren...,1.0,939--1--victory-blvd-bay-st-little-clove-rd-ma...,both,"939--Victory Boulevard, Bay Street to Wild Ave..."


### Project Data

In [4]:
BASEPATH_OPENNYC = Path('../../data/openNYC') # only 445 distinct projectNames

PATH_STR_N_HWY_PROJECTS = BASEPATH_OPENNYC / 'Street_and_Highway_Capital_Reconstruction_Projects_-_Intersection_20250721.csv' # a mishmash of project types
PATH_CAPITAL_PROJECTS = BASEPATH_OPENNYC / 'Capital_Projects_20250720.csv' # Less useful, because its a mishmash

gdf_strhwy_projs = gpd.read_file(PATH_STR_N_HWY_PROJECTS) # 7000 but
#gdf_strhwy_projs['ProjTitle'].value_counts()  but only 445 distinct projectNames
gdf_strhwy_projs['LeadAgency'].value_counts() # largely DEP,
gdf_strhwy_projs_DOT = gdf_strhwy_projs[gdf_strhwy_projs['LeadAgency'] == 'DOT']
gdf_strhwy_projs_DOT['ProjTitle'].value_counts()

gdf_strhwy_projs_DOT.groupby('ConstructionFY')['ProjTitle'].nunique() # really not that many each year unfortunately


gdf_capital_projects = gpd.read_file(PATH_CAPITAL_PROJECTS)
gdf_capital_projects['Category'].value_counts()
RELEVANT_CAPITAL_PROJECT_CATEGORIES = ['Streets and Roadways', 'Bridges','Bridges, Streets and Roadways','Parks, Streets and Roadways','Industrial Development, Streets and']

gdf_capital_projects_transportation = gdf_capital_projects[gdf_capital_projects['Category'].isin(RELEVANT_CAPITAL_PROJECT_CATEGORIES)]
gdf_capital_projects_transportation

Unnamed: 0,Date Reported As Of,PID,Project Name,Description,Category,Borough,Managing Agency,Client Agency,Current Phase,Design Start,Budget Forecast,Latest Budget Changes,Total Budget Changes,Forecast Completion,Latest Schedule Changes,Total Schedule Changes
21,09/2013,97,Bruckner Expressway Westchester Creek Bridge R...,Replace existing bridge with a wider double-le...,"Bridges, Streets and Roadways",Bronx,DOT,"DEP, DOT",Design,07/23/2012,151775497,,142277136,11/20/2019,,2086
22,01/2014,97,Bruckner Expressway Westchester Creek Bridge R...,This project will replace the existing bridge ...,"Bridges, Streets and Roadways",Bronx,DOT,"DEP, DOT",Design,07/23/2012,152871246,1095749,142277136,11/20/2019,0,2086
23,05/2014,97,Bruckner Expressway Westchester Creek Bridge R...,This project will replace the existing bridge ...,"Bridges, Streets and Roadways",Bronx,DOT,"DEP, DOT",Design,07/23/2012,195217224,42345978,142277136,07/21/2020,244,2086
24,02/2015,97,Bruckner Expressway Westchester Creek Bridge R...,This project will replace the existing bridge ...,"Bridges, Streets and Roadways",Bronx,DOT,"DEP, DOT",Design,07/23/2012,198569961,3352737,142277136,10/14/2020,85,2086
25,08/2015,97,Bruckner Expressway Westchester Creek Bridge R...,This project will replace the existing bridge ...,"Bridges, Streets and Roadways",Bronx,DOT,"DEP, DOT",Design,07/23/2012,198843972.62,274011.62,142277136,08/17/2020,-58,2086
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3314,01/2023,1368,Construction of ROW Green Infrastructure in CI...,Construction of ROW Green Infrastructure in CI...,Streets and Roadways,,DDC,Environmental Protection,Construction Procurement,06/27/2022,36486000,,,08/27/2025,,
3315,01/2023,1369,Bellevue Men's Shelter: Bathroom Upgrade,Renovation of all bathroom facilities through ...,Streets and Roadways,,DDC,Homeless Services,Design,01/29/2018,25019000,,,10/28/2026,,
3316,01/2023,1370,"STORM & SANITARY SEWERS IN ALVERSON AVENUE, SI","STORM & SANITARY SEWERS IN WEST CASTOR PLACE, SI",Streets and Roadways,,DDC,Environmental Protection,Construction,06/16/2014,25486000,,,07/16/2024,,
3318,01/2023,1372,Construction of ROW Green Infrastructure in CI...,Construction of Right of Way Green Infrastruct...,Streets and Roadways,,DDC,Environmental Protection,Construction Procurement,06/08/2022,29363000,,,06/01/2025,,


In [5]:
pd.concat([pd.to_datetime(gdf_capital_projects_transportation['Date Reported As Of']), gdf_capital_projects_transportation['Date Reported As Of']], axis=1)

  pd.concat([pd.to_datetime(gdf_capital_projects_transportation['Date Reported As Of']), gdf_capital_projects_transportation['Date Reported As Of']], axis=1)


Unnamed: 0,Date Reported As Of,Date Reported As Of.1
21,2013-09-01,09/2013
22,2014-01-01,01/2014
23,2014-05-01,05/2014
24,2015-02-01,02/2015
25,2015-08-01,08/2015
...,...,...
3314,2023-01-01,01/2023
3315,2023-01-01,01/2023
3316,2023-01-01,01/2023
3318,2023-01-01,01/2023


In [17]:
temp = pd.read_csv('../../data/project_documents/projects_docs_merged.csv', index_col=0)
temp_path = BASEPATH_PROJECT_DOCUMENT / temp.loc[0, 'document_path']
print(temp_path)
temp_path.exists()

from IPython.display import IFrame, display
IFrame(temp_path, width=20, height=20)


../../../proj_data/project_documents/0--Bruckner-Sheridan Expressway Interchange Enhancements/0--0--brucknersher.pdf


# Tile2net Segmentation

In [161]:
import osmnx as ox
nyc_gdfs = {
    'brooklyn': ox.geocode_to_gdf('Brooklyn, New York, USA'),
    'manhattan': ox.geocode_to_gdf('Manhattan , New York, USA'),
    'bronx': ox.geocode_to_gdf('Bronx County, New York, USA'),
    'queens': ox.geocode_to_gdf('Queens County, New York, USA'),
    'statenisland': ox.geocode_to_gdf('Staten Island, New York, USA')
}

In [169]:
borough_boundaries = pd.concat(nyc_gdfs).reset_index().rename(columns={'level_0': 'borough'}).set_index('borough').drop('level_1', axis=1)

nyc_bounds = ox.geocode_to_gdf('New York City, New York, USA')

In [208]:
import numpy as np
GRID_STEP = 4
total_width = (nyc_bounds.loc[0]['bbox_east'] - nyc_bounds.loc[0]['bbox_west'])
total_height = (nyc_bounds.loc[0]['bbox_north'] - nyc_bounds.loc[0]['bbox_south'])

(total_height, total_width)

# step_x = total_width / GRID_STEP
# step_y = total_height / GRID_STEP

#bboxes = range(nyc_bounds.loc[0]['bbox_west'], nyc_bounds.loc[0]['bbox_east'], step_x)

x_cuts = np.linspace(nyc_bounds['bbox_west'][0], nyc_bounds['bbox_east'][0], GRID_STEP+1)
y_cuts = np.linspace(nyc_bounds['bbox_north'][0], nyc_bounds['bbox_south'][0], GRID_STEP+1)

bbox_pairs = np.array(np.meshgrid(x_cuts, y_cuts)).T.reshape(-1,2)
bbox_pairs

array([[-74.258843 ,  40.91763  ],
       [-74.258843 ,  40.807367 ],
       [-74.258843 ,  40.697104 ],
       [-74.258843 ,  40.586841 ],
       [-74.258843 ,  40.476578 ],
       [-74.1191905,  40.91763  ],
       [-74.1191905,  40.807367 ],
       [-74.1191905,  40.697104 ],
       [-74.1191905,  40.586841 ],
       [-74.1191905,  40.476578 ],
       [-73.979538 ,  40.91763  ],
       [-73.979538 ,  40.807367 ],
       [-73.979538 ,  40.697104 ],
       [-73.979538 ,  40.586841 ],
       [-73.979538 ,  40.476578 ],
       [-73.8398855,  40.91763  ],
       [-73.8398855,  40.807367 ],
       [-73.8398855,  40.697104 ],
       [-73.8398855,  40.586841 ],
       [-73.8398855,  40.476578 ],
       [-73.700233 ,  40.91763  ],
       [-73.700233 ,  40.807367 ],
       [-73.700233 ,  40.697104 ],
       [-73.700233 ,  40.586841 ],
       [-73.700233 ,  40.476578 ]])

In [216]:
import numpy as np
from typing import List

def split_bbox(bbox:np.ndarray, n=4):
    if bbox.shape != (4,):
        raise TypeError('Error: `bbox` must be an array of 4 floats.')
    
    min_x, min_y, max_x, max_y = bbox
    x_edges = np.linspace(min_x, max_x, n+1)
    y_edges = np.linspace(min_y, max_y, n+1)
    
    X, Y = np.meshgrid(x_edges[:-1], y_edges[:-1])
    width_step = x_edges[1] - x_edges[0]
    height_step = y_edges[1] - y_edges[0]
    
    boxes = np.stack([
        X.ravel(),                  # min_x
        Y.ravel(),                  # min_y
        X.ravel() + width_step,     # max_x
        Y.ravel() + height_step     # max_y
    ], axis=1)
    
    return boxes

# Example
# result = split_bbox(0, 0, 400, 400, n=4)
# print(result.shape)  # (16, 4)
# print(result)

bboxes = split_bbox(nyc_bounds.total_bounds, 4)


In [221]:
import numpy as np
import geopandas as gpd
from shapely.geometry import box
import matplotlib.pyplot as plt
import contextily as ctx

# Example bounding box (EPSG:4326)
# min_x, min_y, max_x, max_y = -74.05, 40.68, -73.85, 40.85  # NYC area
# n = 4  # 4x4 grid

# # Compute sub-boxes
# x_edges = np.linspace(min_x, max_x, n+1)
# y_edges = np.linspace(min_y, max_y, n+1)

# X, Y = np.meshgrid(x_edges[:-1], y_edges[:-1])
# width_step = x_edges[1] - x_edges[0]
# height_step = y_edges[1] - y_edges[0]

# boxes = [
#     box(x, y, x + width_step, y + height_step)
#     for x, y in zip(X.ravel(), Y.ravel())
# ]


boxes = [box(x1, y1, x2, y2) for x1, y1, x2, y2 in bboxes]
# Create GeoDataFrame
gdf = gpd.GeoDataFrame(geometry=boxes, crs="4326")
gdf.explore()
# # Plot
# fig, ax = plt.subplots(figsize=(8, 8))
# gdf.boundary.plot(ax=ax, color="red", linewidth=1)

# # Add basemap
# ctx.add_basemap(ax, crs=gdf.crs.to_string(), source=ctx.providers.OpenStreetMap.Mapnik)

# plt.title("4x4 Grid on Map", fontsize=14)
# plt.show()
