# Matching up project document pdfs with the geotagged files in the database
## env

In [1]:
import os
from pathlib import Path

import pandas as pd
import geopandas as gpd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

## Gameplan

- pull all document pdf titles from the folder
- for each project title in the database (n=~1000):
    - create a similarity score for all the documents (n=~1800)
    - pull the top 5 candidates
- validate first by confirming borough.
- Then write an LLM script that extracts addresses and intersections from the documents themselves and stores those in a database (n=~1800). Also extract the years
- for each project (n=~1000)
    - for each candidate (5):
        - loop through the addresses and intersections listed:
            - geocode them
            - calcualte distance from the 
            - find total distance and order by it


## Load files
### Project Documents

In [2]:
BASEPATH_PROJECT_DOCUMENT = Path('../../../proj_data/project_documents')

# Get all project documents - note each of these is a folder
project_document_titles = os.listdir(BASEPATH_PROJECT_DOCUMENT)
project_document_split_titles = {int(x.split('--')[0]): '--'.join(x.split('--')[1:]) for x in project_document_titles if Path(BASEPATH_PROJECT_DOCUMENT / x).is_dir()}
project_document_titles_series =  pd.Series(project_document_split_titles)
project_document_titles_series.name = 'pdf_dir'


project_document_all_document_paths = None

all_files = []
for idx, title in project_document_split_titles.items():

    folder_path = f'{idx}--{title}'
    #print(folder_path)
    files = os.listdir(BASEPATH_PROJECT_DOCUMENT / folder_path)
    #print('\n'.join([f'\t{x}' for x in files]))
    all_files.extend(files)


df_project_files = pd.DataFrame([{'proj_id': int(y[0]), 'doc_id': int(y[1]), 'path': '--'.join(y)} for y in [x.split('--') for x in all_files]])
df_project_files = df_project_files.set_index('proj_id').sort_index()

df_project_folders = pd.read_csv(BASEPATH_PROJECT_DOCUMENT /'projects_df.csv', index_col=0)
df_project_documents_merged = df_project_folders.merge(df_project_files, left_index=True, right_index=True, how='outer', indicator=True)
df_project_documents_merged['_merge'].value_counts()
df_project_documents_merged[df_project_documents_merged['_merge'] != 'both']

df_project_documents_merged#['doc_id'] = df_project_documents_merged['doc_id'].astype(int)
df_project_documents_merged['document_path'] = df_project_documents_merged.index.values.astype('str') + '--' + df_project_documents_merged['name'] + '/' + df_project_documents_merged['path']
df_project_documents_merged[df_project_documents_merged['_merge'] != 'both']


# So I have a bunch of these documents. My goal is to get location data out of them
os.makedirs('../../data/project_documents', exist_ok=True)
df_project_documents_merged.to_csv('../../data/project_documents/projects_docs_merged.csv') # Need to Save


In [3]:
#df_project_documents_merged#

In [4]:
#BASEPATH_PROJECT_DOCUMENT / df_project_documents_merged.iloc[0]['document_path']
#df_project_documents_merged

### Project Data

In [5]:
BASEPATH_OPENNYC = Path('../../data/openNYC') # only 445 distinct projectNames

PATH_STR_N_HWY_PROJECTS = BASEPATH_OPENNYC / 'Street_and_Highway_Capital_Reconstruction_Projects_-_Intersection_20250721.csv' # a mishmash of project types
PATH_CAPITAL_PROJECTS = BASEPATH_OPENNYC / 'Capital_Projects_20250720.csv' # Less useful, because its a mishmash

gdf_strhwy_projs = gpd.read_file(PATH_STR_N_HWY_PROJECTS) # 7000 but
#gdf_strhwy_projs['ProjTitle'].value_counts()  but only 445 distinct projectNames
gdf_strhwy_projs['LeadAgency'].value_counts() # largely DEP,
gdf_strhwy_projs_DOT = gdf_strhwy_projs[gdf_strhwy_projs['LeadAgency'] == 'DOT']
gdf_strhwy_projs_DOT['ProjTitle'].value_counts()

gdf_strhwy_projs_DOT.groupby('ConstructionFY')['ProjTitle'].nunique() # really not that many each year unfortunately


gdf_capital_projects = gpd.read_file(PATH_CAPITAL_PROJECTS)
gdf_capital_projects['Category'].value_counts()
RELEVANT_CAPITAL_PROJECT_CATEGORIES = ['Streets and Roadways', 'Bridges','Bridges, Streets and Roadways','Parks, Streets and Roadways','Industrial Development, Streets and']

gdf_capital_projects_transportation = gdf_capital_projects[gdf_capital_projects['Category'].isin(RELEVANT_CAPITAL_PROJECT_CATEGORIES)]
gdf_capital_projects_transportation

Unnamed: 0,Date Reported As Of,PID,Project Name,Description,Category,Borough,Managing Agency,Client Agency,Current Phase,Design Start,Budget Forecast,Latest Budget Changes,Total Budget Changes,Forecast Completion,Latest Schedule Changes,Total Schedule Changes
21,09/2013,97,Bruckner Expressway Westchester Creek Bridge R...,Replace existing bridge with a wider double-le...,"Bridges, Streets and Roadways",Bronx,DOT,"DEP, DOT",Design,07/23/2012,151775497,,142277136,11/20/2019,,2086
22,01/2014,97,Bruckner Expressway Westchester Creek Bridge R...,This project will replace the existing bridge ...,"Bridges, Streets and Roadways",Bronx,DOT,"DEP, DOT",Design,07/23/2012,152871246,1095749,142277136,11/20/2019,0,2086
23,05/2014,97,Bruckner Expressway Westchester Creek Bridge R...,This project will replace the existing bridge ...,"Bridges, Streets and Roadways",Bronx,DOT,"DEP, DOT",Design,07/23/2012,195217224,42345978,142277136,07/21/2020,244,2086
24,02/2015,97,Bruckner Expressway Westchester Creek Bridge R...,This project will replace the existing bridge ...,"Bridges, Streets and Roadways",Bronx,DOT,"DEP, DOT",Design,07/23/2012,198569961,3352737,142277136,10/14/2020,85,2086
25,08/2015,97,Bruckner Expressway Westchester Creek Bridge R...,This project will replace the existing bridge ...,"Bridges, Streets and Roadways",Bronx,DOT,"DEP, DOT",Design,07/23/2012,198843972.62,274011.62,142277136,08/17/2020,-58,2086
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3314,01/2023,1368,Construction of ROW Green Infrastructure in CI...,Construction of ROW Green Infrastructure in CI...,Streets and Roadways,,DDC,Environmental Protection,Construction Procurement,06/27/2022,36486000,,,08/27/2025,,
3315,01/2023,1369,Bellevue Men's Shelter: Bathroom Upgrade,Renovation of all bathroom facilities through ...,Streets and Roadways,,DDC,Homeless Services,Design,01/29/2018,25019000,,,10/28/2026,,
3316,01/2023,1370,"STORM & SANITARY SEWERS IN ALVERSON AVENUE, SI","STORM & SANITARY SEWERS IN WEST CASTOR PLACE, SI",Streets and Roadways,,DDC,Environmental Protection,Construction,06/16/2014,25486000,,,07/16/2024,,
3318,01/2023,1372,Construction of ROW Green Infrastructure in CI...,Construction of Right of Way Green Infrastruct...,Streets and Roadways,,DDC,Environmental Protection,Construction Procurement,06/08/2022,29363000,,,06/01/2025,,


In [6]:
#pd.concat([pd.to_datetime(gdf_capital_projects_transportation['Date Reported As Of']), gdf_capital_projects_transportation['Date Reported As Of']], axis=1)

In [7]:
# temp = pd.read_csv('../../data/project_documents/projects_docs_merged.csv', index_col=0)
# temp_path = BASEPATH_PROJECT_DOCUMENT / temp.loc[0, 'document_path']
# print(temp_path)
# temp_path.exists()

# from IPython.display import IFrame, display
# IFrame(temp_path, width=20, height=20)


# Tokenize and cosine similarity


In [41]:
from title_match import find_most_similar_titles

find_most_similar_titles(
    df_project_documents_merged['name'][0],
    gdf_capital_projects_transportation['Project Name'].drop_duplicates(),
    top_n=5
).to_dict(orient='records')

#gdf_capital_projects_transportation['Project Name'].drop_duplicates()
test_titles = gdf_capital_projects_transportation['Project Name'].drop_duplicates()
similar_titles = df_project_documents_merged.apply(
    lambda x: find_most_similar_titles(x['name'], test_titles).to_dict(orient='records'),
    axis=1
)

# df_project_documents_merged['similar_titles'] = similar_titles
# df_project_documents_merged

#gdf_capital_projects_transportation['Project Name'].drop_duplicates()

In [40]:
similar_titles

0      [{'document': 'Bruckner Expressway Westchester...
1      [{'document': 'Reconstruction Of Arthur Kill R...
2      [{'document': 'Lexington Avenue Pedestrian Saf...
3      [{'document': 'DELANCEY STREET SAFETY IMPROVEM...
4      [{'document': 'UNION TURNPIKE CENTER MEDIAN RE...
                             ...                        
938    [{'document': 'West Tremont Avenue over Metro ...
939    [{'document': 'Bergen Avenue Street Reconstruc...
939    [{'document': 'Bergen Avenue Street Reconstruc...
939    [{'document': 'Bergen Avenue Street Reconstruc...
940    [{'document': 'RECONSTRUCTION OF FRONT STREET,...
Length: 1802, dtype: object

In [89]:
#matches = pd.DataFrame([pd.Series(x) for x in similar_titles]).rename(lambda x: f'Match_{x}', axis=1)
df_project_documents_merged.merge(
    similar_titles.explode().apply(pd.Series),
    left_index=True, right_index=True
).sort_values('similarity', ascending=False).drop_duplicates(subset=['document_path', 'document','similarity'])

Unnamed: 0,year,borough,name,document_links,source_url,doc_id,path,_merge,document_path,document,similarity
139,2012,Manhattan,Delancey Street Safety Improvements,['http://nyc.gov/html/dot/downloads/pdf/2014-0...,https://www.nyc.gov/html/dot/html/about/projec...,0.0,139--0--2014-09-04_grand_st_and_clinton_st_mit...,both,139--Delancey Street Safety Improvements/139--...,DELANCEY STREET SAFETY IMPROVEMENTS,1.000000
139,2012,Manhattan,Delancey Street Safety Improvements,['http://nyc.gov/html/dot/downloads/pdf/2014-0...,https://www.nyc.gov/html/dot/html/about/projec...,1.0,139--1--2012-02-delancey-slides.pdf,both,139--Delancey Street Safety Improvements/139--...,DELANCEY STREET SAFETY IMPROVEMENTS,1.000000
164,2013,the Bronx,Grand Concourse Reconstruction,['http://nyc.gov/html/dot/downloads/pdf/2013-0...,https://www.nyc.gov/html/dot/html/about/projec...,0.0,164--0--2013-02-grand-concourse.pdf,both,164--Grand Concourse Reconstruction/164--0--20...,"GRAND CONCOURSE, PHASE 5",0.822255
874,2025,Manhattan,East Side Coastal Resiliency (ESCR) Project,['https://www.nyc.gov/site/escr/index.page'],https://www.nyc.gov/html/dot/html/about/curren...,0.0,874--0--index.page,both,874--East Side Coastal Resiliency (ESCR) Proje...,Coastal Resiliency – Project Area 2,0.694242
52,2009,Manhattan,Park Avenue Tunnel,['http://www.nyc.gov/html/dot/downloads/pdf/pa...,https://www.nyc.gov/html/dot/html/about/projec...,0.0,52--0--parkavetunnel.pdf,both,52--Park Avenue Tunnel/52--0--parkavetunnel.pdf,Park Avenue Tunnel (Murray Hill),0.672703
...,...,...,...,...,...,...,...,...,...,...,...
873,2025,Manhattan,Chinatown Connections,['https://www.nyc.gov/html/dot/downloads/pdf/f...,https://www.nyc.gov/html/dot/html/about/curren...,9.0,873--9--chinatown-connections-jun2023.pdf,both,873--Chinatown Connections/873--9--chinatown-c...,12th Avenue Accessibility Improvements,0.000000
873,2025,Manhattan,Chinatown Connections,['https://www.nyc.gov/html/dot/downloads/pdf/f...,https://www.nyc.gov/html/dot/html/about/curren...,9.0,873--9--chinatown-connections-jun2023.pdf,both,873--Chinatown Connections/873--9--chinatown-c...,RECONSTRUCTION OF QUEENS BLVD - PHASE 3,0.000000
873,2025,Manhattan,Chinatown Connections,['https://www.nyc.gov/html/dot/downloads/pdf/f...,https://www.nyc.gov/html/dot/html/about/curren...,9.0,873--9--chinatown-connections-jun2023.pdf,both,873--Chinatown Connections/873--9--chinatown-c...,RECONSTRUCTION OF SOUTH JAMAICA AREA (SEQ200562),0.000000
873,2025,Manhattan,Chinatown Connections,['https://www.nyc.gov/html/dot/downloads/pdf/f...,https://www.nyc.gov/html/dot/html/about/curren...,9.0,873--9--chinatown-connections-jun2023.pdf,both,873--Chinatown Connections/873--9--chinatown-c...,HEMPSTEAD AVE BR/CIP 2231780,0.000000


In [9]:
#df_project_documents_merged[['borough','name','year', 'similar_titles']].

# Tile2net Segmentation

In [10]:
import osmnx as ox
nyc_gdfs = {
    'brooklyn': ox.geocode_to_gdf('Brooklyn, New York, USA'),
    'manhattan': ox.geocode_to_gdf('Manhattan , New York, USA'),
    'bronx': ox.geocode_to_gdf('Bronx County, New York, USA'),
    'queens': ox.geocode_to_gdf('Queens County, New York, USA'),
    'statenisland': ox.geocode_to_gdf('Staten Island, New York, USA')
}

In [11]:
borough_boundaries = pd.concat(nyc_gdfs).reset_index().rename(columns={'level_0': 'borough'}).set_index('borough').drop('level_1', axis=1)

nyc_bounds = ox.geocode_to_gdf('New York City, New York, USA')

In [12]:
import numpy as np
GRID_STEP = 4
total_width = (nyc_bounds.loc[0]['bbox_east'] - nyc_bounds.loc[0]['bbox_west'])
total_height = (nyc_bounds.loc[0]['bbox_north'] - nyc_bounds.loc[0]['bbox_south'])

(total_height, total_width)

# step_x = total_width / GRID_STEP
# step_y = total_height / GRID_STEP

#bboxes = range(nyc_bounds.loc[0]['bbox_west'], nyc_bounds.loc[0]['bbox_east'], step_x)

x_cuts = np.linspace(nyc_bounds['bbox_west'][0], nyc_bounds['bbox_east'][0], GRID_STEP+1)
y_cuts = np.linspace(nyc_bounds['bbox_north'][0], nyc_bounds['bbox_south'][0], GRID_STEP+1)

bbox_pairs = np.array(np.meshgrid(x_cuts, y_cuts)).T.reshape(-1,2)
bbox_pairs

array([[-74.258843 ,  40.91763  ],
       [-74.258843 ,  40.807367 ],
       [-74.258843 ,  40.697104 ],
       [-74.258843 ,  40.586841 ],
       [-74.258843 ,  40.476578 ],
       [-74.1191905,  40.91763  ],
       [-74.1191905,  40.807367 ],
       [-74.1191905,  40.697104 ],
       [-74.1191905,  40.586841 ],
       [-74.1191905,  40.476578 ],
       [-73.979538 ,  40.91763  ],
       [-73.979538 ,  40.807367 ],
       [-73.979538 ,  40.697104 ],
       [-73.979538 ,  40.586841 ],
       [-73.979538 ,  40.476578 ],
       [-73.8398855,  40.91763  ],
       [-73.8398855,  40.807367 ],
       [-73.8398855,  40.697104 ],
       [-73.8398855,  40.586841 ],
       [-73.8398855,  40.476578 ],
       [-73.700233 ,  40.91763  ],
       [-73.700233 ,  40.807367 ],
       [-73.700233 ,  40.697104 ],
       [-73.700233 ,  40.586841 ],
       [-73.700233 ,  40.476578 ]])

In [13]:
import numpy as np
from typing import List

def split_bbox(bbox:np.ndarray, n=4):
    if bbox.shape != (4,):
        raise TypeError('Error: `bbox` must be an array of 4 floats.')
    
    min_x, min_y, max_x, max_y = bbox
    x_edges = np.linspace(min_x, max_x, n+1)
    y_edges = np.linspace(min_y, max_y, n+1)
    
    X, Y = np.meshgrid(x_edges[:-1], y_edges[:-1])
    width_step = x_edges[1] - x_edges[0]
    height_step = y_edges[1] - y_edges[0]
    
    boxes = np.stack([
        X.ravel(),                  # min_x
        Y.ravel(),                  # min_y
        X.ravel() + width_step,     # max_x
        Y.ravel() + height_step     # max_y
    ], axis=1)
    
    return boxes

# Example
# result = split_bbox(0, 0, 400, 400, n=4)
# print(result.shape)  # (16, 4)
# print(result)

bboxes = split_bbox(nyc_bounds.total_bounds, 4)


In [91]:
gdf_capital_projects_transportation.drop_duplicates(subset=['Project Name'])

Unnamed: 0,Date Reported As Of,PID,Project Name,Description,Category,Borough,Managing Agency,Client Agency,Current Phase,Design Start,Budget Forecast,Latest Budget Changes,Total Budget Changes,Forecast Completion,Latest Schedule Changes,Total Schedule Changes
21,09/2013,97,Bruckner Expressway Westchester Creek Bridge R...,Replace existing bridge with a wider double-le...,"Bridges, Streets and Roadways",Bronx,DOT,"DEP, DOT",Design,07/23/2012,151775497,,142277136,11/20/2019,,2086
41,09/2013,101,Roosevelt Avenue Bridge Superstructure and Sub...,Repair/replacement of bridge superstructure an...,Bridges,Queens,DOT,DOT,Design,03/10/2003,123384000,,23798213,02/11/2018,,2003
65,09/2013,106,Broadway Bridge over Harlem River Replacement,Replace deteriorated members of lift span supe...,Bridges,"Bronx, Manhattan",DOT,DOT,Design,04/01/2005,109236000,,39724388,08/05/2019,,2797
89,09/2013,112,Westchester Avenue Bridge over Hutch River Par...,"Rehabilitation of abutments, piers, approaches...","Bridges, Streets and Roadways",Bronx,DOT,DOT,Design,01/17/1995,58259000,,20945057,03/22/2019,,1592
113,02/2015,113,Riverside Drive/West 158th Street Bridge Rehab...,Deck replacement of the Riverside Drive Viaduc...,"Bridges, Streets and Roadways",Manhattan,DOT,DOT,Design,10/22/2014,61507424,6801424,96922794,12/17/2021,901,2152
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3314,01/2023,1368,Construction of ROW Green Infrastructure in CI...,Construction of ROW Green Infrastructure in CI...,Streets and Roadways,,DDC,Environmental Protection,Construction Procurement,06/27/2022,36486000,,,08/27/2025,,
3315,01/2023,1369,Bellevue Men's Shelter: Bathroom Upgrade,Renovation of all bathroom facilities through ...,Streets and Roadways,,DDC,Homeless Services,Design,01/29/2018,25019000,,,10/28/2026,,
3316,01/2023,1370,"STORM & SANITARY SEWERS IN ALVERSON AVENUE, SI","STORM & SANITARY SEWERS IN WEST CASTOR PLACE, SI",Streets and Roadways,,DDC,Environmental Protection,Construction,06/16/2014,25486000,,,07/16/2024,,
3318,01/2023,1372,Construction of ROW Green Infrastructure in CI...,Construction of Right of Way Green Infrastruct...,Streets and Roadways,,DDC,Environmental Protection,Construction Procurement,06/08/2022,29363000,,,06/01/2025,,


In [14]:
import numpy as np
import geopandas as gpd
from shapely.geometry import box
import matplotlib.pyplot as plt
import contextily as ctx

# Example bounding box (EPSG:4326)
# min_x, min_y, max_x, max_y = -74.05, 40.68, -73.85, 40.85  # NYC area
# n = 4  # 4x4 grid

# # Compute sub-boxes
# x_edges = np.linspace(min_x, max_x, n+1)
# y_edges = np.linspace(min_y, max_y, n+1)

# X, Y = np.meshgrid(x_edges[:-1], y_edges[:-1])
# width_step = x_edges[1] - x_edges[0]
# height_step = y_edges[1] - y_edges[0]

# boxes = [
#     box(x, y, x + width_step, y + height_step)
#     for x, y in zip(X.ravel(), Y.ravel())
# ]


boxes = [box(x1, y1, x2, y2) for x1, y1, x2, y2 in bboxes]
# Create GeoDataFrame
gdf = gpd.GeoDataFrame(geometry=boxes, crs="4326")
gdf.explore()
# # Plot
# fig, ax = plt.subplots(figsize=(8, 8))
# gdf.boundary.plot(ax=ax, color="red", linewidth=1)

# # Add basemap
# ctx.add_basemap(ax, crs=gdf.crs.to_string(), source=ctx.providers.OpenStreetMap.Mapnik)

# plt.title("4x4 Grid on Map", fontsize=14)
# plt.show()


ModuleNotFoundError: No module named 'contextily'