In [6]:
from dataclasses import dataclass
from shapely.geometry import Point
from typing import Tuple

# Locations
@dataclass
class Location: # Geographic
    location_id: int
    crossstreets: list[str]
    geometry: Point
    sample: str

@dataclass
class LocationGeos: # Geographic
    location_id: int
    centroid: Tuple[float, float]
    tile_width: int
    zlevel: int
    crs: str
    bounds_gcs: list[float]
    bounds_proj: list[float]
    center_tile: str
    centroid_proj: Tuple[float, float]
    tile_grid: list[str]

@dataclass
class LocationYearFiles:
    location_id: int
    year: str
    imagery_path: str
    segmentation_path: str

# Documents
@dataclass
class DocumentCollections:
    document_collection_id: int
    year: str
    borough: str

@dataclass
class DocumentFiles:
    document_file_id: int
    document_collection_id: int
    file_path: str

@dataclass
class DocumentGeocodes: # Geographic
    document_file_id: int
    crossstreets: list[str]
    page_found: int
    geometry: Point

# CityDataProjects
@dataclass
class CityDataProjects:
    cd_project_id: int
    year: str


# Relations
@dataclass
class LocationToDocumentFile:
    location_id: int
    document_file_id: int
    distance: float

@dataclass
class LocationToCityDataProject:
    location_id: int
    cd_project_id: int
    distance: float


# Locations
## Locations

In [41]:
from streettransformer.config.constants import UNIVERSES_PATH, DATA_PATH
import geopandas as gpd
import pandas as pd
from dotenv import load_dotenv
import os
from pathlib import Path

load_dotenv()

UNIVERSE_NAME = 'neurips'
DB_PATH = Path(str(os.getenv('DB_PATH')))
YEARS = ['2006', '2012', '2014', '2018', '2024']

# Locations
locations_gdf = gpd.read_parquet(UNIVERSES_PATH / UNIVERSE_NAME / 'locations' / 'locations_raw.parquet')
locations = locations_gdf.drop_duplicates('location_id').reset_index(drop=True)#.rename(columns={'sample':'sample_source'})
locations
#locations.to_parquet(DB_PATH / 'locations.parquet')


# Location Geos - skip for now


Unnamed: 0,location_id,crossstreets,geometry,sample
0,43609,"[W 182 ST, WADSWORTH AVE]",POINT (-73.93454 40.85042),Safety
1,41675,"[30 AVE, 29 ST]",POINT (-73.92298 40.76764),Safety
2,37511,"[182 PL, 89 AVE]",POINT (-73.77928 40.71225),Safety
3,54810,"[71 RD, 136 ST]",POINT (-73.82715 40.72605),Safety
4,41012,"[40 ST, 47 AVE]",POINT (-73.92466 40.7418),Safety
...,...,...,...,...
1673,9042176,"[ROSS ST, KENT AVE]",POINT (-73.96516 40.70265),postIntervention
1674,28480,"[AVE V, BERGEN AVE]",POINT (-73.90199 40.62056),postIntervention
1675,20522,"[W HOUSTON ST, WOOSTER ST, PEDESTRIAN PATH, HO...",POINT (-73.99912 40.72654),postIntervention
1676,36332,"[ROCKAWAY BLVD, 149 ST]",POINT (-73.78902 40.67316),postIntervention


## Location Geos

In [42]:
location_geos = gpd.read_parquet(UNIVERSES_PATH / 'neurips' / 'locations' / 'locationgeos_compiled.parquet')
location_geos = location_geos.drop_duplicates(subset=['location_id', 'geometry'])
#location_geos.to_parquet(DB_PATH / 'location_geos.parquet')

## Location Year Files

In [44]:

# Location
lyf_collection = []
for loc in locations.itertuples():
    for year in YEARS:
        image_path = UNIVERSES_PATH / UNIVERSE_NAME / 'imagery' / f'{loc.location_id}.png'
        image_path_rel = str(image_path.relative_to(DATA_PATH))
        segmentation_path = UNIVERSES_PATH / UNIVERSE_NAME / 'imagery' / year / f'sidebside_{loc.location_id}.png'
        segmentation_path_rel = str(image_path.relative_to(DATA_PATH))
        temp_lyf = LocationYearFiles(loc.location_id, year, image_path_rel, segmentation_path_rel)

        lyf_collection.append(temp_lyf)

pd.DataFrame(lyf_collection).to_parquet(DB_PATH / 'location_year_files.parquet')

# Documents
## Document Collections

In [None]:

from st_preprocessing.config import DOCUMENTS_PATH
document_collections = pd.read_csv(DOCUMENTS_PATH / 'projects_df.csv', index_col=0)
document_collections = document_collections.reset_index(names='document_collection_id').drop('document_links', axis=1)
document_collections['borough'] = document_collections['borough'].str.replace('the Bronx', 'Bronx')
#document_collections.to_parquet(DB_PATH / 'document_collections.parquet')

## Document Files

In [None]:
document_files = gpd.read_parquet(UNIVERSES_PATH / 'caprecon_control5k' / 'documents.parquet')

# document_files = gpd.read_feather(UNIVERSES_PATH / 'caprecon_plus_control_downsampled/documents.feather')
# document_files = pd.read_csv(DATA_PATH / 'processing' / 'documents' / 'crossstreets_to_census_geocoded' / 'projects_docs_merged.csv', index_col=0)
document_files = (
    document_files
    .drop(['geometry', 'coords'],axis=1)
    .rename(columns={'project_id': 'document_collection_id'})
    .explode('relative_paths')
    .reset_index(drop=True)
)[['document_collection_id', 'relative_paths']].reset_index().rename(columns={'index': 'document_file_id', 'relative_paths': 'relative_path'})

#document_files.to_parquet(DB_PATH / 'document_files.parquet')


In [None]:
# TODO: Load from Maryam
document_geocodes_raw = pd.read_csv('../data/processing/documents/NYC_geocoded_results.csv')
document_geocodes_raw['project_id'].min(), document_geocodes_raw['project_id'].max() # Ok so I think it MUST by document_collection_id.
# Unfortunately then it doesn't seem like we have document id. Which is *probably* fine

COLS_TO_EXPORT = ['project_id', 'page_found','raw_cross_streets','geometry']
document_geocodes_raw_noNA = document_geocodes_raw.dropna(subset=['lon','lat'])
document_geocodes_raw_gdf = gpd.GeoDataFrame(document_geocodes_raw_noNA, crs='4326', geometry=gpd.points_from_xy(document_geocodes_raw_noNA['lon'], document_geocodes_raw_noNA['lat']))
document_geocodes = document_geocodes_raw_gdf.dropna(subset=['geometry'])[COLS_TO_EXPORT].rename(columns={'project_id':'document_collection_id'})

#document_geocodes.to_parquet(DB_PATH / 'document_collection_geocodes.parquet')




# City Data Projects

## City Data Projects - Something off here

In [None]:
citydata_projects = pd.read_parquet(UNIVERSES_PATH / 'neurips' / 'projects' / 'citydata_projects.parquet') # This should just be projects
citydata_projects.drop_duplicates(subset=['ProjectID', 'ProjTitle', 'LeadAgency', 'ProjectType', ])


Unnamed: 0,ProjectID,ProjTitle,LeadAgency,ProjectType,ProjectStatus,proj_year,SafetyScope,total_scope,location_id,crossstreets,sample
0,3082,Safe Routes to Schools,DOT,CAPITAL RECONSTRUCTION,Completed Project,2015.0,"[Raised Median, Bus Bulb, Curb Extensions]","[Partial Reconstruction, Raised Median, Bus Bu...",43609,"[W 182 ST, WADSWORTH AVE]",Safety
1,868,School Safety,DOT,CAPITAL RECONSTRUCTION,Completed Project,2016.0,"[Median Tip Extension, Bus Bulb, Curb Extensions]","[Partial Reconstruction, Median Tip Extension,...",41675,"[30 AVE, 29 ST]",Safety
2,3083,Safe Routes to Schools,DOT,CAPITAL RECONSTRUCTION,Completed Project,2013.0,"[Median Tip Extension, Curb Extensions]","[Partial Reconstruction, Median Tip Extension,...",37511,"[182 PL, 89 AVE]",Safety
3,1233,Pedestrian Safety Non-Federal,DOT,CAPITAL RECONSTRUCTION,Completed Project,2013.0,"[Bus Bulb, Curb Extensions]","[Bus Bulb, Curb Extensions]",54810,"[71 RD, 136 ST]",Safety
4,3144,PS 199- Greenpoint/48th Ave Safety Improvements,DOT,CAPITAL RECONSTRUCTION,Completed Project,2017.0,"[Lane Removal or Road Narrowing, Curb Extensions]","[Partial Reconstruction, Resurfacing, Lane Rem...",41012,"[40 ST, 47 AVE]",Safety
...,...,...,...,...,...,...,...,...,...,...,...
2644,3122,Columbus Circle Reconstruction,DOT,CAPITAL RECONSTRUCTION,Completed Project,2003.0,[Lane Removal or Road Narrowing],"[Curb to Curb Reconstruction, Sidewalks, Parti...",21548,"[COLUMBUS CIR, CENTRAL PARK S]",postIntervention
2717,575,"RECON OF 50TH ST. BRIDGE OVER LIRR, BAYRIDGE 2...",DOT,CAPITAL RECONSTRUCTION,Completed Project,2003.0,,[Partial Reconstruction],13766,"[18 AVE, 50 ST]",postIntervention
2904,60803,W-230TH & W-232ND FR NETHERLAND TO JOHNSON,DOT,CAPITAL RECONSTRUCTION,Completed Project,2001.0,,"[Curb to Curb Reconstruction, Sidewalks]",9065979,[W 230 ST],postIntervention
2932,3068,Steinway ST BR 2-23061 GCP 2-23061-0,DOT,CAPITAL RECONSTRUCTION,Completed Project,2002.0,,"[Curb to Curb Reconstruction, Sidewalks]",9061704,[STEINWAY ST],postIntervention


## Location to Project

In [None]:
location_to_project = pd.read_parquet(UNIVERSES_PATH / 'neurips' / 'projects' / 'locationid_2_citydataprojects.parquet')
location_to_project = location_to_project.rename({'ProjectID':'projid_dontuse'})
location_to_project.to_parquet(DB_PATH / '_location_to_project.parquet')


## City Data Projects

In [None]:
from st_preprocessing.citydata.cap_recon_pipeline import load_caprecon_file, COLUMNS_TO_KEEP
projects = load_caprecon_file('nyc').reset_index(names='citydata_proj_id')[['citydata_proj_id'] + COLUMNS_TO_KEEP].rename(columns={'ProjectID': 'projID_dontuse'})
projects.drop(columns=['FMSID', 'FMSAgencyID','Managing Agency', 'ProjectTypeCode'])

Unnamed: 0,citydata_proj_id,projID_dontuse,ProjTitle,LeadAgency,ProjectDescription,ProjectType,ProjectStatus,ConstructionFY,DesignStartDate,ConstructionEndDate,...,OversallScope,SafetyScope,OtherScope,ProjectJustification,OnStreetName,FromStreetName,ToStreetName,OFTCode,DesignFY,geometry
0,48,439,Downtown Brooklyn redevelopment - Fox Square,EDC,Fox Square lies at the convergence of Fulton S...,CAPITAL RECONSTRUCTION,Completed Project,2014,0000/00/00,0000/00/00,...,Partial Reconstruction,Lane Removal or Road Narrowing,Plaza/Ped Space Enhancement,,FLATBUSH AVENUE,NEVINS STREET,,340930364330000000,2012,"MULTIPOINT ((-73.98084 40.68876), (-73.98089 4..."
1,49,439,Downtown Brooklyn redevelopment - Fox Square,EDC,Fox Square lies at the convergence of Fulton S...,CAPITAL RECONSTRUCTION,Completed Project,2014,0000/00/00,0000/00/00,...,Partial Reconstruction,Lane Removal or Road Narrowing,Plaza/Ped Space Enhancement,,FULTON STREET,HUDSON AVENUE,,342730351330000000,2012,MULTIPOINT ((-73.98026 40.68864))
2,50,440,Downtown Brooklyn Redevelopment - Fulton Mall,EDC,Fulton St from Adams St to Flatbush Ave Extens...,CAPITAL RECONSTRUCTION,Completed Project,2009,2011/02/08,2012/04/10,...,Partial Reconstruction,,,,DUFFIELD STREET,FULTON STREET,,334230342730000000,2009,MULTIPOINT ((-73.98443 40.69029))
3,51,440,Downtown Brooklyn Redevelopment - Fulton Mall,EDC,Fulton St from Adams St to Flatbush Ave Extens...,CAPITAL RECONSTRUCTION,Completed Project,2009,2011/02/08,2012/04/10,...,Partial Reconstruction,,,,FULTON STREET,HANOVER PLACE,,342730347100000000,2009,MULTIPOINT ((-73.9821 40.68936))
4,52,440,Downtown Brooklyn Redevelopment - Fulton Mall,EDC,Fulton St from Adams St to Flatbush Ave Extens...,CAPITAL RECONSTRUCTION,Completed Project,2009,2011/02/08,2012/04/10,...,Partial Reconstruction,,,,FLATBUSH AVENUE EXTENSION,FLATBUSH AVENUE,,340830340930000000,2009,MULTIPOINT ((-73.98093 40.6889))
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2794,7628,65354,WATER MAIN IN SOUTHERN BLVD - WESTCHESTER AV &...,DEP,REPLACEMENT OF TRUNK WATER MAIN & SEWER,CAPITAL RECONSTRUCTION,Completed Project,2015,0000/00/00,0000/00/00,...,,,DEP Project,,BOSTON ROAD,WEST FARMS ROAD,,213820276450000000,0,MULTIPOINT ((-73.88016 40.84019))
2795,7629,65354,WATER MAIN IN SOUTHERN BLVD - WESTCHESTER AV &...,DEP,REPLACEMENT OF TRUNK WATER MAIN & SEWER,CAPITAL RECONSTRUCTION,Completed Project,2015,0000/00/00,0000/00/00,...,,,DEP Project,,BOSTON ROAD,EAST 174 STREET,,213820226880000000,0,MULTIPOINT ((-73.88777 40.83737))
2796,7630,65354,WATER MAIN IN SOUTHERN BLVD - WESTCHESTER AV &...,DEP,REPLACEMENT OF TRUNK WATER MAIN & SEWER,CAPITAL RECONSTRUCTION,Completed Project,2015,0000/00/00,0000/00/00,...,,,DEP Project,,BEND,BRONX STREET,,212550215025000000,0,MULTIPOINT ((-73.87807 40.84106))
2797,7631,65907,Howland Hook - Forest Avenue access Improvements,DOT,Improvements to turning lanes access to the Ho...,CAPITAL RECONSTRUCTION,Substantial Completion,2022,0000/00/00,0000/00/00,...,Partial Reconstruction,Bike/GW Enhancement,,Improvements to turning lanes,WESTERN AVENUE,GOETHALS BRIDGE APPROACH,,554940597040000000,0,"MULTIPOINT ((-74.18649 40.63026), (-74.18651 4..."


# Try out things

In [216]:
#[print(f'{x.stem} = gpd.read_parquet(DB_PATH / "{x.name}")') for x in DB_PATH.iterdir()]
document_files = pd.read_parquet(DB_PATH / "document_files.parquet")
locations = gpd.read_parquet(DB_PATH / "locations.parquet")
project_total_scopes = pd.read_parquet(DB_PATH / "citydata_project_total_scopes.parquet")
project_safety_scopes = pd.read_parquet(DB_PATH / "citydata_project_safety_scopes.parquet")
project_geos = gpd.read_parquet(DB_PATH / "project_geos.parquet")
projects = pd.read_parquet(DB_PATH / "projects.parquet")
location_year_files = pd.read_parquet(DB_PATH / "location_year_files.parquet")
document_collections = pd.read_parquet(DB_PATH / "document_collections.parquet") # De-duped
location_to_project = pd.read_parquet(DB_PATH / "_location_to_project.parquet") # De-duped

In [234]:
# # location_to_project.shape
# location_to_project#.drop_duplicates(subset = ['location_id', 'citydata_project_id']).reset_index(drop=True).to_parquet(DB_PATH / "location_to_project.parquet")
# #document_collections.drop_duplicates(['document_collection_id'])
# location_year_files.drop_duplicates(subset=['location_id', 'year'])
# #location_year_files#.drop_duplicates(subset=['location_id', 'year'])
# #locations#.drop_duplicates(subset=['location_id'])
# location_to_project.shape # 7236
# location_to_project#.drop_duplicates(subset=['location_id','citydata_project_id']).to_parquet(DB_PATH / "location_to_project.parquet")

# citydata_projects.drop_duplicates(subset=['citydata_proj_id'])
# citydata_project_geos.drop_duplicates(subset=['citydata_proj_id', 'geometry'])

#project_safety_scopes.drop_duplicates().reset_index(drop=True).to_parquet(DB_PATH / 'project_safety_scopes.parquet')
#project_total_scopes.drop_duplicates().reset_index(drop=True).to_parquet(DB_PATH / 'project_total_scopes.parquet')
projects
project_total_scopes
project_safety_scopes
#location_to_project = location_to_project.drop('ProjectID', axis=1).rename(columns={'citydata_project_id': 'citydata_proj_id'})
location_to_project.to_parquet(DB_PATH / '_location_to_project.parquet')

In [5]:
from streettransformer.llms.queries import QUERIES
print(QUERIES['image_change_identifier'].text())


        Role: You are a Transportation Engineer employed by the city tasked with analyzing changes in intersection streetscape over time.

        Goal: Your goal in this task is to look at two satellite images taken of the same location at different times and identify if there are any changes in the structural street design which may have taken place between the snapshots. Do NOT hesitate to say there is not significant change if you do not see them. The first image is the before and and the second is the after. Limit this analysis to only capital reconstruction features including: 'Curb Extensions', 'New or Expanded Median/Pedestrian Refuge Island', 'Bike Enhancement', 'Median Tip Extension', 'Raised Median', 'Lane Removal or Road Narrowing', 'Bus Bulb', 'Shared Street')

        Respond: Please respond in a well formatted json exclusively with 3 tags:
	- change_detected
	- features_detected
	- confidence
        
