In [31]:
import pandas as pd
import datetime as dt
import geopandas as gpd
from shapely.geometry import Point

In [22]:
df = pd.read_csv('Data/Raw/Affordable_Housing.csv')

In [23]:
df = df[['Project Start Date',
       'Project Completion Date','Census Tract','Latitude',
       'Longitude', 'Building Completion Date']]

df.dropna(subset=['Latitude','Longitude'], inplace=True)

In [27]:
df['year'] = pd.to_datetime(df['Project Start Date']).dt.year
df

Unnamed: 0,Project Start Date,Project Completion Date,Census Tract,Latitude,Longitude,Building Completion Date,year
1,06/28/2024,,1.0,40.699994,-73.990980,,2024
2,06/28/2024,,523.0,40.712440,-73.960722,,2024
3,06/28/2024,,523.0,40.713406,-73.958611,,2024
4,06/28/2024,,523.0,40.712486,-73.957789,,2024
5,06/28/2024,,523.0,40.712586,-73.960235,,2024
...,...,...,...,...,...,...,...
7621,01/21/2014,01/21/2014,189.0,40.798269,-73.962676,01/21/2014,2014
7626,01/15/2014,04/25/2016,1126.0,40.664986,-73.895100,10/21/2015,2014
7627,01/15/2014,04/25/2016,1126.0,40.664945,-73.895111,04/22/2016,2014
7628,01/15/2014,04/25/2016,1130.0,40.664166,-73.895855,04/25/2016,2014


### Convert coordinates to Census Tracts

In [38]:
# get census tract to coordinate conversion

# Load the census tract shapefile
tracts = gpd.read_file('Data/Raw/tracts2020_shapefile/nyct2020.shp')
tracts = tracts.to_crs(epsg = 4326)

coords_df = df[["Latitude","Longitude"]].drop_duplicates().reset_index(drop=True)
coords = [tuple(record) for record in coords_df.to_records(index=False)] #[(40.735324, -73.998004), (40.715595,	-73.987030)]

# Create a GeoDataFrame from the coordinates
geometry = [Point(lon, lat) for lat, lon in coords]
geo_df = gpd.GeoDataFrame(geometry=geometry, crs="EPSG:4326")
geo_df["latitude"] = coords_df["Latitude"]
geo_df["longitude"] = coords_df["Longitude"]

# Perform a spatial join to match points to census tracts
ct_lookup = gpd.sjoin(geo_df, tracts, how='left', predicate="within")[["latitude", "longitude", "GEOID"]]

# merge our lats/longs with the lookup table we created
df_ct = df.merge(ct_lookup, left_on=["Latitude", "Longitude"], right_on=["latitude", "longitude"])
df_ct.drop(['latitude','longitude','Project Completion Date', 'Census Tract',
       'Latitude', 'Longitude', 'Building Completion Date'], axis=1, inplace=True)
df_ct

Unnamed: 0,Project Start Date,year,GEOID
0,06/28/2024,2024,36047000100
1,06/28/2024,2024,36047052300
2,06/28/2024,2024,36047052300
3,06/28/2024,2024,36047052300
4,06/28/2024,2024,36047052300
...,...,...,...
6101,01/21/2014,2014,36061018900
6102,01/15/2014,2014,36047112600
6103,01/15/2014,2014,36047112600
6104,01/15/2014,2014,36047113000


In [44]:
grouped = df_ct.groupby(['GEOID', 'year']).count().reset_index()
grouped.rename(columns={'Project Start Date':'num_affordable_hous_built'}, inplace=True)

In [45]:
grouped

Unnamed: 0,GEOID,year,num_affordable_hous_built
0,36005001600,2017,9
1,36005001600,2019,1
2,36005001600,2022,1
3,36005001600,2023,3
4,36005001901,2020,2
...,...,...,...
2854,36085020702,2018,1
2855,36085020702,2019,1
2856,36085022300,2015,1
2857,36085024800,2023,1


In [46]:
# grouped.to_parquet('Data/Cleaned/affordable_housing.parquet')