In [1]:
import pandas as pd
import datetime as dt
import geopandas as gpd
from shapely.geometry import Point

In [2]:
df = pd.read_csv('Data/Raw/Affordable_Housing.csv')

In [3]:
df = df[['Project Start Date',
       'Project Completion Date','Census Tract','Latitude',
       'Longitude', 'Building Completion Date']]

df.dropna(subset=['Latitude','Longitude'], inplace=True)

In [4]:
df['year'] = pd.to_datetime(df['Project Start Date']).dt.year
df

Unnamed: 0,Project Start Date,Project Completion Date,Census Tract,Latitude,Longitude,Building Completion Date,year
1,06/28/2024,,1.0,40.699994,-73.990980,,2024
2,06/28/2024,,523.0,40.712440,-73.960722,,2024
3,06/28/2024,,523.0,40.713406,-73.958611,,2024
4,06/28/2024,,523.0,40.712486,-73.957789,,2024
5,06/28/2024,,523.0,40.712586,-73.960235,,2024
...,...,...,...,...,...,...,...
7621,01/21/2014,01/21/2014,189.0,40.798269,-73.962676,01/21/2014,2014
7626,01/15/2014,04/25/2016,1126.0,40.664986,-73.895100,10/21/2015,2014
7627,01/15/2014,04/25/2016,1126.0,40.664945,-73.895111,04/22/2016,2014
7628,01/15/2014,04/25/2016,1130.0,40.664166,-73.895855,04/25/2016,2014


### Convert coordinates to Census Tracts

In [5]:
# get census tract to coordinate conversion

# Load the census tract shapefile
tracts = gpd.read_file('Data/Raw/tracts2020_shapefile/nyct2020.shp')
tracts = tracts.to_crs(epsg = 4326)

coords_df = df[["Latitude","Longitude"]].drop_duplicates().reset_index(drop=True)
coords = [tuple(record) for record in coords_df.to_records(index=False)] #[(40.735324, -73.998004), (40.715595,	-73.987030)]

# Create a GeoDataFrame from the coordinates
geometry = [Point(lon, lat) for lat, lon in coords]
geo_df = gpd.GeoDataFrame(geometry=geometry, crs="EPSG:4326")
geo_df["latitude"] = coords_df["Latitude"]
geo_df["longitude"] = coords_df["Longitude"]

# Perform a spatial join to match points to census tracts
ct_lookup = gpd.sjoin(geo_df, tracts, how='left', predicate="within")[["latitude", "longitude", "GEOID"]]

# merge our lats/longs with the lookup table we created
df_ct = df.merge(ct_lookup, left_on=["Latitude", "Longitude"], right_on=["latitude", "longitude"])
df_ct.drop(['latitude','longitude','Project Completion Date', 'Census Tract',
       'Latitude', 'Longitude', 'Building Completion Date'], axis=1, inplace=True)
df_ct

Unnamed: 0,Project Start Date,year,GEOID
0,06/28/2024,2024,36047000100
1,06/28/2024,2024,36047052300
2,06/28/2024,2024,36047052300
3,06/28/2024,2024,36047052300
4,06/28/2024,2024,36047052300
...,...,...,...
6101,01/21/2014,2014,36061018900
6102,01/15/2014,2014,36047112600
6103,01/15/2014,2014,36047112600
6104,01/15/2014,2014,36047113000


In [31]:
grouped = df_ct.groupby(['GEOID', 'year']).count().reset_index()
grouped.rename(columns={'Project Start Date':'num_affordable_hous_built'}, inplace=True)

grouped['year'] = pd.to_datetime(grouped['year'].astype(str))
yearly = grouped.groupby('GEOID')[['year','num_affordable_hous_built']].apply(lambda x: x.resample("YE", on='year').mean()).reset_index() 

Unnamed: 0,GEOID,year,num_affordable_hous_built
0,36005001600,2017-12-31,9.0
1,36005001600,2018-12-31,
2,36005001600,2019-12-31,1.0
3,36005001600,2020-12-31,
4,36005001600,2021-12-31,
...,...,...,...
4779,36085020702,2018-12-31,1.0
4780,36085020702,2019-12-31,1.0
4781,36085022300,2015-12-31,1.0
4782,36085024800,2023-12-31,1.0


In [39]:
# add more years in between
full_date_range = pd.date_range(start='2014-12-31', end='2023-12-31', freq='YE')
more_years = yearly.groupby('GEOID')[['year','num_affordable_hous_built']].apply(lambda x: x.set_index('year').reindex(full_date_range)).reset_index()
more_years

Unnamed: 0,GEOID,level_1,num_affordable_hous_built
0,36005001600,2014-12-31,
1,36005001600,2015-12-31,
2,36005001600,2016-12-31,
3,36005001600,2017-12-31,9.0
4,36005001600,2018-12-31,
...,...,...,...
10685,36085031902,2019-12-31,
10686,36085031902,2020-12-31,
10687,36085031902,2021-12-31,
10688,36085031902,2022-12-31,


In [76]:
# fill empty years with zero
more_years.fillna(0,inplace=True)

# add more years at the beginning
full_date_range = pd.date_range(start='2010-12-31', end='2023-12-31', freq='YE')
all_years = more_years.groupby('GEOID')[['level_1','num_affordable_hous_built']].apply(lambda x: x.set_index('level_1').reindex(full_date_range)).reset_index()
all_years.rename(columns={'level_1':'year'}, inplace=True)


In [77]:
# interpolate the empty years at the beginning

all_years = all_years.groupby('GEOID')[['year','num_affordable_hous_built']].apply(lambda x: x.resample("YE", on='year').mean()).apply(lambda x: x.interpolate(method='linear')).reset_index()

# fill empty years with zero
all_years.fillna(0,inplace=True)
# all_years['num_affordable_hous_built'] = all_years['num_affordable_hous_built'].astype(int)
all_years['year'] = all_years['year'].dt.year

In [78]:
all_years.sample(10)

Unnamed: 0,GEOID,year,num_affordable_hous_built
2948,36005039901,2018,0.0
11106,36061023000,2014,11.0
9390,36061003002,2020,0.0
8080,36047083400,2012,0.0
8564,36047105804,2020,0.0
9797,36061009000,2021,0.0
9530,36061004400,2020,0.0
12024,36081009700,2022,0.0
1811,36005023900,2015,0.0
8355,36047089800,2021,0.0


In [79]:
# all_years.to_parquet('Data/Cleaned/affordable_housing.parquet')