In [1]:
import pandas as pd
import numpy as np
import os
import geopandas as gpd
from shapely.geometry import Point
import datetime as dt

In [2]:
df = pd.read_csv('Data/Raw/Evictions.csv')
df = df[['Executed Date','Residential/Commercial','Latitude', 'Longitude']]

In [3]:
# change to datetime

df['year'] = pd.to_datetime(df['Executed Date']).dt.year
df

Unnamed: 0,Executed Date,Residential/Commercial,Latitude,Longitude,year
0,10/02/2024,Residential,40.852537,-73.907293,2024
1,10/21/2024,Residential,40.868402,-73.916505,2024
2,01/02/2024,Residential,40.876181,-73.909426,2024
3,10/22/2024,Residential,40.828490,-73.880472,2024
4,01/09/2024,Residential,40.781716,-73.954878,2024
...,...,...,...,...,...
103800,11/30/2023,Commercial,,,2023
103801,02/11/2020,Residential,,,2020
103802,06/22/2017,Residential,40.878476,-73.855940,2017
103803,10/16/2023,Residential,40.854460,-73.930418,2023


In [4]:
# map coordinates to census tracts

# Load the census tract shapefile
tracts = gpd.read_file('Data/Raw/tracts2020_shapefile/nyct2020.shp')
tracts = tracts.to_crs(epsg = 4326)

coords_df = df[["Latitude","Longitude"]].drop_duplicates().reset_index(drop=True)
coords = [tuple(record) for record in coords_df.to_records(index=False)]

# Create a GeoDataFrame from the coordinates
geometry = [Point(lon, lat) for lat, lon in coords]
geo_df = gpd.GeoDataFrame(geometry=geometry, crs="EPSG:4326")
geo_df["latitude"] = coords_df["Latitude"]
geo_df["longitude"] = coords_df["Longitude"]

# Perform a spatial join to match points to census tracts
ct_lookup = gpd.sjoin(geo_df, tracts, how='left', predicate="within")[["latitude", "longitude", "GEOID"]]

# merge our lats/longs with the lookup table we created
df_ct = df.merge(ct_lookup, left_on=["Latitude", "Longitude"], right_on=["latitude", "longitude"])
df_ct.drop(['latitude','longitude'], axis=1, inplace=True)
df_ct

Unnamed: 0,Executed Date,Residential/Commercial,Latitude,Longitude,year,GEOID
0,10/02/2024,Residential,40.852537,-73.907293,2024,36005024100
1,10/21/2024,Residential,40.868402,-73.916505,2024,36061030300
2,01/02/2024,Residential,40.876181,-73.909426,2024,36061030900
3,10/22/2024,Residential,40.828490,-73.880472,2024,36005005400
4,01/09/2024,Residential,40.781716,-73.954878,2024,36061014802
...,...,...,...,...,...,...
103800,11/30/2023,Commercial,,,2023,
103801,02/11/2020,Residential,,,2020,
103802,06/22/2017,Residential,40.878476,-73.855940,2017,36005038800
103803,10/16/2023,Residential,40.854460,-73.930418,2023,36061027900


In [5]:
# group by type of eviction (residential vs commercial)
grouped = df_ct.groupby(['year', 'GEOID', 'Residential/Commercial'])['Executed Date'].count().reset_index()

# pivot to separate counts for each eviction type
df = grouped.pivot_table(index=['year','GEOID'],columns='Residential/Commercial', values='Executed Date').reset_index().rename_axis(None, axis=1)
df.fillna(0, inplace=True)
df.rename(columns={'Commercial':'num_evictions_commercial', 'Residential':'num_evictions_residential'}, inplace=True)

In [14]:
# add more years in between
df['year'] = pd.to_datetime(df['year'].astype(str))
yearly = df.groupby('GEOID')[['year','num_evictions_commercial', 'num_evictions_residential']].apply(lambda x: x.resample("YE", on='year').mean()).reset_index()
yearly

Unnamed: 0,GEOID,year,num_evictions_commercial,num_evictions_residential
0,36005000200,2017-12-31,0.0,7.0
1,36005000200,2018-12-31,0.0,9.0
2,36005000200,2019-12-31,0.0,10.0
3,36005000200,2020-12-31,,
4,36005000200,2021-12-31,,
...,...,...,...,...
17658,36085032300,2020-12-31,,
17659,36085032300,2021-12-31,0.0,1.0
17660,36085032300,2022-12-31,0.0,1.0
17661,36085032300,2023-12-31,1.0,1.0


In [17]:
# fill more years in between
full_date_range = pd.date_range(start='2017-12-31', end='2023-12-31', freq='YE')
more_years = yearly.groupby('GEOID')[['year','num_evictions_commercial', 'num_evictions_residential']].apply(lambda x: x.set_index('year').reindex(full_date_range)).reset_index()
more_years

Unnamed: 0,GEOID,level_1,num_evictions_commercial,num_evictions_residential
0,36005000200,2017-12-31,0.0,7.0
1,36005000200,2018-12-31,0.0,9.0
2,36005000200,2019-12-31,0.0,10.0
3,36005000200,2020-12-31,,
4,36005000200,2021-12-31,,
...,...,...,...,...
15612,36085032300,2019-12-31,0.0,4.0
15613,36085032300,2020-12-31,,
15614,36085032300,2021-12-31,0.0,1.0
15615,36085032300,2022-12-31,0.0,1.0


In [18]:
# fill empty years with zero
more_years.fillna(0,inplace=True)

# add more years at the beginning
full_date_range = pd.date_range(start='2010-12-31', end='2023-12-31', freq='YE')
all_years = more_years.groupby('GEOID')[['level_1','num_evictions_commercial', 'num_evictions_residential']].apply(lambda x: x.set_index('level_1').reindex(full_date_range)).reset_index()
all_years.rename(columns={'level_1':'year'}, inplace=True)
all_years

Unnamed: 0,GEOID,year,num_evictions_commercial,num_evictions_residential
0,36005000200,2010-12-31,,
1,36005000200,2011-12-31,,
2,36005000200,2012-12-31,,
3,36005000200,2013-12-31,,
4,36005000200,2014-12-31,,
...,...,...,...,...
31229,36085032300,2019-12-31,0.0,4.0
31230,36085032300,2020-12-31,0.0,0.0
31231,36085032300,2021-12-31,0.0,1.0
31232,36085032300,2022-12-31,0.0,1.0


In [19]:
# interpolate the empty years at the beginning

all_years = all_years.groupby('GEOID')[['year','num_evictions_commercial', 'num_evictions_residential']].apply(lambda x: x.resample("YE", on='year').mean()).apply(lambda x: x.interpolate(method='linear')).reset_index()

# fill empty years with zero
all_years.fillna(0,inplace=True)
all_years['year'] = all_years['year'].dt.year
all_years

Unnamed: 0,GEOID,year,num_evictions_commercial,num_evictions_residential
0,36005000200,2010,0.0,0.0
1,36005000200,2011,0.0,0.0
2,36005000200,2012,0.0,0.0
3,36005000200,2013,0.0,0.0
4,36005000200,2014,0.0,0.0
...,...,...,...,...
31229,36085032300,2019,0.0,4.0
31230,36085032300,2020,0.0,0.0
31231,36085032300,2021,0.0,1.0
31232,36085032300,2022,0.0,1.0


In [20]:
all_years.sample(5)

Unnamed: 0,GEOID,year,num_evictions_commercial,num_evictions_residential
1686,36005019500,2016,0.0,37.125
14887,36047103401,2015,0.0,2.5
30344,36085014608,2016,0.125,0.375
23377,36081032800,2021,0.0,0.0
13067,36047074600,2015,0.0,1.0


In [21]:
# all_years.to_parquet('Data/Cleaned/evictions.parquet')