In [24]:
# !pip install libpysal

In [28]:
import pandas as pd
import numpy as np
import geopandas as gpd
import libpysal
import matplotlib.pyplot as plt
import pickle

In [6]:
df = pd.read_parquet('Data/Cleaned/after_kriging.parquet')
labels = pd.read_parquet('Data/Cleaned/labels.parquet')

## Generate Features for X-Year Percent Change
**We train on a 6 year period, and predict 4 years into the future**
- train on features 2010-2016 to predict gentrification status in 2020 (labels from 2020, eligible tracts from 2010)
- train on features 2013-2019 to predict gentrification status in 2023 (labels from 2023, eligible tracts from 2013)
- then use features 2017-2023 to predict gentrification status in 2027 (no labels, eligible tracts are taken from 2017)

In [42]:
df_10_16 = df[df.year.isin(['2010','2011','2012','2013','2014','2015','2016'])]  # training set
df_13_19 = df[df.year.isin(['2013','2014','2015','2016','2017','2018','2019'])]  # training set
df_17_23 = df[df.year.isin(['2017','2018','2019','2020','2021','2022','2023'])]  # pred set

In [48]:
# TODO we have missing years which will mess up calculation of pct change. need to interpolate missing years

df_10_16 = df_10_16.drop(['NAME','year', 'full_tract','random_lat', 'random_lon'], axis=1).groupby('GEOID').agg(lambda x: x.pct_change().replace([np.inf, -np.inf], np.nan).mean())
df_13_19 = df_13_19.drop(['NAME','year', 'full_tract','random_lat', 'random_lon'], axis=1).groupby('GEOID').agg(lambda x: x.pct_change().replace([np.inf, -np.inf], np.nan).mean())
df_17_23 = df_17_23.drop(['NAME','year', 'full_tract','random_lat', 'random_lon'], axis=1).groupby('GEOID').agg(lambda x: x.pct_change().replace([np.inf, -np.inf], np.nan).mean())

df_10_16['pred_year']='2020'
df_13_19['pred_year']='2023'
df_17_23['pred_year']='2027'


  df_10_16 = df_10_16.drop(['NAME','year', 'full_tract','random_lat', 'random_lon'], axis=1).groupby('GEOID').agg(lambda x: x.pct_change().replace([np.inf, -np.inf], np.nan).mean())


In [55]:
df_train = pd.concat([df_10_16, df_13_19, df_17_23])

# drop rows that became entirely null bc of the % change
df_train.dropna(how='all', subset=['percent_hh_income_assist', 'percent_hh_snap',
       'percent_hh_english', 'percent_out_of_county',
       'percent_moved_within_county', 'percent_public_transit',
       'percent_drive_commute', 'percent_bike_commute', 'percent_walk_commute',
       'percent_wfh', 'percent_white', 'percent_black', 'percent_native',
       'percent_asian', 'percent_pacific', 'percent_latino',
       'percent_work_agriculture', 'percent_work_construction',
       'percent_work_retail', 'percent_work_finance', 'percent_work_stem',
       'percent_work_edu_health', 'percent_work_art', 'percent_bachelors',
       'percent_grad', 'percent_born_citizen', 'percent_naturalized_citizen',
       'percent_not_citizen', 'percent_same_house_1yr', 'percent_out_of_state',
       'med_income', 'med_rent_acs', 'med_age', 'percent_hh_rented',
       'med_value', 'percent_hh_electric', 'percent_hh_solar',
       'med_hh_age', 'med_hh_tenure'], inplace=True)

df_train.fillna(0, inplace=True)
df_train.reset_index(inplace=True)
df_train

Unnamed: 0,GEOID,total_pop,percent_hh_income_assist,percent_hh_snap,percent_hh_english,percent_out_of_county,percent_moved_within_county,percent_public_transit,percent_drive_commute,percent_bike_commute,...,num_citibike_rides,num_trees,med_rent,num_affordable_hous_built,num_bikelanes,num_evictions_commercial,num_evictions_residential,park_acres,num_parks,pred_year
0,36005000100,-0.068661,0.003004,0.037381,-0.002021,-0.014333,0.065448,0.000000,0.000000,0.000000,...,0.0,0.058478,0.196342,0.0,0.000000,0.000000,0.000000,0.0,0.0,2020
1,36005000200,0.034962,0.218237,0.201764,-0.026817,-0.077578,-0.105136,0.033405,-0.021183,0.000000,...,0.0,0.039684,0.251055,0.0,0.000000,0.000000,0.000000,0.0,0.0,2020
2,36005000400,0.034124,0.786830,0.168444,-0.006987,-0.107590,-0.004790,0.028070,-0.028604,-0.187822,...,0.0,0.065020,0.255564,0.0,0.070962,0.000000,0.000000,0.0,0.0,2020
3,36005001600,0.028238,0.105534,0.119549,-0.037455,0.232766,-0.104326,-0.015358,-0.022066,0.000000,...,0.0,0.063187,0.283414,0.0,0.069444,0.000000,0.000000,0.0,0.0,2020
4,36005001901,0.003967,-0.026514,0.012163,-0.003241,-0.056242,-0.024531,0.015635,-0.039269,4.740955,...,0.0,0.055470,0.070139,0.0,0.000000,0.000000,0.000000,0.0,0.0,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6970,36085030302,-0.001154,0.215473,0.011413,-0.021104,0.698988,-0.034859,0.041904,-0.004503,0.000000,...,0.0,0.002710,-0.009754,0.0,0.000000,0.019099,1.119028,0.0,0.0,2027
6971,36085031901,0.061577,-0.040771,-0.050215,-0.006222,-0.330398,-0.189565,0.168862,0.013573,-0.225023,...,0.0,0.038060,-0.008647,0.0,0.000000,-1.000000,0.116167,0.0,0.0,2027
6972,36085031902,-0.019885,0.048796,0.054946,-0.015290,0.198931,0.014783,0.004287,0.073021,0.000000,...,0.0,0.039666,-0.002506,0.0,0.000000,-1.000000,1.545939,0.0,0.0,2027
6973,36085032300,-0.006561,-0.142393,0.053304,-0.043870,0.000000,1.860428,0.091209,-0.054103,0.000000,...,0.0,-0.019825,-0.009754,0.0,0.105556,-1.000000,-0.137463,0.0,0.0,2027


## Spatial Lag Feature
Feature to measure proximity to already-gentrified / super-gentrified tracts

In [33]:
# Load the census tract shapefile
gdf = gpd.read_file('Data/Raw/tracts2020_shapefile/nyct2020.shp')
gdf = gdf.to_crs(epsg = 4326)

gdf = gdf.merge(labels, on='GEOID')


In [40]:
# Create a weights matrix based on contiguity (e.g., Queen's case)
w = libpysal.weights.contiguity.Queen.from_dataframe(gdf)
    
# Row-standardize the weights (optional, but often recommended)
w.transform = 'R'

# Calculate the spatial lag
spatial_lag = libpysal.weights.spatial_lag.lag_spatial(w, gdf['vulnerable'])

# Add the spatial lag as a new column in the GeoDataFrame
gdf['lag_' + 'vulnerable'] = spatial_lag


# Identify observations with no neighbors
islands = w.islands
if islands:
    # Assign the original value to the spatial lag for islands
    gdf.loc[gdf.index.isin(islands), 'lag_' + 'vulnerable'] = gdf.loc[gdf.index.isin(islands), 'vulnerable']

lag_df = gdf[['NTAName','GEOID','vulnerable','pred_year','lag_vulnerable']]

In [41]:
lag_df.sample(10)

Unnamed: 0,NTAName,GEOID,vulnerable,pred_year,lag_vulnerable
503,Soundview-Bruckner-Bronx River,36005004800,1,2027,1.0
2422,Fresh Meadows-Utopia,36081134701,0,2023,0.0
2390,East Flushing,36081120702,1,2027,0.176471
5392,Canarsie,36047095600,0,2023,0.230769
4239,Crown Heights (North),36047030700,1,2020,1.0
5952,Bedford-Stuyvesant (East),36047038700,1,2020,0.95
4700,Kensington,36047049200,0,2027,0.538462
2921,Breezy Point-Belle Harbor-Rockaway Park-Broad ...,36081107201,0,2027,0.2
4528,Gravesend (East)-Homecrest,36047041800,0,2023,0.0
1906,Queens Village,36081055400,0,2023,0.2


In [58]:
# add spatial lag feature to the main df
df_train = df_train.merge(lag_df[['lag_vulnerable','GEOID','pred_year']], on=['GEOID','pred_year'])

In [59]:
# save features
# df_train.to_parquet('Data/Cleaned/post_feat_engineering.parquet')