In [24]:
# !pip install libpysal

In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd
import libpysal
import matplotlib.pyplot as plt
import pickle

In [2]:
df = pd.read_parquet('Data/Cleaned/after_kriging.parquet')
labels = pd.read_parquet('Data/Cleaned/labels.parquet')

## Generate Features for X-Year Percent Change
**We train on a 6 year period, and predict 4 years into the future**
- train on features 2010-2016 to predict gentrification status in 2020 (labels from 2020, eligible tracts from 2010)
- train on features 2013-2019 to predict gentrification status in 2023 (labels from 2023, eligible tracts from 2013)
- then use features 2017-2023 to predict gentrification status in 2027 (no labels, eligible tracts are taken from 2017)

In [3]:
df_10_16 = df[df.year.isin(['2010','2011','2012','2013','2014','2015','2016'])]  # training set
df_13_19 = df[df.year.isin(['2013','2014','2015','2016','2017','2018','2019'])]  # training set
df_17_23 = df[df.year.isin(['2017','2018','2019','2020','2021','2022','2023'])]  # pred set

In [4]:
# TODO we have missing years which will mess up calculation of pct change. need to interpolate missing years

df_10_16 = df_10_16.drop(['NAME','year', 'full_tract','random_lat', 'random_lon'], axis=1).groupby('GEOID').agg(lambda x: x.pct_change().replace([np.inf, -np.inf], np.nan).mean())
df_13_19 = df_13_19.drop(['NAME','year', 'full_tract','random_lat', 'random_lon'], axis=1).groupby('GEOID').agg(lambda x: x.pct_change().replace([np.inf, -np.inf], np.nan).mean())
df_17_23 = df_17_23.drop(['NAME','year', 'full_tract','random_lat', 'random_lon'], axis=1).groupby('GEOID').agg(lambda x: x.pct_change().replace([np.inf, -np.inf], np.nan).mean())

df_10_16['pred_year']='2020'
df_13_19['pred_year']='2023'
df_17_23['pred_year']='2027'


In [6]:
df_train = pd.concat([df_10_16, df_13_19, df_17_23])
print(df_train.shape)

# drop rows that became entirely null bc of the % change
df = df_train.dropna(how='all', subset=['percent_hh_income_assist', 'percent_hh_snap',
       'percent_hh_english', 'percent_out_of_county',
       'percent_moved_within_county', 'percent_public_transit',
       'percent_drive_commute', 'percent_bike_commute', 'percent_walk_commute',
       'percent_wfh', 'percent_white', 'percent_black', 'percent_native',
       'percent_asian', 'percent_pacific', 'percent_latino',
       'percent_work_agriculture', 'percent_work_construction',
       'percent_work_retail', 'percent_work_finance', 'percent_work_stem',
       'percent_work_edu_health', 'percent_work_art', 'percent_bachelors',
       'percent_grad', 'percent_born_citizen', 'percent_naturalized_citizen',
       'percent_not_citizen', 'percent_same_house_1yr', 'percent_out_of_state',
       'med_income', 'med_rent_acs', 'med_age', 'percent_hh_rented',
       'med_value', 'percent_hh_electric', 'percent_hh_solar',
       'med_hh_age', 'med_hh_tenure'])

df.fillna(0, inplace=True)
df.reset_index(inplace=True)
df.sample(10)

(6975, 50)


Unnamed: 0,GEOID,total_pop,percent_hh_income_assist,percent_hh_snap,percent_hh_english,percent_work_agriculture,percent_work_construction,percent_work_retail,percent_work_finance,percent_work_stem,...,num_citibike_rides,num_trees,med_rent,num_affordable_hous_built,num_bikelanes,num_evictions_commercial,num_evictions_residential,park_acres,num_parks,pred_year
3265,36047068200,0.004142,0.217278,0.145217,-0.003976,0.0,0.083586,0.02318,0.242947,-0.035113,...,0.0,0.033036,-0.009694,0.0,0.0,0.0,-0.104289,0.0,0.0,2023
6503,36081051700,0.005346,-0.310471,0.160635,-0.01283,-0.283404,0.034084,-0.029437,0.040793,0.011684,...,0.0,-0.009034,5.5e-05,0.0,0.0,0.0,0.0625,0.0,0.0,2027
3872,36081009700,0.012293,0.014952,0.024576,0.008159,0.0,-0.013499,0.134832,-0.059209,0.075562,...,3.172917,0.010856,0.028305,-1.0,0.0,0.0,0.182143,0.0,0.0,2023
4772,36005018501,-0.046434,0.197547,0.001405,0.075034,-0.264769,0.286298,-0.053719,-0.204867,0.115759,...,3.960526,0.05365,0.017392,-0.333333,0.305556,-0.625,1.246581,0.0,0.0,2027
3291,36047074400,0.013957,-0.103737,0.047804,0.01507,0.0,0.030239,0.431969,0.060634,-0.039432,...,0.0,0.007784,0.045224,0.0,0.0,0.0,-0.616667,0.0,0.0,2023
3009,36047035900,0.012688,0.061727,0.000772,-0.011591,0.0,0.100311,0.240934,0.460218,0.022973,...,0.0,0.045001,-0.006921,-0.083333,0.071429,-0.012302,0.102753,0.0,0.0,2023
2416,36005014701,0.005007,0.215091,0.025962,0.002686,0.0,1.049721,0.11573,-0.524708,0.053522,...,0.0,0.072,-0.001972,0.0,0.833333,-0.520833,0.09743,0.0,0.0,2023
2710,36047003400,-0.007445,0.560291,0.104588,0.009289,0.022379,0.09102,0.04112,-0.160258,0.023845,...,0.0,-0.012556,0.052872,0.0,0.0,-0.520833,0.009303,0.0,0.0,2023
179,36005024000,-0.018809,0.079655,0.002185,-0.033626,0.343564,-0.009947,0.066106,-0.130133,0.060578,...,0.0,0.021178,-0.01832,0.0,0.0,-0.081543,-0.03831,0.0,0.0,2020
327,36005042600,-0.025808,0.252186,0.037468,-0.022393,0.0,-0.089724,0.263415,0.10478,0.098596,...,0.0,-0.011503,0.036434,0.0,0.222222,0.0,-0.011839,0.0,0.0,2020


## Spatial Lag Feature
Feature to measure proximity to already-gentrified / super-gentrified tracts

In [7]:
# Load the census tract shapefile
gdf = gpd.read_file('Data/Raw/tracts2020_shapefile/nyct2020.shp')
gdf = gdf.to_crs(epsg = 4326)

gdf = gdf.merge(labels, on='GEOID')


In [8]:
# Create a weights matrix based on contiguity (e.g., Queen's case)
w = libpysal.weights.contiguity.Queen.from_dataframe(gdf)
    
# Row-standardize the weights (optional, but often recommended)
w.transform = 'R'

# Calculate the spatial lag
spatial_lag = libpysal.weights.spatial_lag.lag_spatial(w, gdf['vulnerable'])

# Add the spatial lag as a new column in the GeoDataFrame
gdf['lag_' + 'vulnerable'] = spatial_lag


# Identify observations with no neighbors
islands = w.islands
if islands:
    # Assign the original value to the spatial lag for islands
    gdf.loc[gdf.index.isin(islands), 'lag_' + 'vulnerable'] = gdf.loc[gdf.index.isin(islands), 'vulnerable']

lag_df = gdf[['NTAName','GEOID','vulnerable','pred_year','lag_vulnerable']]

  w = libpysal.weights.contiguity.Queen.from_dataframe(gdf)
 There are 9 disconnected components.
  W.__init__(self, neighbors, ids=ids, **kw)


In [9]:
lag_df.sample(10)

Unnamed: 0,NTAName,GEOID,vulnerable,pred_year,lag_vulnerable
2998,Upper East Side-Lenox Hill-Roosevelt Island,36061011000,0,2023,0.0
3079,Morningside Heights,36061020901,1,2023,0.75
2118,Forest Hills,36081074500,0,2020,0.304348
5666,Woodside,36081024700,1,2027,0.73913
4901,Madison,36047057600,0,2027,0.05
1319,South Ozone Park,36081016800,0,2027,0.615385
3295,Rosebank-Shore Acres-Park Hill,36085004004,0,2023,0.0
492,Soundview-Clason Point,36005004200,1,2020,1.0
2513,Lower East Side,36061001001,0,2027,0.545455
5213,Prospect Lefferts Gardens-Wingate,36047080400,1,2027,0.965517


In [10]:
# add spatial lag feature to the main df
df = df.merge(lag_df[['lag_vulnerable','GEOID','pred_year']], on=['GEOID','pred_year'])
df

Unnamed: 0,GEOID,total_pop,percent_hh_income_assist,percent_hh_snap,percent_hh_english,percent_work_agriculture,percent_work_construction,percent_work_retail,percent_work_finance,percent_work_stem,...,num_trees,med_rent,num_affordable_hous_built,num_bikelanes,num_evictions_commercial,num_evictions_residential,park_acres,num_parks,pred_year,lag_vulnerable
0,36005000100,-0.068661,0.017417,0.031980,-0.000732,0.230625,0.042694,-0.021997,0.038276,0.063432,...,0.064587,0.028240,0.000000,0.000000,0.149624,0.063280,0.0,0.0,2020,1.000000
1,36005000200,0.034962,0.218237,0.201764,-0.026817,-0.211255,-0.143559,-0.051058,0.005439,0.075584,...,0.039684,0.037160,0.000000,0.000000,0.000000,0.000000,0.0,0.0,2020,0.705882
2,36005000400,0.034124,0.786830,0.168444,-0.006987,0.000000,-0.086546,0.085892,-0.119376,0.065526,...,0.065020,0.030647,0.000000,0.070962,0.000000,0.051681,0.0,0.0,2020,0.588235
3,36005001600,0.028238,0.105534,0.119549,-0.037455,0.000000,-0.068067,0.070625,-0.016419,0.270917,...,0.063187,0.049607,0.000000,0.069444,0.000000,0.106212,0.0,0.0,2020,0.739130
4,36005001901,0.017769,-0.025892,0.010398,-0.003196,-0.058565,-0.033177,-0.010920,-0.021321,0.048977,...,0.055470,0.032116,-0.520833,0.000000,0.000000,0.048778,0.0,0.0,2020,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6970,36085030302,-0.001154,0.215473,0.011413,-0.021104,0.000000,-0.036738,-0.134240,-0.067538,0.016564,...,0.002710,0.018248,0.000000,0.000000,-1.000000,0.547619,0.0,0.0,2027,0.411765
6971,36085031901,0.061577,-0.040771,-0.050215,-0.006222,0.000000,1.556189,0.042698,0.185330,0.093050,...,0.038060,0.002050,0.000000,0.000000,0.000000,-0.101082,0.0,0.0,2027,0.588235
6972,36085031902,-0.019885,0.048796,0.054946,-0.015290,0.000000,0.013640,0.195240,0.040898,0.096007,...,0.039666,0.001056,0.000000,0.000000,-1.000000,1.545939,0.0,0.0,2027,0.818182
6973,36085032300,-0.006561,-0.142393,0.053304,-0.043870,0.000000,0.097549,-0.031988,0.371914,0.182109,...,-0.019825,0.018248,0.000000,0.105556,0.000000,-0.166667,0.0,0.0,2027,0.400000


In [11]:
# save features
# df_train.to_parquet('Data/Cleaned/post_feat_engineering.parquet')