In [7]:
# !pip install libpysal

In [62]:
import pandas as pd
import numpy as np
import geopandas as gpd
import libpysal
import matplotlib.pyplot as plt
pd.set_option("display.max_columns", None)

In [103]:
df = pd.read_parquet('Data/Cleaned/after_kriging.parquet')
labels = pd.read_parquet('Data/Cleaned/more_labels.parquet')
labels

Unnamed: 0,GEOID,label,vulnerable,gentrified,pred_year
0,36005000100,2,1,1,2020
1,36005000200,0,0,1,2020
2,36005000400,0,0,0,2020
3,36005001600,1,1,0,2020
4,36005001901,2,1,1,2020
...,...,...,...,...,...
2320,36085030302,0,0,0,2027
2321,36085031901,1,1,0,2027
2322,36085031902,1,1,0,2027
2323,36085032300,0,0,0,2027


## Generate Features for X-Year Percent Change
**We train on a 6 year period, and predict 4 years into the future**
- train on features 2010-2016 to predict gentrification status in 2020 (labels from 2020, eligible tracts from 2010)
- 2011-2017
- 2012-2018
- train on features 2013-2019 to predict gentrification status in 2023 (labels from 2023, eligible tracts from 2013)
- then use features 2017-2023 to predict gentrification status in 2027 (no labels, eligible tracts are taken from 2017)

In [93]:
df_10_16 = df[df.year.isin(['2010','2011','2012','2013','2014','2015','2016'])]  # training set
df_11_17 = df[df.year.isin(['2013','2014','2015','2016','2017','2018','2019'])]  # training set
df_12_18 = df[df.year.isin(['2013','2014','2015','2016','2017','2018','2019'])]  # training set
df_13_19 = df[df.year.isin(['2013','2014','2015','2016','2017','2018','2019'])]  # training set
df_17_23 = df[df.year.isin(['2017','2018','2019','2020','2021','2022','2023'])]  # pred set

df_10_16 = df_10_16.drop(['NAME','year', 'full_tract','random_lat', 'random_lon'], axis=1).groupby('GEOID').agg(lambda x: x.pct_change().replace([np.inf, -np.inf], np.nan).mean())
df_11_17 = df_11_17.drop(['NAME','year', 'full_tract','random_lat', 'random_lon'], axis=1).groupby('GEOID').agg(lambda x: x.pct_change().replace([np.inf, -np.inf], np.nan).mean())
df_12_18 = df_12_18.drop(['NAME','year', 'full_tract','random_lat', 'random_lon'], axis=1).groupby('GEOID').agg(lambda x: x.pct_change().replace([np.inf, -np.inf], np.nan).mean())
df_13_19 = df_13_19.drop(['NAME','year', 'full_tract','random_lat', 'random_lon'], axis=1).groupby('GEOID').agg(lambda x: x.pct_change().replace([np.inf, -np.inf], np.nan).mean())
df_17_23 = df_17_23.drop(['NAME','year', 'full_tract','random_lat', 'random_lon'], axis=1).groupby('GEOID').agg(lambda x: x.pct_change().replace([np.inf, -np.inf], np.nan).mean())

df_10_16['pred_year']='2020'
df_11_17['pred_year']='2021'
df_12_18['pred_year']='2022'
df_13_19['pred_year']='2023'
df_17_23['pred_year']='2027'


In [104]:
df_train = pd.concat([df_10_16,df_11_17,df_12_18, df_13_19, df_17_23])

# drop rows that became entirely null bc of the % change
df = df_train.dropna(how='all', subset=['percent_hh_income_assist', 'percent_hh_snap',
       'percent_hh_english', 'percent_out_of_county',
       'percent_moved_within_county', 'percent_public_transit',
       'percent_drive_commute', 'percent_bike_commute', 'percent_walk_commute',
       'percent_wfh', 'percent_white', 'percent_black', 'percent_native',
       'percent_asian', 'percent_pacific', 'percent_latino',
       'percent_work_agriculture', 'percent_work_construction',
       'percent_work_retail', 'percent_work_finance', 'percent_work_stem',
       'percent_work_edu_health', 'percent_work_art', 'percent_bachelors',
       'percent_grad', 'percent_born_citizen', 'percent_naturalized_citizen',
       'percent_not_citizen', 'percent_same_house_1yr', 'percent_out_of_state',
       'med_income', 'med_rent_acs', 'med_age', 'percent_hh_rented',
       'med_value', 'percent_hh_electric', 'percent_hh_solar',
       'med_hh_age', 'med_hh_tenure'])

df.fillna(0, inplace=True)
df.reset_index(inplace=True)
df

Unnamed: 0,GEOID,total_pop,percent_hh_income_assist,percent_hh_snap,percent_hh_english,percent_work_agriculture,percent_work_construction,percent_work_retail,percent_work_finance,percent_work_stem,percent_work_edu_health,percent_work_art,percent_out_of_county,percent_moved_within_county,percent_public_transit,percent_drive_commute,percent_bike_commute,percent_walk_commute,percent_wfh,percent_white,percent_black,percent_native,percent_asian,percent_pacific,percent_latino,percent_bachelors,percent_grad,percent_born_citizen,percent_naturalized_citizen,percent_not_citizen,percent_same_house_1yr,percent_out_of_state,med_income,med_rent_acs,med_age,percent_hh_rented,med_value,percent_hh_electric,percent_hh_solar,med_hh_age,med_hh_tenure,num_citibike_rides,num_trees,med_rent,num_affordable_hous_built,num_bikelanes,num_evictions_commercial,num_evictions_residential,park_acres,num_parks,pred_year
0,36005000100,-0.068661,0.035685,0.028985,0.001017,0.199477,0.045892,-0.017763,0.035731,0.022690,0.021284,0.054678,-0.014333,0.065448,0.000000,0.000000,0.000000,0.000000,0.000000,-0.006464,0.018035,0.277300,-0.000033,0.000000,-0.024075,-0.124830,0.449510,0.002733,0.025448,0.010323,-0.014150,-0.050057,0.005722,0.049193,0.004397,1.089373,0.008277,-0.005696,12.906525,-0.007878,-0.034566,0.0,0.040565,0.013164,0.000000,0.000000,0.144316,0.082033,0.0,0.0,2020
1,36005000200,0.034962,0.218237,0.201764,-0.026817,-0.211255,-0.143559,-0.051058,0.005439,0.075584,0.050176,0.071833,-0.077578,-0.105136,0.033405,-0.021183,0.000000,0.138279,-0.377954,0.214509,-0.026615,-0.298942,-0.017225,0.000000,0.009071,0.057625,0.148837,-0.027422,0.023382,0.149468,0.009212,-0.077517,0.030404,-0.003322,0.049306,1.106573,0.001691,0.311270,0.000000,-0.021834,-0.021378,0.0,0.039684,0.000384,0.000000,0.000000,0.000000,0.000000,0.0,0.0,2020
2,36005000400,0.034124,0.786830,0.168444,-0.006987,0.000000,-0.086546,0.085892,-0.119376,0.065526,-0.033501,0.236460,-0.107590,-0.004790,0.028070,-0.028604,-0.187822,-0.041648,2.037930,0.255629,0.070771,0.083326,-0.038641,0.000000,-0.016291,0.000209,0.032225,0.000973,0.054771,0.012176,0.007979,-0.229897,0.011434,0.009918,0.015270,-0.394797,-0.017437,1.049936,0.000000,-0.060346,-0.014822,0.0,0.065020,-0.002948,0.000000,0.070962,0.000000,0.051681,0.0,0.0,2020
3,36005001600,0.028238,0.105534,0.119549,-0.037455,0.000000,-0.068067,0.070625,-0.016419,0.270917,-0.013170,0.057567,0.232766,-0.104326,-0.015358,-0.022066,0.000000,0.312918,-0.095215,0.168040,0.005470,0.000000,0.000000,0.000000,0.013476,0.015757,-0.076782,-0.031302,0.045230,0.142282,0.004117,-0.403649,0.008068,0.023547,-0.013855,7.142181,0.003578,0.013124,0.000000,-0.017877,-0.040687,0.0,0.063187,0.029170,0.000000,0.069444,0.000000,0.106212,0.0,0.0,2020
4,36005001901,0.000644,-0.034693,0.006081,-0.001591,-0.081312,-0.047796,-0.014341,-0.055569,0.035664,-0.007433,0.090601,-0.102033,-0.018727,0.016370,-0.032029,0.232876,0.007768,0.041536,-0.064106,-0.018095,0.106253,-0.061774,0.422531,0.003428,0.049573,0.224663,0.004055,0.049740,-0.004173,0.002864,-0.171264,-0.018268,0.078083,0.001804,7.661378,-0.023169,-0.001365,-0.010289,0.026425,-0.039251,0.0,0.055470,0.016927,-0.520833,0.000000,0.000000,0.048778,0.0,0.0,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11620,36085030302,-0.001154,0.215473,0.011413,-0.021104,0.000000,-0.036738,-0.134240,-0.067538,0.016564,0.077345,0.148286,0.698988,-0.034859,0.041904,-0.004503,0.000000,0.209163,0.946175,-0.045150,0.086675,0.000000,0.011771,0.000000,0.008026,0.055342,0.008809,-0.009526,0.082499,-0.051934,0.002615,-0.173023,0.033478,0.025459,0.025387,-0.050505,0.064139,-0.006120,-0.066851,-0.011909,-0.032434,0.0,0.002710,0.018248,0.000000,0.000000,-1.000000,0.547619,0.0,0.0,2027
11621,36085031901,0.061577,-0.040771,-0.050215,-0.006222,0.000000,1.556189,0.042698,0.185330,0.093050,0.046607,0.215565,-0.330398,-0.189565,0.168862,0.013573,-0.225023,-0.275365,-0.003081,0.011597,-0.048561,0.000000,0.183167,0.000000,0.075288,-0.061371,0.433025,0.009850,0.088852,-0.103608,0.004566,-0.163036,0.465666,-0.063389,0.011333,0.009512,0.050791,1.679250,0.000000,0.010760,-0.012765,0.0,0.038060,0.054117,0.000000,0.000000,0.000000,-0.101082,0.0,0.0,2027
11622,36085031902,-0.019885,0.048796,0.054946,-0.015290,0.000000,0.013640,0.195240,0.040898,0.096007,-0.049353,-0.125466,0.198931,0.014783,0.004287,0.073021,0.000000,0.161195,0.009269,-0.115204,0.011083,-0.083033,0.477168,0.000000,-0.003335,0.095097,0.006309,-0.007570,0.107293,0.013515,0.005042,-1.000000,0.120181,0.058746,0.047016,-0.021044,0.090066,-0.035160,-0.247456,-0.000359,-0.016329,0.0,0.039666,0.040176,0.000000,0.000000,-1.000000,1.545939,0.0,0.0,2027
11623,36085032300,-0.006561,-0.142393,0.053304,-0.043870,0.000000,0.097549,-0.031988,0.371914,0.182109,-0.041365,-0.224979,0.000000,1.860428,0.091209,-0.054103,0.000000,-0.014034,0.432146,-0.110069,-0.082349,0.000000,0.570565,0.000000,0.076573,0.122195,0.421512,-0.010318,0.160294,0.083382,-0.001104,-0.376098,0.043610,0.121217,0.016846,0.057573,0.010175,0.176348,0.000000,0.020666,-0.008333,0.0,-0.019825,0.018248,0.000000,0.105556,0.000000,-0.166667,0.0,0.0,2027


## Spatial Lag Feature
Feature to measure proximity to already-gentrified / super-gentrified tracts

In [105]:
# Load the census tract shapefile
gdf = gpd.read_file('Data/Raw/tracts2020_shapefile/nyct2020.shp')
gdf = gdf.to_crs(epsg = 4326)
gdf = gdf.merge(labels, on='GEOID')


In [106]:
# Create a weights matrix based on contiguity (e.g., Queen's case)
w = libpysal.weights.contiguity.Queen.from_dataframe(gdf)
    
# Row-standardize the weights (optional, but often recommended)
w.transform = 'R'

# Calculate the spatial lag
spatial_lag = libpysal.weights.spatial_lag.lag_spatial(w, gdf['vulnerable'])

# Add the spatial lag as a new column in the GeoDataFrame
gdf['lag_' + 'vulnerable'] = spatial_lag


# Identify observations with no neighbors
islands = w.islands
if islands:
    # Assign the original value to the spatial lag for islands
    gdf.loc[gdf.index.isin(islands), 'lag_' + 'vulnerable'] = gdf.loc[gdf.index.isin(islands), 'vulnerable']

lag_df = gdf[['NTAName','GEOID','vulnerable','pred_year','lag_vulnerable']]

  w = libpysal.weights.contiguity.Queen.from_dataframe(gdf)
 There are 9 disconnected components.
  W.__init__(self, neighbors, ids=ids, **kw)


In [108]:
# add spatial lag feature to the main df
df = df.merge(lag_df[['lag_vulnerable','GEOID','pred_year']], on=['GEOID','pred_year'])
df

Unnamed: 0,GEOID,total_pop,percent_hh_income_assist,percent_hh_snap,percent_hh_english,percent_work_agriculture,percent_work_construction,percent_work_retail,percent_work_finance,percent_work_stem,percent_work_edu_health,percent_work_art,percent_out_of_county,percent_moved_within_county,percent_public_transit,percent_drive_commute,percent_bike_commute,percent_walk_commute,percent_wfh,percent_white,percent_black,percent_native,percent_asian,percent_pacific,percent_latino,percent_bachelors,percent_grad,percent_born_citizen,percent_naturalized_citizen,percent_not_citizen,percent_same_house_1yr,percent_out_of_state,med_income,med_rent_acs,med_age,percent_hh_rented,med_value,percent_hh_electric,percent_hh_solar,med_hh_age,med_hh_tenure,num_citibike_rides,num_trees,med_rent,num_affordable_hous_built,num_bikelanes,num_evictions_commercial,num_evictions_residential,park_acres,num_parks,pred_year,lag_vulnerable
0,36005000100,-0.068661,0.035685,0.028985,0.001017,0.199477,0.045892,-0.017763,0.035731,0.022690,0.021284,0.054678,-0.014333,0.065448,0.000000,0.000000,0.000000,0.000000,0.000000,-0.006464,0.018035,0.277300,-0.000033,0.000000,-0.024075,-0.124830,0.449510,0.002733,0.025448,0.010323,-0.014150,-0.050057,0.005722,0.049193,0.004397,1.089373,0.008277,-0.005696,12.906525,-0.007878,-0.034566,0.0,0.040565,0.013164,0.000000,0.000000,0.144316,0.082033,0.0,0.0,2020,1.000000
1,36005000200,0.034962,0.218237,0.201764,-0.026817,-0.211255,-0.143559,-0.051058,0.005439,0.075584,0.050176,0.071833,-0.077578,-0.105136,0.033405,-0.021183,0.000000,0.138279,-0.377954,0.214509,-0.026615,-0.298942,-0.017225,0.000000,0.009071,0.057625,0.148837,-0.027422,0.023382,0.149468,0.009212,-0.077517,0.030404,-0.003322,0.049306,1.106573,0.001691,0.311270,0.000000,-0.021834,-0.021378,0.0,0.039684,0.000384,0.000000,0.000000,0.000000,0.000000,0.0,0.0,2020,0.689655
2,36005000400,0.034124,0.786830,0.168444,-0.006987,0.000000,-0.086546,0.085892,-0.119376,0.065526,-0.033501,0.236460,-0.107590,-0.004790,0.028070,-0.028604,-0.187822,-0.041648,2.037930,0.255629,0.070771,0.083326,-0.038641,0.000000,-0.016291,0.000209,0.032225,0.000973,0.054771,0.012176,0.007979,-0.229897,0.011434,0.009918,0.015270,-0.394797,-0.017437,1.049936,0.000000,-0.060346,-0.014822,0.0,0.065020,-0.002948,0.000000,0.070962,0.000000,0.051681,0.0,0.0,2020,0.620690
3,36005001600,0.028238,0.105534,0.119549,-0.037455,0.000000,-0.068067,0.070625,-0.016419,0.270917,-0.013170,0.057567,0.232766,-0.104326,-0.015358,-0.022066,0.000000,0.312918,-0.095215,0.168040,0.005470,0.000000,0.000000,0.000000,0.013476,0.015757,-0.076782,-0.031302,0.045230,0.142282,0.004117,-0.403649,0.008068,0.023547,-0.013855,7.142181,0.003578,0.013124,0.000000,-0.017877,-0.040687,0.0,0.063187,0.029170,0.000000,0.069444,0.000000,0.106212,0.0,0.0,2020,0.743590
4,36005001901,0.000644,-0.034693,0.006081,-0.001591,-0.081312,-0.047796,-0.014341,-0.055569,0.035664,-0.007433,0.090601,-0.102033,-0.018727,0.016370,-0.032029,0.232876,0.007768,0.041536,-0.064106,-0.018095,0.106253,-0.061774,0.422531,0.003428,0.049573,0.224663,0.004055,0.049740,-0.004173,0.002864,-0.171264,-0.018268,0.078083,0.001804,7.661378,-0.023169,-0.001365,-0.010289,0.026425,-0.039251,0.0,0.055470,0.016927,-0.520833,0.000000,0.000000,0.048778,0.0,0.0,2020,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11620,36085030302,-0.001154,0.215473,0.011413,-0.021104,0.000000,-0.036738,-0.134240,-0.067538,0.016564,0.077345,0.148286,0.698988,-0.034859,0.041904,-0.004503,0.000000,0.209163,0.946175,-0.045150,0.086675,0.000000,0.011771,0.000000,0.008026,0.055342,0.008809,-0.009526,0.082499,-0.051934,0.002615,-0.173023,0.033478,0.025459,0.025387,-0.050505,0.064139,-0.006120,-0.066851,-0.011909,-0.032434,0.0,0.002710,0.018248,0.000000,0.000000,-1.000000,0.547619,0.0,0.0,2027,0.517241
11621,36085031901,0.061577,-0.040771,-0.050215,-0.006222,0.000000,1.556189,0.042698,0.185330,0.093050,0.046607,0.215565,-0.330398,-0.189565,0.168862,0.013573,-0.225023,-0.275365,-0.003081,0.011597,-0.048561,0.000000,0.183167,0.000000,0.075288,-0.061371,0.433025,0.009850,0.088852,-0.103608,0.004566,-0.163036,0.465666,-0.063389,0.011333,0.009512,0.050791,1.679250,0.000000,0.010760,-0.012765,0.0,0.038060,0.054117,0.000000,0.000000,0.000000,-0.101082,0.0,0.0,2027,0.620690
11622,36085031902,-0.019885,0.048796,0.054946,-0.015290,0.000000,0.013640,0.195240,0.040898,0.096007,-0.049353,-0.125466,0.198931,0.014783,0.004287,0.073021,0.000000,0.161195,0.009269,-0.115204,0.011083,-0.083033,0.477168,0.000000,-0.003335,0.095097,0.006309,-0.007570,0.107293,0.013515,0.005042,-1.000000,0.120181,0.058746,0.047016,-0.021044,0.090066,-0.035160,-0.247456,-0.000359,-0.016329,0.0,0.039666,0.040176,0.000000,0.000000,-1.000000,1.545939,0.0,0.0,2027,0.789474
11623,36085032300,-0.006561,-0.142393,0.053304,-0.043870,0.000000,0.097549,-0.031988,0.371914,0.182109,-0.041365,-0.224979,0.000000,1.860428,0.091209,-0.054103,0.000000,-0.014034,0.432146,-0.110069,-0.082349,0.000000,0.570565,0.000000,0.076573,0.122195,0.421512,-0.010318,0.160294,0.083382,-0.001104,-0.376098,0.043610,0.121217,0.016846,0.057573,0.010175,0.176348,0.000000,0.020666,-0.008333,0.0,-0.019825,0.018248,0.000000,0.105556,0.000000,-0.166667,0.0,0.0,2027,0.411765


In [109]:
# save features
# df.to_parquet('Data/Cleaned/more_data_post_feat_engineering.parquet')