In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle

In [17]:
filename = 'Data/Cleaned/postprocessed.pkl'
with open(filename, mode='rb') as f:
    df = pickle.load(f)
df.sample()

Unnamed: 0,NAME,percent_white,percent_black,percent_native,percent_asian,percent_pacific,percent_latino,percent_bachelors,percent_grad,percent_instate,...,percent_rented,med_value,year,med_hh_age,med_hh_residence,full_tract,Proportion_citibike_rides,num_trees,med_rent,GEOID
7742,"Census Tract 645, Queens County, New York",0.579151,0.040058,0.002413,0.271718,0.0,0.236486,0.30432,0.208253,0.344112,...,0.410014,684100.0,2018,0.0,18.0,81064500,0.0,394.0,2087.729817,36081064500


## Generate Features for X-Year Percent Change
- train on features 2010-2016 to predict gentrification status in 2020 (labels from 2020, eligible tracts from 2010)
- train on features 2012-2018 to predict gentrification status in 2022 (labels from 2022, eligible tracts from 2012)

- then use features 2016-2022 to predict gentrification status in 2026 (no labels, eligible tracts are taken from 2016)

In [4]:
df_10_16 = df[df.year.isin(['2010','2011','2012','2013','2014','2015','2016'])]  # training set
df_12_18 = df[df.year.isin(['2012','2013','2014','2015','2016','2017','2018'])]  # training set
df_16_22 = df[df.year.isin(['2016','2017','2018','2019','2020','2021','2022'])]  # pred set

In [13]:
# TODO we have missing years which will mess up calculation of pct change. need to interpolate missing years

df_10_16 = df_10_16.drop(['NAME','year', 'full_tract'], axis=1).groupby('GEOID').agg(lambda x: x.pct_change().mean())
df_12_18 = df_12_18.drop(['NAME','year', 'full_tract'], axis=1).groupby('GEOID').agg(lambda x: x.pct_change().mean())
df_16_22 = df_16_22.drop(['NAME','year', 'full_tract'], axis=1).groupby('GEOID').agg(lambda x: x.pct_change().mean())

df_10_16['pred_year']='2020'
df_12_18['pred_year']='2022'
df_16_22['pred_year']='2026'

In [16]:
df_train = pd.concat([df_10_16, df_12_18, df_16_22])
df_train

Unnamed: 0_level_0,percent_white,percent_black,percent_native,percent_asian,percent_pacific,percent_latino,percent_bachelors,percent_grad,percent_instate,med_income,percent_owned,percent_rented,med_value,med_hh_age,med_hh_residence,Proportion_citibike_rides,num_trees,med_rent,pred_year
GEOID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
36005000200,0.253498,-0.069683,-0.158731,0.045475,,0.013086,0.054308,-0.001092,0.004986,0.039653,-0.097196,inf,0.011581,-0.026547,-0.016797,,0.040706,0.000000,2020
36005000400,0.401586,0.111454,inf,-0.076004,,-0.024307,-0.021735,0.022121,0.007430,0.016922,-0.039718,inf,-0.025644,-0.091655,-0.023268,,0.099676,0.000000,2020
36005001600,0.206153,0.006892,,,,0.016500,0.028395,-0.077970,-0.066064,0.009668,-0.185730,1.482936,0.004031,-0.021844,-0.050903,,0.076535,0.000000,2020
36005001900,-0.231540,0.134766,,0.140154,,-0.040426,0.062270,0.037248,0.195301,0.049870,-0.275184,0.090376,0.025174,-0.107038,-0.062557,,,0.000000,2020
36005002000,0.266932,0.031160,inf,inf,,0.007742,0.063012,-0.179134,-0.056130,-0.088320,-0.196227,inf,-0.004516,0.003948,-0.067870,,,0.000000,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36085030301,-0.000015,-0.372741,inf,0.097769,,-0.002331,0.085906,-0.037404,0.024439,0.132738,0.008724,0.001863,0.082799,0.000000,-0.041210,,0.021161,0.016526,2026
36085030302,-0.058056,0.192520,,0.072049,,-0.111086,0.045027,-0.021830,0.051280,0.090315,0.036488,-0.107441,0.113577,-0.024206,-0.048465,,0.004528,0.022034,2026
36085031901,0.093828,-0.057249,,0.079193,,0.034899,-0.012391,0.576102,0.118195,0.488474,0.071542,-0.022689,-0.014634,0.011404,-0.025731,,0.054600,0.000000,2026
36085031902,-0.109198,-0.011178,inf,0.609157,,0.016932,0.054347,-0.031721,0.007832,0.168562,0.043006,-0.038730,0.090462,-0.022576,-0.033905,,0.040412,0.000000,2026


In [20]:
# drop rows that became entirely null bc of the pairwise % change
df_train.dropna(how='all',subset=['percent_white', 'percent_black',
       'percent_native', 'percent_asian', 'percent_pacific', 'percent_latino',
       'percent_bachelors', 'percent_grad', 'percent_instate', 'med_income',
       'percent_owned', 'percent_rented', 'med_value', 'med_hh_age',
       'med_hh_residence', 'Proportion_citibike_rides', 'num_trees'], inplace=True)

df_train.fillna(0, inplace=True)
df_train

Unnamed: 0_level_0,percent_white,percent_black,percent_native,percent_asian,percent_pacific,percent_latino,percent_bachelors,percent_grad,percent_instate,med_income,percent_owned,percent_rented,med_value,med_hh_age,med_hh_residence,Proportion_citibike_rides,num_trees,med_rent,pred_year
GEOID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
36005000200,0.253498,-0.069683,-0.158731,0.045475,0.0,0.013086,0.054308,-0.001092,0.004986,0.039653,-0.097196,inf,0.011581,-0.026547,-0.016797,0.0,0.040706,0.000000,2020
36005000400,0.401586,0.111454,inf,-0.076004,0.0,-0.024307,-0.021735,0.022121,0.007430,0.016922,-0.039718,inf,-0.025644,-0.091655,-0.023268,0.0,0.099676,0.000000,2020
36005001600,0.206153,0.006892,0.000000,0.000000,0.0,0.016500,0.028395,-0.077970,-0.066064,0.009668,-0.185730,1.482936,0.004031,-0.021844,-0.050903,0.0,0.076535,0.000000,2020
36005001900,-0.231540,0.134766,0.000000,0.140154,0.0,-0.040426,0.062270,0.037248,0.195301,0.049870,-0.275184,0.090376,0.025174,-0.107038,-0.062557,0.0,0.000000,0.000000,2020
36005002000,0.266932,0.031160,inf,inf,0.0,0.007742,0.063012,-0.179134,-0.056130,-0.088320,-0.196227,inf,-0.004516,0.003948,-0.067870,0.0,0.000000,0.000000,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36085030301,-0.000015,-0.372741,inf,0.097769,0.0,-0.002331,0.085906,-0.037404,0.024439,0.132738,0.008724,0.001863,0.082799,0.000000,-0.041210,0.0,0.021161,0.016526,2026
36085030302,-0.058056,0.192520,0.000000,0.072049,0.0,-0.111086,0.045027,-0.021830,0.051280,0.090315,0.036488,-0.107441,0.113577,-0.024206,-0.048465,0.0,0.004528,0.022034,2026
36085031901,0.093828,-0.057249,0.000000,0.079193,0.0,0.034899,-0.012391,0.576102,0.118195,0.488474,0.071542,-0.022689,-0.014634,0.011404,-0.025731,0.0,0.054600,0.000000,2026
36085031902,-0.109198,-0.011178,inf,0.609157,0.0,0.016932,0.054347,-0.031721,0.007832,0.168562,0.043006,-0.038730,0.090462,-0.022576,-0.033905,0.0,0.040412,0.000000,2026


## More Feature Engineering Yada Yada

In [21]:
# save features
# df_train.to_pickle('Data/Cleaned/post_feat_engineering.pkl')