In [419]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score

## Import Cleaned Data

In [389]:
acs = pd.read_parquet("Data/Cleaned/ACS.parquet")
citi =  pd.read_parquet("Data/Cleaned/Citibike_Clean/citibike_all_processed.parquet")

# merge df's together on hard tract name (with decimal points)

df = acs.merge(citi, left_on=['full_tract','year'], right_on=['Tract','Year'], how='left')
df.drop(['Tract','Year'],axis=1,inplace=True)

# do a fuzzy join on the remaining counties by removing decimal points from tract

no_bike = df[df.Proportion_citibike_rides.isna()].drop('Proportion_citibike_rides', axis=1)
no_bike['fuzzy_tract'] = no_bike['full_tract'].apply(lambda x: x[:-2])

citi_fuzzy = citi.copy()
citi_fuzzy['fuzzy_tract'] = citi_fuzzy['Tract'].apply(lambda x: x[:-2])
citi_fuzzy.drop('Tract',axis=1,inplace=True)

df_fuzzy = no_bike.merge(citi_fuzzy, left_on=['fuzzy_tract','year'], right_on=['fuzzy_tract','Year'], how='left').drop_duplicates()
df_fuzzy.drop(['fuzzy_tract','Year'], axis=1, inplace=True)


# add the fuzzy matches back into main df
df_w_bike = df[~df.Proportion_citibike_rides.isna()]
df = pd.concat([df_fuzzy, df_w_bike]).drop_duplicates(['full_tract','year'])
df['Proportion_citibike_rides'] = df['Proportion_citibike_rides'].fillna(0)


In [390]:
## Add Tree Census

tree = pd.read_parquet("Data/Cleaned/tree_census.parquet")
tree['year'] = tree['year'].astype(str)
df = df.merge(tree, on=['full_tract', 'year'], how='left')
df['num_trees'] = df['num_trees'].fillna(0)  #### TODO: fill tree nans with the average of that area


In [392]:
## Zillow Data

zillow = pd.read_parquet("Data/Cleaned/zillow_clean.parquet")
df = df.merge(zillow, on=['full_tract','year'], how='left')
df.fillna(2000, inplace=True)  ###### TODO fillna with region/zipcode average
df

Unnamed: 0,NAME,percent_white,percent_black,percent_native,percent_asian,percent_pacific,percent_latino,percent_bachelors,percent_grad,percent_instate,...,percent_owned,percent_rented,med_value,year,med_hh_age,med_hh_residence,full_tract,Proportion_citibike_rides,num_trees,med_rent
0,"Census Tract 19, Bronx County, New York",0.388368,0.255269,0.0,0.013271,0.000000,0.611632,0.114817,0.062861,0.202186,...,0.294118,0.705882,348600.0,2013,57.0,17.0,005001900,0.000000,0.0,2000.000000
1,"Census Tract 19, Bronx County, New York",0.322304,0.316855,0.0,0.015181,0.000000,0.587388,0.132156,0.052984,0.283379,...,0.237548,0.762452,360000.0,2014,57.0,13.0,005001900,0.000000,0.0,2000.000000
2,"Census Tract 19, Bronx County, New York",0.190660,0.343497,0.0,0.021227,0.000000,0.554226,0.137911,0.061620,0.317252,...,0.094340,0.905660,347100.0,2015,56.0,666668690.0,005001900,0.000000,0.0,2000.000000
3,"Census Tract 19, Bronx County, New York",0.168531,0.370619,0.0,0.018643,0.000000,0.540268,0.136842,0.068144,0.337808,...,0.091476,0.908524,374400.0,2016,39.0,13.0,005001900,0.000000,0.0,2000.000000
4,"Census Tract 19, Bronx County, New York",0.184239,0.374157,0.0,0.027689,0.000000,0.533901,0.155144,0.073489,0.312389,...,0.103199,0.896801,375000.0,2017,70.0,12.0,005001900,0.000000,0.0,2000.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19281,"Census Tract 81, Richmond County, New York",0.545651,0.237992,0.0,0.069018,0.000865,0.316097,0.124294,0.112994,0.424708,...,0.513605,0.486395,486600.0,2021,2024.0,15.0,085008100,0.000000,688.0,2252.205989
19282,Census Tract 81; Richmond County; New York,0.489004,0.248167,0.0,0.105865,0.001078,0.303148,0.119789,0.120963,0.376240,...,0.523392,0.476608,569000.0,2022,83.0,13.0,085008100,0.000007,705.0,2252.205989
19283,"Census Tract 207.02, Richmond County, New York",0.553680,0.203000,0.0,0.037037,0.008439,0.693858,0.117066,0.048660,0.175809,...,0.407534,0.592466,370500.0,2020,2024.0,13.0,085020702,0.000000,380.0,2000.000000
19284,"Census Tract 207.02, Richmond County, New York",0.353959,0.212029,0.0,0.020770,0.008222,0.686716,0.187831,0.027116,0.205106,...,0.359763,0.640237,362500.0,2021,71.0,8.0,085020702,0.000000,389.0,2000.000000


## Outliers

In [393]:
# replace all outliers with a common value

df['med_hh_residence'] = np.where(df['med_hh_residence']>666668689, -999, df['med_hh_residence'])
df['med_income'] = df['med_income'].astype(float)
for col in ['med_income','med_value','med_hh_age']:
    df[col] = np.where(df[col]<0, -999, df[col])


# change house age of 2024 to zero

df.loc[df['med_hh_age']>1000, 'med_hh_age'] = 2024-df.loc[df['med_hh_age']>1000, 'med_hh_age']


# replace missing values with the mean of that year. TODO: interpolate missing vals with padded years

for yr in df.year.unique():
    for col in ['med_value','med_hh_age','med_income','med_hh_residence']:
        mean = df[df.year==yr][col].mean()
        df.loc[df.year==yr, col] = df.loc[df.year==yr, col].replace(to_replace= -999, value=mean)

## Normalize

In [394]:
# # normalize certain columns

# for yr in df.year.unique():
#     yr_df = df[df.year == yr]
#     for col in ['med_income','med_value','med_hh_age','med_hh_residence','num_trees']:
#         yr_df[col] = yr_df[col].astype(float)
#         yr_df[col] = (yr_df[col] - yr_df[col].min()) / (yr_df[col].max() - yr_df[col].min())
#     dropped = df.drop(df.iloc[yr_df.index].index)  # drop that year from main df
#     df = pd.concat([dropped, yr_df]).reset_index(drop=True)  # add that year back
# df.sort_values(['full_tract','year'], inplace=True)

### Limit Dataset to CT's that have not gentrified anytime in past decade 
- The tract had a population of at least 500 residents at the beginning and end of a decade and was located within a central city. 
- The tract’s median household income was in the bottom 40th percentile when compared to all tracts within its metro area at the beginning of the decade.
- The tract’s median home value was in the bottom 40th percentile when compared to all tracts within its metro area at the beginning of the decade.

https://www.governing.com/archive/gentrification-report-methodology.html


In [395]:
# TODO: figure out if dollar amounts in ACS are inflation adjusted or not

bottom_income = df.med_income.quantile(.5)


# define eligible column if tract meets all criteria
df['eligible'] = np.where((df.med_income<=bottom_income)|
                                 (df.percent_black>.4) |
                                 (df.percent_latino>.4), 1, 0)

eligible_tracts = df[df['eligible']==1]['full_tract'].unique()
df = df[df.full_tract.isin(eligible_tracts)].drop('eligible',axis=1)
df.shape

(14897, 21)

In [396]:
# df[df.full_tract=='047042300'] # my bushwick neighborhood
# df[df.full_tract=='047026500'] # my bedstuy neighborhood

## Generate Label
### Determine if a tract is gentrified in present
- An increase in a tract's percent bachelor’s degrees was in the top third percentile of all tracts within a metro area.
- Increase in percent white is in top 3rd percentile
- The percentage increase in a tract’s inflation-adjusted median income was in the top third percentile of all tracts within a metro area.

In [397]:
# fill zeros with nan so we can calculate percent change
df.replace(0, np.nan, inplace=True)

# get average percent change over ALL years
pct_chg = df.drop(['NAME','year'], axis=1).groupby('full_tract').agg(lambda x: x.pct_change().mean())
pct_chg.dropna(how='all', inplace=True)
pct_chg.fillna(0, inplace=True)

  pct_chg = df.drop(['NAME','year'], axis=1).groupby('full_tract').agg(lambda x: x.pct_change().mean())


In [398]:
# Set thresholds. If overall %change is greater than thresholds, we consider tract gentrified

# top_third_white
pos = pct_chg[pct_chg.percent_white>=0]
top_third_white = pos.percent_white.quantile(.666)

# top_third_bach
pos = pct_chg[pct_chg.percent_bachelors>=0]
top_third_bach = pos.percent_bachelors.quantile(.666)

# top_third_income
pos = pct_chg[pct_chg.med_income>=0]
top_third_income = pos.med_income.quantile(.666)

# define gentrified column if tract meets all criteria
pct_chg['gentrified'] = np.where((pct_chg.percent_white>=top_third_white)|
                                 (pct_chg.percent_bachelors>=top_third_bach)|
                                 (pct_chg.med_income>=top_third_income), 1, 0)
pct_chg.gentrified.value_counts()

gentrified
1    918
0    820
Name: count, dtype: int64

## Generate Features for X-Year Percent Change

In [399]:
# create era buckets # TODO we have missing years which will mess up binning. need to interpolate missing years
df['year_bins'] = pd.cut(df['year'].astype(int), bins=2)

df = df.drop(['NAME','year'], axis=1).groupby(['full_tract','year_bins']).agg(lambda x: x.pct_change().mean())


  df = df.drop(['NAME','year'], axis=1).groupby(['full_tract','year_bins']).agg(lambda x: x.pct_change().mean())
  df = df.drop(['NAME','year'], axis=1).groupby(['full_tract','year_bins']).agg(lambda x: x.pct_change().mean())


In [400]:
# drop rows that became entirely null bc of the pairwise % change
df.dropna(how='all',subset=['percent_white', 'percent_black',
       'percent_native', 'percent_asian', 'percent_pacific', 'percent_latino',
       'percent_bachelors', 'percent_grad', 'percent_instate', 'med_income',
       'percent_owned', 'percent_rented', 'med_value', 'med_hh_age',
       'med_hh_residence', 'Proportion_citibike_rides', 'num_trees'], inplace=True)

# df = df.drop('year_bins', axis=1)
df.fillna(0, inplace=True)


## Feature for proximity to other gentrified CT's

### Add labels back into dataset

In [401]:
df_model = df.reset_index().merge(pct_chg.reset_index()[['gentrified','full_tract']], on='full_tract', how='left')
df_model

Unnamed: 0,full_tract,year_bins,percent_white,percent_black,percent_native,percent_asian,percent_pacific,percent_latino,percent_bachelors,percent_grad,...,med_income,percent_owned,percent_rented,med_value,med_hh_age,med_hh_residence,Proportion_citibike_rides,num_trees,med_rent,gentrified
0,005000200,"(2009.988, 2016.0]",0.253498,-0.069683,-0.158731,0.045475,0.0,0.013086,0.054308,-0.001092,...,0.039653,-0.097196,3.020267,0.011581,-0.026547,-0.016797,0.0,0.040706,0.00000,1
1,005000200,"(2016.0, 2022.0]",-0.252901,0.065308,0.000000,0.302025,0.0,-0.046746,0.089295,0.570415,...,0.202648,0.056192,-0.058296,0.092322,0.017935,0.022222,0.0,0.039867,0.00000,1
2,005000400,"(2009.988, 2016.0]",0.401586,0.111454,0.106480,-0.076004,0.0,-0.024307,-0.021735,0.022121,...,0.016922,-0.039718,0.446377,-0.025644,-0.091655,-0.023268,0.0,0.099676,0.00000,0
3,005000400,"(2016.0, 2022.0]",-0.228346,-0.007426,-0.078144,0.795912,0.0,0.004719,0.081381,0.001586,...,0.051392,-0.054427,0.139137,0.066512,0.068320,-0.101504,0.0,0.057602,0.00000,0
4,005001600,"(2009.988, 2016.0]",0.206153,0.006892,0.000000,0.000000,0.0,0.016500,0.028395,-0.077970,...,0.009668,-0.185730,1.482936,0.004031,-0.021844,-0.050903,0.0,0.076535,0.00000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3296,085031901,"(2009.988, 2016.0]",0.042575,0.040157,0.009394,0.169419,0.0,0.017621,0.020415,-0.013871,...,-0.044067,-0.072335,3.880733,-0.023403,-0.001047,-0.023516,0.0,0.051320,0.00000,1
3297,085031901,"(2016.0, 2022.0]",0.039764,-0.034771,0.000000,0.201792,0.0,-0.021121,0.133070,1.111323,...,1.017562,0.069650,-0.020859,-0.036373,0.047410,0.066667,0.0,0.038259,0.00000,1
3298,085031902,"(2016.0, 2022.0]",-0.109198,-0.011178,0.054922,0.609157,0.0,0.016932,0.054347,-0.031721,...,0.168562,0.043006,-0.038730,0.090462,-0.022576,-0.033905,0.0,0.040412,0.00000,1
3299,085032300,"(2009.988, 2016.0]",0.194134,0.218746,-0.924330,0.000000,0.0,-0.069739,-0.154394,0.330311,...,0.611596,-0.154309,0.000000,0.079859,-0.073109,-0.094498,0.0,-0.047127,0.00000,1


# Modeling

In [449]:
# define feature set and split data

feat = ['percent_grad', 'percent_instate', 'percent_owned', 'percent_rented', 'med_value','med_hh_age',
       'med_hh_residence','Proportion_citibike_rides', 'num_trees', 'med_rent']

# split the data  ## TODO use cross validation
test = df_model.sample(600)
train = df_model[df_model.index.isin(test.index)==False]

y_train = train['gentrified']
y_test = test['gentrified']
X_train = train[feat]
X_test = test[feat]

X_train.shape

(2701, 10)

In [454]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

y_train_pred = logreg.predict(X_train)
y_test_pred = logreg.predict(X_test)

train_score = accuracy_score(y_train, y_train_pred)
test_score =  accuracy_score(y_test, y_test_pred)
print("Train Precision:", train_score)
print("Test Precision:", test_score)




Train Precision: 0.5790447982228805
Test Precision: 0.5983333333333334


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [460]:
# Random Forest

forest = RandomForestClassifier(n_estimators=100)
forest.fit(X_train, y_train)

y_train_pred = forest.predict(X_train)
y_test_pred = forest.predict(X_test)

train_score = accuracy_score(y_train, y_train_pred)
test_score =  accuracy_score(y_test, y_test_pred)
print("Train Precision:", train_score)
print("Test Precision:", test_score)

Train Precision: 1.0
Test Precision: 0.6083333333333333
