In [1]:
### This file checks the labels for gentrification against Oliver's data

In [3]:
import os
import pickle
import geopandas as gp
import pandas as pd

from os import sep

In [2]:
cwd = f"C:{sep}Users{sep}ltswe{sep}Dropbox{sep}Oxford{sep}Thesis"
data_dir = f"D:"

In [4]:
# Recreate lauren labels but without dropping tracts with NA
# Begin by importing data from IPUMS NGHIS (from the ACS)
# Import 2006-2010 (base year estimates) into pandas
est_0610_raw = pd.read_csv(f"{cwd}{sep}data{sep}2006_2010_estimates.csv", encoding = 'latin-1')

# Import 2014-2018 data estimates 
est_1418_raw = pd.read_csv(f"{cwd}{sep}data{sep}2014_2018_estimates.csv", encoding = 'latin-1')


In [5]:

# Get rid of data outside NYC 
est_0610 = est_0610_raw[(est_0610_raw['STATE'] == 'New York') & ((est_0610_raw['COUNTY'] == 'Bronx County') | 
                                              (est_0610_raw['COUNTY'] == 'New York County') |
                                              (est_0610_raw['COUNTY'] == 'Kings County') |
                                              (est_0610_raw['COUNTY'] == 'Queens County') |
                                              (est_0610_raw['COUNTY'] == 'Richmond County'))].reset_index(drop=True)

est_1418 = est_1418_raw[(est_1418_raw['STATE'] == 'New York') & ((est_1418_raw['COUNTY'] == 'Bronx County') | 
                                              (est_1418_raw['COUNTY'] == 'New York County') |
                                              (est_1418_raw['COUNTY'] == 'Kings County') |
                                              (est_1418_raw['COUNTY'] == 'Queens County') |
                                              (est_1418_raw['COUNTY'] == 'Richmond County'))].reset_index(drop=True)

# Calculate % of people with at least a bachelor's degree (of adults over 25 years of age)
est_0610["%_bachelors_0610"] = (est_0610["JN9E015"]+est_0610["JN9E016"]+est_0610["JN9E017"] \
                                 +est_0610["JN9E018"] +est_0610["JN9E032"]+est_0610["JN9E033"]+est_0610["JN9E034"] \
                                 + est_0610["JN9E035"])/est_0610["JN9E001"]
est_1418["%_bachelors_1418"] = (est_1418["AJYPE022"] + est_1418["AJYPE023"]+ est_1418["AJYPE024"]+ est_1418["AJYPE025"])/est_1418["AJYPE001"]

# # Calc % of nonwhite (including Hispanic/Latino whites) for 06-10 
# # = (total- (not hisp/latino & white alone))/total pop
# est_0610["%_nonwhite_0610"] = (est_0610["JMJE001"] - est_0610["JMJE003"])/est_0610["JMJE001"] 

# Oliver non-white (DOES NOT include hisp/latino whites)
est_0610["%_nonwhite_0610"] = (est_0610["JMJE001"] - (est_0610['JMJE013'] + est_0610["JMJE003"]))/est_0610["JMJE001"] 


# Calc % that are renter households for 06-10 = # occupied by renters/total # of housing units
est_0610['%_hhrent_0610'] = est_0610["JRKE003"]/est_0610["JRKE001"]

# # % of low-income households: % of households with income under $45k (see thesis notes for explanation of number)
est_0610['%_li_0610'] = (est_0610['JOHE002'] + est_0610['JOHE003'] + est_0610['JOHE004'] + est_0610['JOHE005'] \
                       + est_0610['JOHE006'] + est_0610['JOHE007'] + est_0610['JOHE008'] + est_0610['JOHE009'])/est_0610['JOHE001'] 


# Rename other relevant vars in both 2006-2010 and 2014-2018
est_0610 = est_0610.rename(columns={'JN9E001': '25plus_pop_0610','JOIE001':'med_hh_inc_0610_nom', 'JS5E001': 'med_rent_0610_nom', 'JTIE001': 'med_home_value_0610_nom'})
est_1418 = est_1418.rename(columns={'AJYPE001': '25plus_pop_1418', 'AJZAE001': 'med_hh_income_1418', 'AJ3EE001': 'med_rent_1418', 'AJ3QE001': 'med_home_value_1418'})

# Rent, home value, and median household income are both in 2010/2018 adjusted $$ -- so adjust 2010 to be in 2018 $$
est_0610["med_hh_income_0610"] = est_0610["med_hh_inc_0610_nom"].apply(lambda x: 1.15*x)
est_0610["med_rent_0610"] = est_0610["med_rent_0610_nom"].apply(lambda x: 1.15*x)
est_0610["med_home_value_0610"] = est_0610["med_home_value_0610_nom"].apply(lambda x: 1.15*x)

# Keep only vars of interest (above constructed terms plus REAL med hh income, median gross rent, median home value)
est_0610 = est_0610[["GISJOIN", "STATE", "COUNTY", '25plus_pop_0610', "%_bachelors_0610", '%_nonwhite_0610', '%_hhrent_0610',
                     '%_li_0610', "med_hh_income_0610", "med_rent_0610", 'med_home_value_0610']]
est_1418 = est_1418[["GISJOIN", "STATE", "COUNTY", '25plus_pop_1418',"%_bachelors_1418",
                     "med_hh_income_1418", "med_rent_1418", 'med_home_value_1418']]


In [6]:
# # Merge!
est_all = est_0610.merge(est_1418, on = ["GISJOIN", "STATE", "COUNTY"])   

# Drop only census tracts with no population -- fill in the rest
est_all = est_all[(est_all['25plus_pop_1418'] != 0) & (est_all['25plus_pop_0610'] != 0)]
est_all = est_all.fillna(0).reset_index(drop=True)


# Find % change in median real rent & median home value change
est_all['change_rent'] = (est_all['med_rent_1418'] - est_all['med_rent_0610'])/est_all['med_rent_0610']
est_all['change_home_value'] = (est_all['med_home_value_1418'] - est_all['med_home_value_0610'])/est_all['med_home_value_0610']




In [7]:
# Find medians over all census tracts of variables at base period (2006/2010) to determine which neighborhoods are 
# eligible to gentrify
med_bach_start = est_all["%_bachelors_0610"].median()
med_nw_start = est_all['%_nonwhite_0610'].median()
med_hhrent_start = est_all['%_hhrent_0610'].median()
med_rent_start = est_all["med_rent_0610"].median()
med_home_value_start = est_all['med_home_value_0610'].median()
med_li_start = est_all['%_li_0610'].median()

# Determine which neighborhoods are eligible to gentrify
# Firstly, must meet 3 of the following 4 non-housing (price/rent) criteria: % li-hh > med, % college-educated < med, % hh's rent > med, % nonwhite > med.
# Create Series from list comprehensions that correspond to 1 if criteria of census tract is met 0 if not and None if either var is NaN
li_0610_col = est_all.columns.get_loc('%_li_0610')
bachelors_0610_col = est_all.columns.get_loc('%_bachelors_0610')
hhrent_0610_col = est_all.columns.get_loc('%_hhrent_0610')
nonwhite_0610_col = est_all.columns.get_loc('%_nonwhite_0610')

# Note that est_all.shape[0] = number of tracts in est_all (all the tracts in NYC minus those with 0 pop)
# LI
li_0610 = pd.Series([1 if est_all.iloc[tract, li_0610_col] > med_li_start 
                else 0 if est_all.iloc[tract, li_0610_col] <= med_li_start 
                else None for tract in range(est_all.shape[0])])
# college-educated
bachelors_0610 = pd.Series([1 if est_all.iloc[tract, bachelors_0610_col] < med_bach_start 
                else 0 if est_all.iloc[tract, bachelors_0610_col] >= med_bach_start 
                else None for tract in range(est_all.shape[0])])

# % of households that rent
hhrent_0610 = pd.Series([1 if est_all.iloc[tract, hhrent_0610_col] > med_hhrent_start 
                else 0 if est_all.iloc[tract, hhrent_0610_col] <= med_hhrent_start 
                else None for tract in range(est_all.shape[0])])

# Non-white
nonwhite_0610 = pd.Series([1 if est_all.iloc[tract, nonwhite_0610_col] > med_nw_start 
                else 0 if est_all.iloc[tract, nonwhite_0610_col] <= med_nw_start 
                else None for tract in range(est_all.shape[0])])

# Now, create series = sum of each element in the above 
nonhousing_crit_0610 = pd.Series([li_0610[tract] + bachelors_0610[tract] + hhrent_0610[tract] + nonwhite_0610[tract]
                              for tract in range(est_all.shape[0])])

# Next, median rent or median home value must be < 80% of the NYC median (med_rent_start & med_home_value_start)
# Don't have to worry about NAs cuz we got rid of all of them already :D -- so everything is either 0 or 1!
rent_0610_col = est_all.columns.get_loc('med_rent_0610')
hv_0610_col = est_all.columns.get_loc('med_home_value_0610')

housing_crit_0610 = pd.Series([1 if est_all.iloc[tract, rent_0610_col] < 0.8*med_rent_start 
                               or est_all.iloc[tract, hv_0610_col] < 0.8*med_home_value_start
                              else 0 for tract in range(est_all.shape[0])])

# Finally, determine whether a neighbourhood is eligible to gentrify -- must have 1 on housing_crit and 3+ on nonhousing
est_all["eligible_gentrify"] = pd.Series([1 if housing_crit_0610[tract] == 1 and nonhousing_crit_0610[tract] >= 3 
                               else 0 for tract in range(est_all.shape[0])])

# Next up, determine whether a neighbourhood had a 'hot market' -- meaning % change in median real rent > nyc median change
# or % change in median home value > nyc median change 
# Have already calculated changes in rent, housing value above

# Calculate median changes
med_change_rent, med_change_home_value = est_all['change_rent'].median(), est_all['change_home_value'].median()

# Col numbers for change rent and change home value
change_rent_col, change_hv_col = est_all.columns.get_loc('change_rent'), est_all.columns.get_loc('change_home_value')

# Determine 'hot market'
est_all['hot_market'] = [1 if est_all.iloc[tract, change_rent_col] > med_change_rent 
                        or est_all.iloc[tract, change_hv_col] > med_change_home_value
                        else 0 for tract in range(est_all.shape[0])]


In [8]:
est_all['eligible_gentrify'].value_counts()


0    1685
1     429
Name: eligible_gentrify, dtype: int64

In [160]:
# Finally, determine whether a neighbourhood has been gentrified
# Find change in % of college-educated residents & % change in real median hh income
est_all['change_college'] = est_all['%_bachelors_1418'] - est_all['%_bachelors_0610']
est_all['change_hh_inc'] = (est_all['med_hh_income_1418'] - est_all['med_hh_income_0610'])/est_all['med_hh_income_0610']

# Medians
med_change_college, med_change_hh_income = est_all['change_college'].median(), est_all['change_hh_inc'].median()

# Col numbers for eligible to gentrify, hot market, change in college-edu residents and change in hh income
elig_gentrify_col, hot_market_col = est_all.columns.get_loc('eligible_gentrify'), est_all.columns.get_loc('hot_market')
change_coll_col, change_hh_inc_col = est_all.columns.get_loc('change_college'), est_all.columns.get_loc('change_hh_inc')

# Neighbourhood gentrifies IF it is eligible to, it has a hot market, change_college > median and change_hh_inc > med
est_all['gentrification'] = [1 if est_all.iloc[tract, change_coll_col] > med_change_college 
                            and est_all.iloc[tract, change_hh_inc_col] > med_change_hh_income
                            and est_all.iloc[tract, elig_gentrify_col] == 1 
                            and est_all.iloc[tract, hot_market_col] == 1
                            else 0 for tract in range(est_all.shape[0])]

In [162]:
# Make a more understandable fips code (11 digits instead of 13)
def gen_fips_from_gisjoin(x):
    ''' x = GISJOIN code '''
    # x[4:7] is county code, x[8:14] = census tract code ('36' is always state code)
    return "36" + x[4:7] + x[8:14]

est_all['fips_code'] = est_all['GISJOIN'].apply(lambda x: gen_fips_from_gisjoin(x))

In [93]:
lauren_labels_wNAs = est_all
lauren_labels_woutNAs = pickle.load(open(f'{cwd}{sep}from_Oliver{sep}to_check_labels.pickle', 'rb'))

In [94]:
# Unpickle gentrification labels
oliver_labels = pd.read_excel(f'{cwd}{sep}from_Oliver{sep}gentrified_elig.xlsx').rename(columns={'Tract ID':'fips_code'})

# FIPS code should be string
oliver_labels['fips_code'] = oliver_labels['fips_code'].apply(lambda x: str(x))

In [177]:
oliver_labels.columns

Index(['fips_code', 'Gentrification_Eligibility', 'Gentrified', '%_Black',
       '%_White', 'Median_Income', 'Distance_to_downtown', 'Gross_Rent',
       '%_College_Educated', 'House_value', '%_vacant_homes', '%_Unemployed',
       '%_renters', 'tst_eligible_gentrify'],
      dtype='object')

In [171]:
# Note that est_all.shape[0] = number of tracts in est_all (all the tracts in NYC minus those with 0 pop)
# LI
li_0610 = pd.Series([1 if oliver_labels.iloc[tract, 5] < 50000 
                else 0 if oliver_labels.iloc[tract, 5] >= 50000 
                else None for tract in range(oliver_labels.shape[0])])
# college-educated
bachelors_0610 = pd.Series([1 if oliver_labels.iloc[tract, 8] < oliver_labels['%_College_Educated'].median() 
                else 0 if oliver_labels.iloc[tract, 8] >= oliver_labels['%_College_Educated'].median() 
                else None for tract in range(oliver_labels.shape[0])])

# % of households that rent
hhrent_0610 = pd.Series([1 if oliver_labels.iloc[tract, 12] > oliver_labels['%_renters'].median() 
                else 0 if oliver_labels.iloc[tract, 12] <= oliver_labels['%_renters'].median() 
                else None for tract in range(oliver_labels.shape[0])])

# Non-white
nonwhite_0610 = pd.Series([1 if oliver_labels.iloc[tract, 4] < oliver_labels['%_White'].median() 
                else 0 if oliver_labels.iloc[tract, 4] >= oliver_labels['%_White'].median() 
                else None for tract in range(oliver_labels.shape[0])])

# Now, create series = sum of each element in the above 
nonhousing_crit_0610 = pd.Series([li_0610[tract] + bachelors_0610[tract] + hhrent_0610[tract] + nonwhite_0610[tract]
                              for tract in range(oliver_labels.shape[0])])


# Next, median rent or median home value must be < 80% of the NYC median (med_rent_start & med_home_value_start)
# Don't have to worry about NAs cuz we got rid of all of them already :D -- so everything is either 0 or 1!
rent_0610_col = oliver_labels.columns.get_loc('Gross_Rent')
hv_0610_col = oliver_labels.columns.get_loc('House_value')

housing_crit_0610 = pd.Series([1 if oliver_labels.iloc[tract, rent_0610_col] < 0.8*oliver_labels['Gross_Rent'].median() 
                               or oliver_labels.iloc[tract, hv_0610_col] < 0.8*oliver_labels['House_value'].median()
                              else 0 for tract in range(oliver_labels.shape[0])])

# Finally, determine whether a neighbourhood is eligible to gentrify -- must have 1 on housing_crit and 3+ on nonhousing
oliver_labels["tst_eligible_gentrify"] = pd.Series([1 if housing_crit_0610[tract] == 1 and nonhousing_crit_0610[tract] >= 3 
                               else 0 for tract in range(oliver_labels.shape[0])])

In [182]:
oliver_labels[['tst_eligible_gentrify', 'Gentrification_Eligibility']]
oliver_labels["check"] = oliver_labels['tst_eligible_gentrify'] - oliver_labels['Gentrification_Eligibility']
oliver_labels[oliver_labels['check'] != 0]

Unnamed: 0,fips_code,Gentrification_Eligibility,Gentrified,%_Black,%_White,Median_Income,Distance_to_downtown,Gross_Rent,%_College_Educated,House_value,%_vacant_homes,%_Unemployed,%_renters,tst_eligible_gentrify,check
17,36005004001,1,0,0.178289,0.268152,44432,15025.40369,963,25.2,361400,7.6,8.4,0.364842,0,-1
129,36005021200,1,0,0.388219,0.107848,50578,15587.89783,1067,28.8,124100,2.6,4.8,0.737065,0,-1
197,36005027900,1,0,0.185874,0.305671,51667,17941.86510,1064,28.6,260500,4.2,10.1,0.694285,0,-1
213,36005030200,1,1,0.616290,0.161359,51544,19946.02053,868,21.8,80300,4.0,8.1,0.352660,0,-1
257,36005037200,1,0,0.567220,0.111618,52107,18560.70783,1096,27.4,376100,5.0,14.1,0.624390,0,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1889,36081094202,1,0,0.515601,0.389236,43565,22246.59765,598,26.1,368800,20.8,15.4,0.560896,0,-1
1894,36081096400,1,0,0.749099,0.123423,66103,23201.75337,1348,24.5,390600,18.1,10.0,0.321274,0,-1
1908,36081099802,1,1,0.491934,0.461353,19425,25523.07900,720,26.4,425900,14.4,4.5,0.800355,0,-1
1998,36081156700,1,0,0.258883,0.235195,83068,21970.33219,668,40.5,588400,12.0,11.4,0.628763,0,-1


In [163]:
labels_compare1 = lauren_labels_wNAs.merge(oliver_labels, how='right', on='fips_code')
labels_compare2 = lauren_labels_woutNAs.merge(oliver_labels, how='right', on='fips_code')
labels_compare3 = est_all.merge(oliver_labels, how='right', on = 'fips_code')

In [165]:
eligible_diff1 = labels_compare1[labels_compare1['eligible_gentrify'] != labels_compare1['Gentrification_Eligibility']]
eligible_diff3 = labels_compare3[labels_compare3['eligible_gentrify'] != labels_compare3['Gentrification_Eligibility']]


In [180]:
eligible_diff3['fips_code']

17      36005004001
129     36005021200
213     36005030200
257     36005037200
296     36005040900
           ...     
1894    36081096400
1908    36081099802
1952    36081120500
2011    36085001100
2089    36085022300
Name: fips_code, Length: 61, dtype: object

In [119]:
tsting = labels_compare2[(labels_compare2['GISJOIN'].isna()==True) 
                & (labels_compare2['Median_Income'] != 0)
               & (labels_compare2['%_College_Educated'] != 0)]
tsting.fips_code

200     36005028400
229     36005033400
1172    36061009400
1727    36081056100
2091    36085022800
Name: fips_code, dtype: object

In [127]:
tsting.columns

Index(['GISJOIN', 'STATE', 'COUNTY', '25plus_pop_0610', '%_bachelors_0610',
       '%_nonwhite_0610', '%_hhrent_0610', '%_li_0610', 'med_hh_income_0610',
       'med_rent_0610', 'med_home_value_0610', '25plus_pop_1418',
       '%_bachelors_1418', 'med_hh_income_1418', 'med_rent_1418',
       'med_home_value_1418', 'change_rent', 'change_home_value',
       'eligible_gentrify', 'hot_market', 'change_college', 'change_hh_inc',
       'gentrification', 'fips_code', 'Gentrification_Eligibility',
       'Gentrified', '%_Black', '%_White', 'Median_Income',
       'Distance_to_downtown', 'Gross_Rent', '%_College_Educated',
       'House_value', '%_vacant_homes', '%_Unemployed', '%_renters'],
      dtype='object')

In [151]:
labels_compare1[labels_compare1['fips_code']=='36085022800'][['med_rent_0610', 'med_home_value_0610', 
                                                             'med_rent_1418', 'med_home_value_1418', 'Gross_Rent',
                                                             'House_value']]

Unnamed: 0,med_rent_0610,med_home_value_0610,med_rent_1418,med_home_value_1418,Gross_Rent,House_value
2091,0.0,674705.0,0.0,0.0,0,586700


In [145]:
labels_compare1['nom_value'] = labels_compare1['med_home_value_0610'].apply(lambda x: x/1.15)

In [147]:
labels_compare1[['GISJOIN','%_renters', '%_hhrent_0610']]

Unnamed: 0,GISJOIN,%_renters,%_hhrent_0610
0,G3600050000100,,0.000000
1,G3600050000200,0.353793,0.395528
2,G3600050000400,0.364757,0.386675
3,G3600050001600,0.760254,0.788704
4,G3600050001900,0.814346,0.935878
...,...,...,...
2109,G3600850030301,0.411063,0.417631
2110,G3600850030302,0.234647,0.253821
2111,G3600850031901,0.746305,0.787013
2112,G3600850031902,0.378646,0.446727


In [90]:
labels_compare.eligible_gentrify.value_counts()

0    1684
1     430
Name: eligible_gentrify, dtype: int64