This notebook builds visuals that highlight trends in the Boston area from 2000 to 2010 using census data.

In [1]:
import pandas as pd
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)
import numpy as np
import sys
sys.path.append('../')
import plotly.graph_objects as go
from ipywidgets import widgets

In [2]:
## join up all the data so we can play around with visuals
df_2000_1=pd.read_csv('../Final_2000_data.csv',index_col=0)
df_2000_2=pd.read_csv('../Final_2000_data_sample.csv',index_col=0)

df_2000 = df_2000_1.merge(df_2000_2,how='inner',on='tractid')

# create a column on representing the year of the census data
df_2000['year']='2000-01-01'
df_2000['year']= pd.to_datetime(df_2000['year']) 

df_2010_1 = pd.read_csv('../Final_2010_data.csv',index_col=0)
df_2010_2= pd.read_csv('../Final_2010_data_sample.csv',index_col=0)

df_2010 = df_2010_1.merge(df_2010_2,how='inner',on='tractid')
df_2010['year']='2010-01-01'
df_2010['year']= pd.to_datetime(df_2010['year']) 

In [3]:
# import changes file too

df_changes_full = pd.read_csv('../fullcount_final.csv',index_col=0)
df_changes_sample =  pd.read_csv('../samplecount_final.csv',index_col=0)

In [4]:
# before visualizing, remove all the tracts that were removed during EDA process
tracts_to_remove = [25025981201,25025980300,25025981000,25025981202,25025981300,25025981501,25025981502,
                    25025981600,25025981700,25025981800,25025060700,25025980700,25025980101]

In [5]:
df_2010 = df_2010[df_2010['tractid'].isin(tracts_to_remove) == False]
df_2000 = df_2000[df_2000['tractid'].isin(tracts_to_remove) == False]
df_changes_full = df_changes_full[df_changes_full['tractid'].isin(tracts_to_remove) == False]
df_changes_sample = df_changes_sample[df_changes_sample['tractid'].isin(tracts_to_remove) == False]

In [6]:
# also drop 25025990101 bc it didnt exist in 2000, its index 203
df_2010.drop(203,axis=0,inplace=True)

In [7]:
df_2010.to_csv('plotly_2010.csv')
df_2000.to_csv('plotly_2000.csv')

In [8]:
cluster_df=pd.read_csv('../clusters.csv',index_col=0)

In [9]:
cluster_df.head()

Unnamed: 0,tractid,cluster,HINC00_PC,MHMVAL00_PC,MRENT00_PC,percent_owneroccupied_00_PC,percent_non-white00_PC,percent_4yrcollege_degree_ormore_PC,cluster_name
0,25025000100,1,-5.21,56.78,50.11,-12.52,27.33,-24.91,Becoming more affordable
1,25025000201,1,-16.97,100.3,41.78,-6.08,22.63,12.61,Becoming more affordable
2,25025000202,1,-24.08,56.41,58.41,-7.61,2.15,11.1,Becoming more affordable
3,25025000301,2,25.76,91.09,51.29,4.77,45.58,37.54,Remaining costly
4,25025000302,1,-15.31,24.16,15.71,-3.6,32.15,-1.16,Becoming more affordable


In [10]:
cluster_and_2000=df_2000.merge(cluster_df,how='inner',on='tractid')

In [11]:
cluster_and_2000.rename(columns={'HINC00_PC':'Percent Change in Median Income','MHMVAL00_PC':'Percent Change in Median Home Value',
                   'MRENT00_PC':'Percent Change in Median Rent','percent_owneroccupied_00_PC':'Percent Change in Owner Occupied Homes',
                    'percent_non-white00_PC':'Percent Change in Non-White Population','percent_4yrcollege_degree_ormore_PC':'Percent Change in Population with College Degree'},inplace=True)

In [12]:
cluster_and_2000.to_csv('../clusters_and_2000.csv')

In [13]:
cluster_and_2000.columns[-7:-1]

Index(['Percent Change in Median Income',
       'Percent Change in Median Home Value', 'Percent Change in Median Rent',
       'Percent Change in Owner Occupied Homes',
       'Percent Change in Non-White Population',
       'Percent Change in Population with College Degree'],
      dtype='object')

In [14]:
df1=df_2000.merge(cluster_df,how='inner',on='tractid')

In [16]:
df1[['HINC00','percent_owneroccupied_00','MHMVAL00','MRENT00','percent_non-white00','percent_4yrcollege_degree_ormore']].mean()

HINC00                               51003.701258
percent_owneroccupied_00                31.490385
MHMVAL00                            277074.143915
MRENT00                                922.301268
percent_non-white00                     57.858739
percent_4yrcollege_degree_ormore        33.356281
dtype: float64

In [17]:
df1.groupby('cluster_name')['HINC00','percent_owneroccupied_00','MHMVAL00','MRENT00','percent_non-white00','percent_4yrcollege_degree_ormore'].mean()

Unnamed: 0_level_0,HINC00,percent_owneroccupied_00,MHMVAL00,MRENT00,percent_non-white00,percent_4yrcollege_degree_ormore
cluster_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Becoming more affordable,50071.892877,28.337898,289275.967114,976.248698,62.758259,38.401296
Gentrifying,38815.01557,24.615605,236787.053914,711.065697,69.162572,16.95961
Remaining costly,56486.609816,37.049649,280846.819569,951.500665,48.984569,34.817631


In [18]:
df2=df_2010.merge(cluster_df,how='inner',on='tractid')

In [19]:
df2.groupby('cluster_name')['hinc12','percent_owneroccupied_10','mhmval12','mrent12','non-white10','pcol12'].mean()

Unnamed: 0_level_0,hinc12,percent_owneroccupied_10,mhmval12,mrent12,non-white10,pcol12
cluster_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Becoming more affordable,44872.090909,26.779719,372684.480519,1085.038961,69.633559,39.67013
Gentrifying,50964.0,28.798752,342251.612903,963.935484,69.784084,36.856129
Remaining costly,68137.439024,38.093474,397913.414634,1204.634146,55.784213,44.502439


In [20]:
df2[['hinc12','percent_owneroccupied_10','mhmval12','mrent12','non-white10','pcol12']].mean()

hinc12                       55906.868421
percent_owneroccupied_10        31.991918
mhmval12                    378607.394737
mrent12                       1116.894737
non-white10                     63.681032
pcol12                          41.296526
dtype: float64

In [21]:
cluster_and_2000[cluster_and_2000['tractid']==25025020302]

Unnamed: 0,tractid,state,county,tract,POP00,percent_white00,percent_black00,percent_asian00,percent_hispanic00,percent_indian00,percent_chinese00,percent_filip00,percent_japan00,percent_korean00,percent_viet00,percent_mex00,percent_pr00,percent_cuban00,percent_vacant_housing00,percent_occupied_housing00,percent_under18_00,percent_60andup_00,percent_75andup_00,percent_owneroccupied_00,percent_renteroccupied_00,percent_non-white00,INCPC00,HINC00,MHMVAL00,MRENT00,percent_foreign_born,percent_naturalized,percent_recent_immigrants(10),percent_other_languages,percent_hs_degree_orless,percent_4yrcollege_degree_ormore,percent_married,percent_unemployed,percent_employed,percent_professional,percent_manufacturing,percent_self_employed,percent_poverty,percent_houses_30yrsago,percent_multiunit_houses,year,cluster,Percent Change in Median Income,Percent Change in Median Home Value,Percent Change in Median Rent,Percent Change in Owner Occupied Homes,Percent Change in Non-White Population,Percent Change in Population with College Degree,cluster_name
35,25025020302,MA,Suffolk County,Census Tract 203.02,1258.999958,71.824521,9.624214,11.324605,5.951369,2.448563,3.179731,0.714164,2.941676,1.156266,0.340078,1.020235,1.360313,0.136031,7.049529,92.950471,7.430709,18.092161,7.039619,28.167731,64.782744,40.741371,58649.948825,66050.205569,354817.26114,1518.293665,19.91158,7.107634,10.491413,23.450703,23.215007,61.234369,41.835061,4.272699,95.727306,76.895306,3.249097,10.731867,11.066072,67.395929,99.48344,2000-01-01,2,34.37,98.61,43.12,9.8,-45.34,41.72,Remaining costly


In [22]:
cluster_and_2000[cluster_and_2000['tractid']==25025020303]

Unnamed: 0,tractid,state,county,tract,POP00,percent_white00,percent_black00,percent_asian00,percent_hispanic00,percent_indian00,percent_chinese00,percent_filip00,percent_japan00,percent_korean00,percent_viet00,percent_mex00,percent_pr00,percent_cuban00,percent_vacant_housing00,percent_occupied_housing00,percent_under18_00,percent_60andup_00,percent_75andup_00,percent_owneroccupied_00,percent_renteroccupied_00,percent_non-white00,INCPC00,HINC00,MHMVAL00,MRENT00,percent_foreign_born,percent_naturalized,percent_recent_immigrants(10),percent_other_languages,percent_hs_degree_orless,percent_4yrcollege_degree_ormore,percent_married,percent_unemployed,percent_employed,percent_professional,percent_manufacturing,percent_self_employed,percent_poverty,percent_houses_30yrsago,percent_multiunit_houses,year,cluster,Percent Change in Median Income,Percent Change in Median Home Value,Percent Change in Median Rent,Percent Change in Owner Occupied Homes,Percent Change in Non-White Population,Percent Change in Population with College Degree,cluster_name
36,25025020303,MA,Suffolk County,Census Tract 203.03,2806.675935,71.824518,9.624213,11.324604,5.951369,2.448563,3.179731,0.714164,2.941677,1.156266,0.340078,1.020235,1.360313,0.136031,7.049529,92.950475,7.430709,18.092161,7.039619,28.167732,64.782743,40.74137,58649.948901,66050.20881,354817.252909,1518.293686,19.91158,7.107635,10.491413,23.450703,23.215004,61.234364,41.835064,4.272698,95.727302,76.895309,3.249097,10.731867,11.066072,67.39593,99.48344,2000-01-01,2,26.72,52.89,66.89,-14.66,39.47,12.99,Remaining costly


## EDA

### Racial and ethnic changes

In [23]:
# 3% decline in white population
(df_2010.nhwht10.sum()-df_2000.NHWHT00.sum())/df_2000.NHWHT00.sum()

AttributeError: 'DataFrame' object has no attribute 'nhwht10'

In [None]:
df_2010.head()

In [None]:
# 16% increase in non-white population
((df_2010[['nhblk10','ntv10','asian10','hisp10',
                                                   'haw10','india10','china10','filip10',
                                                   'japan10','korea10','viet10','mex10',
                                                   'pr10','cuban10']].sum().sum())-df_2000[['NHBLK00','NTV00','ASIAN00','HISP00',
                                                   'HAW00','INDIA00','CHINA00','FILIP00',
                                                   'JAPAN00','KOREA00','VIET00','MEX00',
                                                   'PR00','CUBAN00']].sum().sum())/(df_2000[['NHBLK00','NTV00','ASIAN00','HISP00',
                                                   'HAW00','INDIA00','CHINA00','FILIP00',
                                                   'JAPAN00','KOREA00','VIET00','MEX00',
                                                   'PR00','CUBAN00']].sum().sum())

In [None]:
import plotly.graph_objects as go
animals=['giraffes', 'orangutans', 'monkeys']

fig = go.Figure(data=[
    go.Bar(name='SF Zoo', x=animals, y=[20, 14, 23]),
    go.Bar(name='LA Zoo', x=animals, y=[12, 18, 29])
])
# Change the bar mode
fig.update_layout(barmode='stack')
fig.show()

In [None]:
# bar chart showing breakup of white vs. non -white pop in boston 2000 vs. 2010
import plotly.graph_objects as go



fig = go.Figure(data=[go.Bar(name='White', x=['2000','2010'], y=[df_2000.NHWHT00.sum(),df_2010.nhwht10.sum()]),
                      go.Bar(name='Non-white', x=['2000','2010'], y=[df_2000[['NHBLK00','NTV00','ASIAN00','HISP00',
                                                   'HAW00','INDIA00','CHINA00','FILIP00',
                                                   'JAPAN00','KOREA00','VIET00','MEX00',
                                                   'PR00','CUBAN00']].sum().sum(),
                                          df_2010[['nhblk10','ntv10','asian10','hisp10',
                                                   'haw10','india10','china10','filip10',
                                                   'japan10','korea10','viet10','mex10',
                                                   'pr10','cuban10']].sum().sum()])])
# Change the bar mode
fig.update_layout(title=
                  go.layout.Title(text='Change in Racial Composition of Population 2000-2010',
                                  xref='paper',
                                 x=0),
                  barmode='stack',
                  xaxis=
                  go.layout.XAxis(
                     title=go.layout.xaxis.Title(
                     text='Year'),
                      tickmode='array',
                      tickvals=[2000,2010],
                      ticktext=[2000,2010]),
                  yaxis=go.layout.YAxis(
                      title=go.layout.yaxis.Title(
                      text='Population')
                 ))
fig.show()

In [None]:
df_2000['population_bucket']=df_2000['POP00'].apply(lambda x: 'less than 2500' if x < 2500 else
                                                        ('2500-3500' if x < 3500 else (
                                                        ('3500-4500' if x<4500 else
                                                        '4500-8000'))))

## income

In [None]:
df_changes_sample['HINC00'].describe()

In [None]:
df_changes_sample['hinc12'].describe()

In [None]:
(52184-47486)/47486

In [None]:
def func(x):
    if x >= 63371:
        return 'Top 25% (> 63k)'
    elif 37189<x<63371:
        return 'Middle 50% (<63k and >37k)'
    else:
        return 'Bottom 25% (<37k)'

In [None]:
df_changes_sample['income_bracket']=df_changes_sample['HINC00'].apply(func)

In [None]:
# resave to csv
df_changes_sample.to_csv('plotly_changes.csv')

In [None]:
df_changes_sample['income_bracket'].value_counts()

In [None]:
df_changes_sample[(df_changes_sample['income_bracket']=='Top 25% (> 63k)')&(df_changes_sample['HINC00_PC']<0)].describe()

In [None]:
(71902-80728)/80728

In [None]:
df_changes_sample[(df_changes_sample['income_bracket']=='Bottom 25% (<37k)')&(df_changes_sample['HINC00_PC']<0)].describe()

In [None]:
(21508-27815)/27815

In [None]:
df_changes_sample['income_bracket'][df_changes_sample['HINC00_PC']<0].value_counts()

In [None]:
len(df_changes_sample[(df_changes_sample['HINC00']>47000) &(df_changes_sample['HINC00_PC']>0)])

In [None]:
df_changes_sample[['HINC00','income_bracket']]

In [None]:
fig=go.Figure()
colorsIdx={'Middle 50% (<63k and >37k)':'red','Top 25% (> 63k)':'blue','Bottom 25% (<37k)':'yellow'}
cols=df_changes_sample.income_bracket.map(colorsIdx)
fig.add_trace(go.Scatter(x=df_changes_sample.HINC00,
                         y=df_changes_sample.HINC00_PC,
                        mode='markers',
                        marker=dict(size=10,color=cols)))
fig.show()

In [None]:
df_changes_sample['HINC00'].describe()

In [None]:
(52009-47486)/47486

## Housing costs

In [None]:
df_all_changes=df_changes_full.merge(df_changes_sample,how='inner',on='tractid')

In [None]:
df_all_changes.to_csv('plotly_changes_full.csv')

In [None]:
df_2010['mrent12'].describe()

In [None]:
df_2000['MRENT00'].describe()

In [None]:
(1083.5-871.56)/871.56

In [None]:
len(df_all_changes[(df_all_changes['percent_renteroccupied_00_PC']>0) & (df_all_changes['MHMVAL00_PC'])])

In [None]:
103/191 # 53 % of tracts experienced and increase in home value while owner occ went up
# vs. only 49/191 tracts expereicen increase in home while renter occupany went up

In [None]:
df_2010['mhmval12'].describe()

In [None]:
df_2000['MHMVAL00'].describe()

In [None]:
# county's median median home value rose 41%
(337150-239077)/239077

In [None]:
# 95% of tracts increas ein home val
len(df_changes_sample[df_changes_sample['MRENT00_PC']>0])

In [None]:
# same with rent val
len(df_changes_sample[df_changes_sample['MHMVAL00_PC']>0])

In [None]:
181/190

In [None]:
183/191

## Visualizing the clusters

In [None]:
# Cluster 0 - "gentrified"
import plotly.graph_objects as go

categories=['% Change in Household Income','% Change in Median Home Value','% Change in Median Rent', 
         '% Change in Owner Occupied Housing', '% Change in Non-White Population',
         '% Change in Population with College Degree']
fig = go.Figure()

fig.add_trace(go.Scatterpolar(
      r=[8.20,76.14,54.41,6.23,28.04,36.73],
      theta=categories,
      name='Baseline, County Avg. (n=190)',
      marker={
        'color':'orange'
    }
))

fig.add_trace(go.Scatterpolar(
  r=[29.95,87.36,69.79,28.49,11.23,129.64],
  theta=categories,
    name='Cluster 1: Gentrifying (n=31)'
))

fig.add_trace(go.Scatterpolar(
      r=[-12.11,72.58,40.70,-7.45,22.16,2.05],
      theta=categories,
      name='Cluster 2: Becoming more affordable (n=77)'
))
fig.add_trace(go.Scatterpolar(
      r=[19.24,81.35,61.29,7.34,40.02,34.12],
      theta=categories,
      name='Cluster 3: Becoming costly (n=82)'
))
fig.update_layout(
    legend_orientation="h",
    legend={'x':-.1,'y':1.2
    },
  polar=dict(
    radialaxis=dict(
      visible=True,
        ),
  ),
    bgcolor='aliceblue',
  showlegend=True,
   
)

fig.show()
legend=dict(x=0, y=1.2)

In [None]:
# Cluster 1-"affordable/low income"
categories=['% Change in Household Income','% Change in Median Home Value','% Change in Median Rent', 
         '% Change in Owner Occupied Housing', '% Change in Non-White Population',
         '% Change in Population with College Degree']
fig = go.Figure()

fig.add_trace(go.Scatterpolar(
  r=[-12.11,72.58,40.70,-7.45,22.16,2.05],
  theta=categories,
    name='Cluster 2: Affordable (n=77)'
))

fig.add_trace(go.Scatterpolar(
      r=[8.20,76.14,54.41,6.23,28.04,36.73],
      theta=categories,
      name='Cluster 2: Affordable (n=77)'
))

fig.update_layout(
  polar=dict(
    radialaxis=dict(
      visible=True,
        range=[-10,120]
    ),
  ),
  showlegend=True
)

fig.show()


In [None]:
# Cluster 2 - "stable, in line with average"
categories=['% Change in Household Income','% Change in Median Home Value','% Change in Median Rent', 
         '% Change in Owner Occupied Housing', '% Change in Non-White Population',
         '% Change in Population with College Degree']
fig = go.Figure()

fig.add_trace(go.Scatterpolar(
  r=[19.24,81.35,61.29,7.34,40.02,34.12],
  theta=categories,
    name='Cluster 3: Gentrified (n=31)'
))

fig.add_trace(go.Scatterpolar(
      r=[8.20,76.14,54.41,6.23,28.04,36.73],
      theta=categories,
      name='Baseline'
))

fig.update_layout(
  polar=dict(
    radialaxis=dict(
      visible=True,
    ),
  ),
  showlegend=False
)

fig.show()

In [None]:
len(df_2000[df_2000['percent_white00']>75])/len(df_2000)

In [None]:
import plotly.io as pio

population = df_joined.POP00
percent_white = df_joined.percent_white00

data = [dict(
  type = 'scatter',
  x = population,
  y = percent_white,
  mode = 'markers',
  transforms = [dict(
    type = 'filter',
    target = 'y',
    operation = '>',
    value = 4
  )]
)]

layout = dict(
    title = 'Scores > 4'
)

fig_dict = dict(data=data, layout=layout)

pio.show(fig_dict, validate=False)