## Data for Cambridge and Boston
This document analyzes data available about Cambridge and Boston from the 1990, 2000 and 2010 censuses. 

In [1]:
import numpy as np
import pandas as pd
from scipy import stats
import pickle

In [2]:
import matplotlib.pyplot as plt
import plotly
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

## Analyzing Demographic Data
Populations for white, Black, Native American, Asian, Pacific Islander, and other races of people in Cambridge and Boston.

In [3]:
# Census API Key
key = "d923b916d08af136ade78b021cc31f162cbb2d2f"

The function to fetch the census data.

In [4]:
def fetch_census(year, census_codes, state_code, place):
    results = {}
    for category, variable in census_codes.items():
        out = pd.read_json('https://api.census.gov/data/{}/sf1?get={}&in=state:{}&for=place:{}&key={}'.format(year, variable, state_code, place, key))
        df = out.iloc[[1]][0]
        results[category] = int(df)
    return (results)

The dictionaries containing the codes for accessing the census data.

In [5]:
census_2010 = dict([("total population", "P0030001"), ("white population", "P0030002"), ("black population", "P0030003"), ("native population", "P0030004"), ("asian population", "P0030005"), ("pacific population", "P0030006"), ("other population", "P0030007")])

census_2000 = dict([("total population", "P003002"), ("white population", "P003003"), ("black population", "P003004"), ("native population", "P003005"), ("asian population", "P003006"), ("pacific population", "P003007"), ("other population", "P003008")])

census_1990 = dict([("total population", "P0010001"), ("white population", "P0060001"), ("black population", "P0060002"), ("native population", "P0060003"), ("asian population", "P0060004"), ("other population", "P0060005")])

Calculate and clean the data for the two cities.

In [10]:
# Boston
b2010 = fetch_census(2010, census_2010, 25, '07000')
b2000 = fetch_census(2000, census_2000, 25, '07000')
b1990 = fetch_census(1990, census_1990, 25, '07000')

# Cambridge
c2010 = fetch_census(2010, census_2010, 25, 11000)
c2000 = fetch_census(2000, census_2000, 25, 11000)
c1990 = fetch_census(1990, census_1990, 25, 11000)

URLError: <urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:749)>

In [8]:
boston = pd.DataFrame([b1990, b2000, b2010])
boston.insert(loc=0, column='year', value=['1990', '2000', '2010'])
boston = boston.fillna(0)
boston['asian and pacific islander population'] = boston['asian population'] + boston['pacific population']
boston['white and asian population'] = boston['asian and pacific islander population'] + boston['white population']
boston['poc population'] = boston['asian and pacific islander population'] + boston['black population'] + boston['native population'] + boston['other population']
boston = boston.apply(pd.to_numeric)

NameError: name 'b1990' is not defined

In [None]:
cambridge = pd.DataFrame([c1990, c2000, c2010])
cambridge.insert(loc=0, column='year', value=[1990, 2000, 2010])
cambridge = cambridge.fillna(0)
cambridge['asian and pacific islander population'] = cambridge['asian population'] + cambridge['pacific population']
cambridge['white and asian population'] = cambridge['asian and pacific islander population'] + cambridge['white population']
cambridge['poc population'] = cambridge['asian and pacific islander population'] + cambridge['black population'] + cambridge['native population'] + cambridge['other population']
cambridge = cambridge.apply(pd.to_numeric)

Focus on the change in the populations over the years.

In [None]:
c_change = cambridge.drop('year', axis=1).pct_change()
b_change = boston.drop('year', axis=1).pct_change()

In [None]:
c_change2 = cambridge.drop('year', axis=1).pct_change(periods=2)
b_change2 = boston.drop('year', axis=1).pct_change(periods=2)

Now, the data for the change in these populations (Black, Asian, non-white, and white) will be visualized.

In [None]:
# Demographic changes (bars)
trace1 = go.Bar(
    x=['Black', 'AAPI', 'Non-White', 'White'],
    y=[b_change2.iloc[[2]]['black population'], b_change2.iloc[[2]]['asian and pacific islander population'], b_change2.iloc[[2]]['poc population'], b_change2.iloc[[2]]['white population'], b_change2.iloc[[2]]['white and asian population']],
    name='Boston'
)
trace2 = go.Bar(
    x=['Black', 'AAPI', 'Non-White', 'White'],
    y=[c_change2.iloc[[2]]['black population'], c_change2.iloc[[2]]['asian and pacific islander population'], c_change2.iloc[[2]]['poc population'], c_change2.iloc[[2]]['white population'], c_change2.iloc[[2]]['white and asian population']],
    name='Cambridge'
)

data = [trace1, trace2]
layout = go.Layout(
    title = "The Change In Population Demographics for Boston and Cambridge Since 1990",
    barmode='group'
)

fig = go.Figure(data=data, layout=layout)
iplot(fig)

Define a function to be used to perform linear regression.

In [None]:
def fitter(x, y, regr_x):
    """
    Use linear regression to make a best fit line for a set of data.
    Args:
        x (numpy array): The independent variable.
        y (numpy array): The dependent variable.
        regr_x (numpy array): The array used to extrapolate the regression.
    """
    slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
    return (slope * regr_x + intercept)

In [None]:
# Demographic changes in Boston

years = pd.to_numeric(boston['year']).as_matrix()
regr = np.arange(1990, 2019)

y0 = boston['black population'].as_matrix()
y1 = boston['asian population'].as_matrix()
y2 = boston['poc population'].as_matrix()
y3 = boston['white population'].as_matrix()
y4 = boston['white and asian population'].as_matrix()


# Generate linear fits
line0 = fitter(years, y0, regr)
line1 = fitter(years, y1, regr)
line2 = fitter(years, y2, regr)
line3 = fitter(years, y3, regr)
line4 = fitter(years, y4, regr)

trace0 = go.Scatter(
            x = years,
            y = y0,
            name='Black',
            mode='markers',
            marker=go.Marker(color='rgb(255, 63, 52)')
        )
fit0 = go.Scatter(
                  x = regr,
                  y = line0,
                  mode='lines',
                  marker=go.Marker(color='rgb(255, 63, 52)'),
                  name='Black Fit'
                  )
trace1 = go.Scatter(
            x = years,
            y = y1,
            name='Asian',
            mode='markers',
            marker=go.Marker(color='rgb(5, 196, 107)')
        )
fit1 = go.Scatter(
                  x = regr,
                  y = line1,
                  mode='lines',
                  marker=go.Marker(color='rgb(5, 196, 107)'),
                  name='Asian Fit'
                  )
trace2 = go.Scatter(
            x = years,
            y = y2,
            name='Non-White',
            mode='markers',
            marker=go.Marker(color='rgb(60, 64, 198)')
        )
fit2 = go.Scatter(
                  x = regr,
                  y = line2,
                  mode='lines',
                  marker=go.Marker(color='rgb(60, 64, 198)'),
                  name='Non-White Fit'
                  )
trace3 = go.Scatter(
            x = years,
            y = y3,
            name='White',
            mode='markers',
            marker=go.Marker(color='rgb(255, 221, 89)'),
        )
fit3 = go.Scatter(
                  x = regr,
                  y = line3,
                  mode='lines',
                  marker=go.Marker(color='rgb(255, 221, 89)'),
                  name='White Fit'
                  )
trace4 = go.Scatter(
            x = years,
            y = y4,
            name='White and Asian',
            mode='markers',
            marker=go.Marker(color='rgb(52, 231, 228)'),
        )
fit4 = go.Scatter(
                  x = regr,
                  y = line4,
                  mode='lines',
                  marker=go.Marker(color='rgb(52, 231, 228)'),
                  name='White and Asian Fit'
                  )
data = [trace0, trace1, trace2, trace3, trace4, fit0, fit1, fit2, fit3, fit4]

# Create dictionaries of the demographic estimates for later use.
b_black = dict(zip(regr.T, line0.T))
b_asian = dict(zip(regr.T, line1.T))
b_poc = dict(zip(regr.T, line2.T))
b_white = dict(zip(regr.T, line3.T))
b_whasian = dict(zip(regr.T, line4.T))

layout = go.Layout(
            title = "The Change In Population Demographics for Boston Since 1990",
            font = dict(family='Gotham', size=18),
            yaxis=dict(
                title='Population Levels',
            ),
            xaxis=dict(
                title='Year')
        )

fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [None]:
# Demographic changes in Cambridge

years = pd.to_numeric(cambridge['year']).as_matrix()
regr = np.arange(1990, 2019)

y0 = cambridge['black population'].as_matrix()
y1 = cambridge['asian population'].as_matrix()
y2 = cambridge['poc population'].as_matrix()
y3 = cambridge['white population'].as_matrix()
y4 = cambridge['white and asian population'].as_matrix()


# Generate linear fits
line0 = fitter(years, y0, regr)
line1 = fitter(years, y1, regr)
line2 = fitter(years, y2, regr)
line3 = fitter(years, y3, regr)
line4 = fitter(years, y4, regr)

trace0 = go.Scatter(
            x = years,
            y = y0,
            name='Black',
            mode='markers',
            marker=go.Marker(color='rgb(255, 63, 52)')
        )
fit0 = go.Scatter(
                  x = regr,
                  y = line0,
                  mode='lines',
                  marker=go.Marker(color='rgb(255, 63, 52)'),
                  name='Black Fit'
                  )
trace1 = go.Scatter(
            x = years,
            y = y1,
            name='Asian',
            mode='markers',
            marker=go.Marker(color='rgb(5, 196, 107)')
        )
fit1 = go.Scatter(
                  x = regr,
                  y = line1,
                  mode='lines',
                  marker=go.Marker(color='rgb(5, 196, 107)'),
                  name='Asian Fit'
                  )
trace2 = go.Scatter(
            x = years,
            y = y2,
            name='Non-White',
            mode='markers',
            marker=go.Marker(color='rgb(60, 64, 198)')
        )
fit2 = go.Scatter(
                  x = regr,
                  y = line2,
                  mode='lines',
                  marker=go.Marker(color='rgb(60, 64, 198)'),
                  name='Non-White Fit'
                  )
trace3 = go.Scatter(
            x = years,
            y = y3,
            name='White',
            mode='markers',
            marker=go.Marker(color='rgb(255, 221, 89)'),
        )
fit3 = go.Scatter(
                  x = regr,
                  y = line3,
                  mode='lines',
                  marker=go.Marker(color='rgb(255, 221, 89)'),
                  name='White Fit'
                  )
trace4 = go.Scatter(
            x = years,
            y = y4,
            name='White and Asian',
            mode='markers',
            marker=go.Marker(color='rgb(52, 231, 228)'),
        )
fit4 = go.Scatter(
                  x = regr,
                  y = line4,
                  mode='lines',
                  marker=go.Marker(color='rgb(52, 231, 228)'),
                  name='White and Asian Fit'
                  )
data = [trace0, trace1, trace2, trace3, trace4, fit0, fit1, fit2, fit3, fit4]

# Create dictionaries of the demographic estimates for later use.
c_black = dict(zip(regr.T, line0.T))
c_asian = dict(zip(regr.T, line1.T))
c_poc = dict(zip(regr.T, line2.T))
c_white = dict(zip(regr.T, line3.T))
c_whasian = dict(zip(regr.T, line4.T))

layout = go.Layout(
            title = "The Change In Population Demographics for Cambridge Since 1990",
            yaxis=dict(
                title='Population Levels'
            ),
            xaxis=dict(
                title='Year')
        )

fig = go.Figure(data=data, layout=layout)
plotly.plotly.iplot(fig, filename="Change in Population")

The following is summary data for the above operations. The dataframe contains the estimates for all of the years and demographics.

In [None]:
boston_demdata = [b_black, b_asian, b_poc, b_white, b_whasian]
cambridge_demdata = [c_black, c_asian, c_poc, c_white, c_whasian]

popdata = pd.DataFrame(boston_demdata).T
popdata2 = pd.DataFrame(cambridge_demdata).T

popdata.reset_index(inplace=True)
popdata2.reset_index(inplace=True)
popdata2.drop('index', axis=1, inplace=True)

popdata.columns = ['year', 'b_black', 'b_asian', 'b_poc', 'b_white', 'b_white and asian']
popdata2.columns = ['c_black', 'c_asian', 'c_poc', 'c_white', 'c_white and asian']

popdata = pd.concat([popdata, popdata2], axis=1, join_axes=[popdata.index])

## Analyzing Monetary Data
Income levels, rent, housing prices.

The housing in Cambridge was primarily built before 1939 ( 50.99%), making the housing stock in Cambridge some of the oldest overall in America, although there is a range of ages of homes in Cambridge. The next most important housing age is between 1970-1999 ( 20.24%), followed by between 1940-1969 ( 18.47%). There's also some housing in Cambridge built between 2000 and later ( 10.31%).

In the last 10 years, Cambridge has experienced some of the highest home appreciation rates of any community in the nation. Cambridge real estate appreciated 62.57% over the last ten years, which is an average annual home appreciation rate of 4.98%, putting Cambridge in the top 10% nationally for real estate appreciation. If you are a home buyer or real estate investor, Cambridge definitely has a track record of being one of the best long term real estate investments in America through the last ten years.

Appreciation rates are so strong in Cambridge that despite a nationwide downturn in the housing market, Cambridge real estate has continued to appreciate in value faster than most communities. Looking at just the latest twelve months, Cambridge appreciation rates continue to be some of the highest in America, at 9.83%, which is higher than appreciation rates in 89.78% of the cities and towns in the nation. Based on the last twelve months, short-term real estate investors have found good fortune in Cambridge. Cambridge appreciation rates in the latest quarter were at 1.75%, which equates to an annual appreciation rate of 7.19%.

– https://www.neighborhoodscout.com/ma/cambridge/real-estate

**These are the median income, housing sales prices, and rent prices in Cambridge and Boston.**

### Cambridge Data

In [None]:
# http://www.deptofnumbers.com/income/massachusetts/boston/
c_income2010 = 64865
c_income2000 = 47979
c_income1990 = 33140

# http://www.cambridgema.gov/CDD/factsandmaps/demographicfaq
c_house2010 = 739800
# https://www.trulia.com/real_estate/Cambridge-Massachusetts/market-trends/
c_house2000 = 297000

# http://www.cambridgema.gov/CDD/factsandmaps/demographicfaq
c_rent2010 = 2348
# https://www.cambridgema.gov/~/media/Files/CDD/FactsandMaps/profiles/demo_profile_housing_2016.ashx
c_rent2000 = 1100

### Boston Data

In [None]:
# http://www.deptofnumbers.com/income/massachusetts/boston/
b_income2010 = 55777
b_income2000 = 39629
b_income1990 = 12350

# https://www.trulia.com/real_estate/Boston-Massachusetts/market-trends/
b_house2010 = 575000
b_house2000 = 190000

# http://www.deptofnumbers.com/rent/massachusetts/boston/
b_rent2010 = 1316
# https://www.census.gov/hhes/www/housing/census/historic/grossrents.html
b_rent2000 = 684

#### Comparison Graphs
The following graphs compare the median income of, housing prices in, and rent in the two cities. Since there is missing data for the housing prices and rent, a linear fit was used for estimates.

In [None]:
# Income Graphs
years = np.asarray([1990, 2000, 2010])
regr = np.arange(1990, 2019)
y0 = np.asarray([c_income1990, c_income2000, c_income2010])
y1 = np.asarray([b_income1990, b_income2000, b_income2010])


# Generate linear fits
# Cambridge
line0 = fitter(years, y0, regr)

# Boston
line1 = fitter(years, y1, regr)


trace0 = go.Scatter(
            x = years,
            y = y0,
            mode = 'markers',
            name='Cambridge',
            marker=go.Marker(color='#601014')
        )
fit0 = go.Scatter(
                  x = regr,
                  y = line0,
                  mode='lines',
                  marker=go.Marker(color='#601014'),
                  name='Cambridge Fit'
                  )
trace1 = go.Scatter(
            x = years,
            y = y1,
            mode = 'markers',
            name='Boston',
            marker=go.Marker(color='#D2232A')
        )
fit1 = go.Scatter(
                  x = regr,
                  y = line1,
                  mode='lines',
                  marker=go.Marker(color='#D2232A'),
                  name='Boston Fit'
                  )
data = [trace0, trace1, fit0, fit1]

# Create dictionaries of the income estimates for later use.
cambridge_income = dict(zip(regr.T, line0.T))
boston_income = dict(zip(regr.T, line1.T))

layout = go.Layout(
            title = "The Change In Median Income of Cambridge and Boston Residents Since 1990",
            font = dict(family='Gotham', size=18),
            yaxis=dict(
                title='Median Income ($)'
            ),
            xaxis=dict(
                title='Year')
        )

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename="income_delta")

In [None]:
# Housing Price Graphs
years = np.asarray([2000, 2010])
regr = np.arange(1997, 2019)
y0 = np.asarray([c_house2000, c_house2010])
y1 = np.asarray([b_house2000, b_house2010])


# Generate linear fits
# Cambridge
line0 = fitter(years, y0, regr)

# Boston
line1 = fitter(years, y1, regr)


trace0 = go.Scatter(
            x = years,
            y = y0,
            mode = 'markers',
            name='Cambridge',
            marker=go.Marker(color='#601014')
        )
fit0 = go.Scatter(
                  x = regr,
                  y = line0,
                  mode='lines',
                  marker=go.Marker(color='#601014'),
                  name='Cambridge Fit'
                  )
trace1 = go.Scatter(
            x = years,
            y = y1,
            mode = 'markers',
            name='Boston',
            marker=go.Marker(color='#D2232A')
        )
fit1 = go.Scatter(
                  x = regr,
                  y = line1,
                  mode='lines',
                  marker=go.Marker(color='#D2232A'),
                  name='Boston Fit'
                  )
data = [trace0, trace1, fit0, fit1]

# Create dictionaries of the income estimates for later use.
cambridge_houses = dict(zip(regr.T, line0.T))
boston_houses = dict(zip(regr.T, line1.T))

layout = go.Layout(
            title = "The Change In Housing Prices in Cambridge and Boston Since 2000",
            font = dict(family='Gotham', size=18),
            yaxis=dict(
                title='Median House Price ($)'
            ),
            xaxis=dict(
                title='Year')
        )

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename="housing_delta")

In [None]:
# Housing Price Graphs
years = np.asarray([2000, 2010])
regr = np.arange(1997, 2019)
y0 = np.asarray([c_rent2000, c_rent2010])
y1 = np.asarray([b_rent2000, b_rent2010])


# Generate linear fits
# Cambridge
line0 = fitter(years, y0, regr)

# Boston
line1 = fitter(years, y1, regr)


trace0 = go.Scatter(
            x = years,
            y = y0,
            mode = 'markers',
            name='Cambridge',
            marker=go.Marker(color='#601014')
        )
fit0 = go.Scatter(
                  x = regr,
                  y = line0,
                  mode='lines',
                  marker=go.Marker(color='#601014'),
                  name='Cambridge Fit'
                  )
trace1 = go.Scatter(
            x = years,
            y = y1,
            mode = 'markers',
            name='Boston',
            marker=go.Marker(color='#D2232A')
        )
fit1 = go.Scatter(
                  x = regr,
                  y = line1,
                  mode='lines',
                  marker=go.Marker(color='#D2232A'),
                  name='Boston Fit'
                  )
data = [trace0, trace1, fit0, fit1]

# Create dictionaries of the income estimates for later use.
cambridge_rent = dict(zip(regr.T, line0.T))
boston_rent = dict(zip(regr.T, line1.T))

layout = go.Layout(
            title = "The Change In Rent in Cambridge and Boston Since 2000",
            font = dict(family='Gotham', size=18),
            yaxis=dict(
                title='Median Rent ($)'
            ),
            xaxis=dict(
                title='Year')
        )

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename="rent_delta")

The following is summary data for the above operations. The dataframes contain the estimates for all of the years and incomes/housing prices/rent.

In [None]:
city_income = [boston_income, cambridge_income]
city_housing = [boston_houses, cambridge_houses]
city_rent = [boston_rent, cambridge_rent]

income_data = pd.DataFrame(city_income).T
housing_data = pd.DataFrame(city_housing).T
rent_data = pd.DataFrame(city_rent).T

income_data.reset_index(inplace=True)
housing_data.reset_index(inplace=True)
rent_data.reset_index(inplace=True)

income_data.columns = ['year', 'boston', 'cambridge']
housing_data.columns = ['year', 'boston', 'cambridge']
rent_data.columns = ['year', 'boston', 'cambridge']
money_data = [income_data, housing_data, rent_data]

Save the data for use in another notebook.

In [None]:
with open('VariableData/money_data.pickle', 'wb') as f:
    pickle.dump(money_data, f)
with open('VariableData/demographic_data.pickle', 'wb') as f:
    pickle.dump(popdata, f)