# Census API Metric Codes

| Race | Code |
|------|------|
| Total|B03002_001E|
| Black|B03002_004E|
| Asian|B03002_006E|
| Native Hawaiian Pacific Islander|B03002_007E|
| Other|B03002_008E|
| Hispanic or Latino|B03002_012E|
| 2 or More Races|B03002_010E|

| Citizenship | Code |
|------|------|
| Total Citizen| B05001_001E
| Not a u.s. Citizen|B05001_006E|
| Total Foreign Born|B05002_013E|


| Income | Code |
|------|------|
| Total income population|B05010_001E|
| Under poverty line|B05010_002E|

| Education | Code |
|------|------|
| Total Education |B07009_001E|
| Less than HS graduate |B07009_002E|
| High school graduate |B07009_003E|
| Some college or associate's degree |B07009_004E|
| Grad or professional degree |B07009_006E|

| Under 5 | Code |
|------|------|
| Total |B01001_001E|
| Male under 5 |B01001_003E|
| Female under 5 |B01001_027E|

| Housing | Code |
|------|------|
| Total |B07013_001E|
| Renters |B07013_003E|


# Data Pipelining and Processing

In [18]:
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import numpy as np
import pickle as p
import geopandas as gpd
import requests
import pickle

from bokeh.io import output_notebook, show
from bokeh.plotting import figure
from bokeh.models import (ColumnDataSource,HoverTool, LogColorMapper)
from bokeh.palettes import Viridis6 as palette
from bokeh.tile_providers import STAMEN_TERRAIN
output_notebook()

#If you need to install anything
#import sys
#!conda install --yes --prefix {sys.prefix} numpy

In [19]:
#Read in local files: fcc, census shape, planning database
fcc = pd.read_csv("/Users/robertdeng/Google Drive/Data Science/W209/Project/Data/tract_map_jun_2016.csv", sep=",", encoding = "latin-1", dtype={"tractcode": object})
fcc['pcat_all'] = 5 - fcc['pcat_all']
fcc['pcat_10x1'] = 5 - fcc['pcat_10x1']
fcc.columns = ["tractcode", "Internet_connectivity", "High_Internet_Connectivity" ]
shp_county = gpd.read_file("/Users/robertdeng/Google Drive/Data Science/W209/Project/Data/cb_2016_us_county_500k/cb_2016_us_county_500k.shp")
pdb = pd.read_csv("/Users/robertdeng/Google Drive/Data Science/W209/Project/Data/pdb2016trv8_us.csv", sep = ",", encoding = "latin-1",dtype={"GIDTR":object})
pdb2 = pdb[["GIDTR","MailBack_Area_Count_CEN_2010", "Census_Mail_Returns_CEN_2010"]]

Let's first check the availability of state data and store the missing url requests in a list

In [20]:
full_state_test = ["%.2d" % i for i in range(1,57)]
bad_apples = []
def state_checker(full_state_test):
    for i in full_state_test:
        url = ("https://api.census.gov/data/2015/acs5?get=NAME,B03002_001E"+
               "&for=tract:*&in=state:" + i + "&key=14ba39dd26088efd8d54c4f01d90023f2d4bfc6d")
        response_code = requests.get(url).status_code
        if response_code != 200:
            bad_apples.append([i, response_code])
state_checker(full_state_test)
#Also remove Washington DC
bad_apples.append(['11', 204])
print("These states return no content. Bad Apples :(\n", bad_apples)

These states return no content. Bad Apples :(
 [['03', 204], ['07', 204], ['14', 204], ['43', 204], ['52', 204], ['11', 204]]


3, 7, 14, 43, 52 are missing, but all the states should be in there if we go up to 56

https://www.census.gov/geo/reference/ansi_statetables.html

In [21]:
def pull_census(state, url_yes_no):
    url = ("https://api.census.gov/data/2015/acs5?get=NAME,B03002_001E,B03002_004E,B03002_006E," +
           "B03002_007E,B03002_008E,B03002_010E,B03002_012E," + #raceethicity
           "B05001_001E,B05001_006E," + #citizenship
           "B05002_013E," #foreign born
           "B05010_001E,B05010_002E," + #povertyline
           "B07009_001E,B07009_002E,B07009_003E,B07009_004E,B07009_006E," + #edu
           "B07013_001E,B07013_003E,"+ #renter
           "B01001_001E,B01001_003E,B01001_027E" + #under5 
           "&for=tract:*&in=state:" + state + "&key=14ba39dd26088efd8d54c4f01d90023f2d4bfc6d")
    if url_yes_no:
        print(url)        
    html = requests.get(url).json()
    return html

In [22]:
#Make a master list range and remove the bad apples
master_list = ["%.2d" % i for i in range(1,57)]
master_list = [i for i in master_list if i not in [bad_apples[i][0] for i in range(len(bad_apples))]]

#Then stitch together all the data frames for the remaining dataset
for i in master_list:
    if i == "01":
        newstate = pull_census(i, False)
        master = pd.DataFrame(newstate, columns = newstate[0])[1:]
    elif i != "01":
        newstate = pull_census(i, False)
        master = master.append(pd.DataFrame(newstate, columns = newstate[0])[1:])

In [23]:
#Clean Before Merging
master["GEOID"] = master["state"] + master["county"] + master["tract"]
master = master.merge(pdb2, left_on = "GEOID", right_on = "GIDTR", how = "left")
master = master.merge(fcc, left_on = "GEOID", right_on = "tractcode", how = "left")
master = master.drop(["GIDTR", "tractcode", "High_Internet_Connectivity"], axis = 1)
master["GEOID2"] = master.GEOID.astype(str).str[0:5]
master["County_Name"] = master["NAME"].str.split(",").str[1]
master["State_Name"] = master["NAME"].str.split(",").str[2]

In [24]:
#Name the columns
master.columns = ["Census_Tract_Detail", "Total_all_race", "Black", "Asian", "Native_Hawaiian_Pacific_Islander",
                  "Other", "Two_or_more_races", "Hispanic_or_Latino", 
                  "Total_citizen", "Not_a_us_citizen", "Total_foreign_born", 
                  "Total_income_population", "Total_under_poverty_line",
                  "Total_edu", "Less_than_HS", "HS_grad", "College_grad", "Graduate_or_professional", 
                  "Total_housing", "Renters", 
                  "Total_pop_all_age", "Total_male_under5", "Total_female_under5", 
                  "state", "county", "tract", "GEOID", "Census_Mail_Total_Sent", "Census_Mail_Response",
                  "Internet_connectivity", "GEOID2", "County_Name", "State_Name"]

In [25]:
#Change dtype to numeric
master[['Total_all_race', 'Black', 'Asian', 'Native_Hawaiian_Pacific_Islander', 'Other', 'Two_or_more_races', 
        'Hispanic_or_Latino', 
        'Total_citizen', 'Not_a_us_citizen',
        'Total_income_population', 'Total_under_poverty_line',
        'Total_edu', 'Less_than_HS', 'HS_grad', 'College_grad', 'Graduate_or_professional',
        'Total_housing', 'Renters',
        'Total_pop_all_age', 'Total_male_under5', 'Total_female_under5',
        'Census_Mail_Total_Sent', 'Census_Mail_Response', 'Internet_connectivity']] = master[[
        'Total_all_race', 'Black', 'Asian', 'Native_Hawaiian_Pacific_Islander', 'Other', 'Two_or_more_races', 
        'Hispanic_or_Latino', 
        'Total_citizen', 'Not_a_us_citizen',
        'Total_income_population', 'Total_under_poverty_line',
        'Total_edu', 'Less_than_HS', 'HS_grad', 'College_grad', 'Graduate_or_professional',
        'Total_housing', 'Renters',
        'Total_pop_all_age', 'Total_male_under5', 'Total_female_under5',
        'Census_Mail_Total_Sent', 'Census_Mail_Response', 'Internet_connectivity',]].apply(pd.to_numeric)


In [26]:
master_details = master[["Census_Tract_Detail", "state", "county", "tract", "GEOID2", "County_Name", "State_Name"]]
master_tract = pd.DataFrame(master_details.groupby("GEOID2")["tract"].apply(list).apply(set)).reset_index()

In [27]:
master_groupby_mean = master.iloc[:,[29, 30]].groupby(["GEOID2"], as_index = False).mean()
master_groupby_sum = master.iloc[:, list(range(1,23)) + [27, 28, 30]].groupby(["GEOID2"], as_index = False).sum()
master = master_tract.merge(master_groupby_sum, on = "GEOID2").merge(master_groupby_mean, on = "GEOID2")

In [28]:
master = shp_county.merge(master, left_on = "GEOID", right_on = "GEOID2", how = "left")
print(round(master.Asian.isnull().sum()/len(master.Asian),2)*100, "% of the rows have NAs i.e. they didnt have something to merge with")

3.0 % of the rows have NAs i.e. they didnt have something to merge with


In [29]:
#Column Creation
master["Total_minority"]=master["Black"]+master["Asian"]+master["Native_Hawaiian_Pacific_Islander"]+master["Other"]+master["Two_or_more_races"]+master["Hispanic_or_Latino"]
master["Total_under_5"]=master["Total_male_under5"]+master["Total_female_under5"]

master["pct_mail_failure_to_return"] = 1 - master["Census_Mail_Response"]/master["Census_Mail_Total_Sent"]
master["pct_under_5"]=master["Total_under_5"]/master["Total_pop_all_age"]
master["pct_renters"]=master["Renters"]/master["Total_housing"]
master["pct_minority"]=master["Total_minority"]/master["Total_all_race"]
master["pct_not_citizen"]=master["Not_a_us_citizen"]/ master["Total_citizen"]

#Fill NA with 0 to keep county plot and just have a null value
master = master.fillna(0)

In [30]:
pickle.dump(master, open( "master.pkl", "wb" ) )

In [40]:

for p in master.geometry:
    if p.type != 'Polygon':
        print (p.type, p[:5])

MultiPolygon MULTIPOLYGON (((-88.053375 30.506987, -88.05108799999999 30.508857, -88.045647 30.513306, -88.041966 30.517376, -88.03867 30.520405, -88.035099 30.521967, -88.03240699999999 30.521447, -88.02916499999999 30.519648, -88.02622199999999 30.516609, -88.023506 30.492573, -88.02432999999999 30.490774, -88.026748 30.491105, -88.04504299999999 30.501189, -88.05234899999999 30.505591, -88.053375 30.506987)), ((-88.211209 30.322249, -88.209999 30.323199, -88.209559 30.322202, -88.20895399999999 30.320445, -88.208073 30.319543, -88.208733 30.318878, -88.209559 30.31902, -88.210824 30.320777, -88.211209 30.322249)), ((-88.22511999999999 30.321802, -88.22128099999999 30.322233, -88.218694 30.321903, -88.221031 30.32039, -88.22465299999999 30.319886, -88.226522 30.320793, -88.22511999999999 30.321802)), ((-88.26465899999999 30.335541, -88.26367399999999 30.336335, -88.261506 30.335485, -88.26006099999999 30.336449, -88.259141 30.337129, -88.259141 30.339, -88.25841799999999 30.340417, -

MultiPolygon MULTIPOLYGON (((-88.8677 29.861551, -88.86565899999999 29.86262, -88.864312 29.860997, -88.86348099999999 29.858835, -88.862714 29.856784, -88.86207499999999 29.852793, -88.861436 29.850353, -88.861436 29.84797, -88.862331 29.846195, -88.864057 29.846584, -88.86514299999999 29.848469, -88.86456799999999 29.851019, -88.86565499999999 29.855842, -88.8677 29.8595, -88.8677 29.861551)), ((-88.879333 29.874023, -88.878311 29.877016, -88.876649 29.879787, -88.87530599999999 29.881949, -88.874859 29.880618, -88.875434 29.877847, -88.87607299999999 29.875021, -88.878822 29.87236, -88.879333 29.874023)), ((-88.870476 30.049212, -88.855583 30.034414, -88.84122499999999 30.012789, -88.833725 29.998821, -88.824158 29.970461, -88.81701699999999 29.93425, -88.818146 29.889109, -88.826538 29.847092, -88.83270999999999 29.824062, -88.84584599999999 29.799052, -88.867874 29.762305, -88.875091 29.759818, -88.86506199999999 29.781019, -88.863756 29.793564, -88.853033 29.811117, -88.846497 29