## Census API to query data

US Census allows developers to use their API to query different datasets. I am interested in querying the 2016 ACS dataset for 5 parameters (percent unemployment comes from the Bureau of Labor Statistics):

### Economic:
* Poverty
* Percent unemployment
* Median household income

### Demographic:
* Median age
* Percentage minority
* Education - percentage high school grad or higher

In [1]:
import requests
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
import pickle
import time

In [2]:
# get census API key from file
with open("API_ignore.txt", 'r') as f:
    t = f.read()
    
MY_API = t.split()[1].strip()

In [3]:
# load county_df so that you can iterate over INCITS
with open("statedf_countydf_states.pkl", "rb") as picklefile:
    [state_df, county_df, states] = pickle.load(picklefile)

In [4]:
county_df.head()

Unnamed: 0,state_code,county_code,name,INCITS
0,1,1,Autauga County,1001
1,1,3,Baldwin County,1003
2,1,5,Barbour County,1005
3,1,7,Bibb County,1007
4,1,9,Blount County,1009


In [17]:
census_query = "https://api.census.gov/data/2016/acs/acs5/profile?get=NAME,{}&for=county:{}&in=state:{}&key={}"

stats_request_dict = {"median_age": "DP05_0017E", 
                      "percent_white": "DP05_0032PE", 
                      "hs_percent": "DP02_0066PE",
                      "median_hh_income": "DP03_0062E",
                      "percent_unemployed": "DP03_0005PE",
                      "percent_poverty" : "DP03_0119PE",
                     }

In [69]:
census_columns = ['INCITS']
census_columns.extend(list(stats_request_dict.keys()))
census_df = pd.DataFrame(columns=census_columns)

In [74]:
mini_county_df = county_df.iloc[0:3]

def get_census_df(county_df):
    stats_request_dict = {"median_age": "DP05_0017E", 
                      "percent_white": "DP05_0032PE", 
                      "hs_percent": "DP02_0066PE",
                      "median_hh_income": "DP03_0062E",
                      "percent_unemployed": "DP03_0005PE",
                      "percent_poverty" : "DP03_0119PE",
                     }
    
    census_columns = ['INCITS']
    census_columns.extend(list(stats_request_dict.keys()))
    census_df = pd.DataFrame(columns=census_columns)
    
    census_query = "https://api.census.gov/data/2016/acs/acs5/profile?get=NAME,{}&for=county:{}&in=state:{}&key={}"
    
    for row in county_df.itertuples():
        row_dict = {'INCITS': row.INCITS}
        for stat_name, req in stats_request_dict.items():
            #print(census_query.format(req, row.county_code, row.state_code, MY_API))
            try:
                response = requests.get(census_query.format(req, row.county_code, row.state_code, MY_API))
                response.raise_for_status()
                val = response.json()
                row_dict[stat_name] = val[1][1]
            except:
                print("There was an error making this query.")
        census_df = census_df.append(row_dict, ignore_index=True)
    
    return census_df

In [75]:
x = get_census_df(mini_county_df)

In [76]:
x

Unnamed: 0,INCITS,median_age,percent_white,hs_percent,median_hh_income,percent_unemployed,percent_poverty
0,1001,37.8,76.9,87.6,53099,3.4,9.4
1,1003,42.3,86.4,90.0,51365,3.7,9.3
2,1005,38.7,46.7,73.8,33956,6.2,20.0


In [100]:
list_county_dfs = []

num = county_df.shape[0]
chunk_size = 160
cycles = county_df.shape[0]//chunk_size
for i in range(0, cycles+1):
    if i == cycles:
        #print(str(i*chunk_size) + '\t' + ':')
        list_county_dfs.append(county_df.iloc[(i*chunk_size):])
    else:
        #print(str(i*chunk_size) + '\t' + str((i+1)*chunk_size))
        list_county_dfs.append(county_df.iloc[i*chunk_size:(i+1)*chunk_size])

In [None]:
# create a list of dataframes.
census_dfs = []

In [120]:
list_county_dfs[1]

Unnamed: 0,state_code,county_code,name,INCITS
160,05,099,Nevada County,05099
161,05,101,Newton County,05101
162,05,103,Ouachita County,05103
163,05,105,Perry County,05105
164,05,107,Phillips County,05107
165,05,109,Pike County,05109
166,05,111,Poinsett County,05111
167,05,113,Polk County,05113
168,05,115,Pope County,05115
169,05,117,Prairie County,05117


In [129]:
%%time
x = get_census_df(list_county_dfs[3])
print(time.strftime("%m/%d/%Y %H:%M:%S"))

# 07/12/2018 19:36:50 for chunk 0; 9 minutes and 57 seconds
# 07/12/2018 20:14:06 for chunk 1; 9min 53s
# 07/12/2018 20:31:35 for chunk 2; 12min 51s
# 07/12/2018 20:43:52 for chunk 3; 9min 39s
# 07/12/2018 21:08:55 for chunk 4; 10min 29s

07/12/2018 22:40:35
CPU times: user 25.3 s, sys: 2.22 s, total: 27.6 s
Wall time: 10min 10s


In [114]:
census_dfs.append(x)

In [131]:
with open('cdf3.pkl', 'wb') as picklefile:
    pickle.dump(x, picklefile)

In [93]:
census_req = "https://api.census.gov/data/2016/acs/acs5/profile?get=NAME,DP05_0017E&for=county:001&in=state:01&key={}"

response = requests.get(census_req.format(MY_API))

try:
    response = requests.get(census_req.format(MY_API))
    response.raise_for_status()
    print(response.json())
except:
    print("Error!")

[['NAME', 'DP05_0017E', 'state', 'county'], ['Autauga County, Alabama', '37.8', '01', '001']]


In [90]:
print(time.strftime("%m/%d/%Y %H:%M:%S"))

07/12/2018 18:20:31


https://api.census.gov/data/2016/acs/acs1/profile/variables.html

https://api.census.gov/data/2016/acs/acs1/profile.html

https://www26.state.nj.us/doh-shad/sharedstatic/StandardizedMortalityRatio.pdf

In [126]:
with open('c.pkl', 'rb') as picklefile:
    c = pickle.load(picklefile)

In [128]:
c.INCITS.unique()

array(['01001', '01003', '01005', '01007', '01009', '01011', '01013',
       '01015', '01017', '01019', '01021', '01023', '01025', '01027',
       '01029', '01031', '01033', '01035', '01037', '01039', '01041',
       '01043', '01045', '01047', '01049', '01051', '01053', '01055',
       '01057', '01059', '01061', '01063', '01065', '01067', '01069',
       '01071', '01073', '01075', '01077', '01079', '01081', '01083',
       '01085', '01087', '01089', '01091', '01093', '01095', '01097',
       '01099', '01101', '01103', '01105', '01107', '01109', '01111',
       '01113', '01115', '01117', '01119', '01121', '01123', '01125',
       '01127', '01129', '01131', '01133', '02013', '02016', '02020',
       '02050', '02060', '02068', '02070', '02090', '02100', '02105',
       '02110', '02122', '02130', '02150', '02158', '02164', '02170',
       '02180', '02185', '02188', '02195', '02198', '02220', '02230',
       '02240', '02261', '02275', '02282', '02290', '04001', '04003',
       '04005', '040