## Census API to query data

US Census allows developers to use their API to query different datasets. I am interested in querying the 2016 ACS dataset for 5 parameters (percent unemployment comes from the Bureau of Labor Statistics):

### Economic:
* Poverty
* Percent unemployment
* Median household income

### Demographic:
* Median age
* Percentage minority
* Education - percentage high school grad or higher

In [1]:
import requests
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
import pickle
import time

In [2]:
# get census API key from file
with open("API_ignore.txt", 'r') as f:
    t = f.read()
    
MY_API = t.split()[1].strip()

In [3]:
# load county_df so that you can iterate over INCITS
with open("statedf_countydf_states.pkl", "rb") as picklefile:
    [state_df, county_df, states] = pickle.load(picklefile)

In [4]:
census_query = "https://api.census.gov/data/2016/acs/acs5/profile?get=NAME,{}&for=county:{}&in=state:{}&key={}"

stats_request_dict = {"median_age": "DP05_0017E", 
                      "percent_white": "DP05_0032PE", 
                      "hs_percent": "DP02_0066PE",
                      "median_hh_income": "DP03_0062E",
                      "percent_unemployed": "DP03_0005PE",
                      "percent_poverty" : "DP03_0119PE",
                     }

In [5]:
census_columns = ['INCITS']
census_columns.extend(list(stats_request_dict.keys()))
census_df = pd.DataFrame(columns=census_columns)

In [6]:
#mini_county_df = county_df.iloc[0:3]

def get_census_df(county_df):
    stats_request_dict = {"median_age": "DP05_0017E", 
                      "percent_white": "DP05_0032PE", 
                      "hs_percent": "DP02_0066PE",
                      "median_hh_income": "DP03_0062E",
                      "percent_unemployed": "DP03_0005PE",
                      "percent_poverty" : "DP03_0119PE",
                     }
    
    census_columns = ['INCITS']
    census_columns.extend(list(stats_request_dict.keys()))
    census_df = pd.DataFrame(columns=census_columns)
    
    census_query = "https://api.census.gov/data/2016/acs/acs5/profile?get=NAME,{}&for=county:{}&in=state:{}&key={}"
    
    for row in county_df.itertuples():
        row_dict = {'INCITS': row.INCITS}
        for stat_name, req in stats_request_dict.items():
            #print(census_query.format(req, row.county_code, row.state_code, MY_API))
            try:
                response = requests.get(census_query.format(req, row.county_code, row.state_code, MY_API))
                response.raise_for_status()
                val = response.json()
                row_dict[stat_name] = val[1][1]
            except:
                print("There was an error making this query.")
        census_df = census_df.append(row_dict, ignore_index=True)
    
    return census_df

In [75]:
#x = get_census_df(mini_county_df)

In [76]:
#x

Unnamed: 0,INCITS,median_age,percent_white,hs_percent,median_hh_income,percent_unemployed,percent_poverty
0,1001,37.8,76.9,87.6,53099,3.4,9.4
1,1003,42.3,86.4,90.0,51365,3.7,9.3
2,1005,38.7,46.7,73.8,33956,6.2,20.0


In [7]:
list_county_dfs = []

num = county_df.shape[0]
chunk_size = 160
cycles = county_df.shape[0]//chunk_size
for i in range(0, cycles+1):
    if i == cycles:
        #print(str(i*chunk_size) + '\t' + ':')
        list_county_dfs.append(county_df.iloc[(i*chunk_size):])
    else:
        #print(str(i*chunk_size) + '\t' + str((i+1)*chunk_size))
        list_county_dfs.append(county_df.iloc[i*chunk_size:(i+1)*chunk_size])

In [8]:
# create a list of dataframes.
census_dfs = []

In [9]:
cycles

19

In [77]:
%%time
x = get_census_df(list_county_dfs[5])
print(time.strftime("%m/%d/%Y %H:%M:%S"))

# 07/12/2018 19:36:50 for chunk 0; 9 minutes and 57 seconds
# 07/12/2018 20:14:06 for chunk 1; 9min 53s
# 07/12/2018 20:31:35 for chunk 2; 12min 51s
# 07/12/2018 20:43:52 for chunk 3; 9min 39s
# 07/12/2018 21:08:55 for chunk 4; 10min 29s

# 07/13/2018 08:24:38 for chunk 6; 6min 4s
# 07/13/2018 08:34:04 for chunk 7; 5min 59s
# 07/13/2018 08:46:25 for chunk 8; 6min 1s
# 07/13/2018 09:33:52 for chunk 9; 6min 4s
# 07/13/2018 09:41:10 for chunk 10; 6min 2s
# 07/13/2018 09:55:42 for chunk 11; 6min 3s
# 07/13/2018 10:05:18 for chunk 12; 6min 17s
# 07/13/2018 10:57:08 for chunk 13; 6min 14s
#There was an error making this query.
#07/13/2018 11:37:01
# 07/13/2018 12:48:08 for chunk 14; 6min 5s
# 07/13/2018 13:10:41 for chunk 15; 7min
# 07/13/2018 13:25:38 for chunk 16; 6min 7s
# 07/13/2018 13:32:24 for chunk 17; 6min 23s
# 07/13/2018 13:54:08 for chunk 18; 6min
# 07/13/2018 14:14:47 for chunk 19; 4min 44s

# we are now up to 19.

07/13/2018 14:54:04
CPU times: user 25.7 s, sys: 1.74 s, total: 27.5 s
Wall time: 6min 12s


In [78]:
with open('cdf5.pkl', 'wb') as picklefile:
    pickle.dump(x, picklefile)

https://api.census.gov/data/2016/acs/acs1/profile/variables.html

https://api.census.gov/data/2016/acs/acs1/profile.html

https://www26.state.nj.us/doh-shad/sharedstatic/StandardizedMortalityRatio.pdf

In [52]:
with open('c.pkl', 'rb') as picklefile:
    c = pickle.load(picklefile)

In [43]:
list_of_dfs = []

for i in range(6, 20):
    filename = 'cdf{}.pkl'
    with open(filename.format(i), 'rb') as picklefile:
        df = pickle.load(picklefile)
        list_of_dfs.append(df)

In [45]:
c_6_19 = pd.concat(list_of_dfs)

In [46]:
c_6_19.head()

Unnamed: 0,INCITS,median_age,percent_white,hs_percent,median_hh_income,percent_unemployed,percent_poverty
0,20145,43.7,90.4,89.3,46227,0.6,4.0
1,20147,44.9,96.5,93.6,45940,2.0,7.2
2,20149,34.8,94.0,95.4,62500,2.9,6.2
3,20151,38.8,94.7,90.6,48995,3.2,9.7
4,20153,50.3,96.4,93.7,49327,1.4,5.4


In [47]:
c_6_19.shape

(2182, 7)

In [50]:
with open('c_6_19.pkl', 'wb') as picklefile:
    pickle.dump(c_6_19, picklefile)

In [53]:
c.shape

(800, 7)

In [55]:
census_master_df = pd.concat([c, c_6_19])

In [56]:
census_master_df.head()

Unnamed: 0,INCITS,median_age,percent_white,hs_percent,median_hh_income,percent_unemployed,percent_poverty
0,1001,37.8,76.9,87.6,53099,3.4,9.4
1,1003,42.3,86.4,90.0,51365,3.7,9.3
2,1005,38.7,46.7,73.8,33956,6.2,20.0
3,1007,40.2,77.0,80.7,39776,3.5,11.7
4,1009,40.8,95.4,80.0,46212,3.0,12.2


In [57]:
county_df.shape

(3142, 4)

In [71]:
missing_incits = list(set(county_df.INCITS).difference(set(census_master_df.INCITS)))

In [None]:
sorted(missing_incits)

In [76]:
list_county_dfs[5].head()

Unnamed: 0,state_code,county_code,name,INCITS
800,19,23,Butler County,19023
801,19,25,Calhoun County,19025
802,19,27,Carroll County,19027
803,19,29,Cass County,19029
804,19,31,Cedar County,19031


In [79]:
census_master_df = pd.concat([census_master_df, x])

In [80]:
census_master_df.shape

(3142, 7)

In [82]:
with open('census_master_df.pkl', 'wb') as picklefile:
    pickle.dump(census_master_df, picklefile)