# 20160110-etl-census-with-python

Related post:  
https://stharrold.github.io/20160110-etl-census-with-python.html

Data documentation:  
https://www.census.gov/programs-surveys/acs/technical-documentation/pums/documentation.2013.html

In [1]:
# Import standard packages.
import collections
import functools
import os
import pdb # Debug with pdb.
import subprocess
import sys
import time
# Import installed packages.
import numpy as np
import pandas as pd
# Import local packages.
# Insert current directory into module search path.
# Autoreload local packages after editing.
# `dsdemos` version: https://github.com/stharrold/dsdemos/releases/tag/v0.0.3
sys.path.insert(0, os.path.join(os.path.curdir, r'dsdemos'))
%reload_ext autoreload
%autoreload 2
import dsdemos as dsd

### Globals

File sources:
* 2013 5-year PUMS data dictionary: [PUMS_Data_Dictionary_2009-2013.txt](http://www2.census.gov/programs-surveys/acs/tech_docs/pums/data_dict/PUMS_Data_Dictionary_2009-2013.txt) (<1&nbsp;MB)
* 2013 5-year PUMS person and housing records for Washington DC:
    * Person records: [csv_pdc.zip](http://www2.census.gov/programs-surveys/acs/data/pums/2013/5-Year/csv_pdc.zip) (5&nbsp;MB compressed, 30&nbsp;MB decompressed)
    * Housing records: [csv_hdc.zip](http://www2.census.gov/programs-surveys/acs/data/pums/2013/5-Year/csv_hdc.zip) (2&nbsp;MB compressed, 13&nbsp;MB decompressed)
* 2013 5-year PUMS estimates for user verification: [pums_estimates_9_13.csv](http://www2.census.gov/programs-surveys/acs/tech_docs/pums/estimates/pums_estimates_9_13.csv) (<1&nbsp;MB)

In [2]:
# File paths
basename = r'20160110-etl-census-with-python'
filename = basename
#path_ipynb = os.path.join(path_static, basename, filename+'.ipynb')
path_disk = os.path.abspath(r'/home/joel/census/')
path_acs = path_disk

#sample constructed by:
#sort -R ss13tot.csv | head -n 200000 > ss13sample.csv
#head -n 1 ss13tot.csv > headers
#cat headers ss13sample.csv > ss13sample_fixed.csv
path_pcsv = os.path.join(path_acs, r'ss13sample_fixed.csv') # 'pdc' = 'person DC'
path_hcsv = os.path.join(path_acs, r'ss13hdc.csv') # 'hdc' = 'housing DC'
path_ecsv = os.path.join(path_acs, r'pums_estimates_9_13.csv')
path_dtxt = os.path.join(path_acs, r'PUMS_Data_Dictionary_2009-2013.txt')

# Weights
pwt = 'PWGTP' # person weight
pwts = [pwt+str(inum) for inum in range(1, 81)]
hwt = 'WGTP' # housing weight
hwts = [hwt+str(inum) for inum in range(1, 81)]

### PUMS data

In [5]:

time_start = time.perf_counter()
for path in [path_pcsv]:
    with open(path) as fobj:
        nlines = sum(1 for _ in fobj)
    with open(path) as fobj:
        first_line = fobj.readline()
        ncols = first_line.count(',')+1
    print("{path}:".format(path=path))
    print("    size (MB)   = {size:.1f}".format(size=os.path.getsize(path)/1e6))
    print("    num lines   = {nlines}".format(nlines=nlines))
    print("    num columns = {ncols}".format(ncols=ncols))
print()

# For ss13pdc.csv, low_memory=False since otherwise pandas raises DtypeWarning.
dfp = pd.read_csv(path_pcsv, low_memory=True)

for (name, df) in [('dfp', dfp)]:
    print("{name} RAM usage (MB) = {mem:.1f}".format(
            name=name, mem=df.memory_usage().sum()/1e6))
time_stop = time.perf_counter()
print()
print("Time elapsed (sec) = {diff:.1f}".format(diff=time_stop-time_start))

`dfp`, `dfh`: Load person and housing records.
/home/joel/census/ss13sample_fixed.csv:
    size (MB)   = 188.4
    num lines   = 200001
    num columns = 283
/home/joel/census/ss13hdc.csv:
    size (MB)   = 13.5
    num lines   = 17501
    num columns = 205

dfp RAM usage (MB) = 452.8
dfh RAM usage (MB) = 28.7

Time elapsed (sec) = 4.9


In [44]:
import numpy as np
import us

state_set = set()
def create_intermediate(sample):
    age = int(sample['AGEP'])
    _mar = sample['MAR']
    _gender = sample['SEX'] 
    _state = "%02d" % sample['ST'] 
    _educ = sample['SCHL']
    
    gender = 'other'
    if _gender==1:
        gender='male'
    if _gender==2:
        gender='female'
    
    educ='error'
    if _educ <=15:
        educ = "no-hs"
    if _educ == 16 or _educ == 17:
        educ = 'hs'
    if _educ >= 18 and _educ <=20:
        educ = 'some-college'
    if _educ == 21:
        educ = 'college'
    if _educ > 21:
        educ = 'post-grad'
    
    race = 'other'
    if sample['RACWHT']==1:
        race = 'white'
    if sample['RACASN']==1:
        race = 'asian'
    if sample['RACBLK']==1:
        race = 'black'
    if sample['RACAIAN']==1:
        race = 'native'
    if sample['RACNH']==1:
        race = 'hawaiian'
            
    state = us.states.lookup(str(_state)).abbr
    if (state=='DC'):
        state = 'VA' #sorry DC 
    state_set.add(state)
    
    mar = None
    if _mar == 1:
     mar = 'married'
    if _mar == 2:
     mar = 'widowed'
    if _mar == 3 or _mar == 4:
     mar = 'divorced'
    if _mar == 5:
     mar = 'single'
    
    return {'age':age,'mar':mar,'state':state,'race':race,'educ':educ,'gender':gender}    

def pull_sample(data,normalized_weights):
    idx = data.index
    sample = np.random.choice(idx,p=normalized_weights)
    return data.loc[sample]

#of voting age; US citizen
US_citizen_filter = np.logical_and(dfp['AGEP']>18,dfp['CIT']!=5)
US_citizen_filter = np.logical_and(US_citizen_filter,dfp['ST']!=72)
filtered_data = dfp[US_citizen_filter]
    
weights = filtered_data[pwt]
normalized_weights = weights/weights.sum()

print(weights.shape)

samples = []
for _ in range(2000):
 sample = pull_sample(filtered_data,normalized_weights)
 intermediate = create_intermediate(sample)
 samples.append(intermediate)
   


(144069,)


In [45]:
import json
outfile = open("census_sample.json","w")
json.dump(samples,outfile)
outfile.close()