In [1]:
import pandas as pd, numpy as np
import zipfile
import glob
import pathlib
import shutil
import subprocess

! whoami
! date

zmbc
Mon Oct  3 14:30:50 PDT 2022


# Download ACS PUMS data

In [2]:
FILE_NAMES = {
    'person': 'csv_pus',
    'household': 'csv_hus',
}

def download_and_extract(year, time_span_years, redownload=False):
    short_name = f'acs_{year}_{time_span_years}yr'
    raw_data_dir = f'../data/raw/{short_name}'
    path = pathlib.Path(raw_data_dir)
    path.mkdir(parents=True, exist_ok=True)

    for file_type in ['household', 'person']:
        print(f'{file_type} file')
        source_file_name = FILE_NAMES[file_type]

        if file_type == 'person':
            dtypes = {"RT": "object", "SERIALNO": "object", "NAICSP": "object", "SOCP": "object"}
        else:
            dtypes = {"RT": "object", "SERIALNO": "object"}
        
        if redownload or not pathlib.Path(f'{raw_data_dir}/{source_file_name}.zip').is_file():
            print('downloading')
            subprocess.run(["wget", f"https://www2.census.gov/programs-surveys/acs/data/pums/{year}/{time_span_years}-Year/{source_file_name}.zip", "-P", raw_data_dir, "--progress=bar:force:noscroll"])

        print('extracting')
        extraction_path = pathlib.Path(f'{raw_data_dir}/{source_file_name}_extracted')
        if extraction_path.is_dir():
            shutil.rmtree(extraction_path)
        extraction_path.mkdir()

        with zipfile.ZipFile(f'{raw_data_dir}/{source_file_name}.zip', 'r') as zip_ref:
            zip_ref.extractall(extraction_path)

        print('combining data')
        df = pd.concat([pd.read_csv(fname, dtype=dtypes) for fname in glob.glob(f'{extraction_path}/*.csv')])
        print(f'writing file with {len(df):,} {file_type} rows')
        df.to_hdf(f'../data/{short_name}_{file_type}.hdf', key='acs', mode='w')
        del df

## 5-year data

This is what's loaded into the simulation, and what I expect we will use for most research tasks.

Note that the 2018 and 2019 5-year person (and possibly household) files on the Census website are malformed (do not unzip successfully).

In [3]:
%%time

download_and_extract(year=2020, time_span_years=5)

household file
extracting
combining data
writing file with 7,426,541 household rows
person file
extracting
combining data
writing file with 15,441,673 person rows


your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block2_values] [items->Index(['RT', 'SERIALNO', 'NAICSP', 'SOCP'], dtype='object')]

  encoding=encoding,


CPU times: user 9min 50s, sys: 4min 10s, total: 14min 1s
Wall time: 14min 42s


## 1-year data

We may use this if we ever want to answer some question where we think there is a rapid time trend that makes year pooling inappropriate (and sample size is sufficient to answer from one year of data).

Note that the 2020 version of this file is experimental (not officially released by Census as part of the standard data series).

In [4]:
%%time

download_and_extract(year=2019, time_span_years=1)

household file
extracting
combining data
writing file with 1,548,188 household rows
person file
extracting
combining data
writing file with 3,239,553 person rows
CPU times: user 2min 15s, sys: 37.4 s, total: 2min 52s
Wall time: 3min
