In [1]:
import pandas as pd, numpy as np
import pathlib
import subprocess

! whoami
! date

zmbc
Wed Oct 26 15:39:56 PDT 2022


# Download PUMA -> MIGPUMA crosswalk

In [2]:
raw_data_dir = f'../data/raw/migpuma_crosswalk'
path = pathlib.Path(raw_data_dir)
path.mkdir(parents=True, exist_ok=True)

In [3]:
# 2010 Migration PUMA Definitions from IPUMS
# Data file linked from https://usa.ipums.org/usa/volii/10migpuma.shtml
subprocess.run(["wget", f"https://usa.ipums.org/usa/resources/volii/puma_migpuma1_pwpuma00.xls", "-P", raw_data_dir, "--progress=bar:force:noscroll"])

--2022-10-26 15:39:56--  https://usa.ipums.org/usa/resources/volii/puma_migpuma1_pwpuma00.xls
Resolving usa.ipums.org (usa.ipums.org)... 128.101.163.136
Connecting to usa.ipums.org (usa.ipums.org)|128.101.163.136|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 229888 (224K) [application/vnd.ms-excel]
Saving to: ‘../data/raw/migpuma_crosswalk/puma_migpuma1_pwpuma00.xls.2’


2022-10-26 15:39:57 (1.63 MB/s) - ‘../data/raw/migpuma_crosswalk/puma_migpuma1_pwpuma00.xls.2’ saved [229888/229888]



CompletedProcess(args=['wget', 'https://usa.ipums.org/usa/resources/volii/puma_migpuma1_pwpuma00.xls', '-P', '../data/raw/migpuma_crosswalk', '--progress=bar:force:noscroll'], returncode=0)

In [4]:
df = pd.read_excel(pathlib.Path(raw_data_dir) / 'puma_migpuma1_pwpuma00.xls', skiprows=2, skipfooter=5)

In [5]:
# MIGPUMAs do not cross state lines
assert np.all(df['State of Residence (ST)'] == df['Place of Work State (PWSTATE2) or Migration State (MIGPLAC1)'])
# Each PUMA is in a single MIGPUMA
assert len(df) == len(df[['State of Residence (ST)', 'PUMA']].drop_duplicates())

In [6]:
df = df.rename(columns={'State of Residence (ST)': 'ST', 'Place of Work State (PWSTATE2) or Migration State (MIGPLAC1)': 'MIGSP', 'PWPUMA00 or MIGPUMA1': 'MIGPUMA'})
df

Unnamed: 0,ST,PUMA,MIGSP,MIGPUMA
0,1,100,1,190
1,1,200,1,290
2,1,301,1,290
3,1,302,1,290
4,1,400,1,400
...,...,...,...,...
2373,72,902,72,900
2374,72,1001,72,1001
2375,72,1002,72,1002
2376,72,1101,72,1101


## Investigate number of PUMAs per MIGPUMA

In [7]:
df.assign(full_migpuma=lambda x: (x['MIGSP'] * 10_000) + x['MIGPUMA']).full_migpuma.value_counts().sort_values()

561105     1
191800     1
190500     1
190600     1
190700     1
          ..
173400    34
40100     35
250390    38
484600    38
63700     69
Name: full_migpuma, Length: 984, dtype: int64

In [8]:
df.assign(full_migpuma=lambda x: (x['MIGSP'] * 10_000) + x['MIGPUMA']).full_migpuma.value_counts().value_counts()

1     643
2     129
3      61
4      38
5      25
6      19
7      18
8      10
9       7
10      6
13      4
16      3
14      3
11      3
12      2
38      2
15      2
18      2
22      2
25      1
26      1
34      1
35      1
69      1
Name: full_migpuma, dtype: int64

## Save data

In [9]:
df.to_hdf('../data/puma_to_migpuma.hdf', key='puma_to_migpuma', mode='w')