In [1]:
import pandas as pd, numpy as np
import seaborn as sns
from numpy.random import default_rng
import scipy.stats as stats

pd.set_option('display.min_rows', 20)

! whoami
! date

zmbc
Tue Jan  2 10:19:42 AM PST 2024


In [2]:
data_dir = '/ihme/scratch/users/zmbc/vivarium_research_prl/migration/data'
acs = pd.read_hdf(f'{data_dir}/acs_2020_5yr_household.hdf', key='acs')

In [3]:
len(acs)

7426541

In [4]:
# Households with nobody in them (we do not consider these households, for sim purposes)
acs[acs.NP == 0]

Unnamed: 0,RT,SERIALNO,DIVISION,PUMA,REGION,ST,ADJHSG,ADJINC,WGTP,NP,...,WGTP71,WGTP72,WGTP73,WGTP74,WGTP75,WGTP76,WGTP77,WGTP78,WGTP79,WGTP80
4,H,2016000000123,2,500,1,42,1078664,1086849,15,0,...,4,25,18,17,3,21,16,14,23,4
5,H,2016000000187,2,1300,1,42,1078664,1086849,30,0,...,50,39,10,58,8,28,29,31,31,27
10,H,2016000000252,2,1702,1,42,1078664,1086849,45,0,...,13,47,41,14,44,44,14,43,44,42
15,H,2016000000309,2,1300,1,42,1078664,1086849,9,0,...,7,3,9,14,9,3,3,10,9,8
19,H,2016000000553,2,900,1,42,1078664,1086849,18,0,...,30,5,16,5,19,19,18,5,6,6
20,H,2016000000571,2,300,1,42,1078664,1086849,5,0,...,5,6,1,5,6,8,2,10,9,6
67,H,2016000001852,2,1100,1,42,1078664,1086849,26,0,...,24,42,46,46,23,6,29,46,24,37
92,H,2016000002715,2,4001,1,42,1078664,1086849,11,0,...,10,2,17,11,9,11,10,3,3,10
100,H,2016000002861,2,1701,1,42,1078664,1086849,19,0,...,5,19,34,32,34,20,19,20,20,32
104,H,2016000002932,2,3209,1,42,1078664,1086849,52,0,...,15,93,54,82,52,52,83,54,85,55


In [5]:
acs = acs[acs.NP > 0]

In [6]:
len(acs)

6807113

In [7]:
# Duplicate indices! In the future, should probably deal with this in download_acs!
# Filter to relevant columns to save memory
acs = acs[['SERIALNO', 'TYPEHUGQ', 'ST', 'WGTP']].reset_index(drop=True)

In [8]:
assert len(acs) == acs.SERIALNO.nunique()

In [9]:
acs.WGTP.max()

2120

In [10]:
len(acs[acs['TYPEHUGQ'] == 1])

6017646

In [11]:
acs[(acs['TYPEHUGQ'] == 1) & (acs['ST'] == 6)].WGTP.sum()

13103115

In [12]:
acs[(acs['TYPEHUGQ'] == 1)].WGTP.sum()

122354269

In [13]:
state_proportions = (
    acs[acs['TYPEHUGQ'] == 1]
    .groupby("ST")
    .WGTP.sum()
)
state_proportions.sum()

122354269

In [14]:
# We want the proportion of the *households* in each state in ACS PUMS.
# That's because it's only the location of *households* that are independent
# of each other.
# The GQ population is a whole other issue (we know we are way off in the
# state distribution) which is ignored here.
state_proportions = (
    acs[acs['TYPEHUGQ'] == 1]
    .groupby("ST")
    .WGTP.sum()
)
state_proportions = state_proportions / state_proportions.sum()
state_proportions

ST
1     0.015435
2     0.002085
4     0.021605
5     0.009567
6     0.107092
8     0.017469
9     0.011323
10    0.003032
11    0.002356
12    0.064822
13    0.031305
15    0.003824
16    0.005307
17    0.039917
18    0.021272
19    0.010412
20    0.009333
21    0.014287
22    0.014319
23    0.004655
24    0.018230
25    0.021634
26    0.032532
27    0.018046
28    0.009126
29    0.019944
30    0.003564
31    0.006266
32    0.009236
33    0.004406
34    0.026743
35    0.006479
36    0.060621
37    0.032950
38    0.002622
39    0.038554
40    0.012207
41    0.013425
42    0.041736
44    0.003390
45    0.016031
46    0.002843
47    0.021572
48    0.080962
49    0.008200
50    0.002148
51    0.026024
53    0.023749
54    0.006001
55    0.019435
56    0.001906
Name: WGTP, dtype: float64

In [15]:
len(state_proportions)

51

In [16]:
import yaml
with open('v_and_v_inputs/household_structure.yaml', 'w', encoding='utf-8') as f:
    yaml.dump({
        'state_proportions': {
            'states': {k: float(v) for k, v in state_proportions.items()},
            # As in domestic migration, we specify a range of multiplicative
            # factors by which we would expect the proportion of the US population
            # in any state to drift for each year of population dynamics applied.
            # This is pretty conservative for big states, but there are some
            # small states that could change rather rapidly.
            'multiplicative_drift_per_year': {
                'lower_bound': 0.75,
                'upper_bound': 1.25,
            },
        },
    }, f, default_flow_style=False)