In [1]:
import numpy as np, matplotlib.pyplot as plt, pandas as pd
# For finding and loading multiple files
import os
# Use this to see how much memory the dataframes use
from sys import getsizeof
# To view keyspace.yaml
import yaml

import pseudopeople as psp

!date
!whoami
!uname -a
!python --version
!pwd

Tue 01 Aug 2023 02:40:37 PM PDT
ndbs
Linux int-slurm-sarchive-p0002 5.4.0-135-generic #152-Ubuntu SMP Wed Nov 23 20:19:22 UTC 2022 x86_64 x86_64 x86_64 GNU/Linux
Python 3.10.12
/mnt/share/code/ndbs/vivarium_research_prl/model_validation


In [33]:
def sizemb(x):
    mb = getsizeof(x) / 1e6
    return mb

# See how many SSA events we can fit in a 64-bit int

Suppose we have a max of 10 million simulants per shard and a max of 1000 events per simulant, which should be more than sufficient. That's a max of $10^7 \times 10^3 = 10^{10}$ events per shard (i.e., 10 billion SSA events per shard). That leaves room for more than $1.8 \times 10^{19} / 10^{10} = 1.8 \times 10^9$ shards (i.e., 1.8 billion shards), which again should be more than sufficient.

In [3]:
M = 2**64
M

18446744073709551616

In [7]:
len(str(M))

20

In [10]:
M / (10*10e6 * 1000)

184467440.7370955

In [11]:
M / 1.8e19

1.0248191152060862

In [14]:
M / (100*1e6 * 10_000)

18446744.07370955

In [13]:
10e6

10000000.0

In [15]:
1e4

10000.0

In [16]:
1e8 == 100_000_000

True

# Find data

In [17]:
project_dir = '/mnt/team/simulation_science/priv/engineering/vivarium_census_prl_synth_pop'
model_dir = (
    f'{project_dir}/results/release_02_yellow/full_data'
    '/united_states_of_america/2023_07_28_08_33_09'
)
output_dir = f'{model_dir}/final_results/2023_07_31_08_59_48/pseudopeople_input_data_usa_0.0.2'

!ls -halt $output_dir

total 360K
drwxrwsr-x  4 sbachmei IHME-Simulationscience 1.0K Jul 31 10:34 ..
-rw-rw-r--  1 sbachmei IHME-Simulationscience 1.6K Jul 31 10:18 CHANGELOG.rst
drwxrwsr-x 11 sbachmei IHME-Simulationscience 5.0K Jul 31 10:18 .
drwxrwsr-x  2 sbachmei IHME-Simulationscience 167K Jul 31 10:17 taxes_dependents
drwxrwsr-x  2 sbachmei IHME-Simulationscience 167K Jul 31 10:15 taxes_1040
drwxrwsr-x  2 sbachmei IHME-Simulationscience 167K Jul 31 10:07 taxes_w2_and_1099
drwxrwsr-x  2 sbachmei IHME-Simulationscience 167K Jul 31 09:55 social_security
drwxrwsr-x  2 sbachmei IHME-Simulationscience 167K Jul 31 09:54 women_infants_and_children
drwxrwsr-x  2 sbachmei IHME-Simulationscience 167K Jul 31 09:54 current_population_survey
drwxrwsr-x  2 sbachmei IHME-Simulationscience 167K Jul 31 09:54 american_community_survey
drwxrwsr-x  2 sbachmei IHME-Simulationscience 167K Jul 31 09:54 decennial_census
drwxrwsr-x  2 sbachmei IHME-Simulationscience 334K Jul 31 09:00 logs


In [18]:
!ls -halt $model_dir

total 1.2M
drwxrwsr-x  4 sbachmei IHME-Simulationscience 2.0K Aug  1 10:52 final_results
drwxrwsr-x  5 sbachmei IHME-Simulationscience 2.5K Jul 31 10:18 ..
drwxrwsr-x  6 sbachmei IHME-Simulationscience 5.0K Jul 31 08:59 .
-rw-rw-r--  1 sbachmei IHME-Simulationscience 1.2M Jul 29 06:01 output.hdf
drwxrwsr-x 10 sbachmei IHME-Simulationscience 4.0K Jul 28 17:09 raw_results
drwxrwsr-x  2 sbachmei IHME-Simulationscience  512 Jul 28 08:33 __pycache__
-rw-rw-r--  1 sbachmei IHME-Simulationscience  110 Jul 28 08:33 settings.py
-rw-rw-r--  1 sbachmei IHME-Simulationscience 1.9K Jul 28 08:33 model_specification.yaml
-rw-rw-r--  1 sbachmei IHME-Simulationscience   50 Jul 28 08:33 branches.yaml
-rw-rw-r--  1 sbachmei IHME-Simulationscience 2.4K Jul 28 08:33 keyspace.yaml
-rw-rw-r--  1 sbachmei IHME-Simulationscience 4.2K Jul 28 08:33 requirements.txt
drwxrwsr-x  3 sbachmei IHME-Simulationscience  512 Jul 28 08:33 logs


# Pick 3 seeds and define directories

In [19]:
!less $model_dir/keyspace.yaml # Had to interrupt process with notebook's stop button

input_draw:1h=
- 602
placeholder_branch_name.scenario:
- baseline
random_seed:
- 4344
- 5616
- 6810
- 2787
- 2284
- 4369
- 3254
- 7359
- 3541
- 5020
- 9672
- 8869
- 3167
- 6817
- 4507
- 9901
- 5949
- 6545
[K:[Kyellow/full_data/united_states_of_america/2023_07_28_08_33_09/keyspace.yaml[m[K

In [25]:
seeds = [4344, 5616, 6810]
w2_dir = f'{output_dir}/taxes_w2_and_1099'
census_dir = f'{output_dir}/decennial_census'

# Load 3 shards of W2/1099 data

In [26]:
w2 = {}
for seed in seeds:
    %time w2[seed] = pd.read_parquet(f'{w2_dir}/taxes_w2_and_1099_{seed}.parquet')
    print(getsizeof(w2[seed]) / 1e6, 'MB')
w2.keys()

CPU times: user 12.2 s, sys: 5.86 s, total: 18.1 s
Wall time: 18.1 s
3342.734353 MB
CPU times: user 12 s, sys: 6.09 s, total: 18.1 s
Wall time: 17.8 s
3340.263612 MB
CPU times: user 12.1 s, sys: 5.83 s, total: 17.9 s
Wall time: 18 s
3340.91473 MB


dict_keys([4344, 5616, 6810])

In [28]:
w2[4344].dtypes

mailing_address_city                   category
age                                       int64
employer_id                               int64
ssn                                    category
mailing_address_state                  category
wages                                     int64
mailing_address_street_name            category
employer_street_number                 category
date_of_birth                    datetime64[ns]
tax_form                               category
employer_unit_number                   category
mailing_address_zipcode                category
first_name                             category
mailing_address_unit_number            category
employer_city                          category
simulant_id                            category
mailing_address_po_box                  float64
employer_state                         category
middle_initial                         category
household_id                           category
tax_year                                

In [29]:
w2[4344]

Unnamed: 0,mailing_address_city,age,employer_id,ssn,mailing_address_state,wages,mailing_address_street_name,employer_street_number,date_of_birth,tax_form,...,household_id,tax_year,employer_street_name,employer_zipcode,mailing_address_street_number,copy_ssn,employer_name,copy_date_of_birth,last_name,copy_age
0,indianapolis,25,1325244,387-19-0776,IN,35506,emerald pointe circle,,1994-03-11,W2,...,4344_6,2019,edgecliff ct,64056,802,,Mike's Auto Collison & Pet Care,NaT,Rushing,
1,unincorporated,33,72861,750-40-2202,FL,49729,ximeno ave,309,1986-06-29,W2,...,4344_7,2019,n 52nd st,91505,1480,501-50-3083,Calvary Church of Starbucks,1983-07-23,Zagata,36
2,unincorporated,36,830369,761-74-3951,FL,23852,ximeno ave,1960,1983-07-23,W2,...,4344_7,2019,skyview ter,33189,1480,501-50-3083,Jordyn's Deli and Ranch & Beyond Juicery,2018-12-04,Zagata,33
3,murfreesboro,80,1240454,847-91-3259,TN,143691,se royal hls dr,e,1939-12-04,W2,...,4344_8,2019,ince dr,61559,3921,204-42-2959,Advance Auto Inc,1940-05-09,Wade,79
4,murfreesboro,79,1080048,204-42-2959,TN,92228,se royal hls dr,,1940-05-09,W2,...,4344_8,2019,stoney crk cir,17517,3921,847-91-3259,Valley Tennis Of Ramen Factory UAE,1939-12-04,Wade,80
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21955987,linden,28,580412,805-89-1340,NJ,952,colony club drive,23218,2012-07-02,W2,...,4344_868663,2040,cricket hllw dr,32205,305,,Gresock & Collectibles,NaT,Daniels,
21955988,gilroy,38,1648705,609-56-3084,CA,8863,north gardiner dv,1550,2002-07-01,W2,...,4344_868664,2040,henry st ne,14020,,884-77-4825,Mo Atunrase Ebby Halliday Realtors,2005-10-27,Rascher,6
21955989,birmingham,29,1399604,042-58-2230,AL,3942,camino del norte,,2011-04-06,W2,...,4344_868667,2040,radcliffe lane,77077,3001,578-28-3212,Sheriff's Office Depot,2040-12-05,Saba,0
21955990,scarborough,34,1051108,304-01-5654,ME,6313,juniper road,3405,2006-03-29,W2,...,4344_868668,2040,doubleday pl,90815,523,611-24-6133,Piers,2005-12-11,Le,6


In [30]:
w2[4344].simulant_id

0                 4344_0
1                 4344_1
2                 4344_2
3                 4344_4
4                 4344_5
                ...     
21955987    4344_1385631
21955988    4344_1385632
21955989    4344_1385639
21955990    4344_1385640
21955991    4344_1385641
Name: simulant_id, Length: 21955992, dtype: category
Categories (1134798, object): ['4344_0', '4344_1', '4344_10', '4344_100', ..., '4344_999996', '4344_999997', '4344_999998', '4344_999999']

In [31]:
%%time
for df in w2.values():
    print(df.isna().sum(),'\n')

mailing_address_city                    0
age                                     0
employer_id                             0
ssn                                     0
mailing_address_state                   0
wages                                   0
mailing_address_street_name             0
employer_street_number             963812
date_of_birth                           0
tax_form                                0
employer_unit_number             20752387
mailing_address_zipcode                 0
first_name                              0
mailing_address_unit_number      20172695
employer_city                           0
simulant_id                             0
mailing_address_po_box           21250716
employer_state                          0
middle_initial                          0
household_id                            0
tax_year                                0
employer_street_name                    0
employer_zipcode                        0
mailing_address_street_number     

# Concatenate 3 W2 shards into one dataframe

In [32]:
%%time
%time df_w2 = pd.concat(w2)#, ignore_index=True)
df_w2

CPU times: user 55.3 s, sys: 13.7 s, total: 1min 8s
Wall time: 1min 8s
CPU times: user 55.3 s, sys: 13.7 s, total: 1min 8s
Wall time: 1min 8s


Unnamed: 0,Unnamed: 1,mailing_address_city,age,employer_id,ssn,mailing_address_state,wages,mailing_address_street_name,employer_street_number,date_of_birth,tax_form,...,household_id,tax_year,employer_street_name,employer_zipcode,mailing_address_street_number,copy_ssn,employer_name,copy_date_of_birth,last_name,copy_age
4344,0,indianapolis,25,1325244,387-19-0776,IN,35506,emerald pointe circle,,1994-03-11,W2,...,4344_6,2019,edgecliff ct,64056,802,,Mike's Auto Collison & Pet Care,NaT,Rushing,
4344,1,unincorporated,33,72861,750-40-2202,FL,49729,ximeno ave,309,1986-06-29,W2,...,4344_7,2019,n 52nd st,91505,1480,501-50-3083,Calvary Church of Starbucks,1983-07-23,Zagata,36
4344,2,unincorporated,36,830369,761-74-3951,FL,23852,ximeno ave,1960,1983-07-23,W2,...,4344_7,2019,skyview ter,33189,1480,501-50-3083,Jordyn's Deli and Ranch & Beyond Juicery,2018-12-04,Zagata,33
4344,3,murfreesboro,80,1240454,847-91-3259,TN,143691,se royal hls dr,e,1939-12-04,W2,...,4344_8,2019,ince dr,61559,3921,204-42-2959,Advance Auto Inc,1940-05-09,Wade,79
4344,4,murfreesboro,79,1080048,204-42-2959,TN,92228,se royal hls dr,,1940-05-09,W2,...,4344_8,2019,stoney crk cir,17517,3921,847-91-3259,Valley Tennis Of Ramen Factory UAE,1939-12-04,Wade,80
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6810,21938913,seaside,80,1294165,888-36-7637,CA,4999,wst myrtle aven,2501,1960-02-22,W2,...,6810_868964,2040,state highway 7,24549,3351,,Imoh Oton,NaT,Funes,
6810,21938914,monroe,31,404350,172-46-7125,MI,1498,chapel stree,8415,2009-09-12,W2,...,6810_868965,2040,old peak ln,77357,11418,275-31-4705,Mike Tinder,2007-11-27,Nguyen,33
6810,21938915,monroe,30,141363,275-31-4705,MI,12102,chapel stree,2643,2010-07-08,1099,...,6810_868965,2040,martin luther king jr ave,45715,11418,172-46-7125,Brookdale Senior Living Landscape,2007-11-27,Teja,33
6810,21938916,grand blanc,27,509803,845-69-9165,MI,5438,n forbes rd,1258,2013-01-03,W2,...,6810_868966,2040,fox squirrel dr,22042,3354,,Boris DDS,NaT,Dang,


In [35]:
%time sizemb(df_w2) # 77.3 GB

CPU times: user 1min 45s, sys: 3.14 s, total: 1min 48s
Wall time: 1min 48s


77331.184156

In [36]:
df_w2.dtypes

mailing_address_city                     object
age                                       int64
employer_id                               int64
ssn                                      object
mailing_address_state                  category
wages                                     int64
mailing_address_street_name              object
employer_street_number                   object
date_of_birth                    datetime64[ns]
tax_form                               category
employer_unit_number                     object
mailing_address_zipcode                  object
first_name                               object
mailing_address_unit_number              object
employer_city                            object
simulant_id                              object
mailing_address_po_box                  float64
employer_state                         category
middle_initial                         category
household_id                             object
tax_year                                

# See whether employer addresses match across shards

In [38]:
%%time
emp1 = df_w2.query("employer_id == 72861")
emp1

CPU times: user 857 ms, sys: 923 ms, total: 1.78 s
Wall time: 1.78 s


Unnamed: 0,Unnamed: 1,mailing_address_city,age,employer_id,ssn,mailing_address_state,wages,mailing_address_street_name,employer_street_number,date_of_birth,tax_form,...,household_id,tax_year,employer_street_name,employer_zipcode,mailing_address_street_number,copy_ssn,employer_name,copy_date_of_birth,last_name,copy_age
4344,1,unincorporated,33,72861,750-40-2202,FL,49729,ximeno ave,309,1986-06-29,W2,...,4344_7,2019,n 52nd st,91505,1480,501-50-3083,Calvary Church of Starbucks,1983-07-23,Zagata,36
4344,560042,rochester,29,72861,720-22-6829,NY,3967,e hammond lake dr,309,1990-01-07,W2,...,4344_239181,2019,n 52nd st,91505,375,291-35-9361,Calvary Church of Starbucks,1988-11-08,Batista,3
4344,952034,unincorporated,34,72861,750-40-2202,FL,28416,ximeno ave,309,1986-06-29,W2,...,4344_7,2020,n 52nd st,91505,1480,766-82-2875,Calvary Church of Starbucks,1983-07-23,Zagata,29
4344,992731,san jose,66,72861,419-62-1420,CA,34418,west 12th street,309,1954-08-02,W2,...,4344_17558,2020,n 52nd st,91505,1230,642-91-4073,Calvary Church of Starbucks,1997-10-09,Ouyang,32
4344,1507276,rochester,30,72861,720-22-6829,NY,74075,e hammond lake dr,309,1990-01-07,W2,...,4344_239181,2020,n 52nd st,91505,375,856-03-1138,Calvary Church of Starbucks,2016-06-12,Batista,32
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6810,19187875,zionsville,32,72861,357-03-4500,IN,86179,delaware ci,7004,2006-05-09,W2,...,6810_62663,2038,braesgate dr,14781,1312,287-75-0797,Calvary Church of Starbucks,2020-11-24,Smith,18
6810,20065955,barrington,36,72861,400-19-9184,RI,34157,willow oak raod,7004,2003-07-23,W2,...,6810_759268,2039,braesgate dr,14781,1812,,Calvary Church of Starbucks,NaT,Dykema,
6810,20440049,ridgeland,39,72861,299-29-9080,OH,11944,woodview dve,7004,2000-10-10,W2,...,6810_724349,2039,braesgate dr,14781,264,645-40-7437,Calvary Church of Starbucks,2034-11-01,Mayol,5
6810,21081723,barrington,37,72861,400-19-9184,RI,34157,willow oak raod,7004,2003-07-23,W2,...,6810_759268,2040,braesgate dr,14781,1812,,Calvary Church of Starbucks,NaT,Dykema,


In [40]:
emp1.query("tax_year==2020").filter(regex='employer')

Unnamed: 0,Unnamed: 1,employer_id,employer_street_number,employer_unit_number,employer_city,employer_state,employer_street_name,employer_zipcode,employer_name
4344,952034,72861,309,,berkeley,CA,n 52nd st,91505,Calvary Church of Starbucks
4344,992731,72861,309,,berkeley,CA,n 52nd st,91505,Calvary Church of Starbucks
4344,1507276,72861,309,,berkeley,CA,n 52nd st,91505,Calvary Church of Starbucks
6810,1440885,72861,338,,rancho cordova,CA,hunt meet circle,91505,Calvary Church of Starbucks


In [41]:
emp1.query("tax_year==2021").filter(regex='employer')

Unnamed: 0,Unnamed: 1,employer_id,employer_street_number,employer_unit_number,employer_city,employer_state,employer_street_name,employer_zipcode,employer_name
4344,1942864,72861,309,,berkeley,CA,n 52nd st,91505,Calvary Church of Starbucks
4344,2462672,72861,309,,berkeley,CA,n 52nd st,91505,Calvary Church of Starbucks
5616,2339588,72861,7,,sta rosa,CA,forest hills dr,90068,Calvary Church of Starbucks
6810,2053691,72861,338,,rancho cordova,CA,hunt meet circle,91505,Calvary Church of Starbucks
6810,2395911,72861,338,,rancho cordova,CA,hunt meet circle,91505,Calvary Church of Starbucks


In [43]:
emp1.query("tax_year==2040").filter(regex='employer')

Unnamed: 0,Unnamed: 1,employer_id,employer_street_number,employer_unit_number,employer_city,employer_state,employer_street_name,employer_zipcode,employer_name
4344,21578819,72861,23359,,brookhaven,NY,blue lake dr,14062,Calvary Church of Starbucks
5616,21754527,72861,3000,,new york,NY,george st,14048,Calvary Church of Starbucks
6810,21081723,72861,7004,,new york,NY,braesgate dr,14781,Calvary Church of Starbucks
6810,21448433,72861,7004,,new york,NY,braesgate dr,14781,Calvary Church of Starbucks


In [44]:
emp1.query("tax_year==2022").filter(regex='employer')

Unnamed: 0,Unnamed: 1,employer_id,employer_street_number,employer_unit_number,employer_city,employer_state,employer_street_name,employer_zipcode,employer_name
4344,2907096,72861,309,,berkeley,CA,n 52nd st,91505,Calvary Church of Starbucks
4344,3428807,72861,309,,berkeley,CA,n 52nd st,91505,Calvary Church of Starbucks
5616,2954977,72861,7,,sta rosa,CA,forest hills dr,90068,Calvary Church of Starbucks
5616,2966801,72861,7,,sta rosa,CA,forest hills dr,90068,Calvary Church of Starbucks
5616,3048144,72861,7,,sta rosa,CA,forest hills dr,90068,Calvary Church of Starbucks
5616,3211546,72861,7,,sta rosa,CA,forest hills dr,90068,Calvary Church of Starbucks
5616,3303269,72861,7,,sta rosa,CA,forest hills dr,90068,Calvary Church of Starbucks
6810,3018201,72861,338,,rancho cordova,CA,hunt meet circle,91505,Calvary Church of Starbucks
