In [1]:
import numpy as np, matplotlib.pyplot as plt, pandas as pd
pd.set_option('display.max_rows', 50)

import os
# Use this to see how much memory the dataframes use
from sys import getsizeof

import yaml

import pseudopeople as pp
from pseudopeople.utilities import get_configuration
from vivarium.framework.randomness import RandomnessStream

from vivarium_research_prl.utils import sizemb, MappingViaAttributes
from vivarium_research_prl import datatypes
from vivarium_research_prl.noise import corruption, fake_names, noisify
from vivarium_research_prl.find_kids import datasets, noisify_data

!date
!whoami
!uname -a
!pwd

Fri 31 Mar 2023 02:53:17 PM PDT
ndbs
Linux gen-slurm-sarchive-p0135 5.4.0-135-generic #152-Ubuntu SMP Wed Nov 23 20:19:22 UTC 2022 x86_64 x86_64 x86_64 GNU/Linux
/mnt/share/code/ndbs/vivarium_research_prl/pseudopeople_testing


In [2]:
%load_ext autoreload
%autoreload 2

# Find data

```
/mnt/team/simulation_science/priv/engineering/vivarium_census_prl_synth_pop/results/full_scale_334mil/united_states_of_america/2023_03_30_10_23_13/final_results/2023_03_30_16_02_39
```

In [3]:
project_output_dir = \
    '/mnt/team/simulation_science/priv/engineering/vivarium_census_prl_synth_pop'
output_subdir = (
    'results/full_scale_334mil'
    '/united_states_of_america/2023_03_30_10_23_13'
    '/final_results/2023_03_30_16_02_39'
)
output_dir = f'{project_output_dir}/{output_subdir}'

!ls -halt $output_dir

total 404K
drwxrwsr-x  3 sbachmei IHME-Simulationscience 1.5K Mar 30 17:15 ..
drwxrwsr-x  2 sbachmei IHME-Simulationscience 167K Mar 30 16:52 tax_dependents_observer
drwxrwsr-x  2 sbachmei IHME-Simulationscience 167K Mar 30 16:44 tax_1040_observer
drwxrwsr-x  2 sbachmei IHME-Simulationscience 339K Mar 30 16:42 logs
drwxrwsr-x  2 sbachmei IHME-Simulationscience 167K Mar 30 16:31 tax_w2_observer
drwxrwsr-x  2 sbachmei IHME-Simulationscience 167K Mar 30 16:28 social_security_observer
drwxrwsr-x  2 sbachmei IHME-Simulationscience 167K Mar 30 16:28 wic_observer
drwxrwsr-x  2 sbachmei IHME-Simulationscience 167K Mar 30 16:28 household_survey_observer_cps
drwxrwsr-x  2 sbachmei IHME-Simulationscience 167K Mar 30 16:28 household_survey_observer_acs
drwxrwsr-x  2 sbachmei IHME-Simulationscience 167K Mar 30 16:26 decennial_census_observer
drwxrwsr-x 11 sbachmei IHME-Simulationscience 4.5K Mar 30 16:21 .


In [4]:
!ls -halt $output_dir/tax_w2_observer

total 199G
-rw-r--r--  1 sbachmei IHME-Simulationscience 614M Mar 30 17:07 tax_w2_observer_3568.csv.bz2
-rw-r--r--  1 sbachmei IHME-Simulationscience 603M Mar 30 17:04 tax_w2_observer_7551.csv.bz2
-rw-r--r--  1 sbachmei IHME-Simulationscience 613M Mar 30 17:04 tax_w2_observer_7086.csv.bz2
-rw-r--r--  1 sbachmei IHME-Simulationscience 613M Mar 30 17:00 tax_w2_observer_5440.csv.bz2
-rw-r--r--  1 sbachmei IHME-Simulationscience 598M Mar 30 16:58 tax_w2_observer_9292.csv.bz2
-rw-r--r--  1 sbachmei IHME-Simulationscience 606M Mar 30 16:42 tax_w2_observer_1282.csv.bz2
-rw-r--r--  1 sbachmei IHME-Simulationscience 611M Mar 30 16:39 tax_w2_observer_1007.csv.bz2
-rw-r--r--  1 sbachmei IHME-Simulationscience 610M Mar 30 16:34 tax_w2_observer_2277.csv.bz2
-rw-r--r--  1 sbachmei IHME-Simulationscience 606M Mar 30 16:34 tax_w2_observer_1483.csv.bz2
-rw-r--r--  1 sbachmei IHME-Simulationscience 597M Mar 30 16:33 tax_w2_observer_1482.csv.bz2
-rw-r--r--  1 sbachmei IHME-Simulationscience 60

In [9]:
!ls $project_output_dir/results/full_scale_334mil/united_states_of_america/2023_03_30_10_23_13

branches.yaml  logs			 __pycache__	   settings.py
final_results  model_specification.yaml  raw_results
keyspace.yaml  output.hdf		 requirements.txt


# Pick some seeds to load, and define directories for W2 and census data

In [13]:
seeds = [3568, 7551, 7086]
w2_dir = f'{output_dir}/tax_w2_observer'
census_dir = f'{output_dir}/decennial_census_observer'

# Load one W2 file and see how big it is

In [14]:
%%time
w2 = {}
seed = seeds[0]
w2[seed] = pd.read_csv(f'{w2_dir}/tax_w2_observer_{seed}.csv.bz2')
w2[seed]

CPU times: user 1min 58s, sys: 5.36 s, total: 2min 3s
Wall time: 2min 4s


Unnamed: 0,tax_form,mailing_address_street_name,mailing_address_city,mailing_address_po_box,tax_year,employer_street_name,age,mailing_address_state,employer_state,date_of_birth,...,employer_zipcode,mailing_address_street_number,simulant_id,employer_city,employer_id,first_name,income,employer_street_number,last_name,mailing_address_zipcode
0,W2,se gillette ave,durham,0,2020,edgecliff ct,59,NC,VA,1961-10-19,...,22911,1091,3568_1,virginia beach,1090943,Coleen,55705.868112,,Corbin,28138
1,W2,tumwater ln,s diego,0,2020,n 52nd st,87,CA,MO,1933-07-08,...,65721,140,3568_3,raytown,460472,Carol,62177.505499,309,Nelson,95602
2,W2,tumwater ln,s diego,0,2020,skyview ter,51,CA,GA,1969-01-19,...,31707,140,3568_4,augusta,96598,Cindy,4010.481741,1960,Nelson,95602
3,W2,abbott ct,sherman oaks,0,2020,ince dr,30,CA,FL,1990-06-24,...,32825,2410,3568_5,fort myers,494164,Brandi,26190.257958,e,Marquez,94521
4,W2,abbott ct,sherman oaks,0,2020,stoney crk cir,31,CA,ID,1989-01-09,...,83805,2410,3568_6,bayview,1499355,Chad,15138.669538,,Marquez,94521
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9763200,W2,13th ave e,s diego,0,2029,gladys ave,23,CA,MA,2006-01-26,...,1536,8401,3568_1172730,boston,725417,Richard,351.326648,6902,Matayoshi,92122
9763201,1099,choctaw ln,tempe,0,2029,hightower court nthwst,60,AZ,GA,1969-09-27,...,30058,19808,3568_1172735,roswell,1299275,Kerry,12665.206674,4522,Mauvais,85205
9763202,W2,n park ave,brandon,0,2029,clay hise ln,59,MS,IA,1970-12-27,...,52248,7217,3568_1172737,des moines,399446,Roy,8574.277827,828,Peoples,39481
9763203,W2,geranium crecent,henderson,0,2029,us 431 s hwy,30,NV,GA,1999-02-02,...,30228,8222,3568_1172738,atlanta,158167,Teresa,3281.288518,607,Alvarez Caraballo,89081


In [15]:
sizemb(w2[seed]) # 11.3 GB for one seed worth of W2 data

11299.961384

# Load two more seeds of W2 data

In [16]:
for seed in seeds[1:]:
    %time w2[seed] = pd.read_csv(f'{w2_dir}/tax_w2_observer_{seed}.csv.bz2')
    print(sizemb(w2[seed]))
w2.keys()

CPU times: user 2min, sys: 4.9 s, total: 2min 4s
Wall time: 2min 5s
11299.052206
CPU times: user 2min, sys: 5.23 s, total: 2min 5s
Wall time: 2min 6s
11312.750373


dict_keys([3568, 7551, 7086])

In [18]:
sum(sizemb(df) for df in w2.values()) # 33.9 GB for 3 W2 files

33911.763963000005

In [19]:
w2[7551]

Unnamed: 0,first_name,date_of_birth,employer_unit_number,mailing_address_unit_number,middle_initial,employer_street_number,tax_year,employer_street_name,employer_id,mailing_address_state,...,mailing_address_po_box,employer_zipcode,mailing_address_street_number,last_name,income,ssn,simulant_id,mailing_address_city,mailing_address_street_name,mailing_address_zipcode
0,John,1940-05-30,,,R,,2020,edgecliff ct,747812,NC,...,0,11581,10663,Mix,383500.979969,483-34-7437,7551_0,statesville,may avnu,27576
1,Kathleen,1952-01-21,,,N,309,2020,n 52nd st,1390512,NC,...,0,97229,10663,Mix,28642.613169,128-35-3084,7551_1,statesville,may avnu,27576
2,Kaelyn,1993-10-02,unit 170,,M,1960,2020,skyview ter,1489871,VA,...,0,92374,18090,Romo,21181.917494,454-54-1807,7551_2,gate cty,south ferguson road,23451
3,Julissa,1997-03-10,,,M,e,2020,ince dr,833925,MN,...,0,97058,3919,Foreman,3849.922875,627-61-1851,7551_3,rochester,randle,55398
4,Julissa,1997-03-10,,,M,,2020,stoney crk cir,1622694,MN,...,0,30016,3919,Foreman,959.607820,627-61-1851,7551_3,rochester,randle,55398
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9764266,Adam,1978-09-29,,apartment number b 2,C,38534,2029,maple avenue,943971,NJ,...,0,46992,3873,Woodward,2896.227904,361-35-5654,7551_1172491,long hill twp,w 10th st,7087
9764267,Nia,1979-11-02,,apartment number b 2,C,5444,2029,villa terrace,590022,NJ,...,0,38114,3873,Woodward,3239.621856,712-58-3332,7551_1172492,long hill twp,w 10th st,7087
9764268,Kyle,1985-03-30,apartment 2a,,J,907,2029,scott rd,1404257,IL,...,0,28704,214,Johnsen,3186.105633,035-34-1900,7551_1172495,hoffman estates,norwich rd,60440
9764269,Sarah,1991-06-13,,,M,2139,2029,victorian way,571805,IL,...,0,55416,214,Johnsen,7321.719295,270-13-7963,7551_1172496,hoffman estates,norwich rd,60440


# Concatenate 3 W2 files into one dataframe

In [44]:
%%time
%time df_w2 = pd.concat(w2)#, ignore_index=True)
%time df_w2

CPU times: user 6.53 s, sys: 3.88 s, total: 10.4 s
Wall time: 10.4 s
CPU times: user 1e+03 ns, sys: 0 ns, total: 1e+03 ns
Wall time: 3.1 µs
CPU times: user 6.53 s, sys: 3.88 s, total: 10.4 s
Wall time: 10.4 s


Unnamed: 0,Unnamed: 1,tax_form,mailing_address_street_name,mailing_address_city,mailing_address_po_box,tax_year,employer_street_name,age,mailing_address_state,employer_state,date_of_birth,...,employer_zipcode,mailing_address_street_number,simulant_id,employer_city,employer_id,first_name,income,employer_street_number,last_name,mailing_address_zipcode
3568,0,W2,se gillette ave,durham,0,2020,edgecliff ct,59,NC,VA,1961-10-19,...,22911,1091,3568_1,virginia beach,1090943,Coleen,55705.868112,,Corbin,28138
3568,1,W2,tumwater ln,s diego,0,2020,n 52nd st,87,CA,MO,1933-07-08,...,65721,140,3568_3,raytown,460472,Carol,62177.505499,309,Nelson,95602
3568,2,W2,tumwater ln,s diego,0,2020,skyview ter,51,CA,GA,1969-01-19,...,31707,140,3568_4,augusta,96598,Cindy,4010.481741,1960,Nelson,95602
3568,3,W2,abbott ct,sherman oaks,0,2020,ince dr,30,CA,FL,1990-06-24,...,32825,2410,3568_5,fort myers,494164,Brandi,26190.257958,e,Marquez,94521
3568,4,W2,abbott ct,sherman oaks,0,2020,stoney crk cir,31,CA,ID,1989-01-09,...,83805,2410,3568_6,bayview,1499355,Chad,15138.669538,,Marquez,94521
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7086,9771288,W2,montgomery ave,san diego,0,2029,badger drive,59,CA,MN,1970-12-29,...,55317,8433,7086_1173356,saint paul,101306,Mindy,11149.321244,2549,Carrano,93268
7086,9771289,W2,amboy road,royal oak,0,2029,s thorson ave,42,MI,MO,1987-11-29,...,63801,992,7086_1173357,columbia,1414918,Vincent,2681.646947,3563,Pender,48098
7086,9771290,W2,smithfield street,e lake,0,2029,sardis ave,41,FL,NY,1988-10-18,...,11518,2598,7086_1173363,new york,1176581,Keaton,6040.637877,6801,Villavicencio,32821
7086,9771291,W2,smithfield street,e lake,0,2029,corporate bl,32,FL,TN,1997-02-05,...,38390,2598,7086_1173364,memphis,1032031,Mercedes,2516.064390,25682,Villavicencio,32821


In [26]:
sizemb(df_w2) # Cool, same as above: 33.9 GB for the concatenated dataframe

33911.763675

# Check that output from different seeds has same columns

Yes, but they're in different orders for some reason.

In [27]:
w2[seeds[0]].columns == w2[seeds[1]].columns

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False,  True])

In [28]:
w2[seeds[0]].columns

Index(['tax_form', 'mailing_address_street_name', 'mailing_address_city',
       'mailing_address_po_box', 'tax_year', 'employer_street_name', 'age',
       'mailing_address_state', 'employer_state', 'date_of_birth',
       'employer_unit_number', 'mailing_address_unit_number', 'employer_name',
       'ssn', 'middle_initial', 'employer_zipcode',
       'mailing_address_street_number', 'simulant_id', 'employer_city',
       'employer_id', 'first_name', 'income', 'employer_street_number',
       'last_name', 'mailing_address_zipcode'],
      dtype='object')

In [29]:
w2[seeds[1]].columns

Index(['first_name', 'date_of_birth', 'employer_unit_number',
       'mailing_address_unit_number', 'middle_initial',
       'employer_street_number', 'tax_year', 'employer_street_name',
       'employer_id', 'mailing_address_state', 'tax_form', 'employer_name',
       'age', 'employer_state', 'employer_city', 'mailing_address_po_box',
       'employer_zipcode', 'mailing_address_street_number', 'last_name',
       'income', 'ssn', 'simulant_id', 'mailing_address_city',
       'mailing_address_street_name', 'mailing_address_zipcode'],
      dtype='object')

In [32]:
w2[seeds[0]].columns.sort_values() == w2[seeds[1]].columns.sort_values()

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True])

In [33]:
w2[seeds[0]].columns.sort_values() == w2[seeds[2]].columns.sort_values()

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True])

# Check that whenever employer ID is the same, so is employer name

In other words, we want to know that the number of employer names for each employer ID equals 1.

## Yes, it's the same across seeds, but one employer name is blank in all three shards...

In [35]:
%autoreload 0

In [45]:
df_w2

Unnamed: 0,Unnamed: 1,tax_form,mailing_address_street_name,mailing_address_city,mailing_address_po_box,tax_year,employer_street_name,age,mailing_address_state,employer_state,date_of_birth,...,employer_zipcode,mailing_address_street_number,simulant_id,employer_city,employer_id,first_name,income,employer_street_number,last_name,mailing_address_zipcode
3568,0,W2,se gillette ave,durham,0,2020,edgecliff ct,59,NC,VA,1961-10-19,...,22911,1091,3568_1,virginia beach,1090943,Coleen,55705.868112,,Corbin,28138
3568,1,W2,tumwater ln,s diego,0,2020,n 52nd st,87,CA,MO,1933-07-08,...,65721,140,3568_3,raytown,460472,Carol,62177.505499,309,Nelson,95602
3568,2,W2,tumwater ln,s diego,0,2020,skyview ter,51,CA,GA,1969-01-19,...,31707,140,3568_4,augusta,96598,Cindy,4010.481741,1960,Nelson,95602
3568,3,W2,abbott ct,sherman oaks,0,2020,ince dr,30,CA,FL,1990-06-24,...,32825,2410,3568_5,fort myers,494164,Brandi,26190.257958,e,Marquez,94521
3568,4,W2,abbott ct,sherman oaks,0,2020,stoney crk cir,31,CA,ID,1989-01-09,...,83805,2410,3568_6,bayview,1499355,Chad,15138.669538,,Marquez,94521
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7086,9771288,W2,montgomery ave,san diego,0,2029,badger drive,59,CA,MN,1970-12-29,...,55317,8433,7086_1173356,saint paul,101306,Mindy,11149.321244,2549,Carrano,93268
7086,9771289,W2,amboy road,royal oak,0,2029,s thorson ave,42,MI,MO,1987-11-29,...,63801,992,7086_1173357,columbia,1414918,Vincent,2681.646947,3563,Pender,48098
7086,9771290,W2,smithfield street,e lake,0,2029,sardis ave,41,FL,NY,1988-10-18,...,11518,2598,7086_1173363,new york,1176581,Keaton,6040.637877,6801,Villavicencio,32821
7086,9771291,W2,smithfield street,e lake,0,2029,corporate bl,32,FL,TN,1997-02-05,...,38390,2598,7086_1173364,memphis,1032031,Mercedes,2516.064390,25682,Villavicencio,32821


In [46]:
%%time
employer_name_counts = df_w2.groupby("employer_id")['employer_name'].nunique()
employer_name_counts

CPU times: user 19.1 s, sys: 848 ms, total: 20 s
Wall time: 20 s


employer_id
1          1
2          1
3          1
5          1
6          1
          ..
1745101    1
1745102    1
1745103    1
1745104    1
1745105    1
Name: employer_name, Length: 1598892, dtype: int64

In [47]:
employer_name_counts.unique()

array([1, 0])

In [48]:
employer_name_counts.value_counts()

1    1598891
0          1
Name: employer_name, dtype: int64

In [49]:
employer_name_counts.loc[employer_name_counts==0]

employer_id
1726494    0
Name: employer_name, dtype: int64

# Look into the employer with a missing name

It occurs in all three seeds.

In [50]:
%%time
df_weird_employer = df_w2.loc[df_w2.employer_id==1726494]
df_weird_employer

CPU times: user 7.78 s, sys: 6.12 s, total: 13.9 s
Wall time: 13.9 s


Unnamed: 0,Unnamed: 1,tax_form,mailing_address_street_name,mailing_address_city,mailing_address_po_box,tax_year,employer_street_name,age,mailing_address_state,employer_state,date_of_birth,...,employer_zipcode,mailing_address_street_number,simulant_id,employer_city,employer_id,first_name,income,employer_street_number,last_name,mailing_address_zipcode
3568,1199313,W2,delrose dr n,buffalo,0,2021,duvan drive,25,NY,NY,1996-04-25,...,13368,5334,3568_391687,ridge,1726494,Jasmine,14294.022167,10228,Noble,11206
3568,1487743,W2,bedard ave,new york,0,2021,duvan drive,36,NY,NY,1985-08-06,...,13368,14857,3568_700485,ridge,1726494,Jamie,21221.537718,10228,Cordova,10469
3568,1834319,W2,blackhawk dr,oshkosh,0,2022,duvan drive,25,WI,NY,1997-06-29,...,13368,905,3568_57225,ridge,1726494,Jose,4951.717395,10228,Witte,54703
3568,2150485,W2,delrose dr n,buffalo,0,2022,duvan drive,26,NY,NY,1996-04-25,...,13368,5334,3568_391687,ridge,1726494,Jasmine,46455.572042,10228,Noble,11206
3568,2442728,W2,bedard ave,new york,0,2022,duvan drive,37,NY,NY,1985-08-06,...,13368,14857,3568_700485,ridge,1726494,Jamie,9094.944736,10228,Cordova,10469
3568,2799678,W2,waterview ct,birmingham,0,2023,duvan drive,26,AL,NY,1997-06-29,...,13368,48715,3568_57225,ridge,1726494,Jose,54468.891344,10228,Witte,35124
3568,3117061,W2,delrose dr n,buffalo,0,2023,duvan drive,27,NY,NY,1996-04-25,...,13368,5334,3568_391687,ridge,1726494,Jasmine,46455.572042,10228,Noble,11206
3568,4094760,W2,delrose dr n,buffalo,0,2024,duvan drive,28,NY,NY,1996-04-25,...,13368,5334,3568_391687,ridge,1726494,Jasmine,46455.572042,10228,Noble,11206
3568,5091871,1099,delrose dr n,buffalo,0,2025,duvan drive,29,NY,NY,1996-04-25,...,13368,5334,3568_391687,ridge,1726494,Jasmine,14294.022167,10228,Noble,11206
3568,5117346,W2,east dundee raod,northbrook,0,2025,duvan drive,86,IL,NY,1939-11-04,...,13368,1030,3568_417644,ridge,1726494,Robert,22471.837264,10228,Beitel,60435


In [51]:
df_weird_employer.employer_name

3568  1199313    NaN
      1487743    NaN
      1834319    NaN
      2150485    NaN
      2442728    NaN
      2799678    NaN
      3117061    NaN
      4094760    NaN
      5091871    NaN
      5117346    NaN
      6128562    NaN
      6561378    NaN
      7128783    NaN
      8135231    NaN
      8989458    NaN
      9110737    NaN
7551  293800     NaN
      402280     NaN
      480787     NaN
      626231     NaN
      1166487    NaN
      1288872    NaN
      1377453    NaN
      1541535    NaN
      2497958    NaN
      3496307    NaN
      4002319    NaN
      4474522    NaN
      4996114    NaN
      5483776    NaN
      6486097    NaN
      7486996    NaN
      8493729    NaN
      8595321    NaN
      9608653    NaN
7086  167904     NaN
      1023950    NaN
      1973973    NaN
      2940564    NaN
      3720451    NaN
      4700663    NaN
      5717707    NaN
      9593205    NaN
Name: employer_name, dtype: object

In [52]:
df_weird_employer.employer_state

3568  1199313    NY
      1487743    NY
      1834319    NY
      2150485    NY
      2442728    NY
      2799678    NY
      3117061    NY
      4094760    NY
      5091871    NY
      5117346    NY
      6128562    NY
      6561378    NY
      7128783    NY
      8135231    NY
      8989458    VA
      9110737    VA
7551  293800     AK
      402280     AK
      480787     AK
      626231     AK
      1166487    NY
      1288872    NY
      1377453    NY
      1541535    NY
      2497958    NY
      3496307    NY
      4002319    NY
      4474522    NY
      4996114    NY
      5483776    NY
      6486097    NY
      7486996    NY
      8493729    NY
      8595321    NY
      9608653    VA
7086  167904     AK
      1023950    NY
      1973973    NY
      2940564    NY
      3720451    NY
      4700663    NY
      5717707    NY
      9593205    VA
Name: employer_state, dtype: object