In [1]:
import pseudopeople as psp
import pandas as pd, numpy as np

from vivarium_research_prl.utils import sizemb, MappingViaAttributes, build_full_address
from vivarium_research_prl import alpha

!date
!whoami
!uname -a
!python --version
!pwd

Wed 09 Aug 2023 05:42:39 PM PDT
ndbs
Linux long-slurm-sarchive-p0046 5.4.0-135-generic #152-Ubuntu SMP Wed Nov 23 20:19:22 UTC 2022 x86_64 x86_64 x86_64 GNU/Linux
Python 3.10.12
/mnt/share/code/ndbs/vivarium_research_prl/pseudopeople_testing


In [90]:
%load_ext autoreload
%autoreload 2

# Write a function to load all dataframes

In [2]:
dir(psp)

['NO_NOISE',
 '__about__',
 '__author__',
 '__builtins__',
 '__cached__',
 '__copyright__',
 '__doc__',
 '__email__',
 '__file__',
 '__license__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '__summary__',
 '__title__',
 '__uri__',
 '__version__',
 'column_getters',
 'configuration',
 'constants',
 'data',
 'entity_types',
 'exceptions',
 'generate_american_community_survey',
 'generate_current_population_survey',
 'generate_decennial_census',
 'generate_social_security',
 'generate_taxes_1040',
 'generate_taxes_w2_and_1099',
 'generate_women_infants_and_children',
 'get_config',
 'interface',
 'loader',
 'noise',
 'noise_entities',
 'noise_functions',
 'noise_scaling',
 'schema_entities',
 'utilities']

In [3]:
[x for x in dir(psp) if 'generate' in x]

['generate_american_community_survey',
 'generate_current_population_survey',
 'generate_decennial_census',
 'generate_social_security',
 'generate_taxes_1040',
 'generate_taxes_w2_and_1099',
 'generate_women_infants_and_children']

In [4]:
getattr(psp, 'generate_american_community_survey')

<function pseudopeople.interface.generate_american_community_survey(source: Union[pathlib.Path, str] = None, seed: int = 0, config: Union[pathlib.Path, str, Dict[str, Dict]] = None, year: Optional[int] = 2020, state: Optional[str] = None, verbose: bool = False) -> pandas.core.frame.DataFrame>

In [5]:
dir(getattr(psp, 'generate_american_community_survey'))

['__annotations__',
 '__builtins__',
 '__call__',
 '__class__',
 '__closure__',
 '__code__',
 '__defaults__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__get__',
 '__getattribute__',
 '__globals__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__kwdefaults__',
 '__le__',
 '__lt__',
 '__module__',
 '__name__',
 '__ne__',
 '__new__',
 '__qualname__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__']

In [6]:
getattr(psp, 'generate_american_community_survey').__name__

'generate_american_community_survey'

In [7]:
'generate_american_community_survey'.replace('generate_', '')

'american_community_survey'

## Function to generate all datasets

In [86]:
def generate_data(*args, **kwargs):
    generation_fns = (getattr(psp, name) for name in dir(psp) if 'generate' in name)
    data = {f.__name__.replace('generate_', ''): f(*args, **kwargs) for f in generation_fns}
    return MappingViaAttributes(data)

In [9]:
%%time
data = generate_data()
data.keylist()

                                                                                                                    

CPU times: user 40.9 s, sys: 338 ms, total: 41.3 s
Wall time: 41.4 s




['american_community_survey',
 'current_population_survey',
 'decennial_census',
 'social_security',
 'taxes_1040',
 'taxes_w2_and_1099',
 'women_infants_and_children']

In [17]:
sum(sizemb(df) for df in data.values())

64.26209399999999

In [18]:
{name: sizemb(df) for name, df in data.items()}

{'american_community_survey': 0.071779,
 'current_population_survey': 0.052471,
 'decennial_census': 14.722564,
 'social_security': 9.158109,
 'taxes_1040': 15.988987,
 'taxes_w2_and_1099': 24.044507,
 'women_infants_and_children': 0.223677}

# Display columns in all datasets

In [10]:
{name: df.dtypes for name, df in data.items()}

{'american_community_survey': simulant_id                                 object
 household_id                                object
 survey_date                         datetime64[ns]
 first_name                                  object
 middle_initial                              object
 last_name                                   object
 age                                         object
 date_of_birth                               object
 street_number                               object
 street_name                                 object
 unit_number                                 object
 city                                        object
 state                                     category
 zipcode                                     object
 relationship_to_reference_person          category
 sex                                       category
 race_ethnicity                            category
 dtype: object,
 'current_population_survey': simulant_id               object
 househo

# Check which datasets have certain columns

In [11]:
[name for name in data if 'sex' in data[name]]

['american_community_survey',
 'current_population_survey',
 'decennial_census',
 'social_security',
 'women_infants_and_children']

In [12]:
[name for name in data if 'relationship_to_reference_person' in data[name]]

['american_community_survey', 'decennial_census']

In [13]:
[name for name in data if 'middle_name' in data[name]]

['social_security']

In [16]:
[name for name in data if not data[name].filter(regex='po_box').columns.empty]

['taxes_1040', 'taxes_w2_and_1099']

In [50]:
[name for name in data if not data[name].filter(regex='year').columns.empty]

['decennial_census',
 'taxes_1040',
 'taxes_w2_and_1099',
 'women_infants_and_children']

# Get default config

In [19]:
config = psp.get_config()
config

{'decennial_census': {'row_noise': {'do_not_respond': {'row_probability': 0.0145}},
  'column_noise': {'first_name': {'leave_blank': {'cell_probability': 0.01},
    'use_nickname': {'cell_probability': 0.01},
    'use_fake_name': {'cell_probability': 0.01},
    'make_phonetic_errors': {'cell_probability': 0.01,
     'token_probability': 0.1},
    'make_ocr_errors': {'cell_probability': 0.01, 'token_probability': 0.1},
    'make_typos': {'cell_probability': 0.01, 'token_probability': 0.1}},
   'middle_initial': {'leave_blank': {'cell_probability': 0.01},
    'make_phonetic_errors': {'cell_probability': 0.01,
     'token_probability': 0.1},
    'make_ocr_errors': {'cell_probability': 0.01, 'token_probability': 0.1},
    'make_typos': {'cell_probability': 0.01, 'token_probability': 0.1}},
   'last_name': {'leave_blank': {'cell_probability': 0.01},
    'use_fake_name': {'cell_probability': 0.01},
    'make_phonetic_errors': {'cell_probability': 0.01,
     'token_probability': 0.1},
    'ma

# Get a config with no noise

In [22]:
nonoise = psp.get_config(user_config=psp.NO_NOISE)
nonoise

{'decennial_census': {'row_noise': {'do_not_respond': {'row_probability': 0.0}},
  'column_noise': {'first_name': {'leave_blank': {'cell_probability': 0.0},
    'use_nickname': {'cell_probability': 0.0},
    'use_fake_name': {'cell_probability': 0.0},
    'make_phonetic_errors': {'cell_probability': 0.0,
     'token_probability': 0.1},
    'make_ocr_errors': {'cell_probability': 0.0, 'token_probability': 0.1},
    'make_typos': {'cell_probability': 0.0, 'token_probability': 0.1}},
   'middle_initial': {'leave_blank': {'cell_probability': 0.0},
    'make_phonetic_errors': {'cell_probability': 0.0,
     'token_probability': 0.1},
    'make_ocr_errors': {'cell_probability': 0.0, 'token_probability': 0.1},
    'make_typos': {'cell_probability': 0.0, 'token_probability': 0.1}},
   'last_name': {'leave_blank': {'cell_probability': 0.0},
    'use_fake_name': {'cell_probability': 0.0},
    'make_phonetic_errors': {'cell_probability': 0.0,
     'token_probability': 0.1},
    'make_ocr_errors': 

# Find which datasets have omission noise turned on

Nothing has simple omission turned on by default. The census and surveys have 'do not respond' omission set to nonzero values.

In [24]:
[x for x in config]

['decennial_census',
 'american_community_survey',
 'current_population_survey',
 'women_infants_and_children',
 'social_security',
 'taxes_w2_and_1099',
 'taxes_1040']

In [25]:
{name: config[name]['row_noise'] for name in config}

{'decennial_census': {'do_not_respond': {'row_probability': 0.0145}},
 'american_community_survey': {'do_not_respond': {'row_probability': 0.0145}},
 'current_population_survey': {'do_not_respond': {'row_probability': 0.2905}},
 'women_infants_and_children': {'omit_row': {'row_probability': 0.0}},
 'social_security': {'omit_row': {'row_probability': 0.0}},
 'taxes_w2_and_1099': {'omit_row': {'row_probability': 0.0}},
 'taxes_1040': {'omit_row': {'row_probability': 0.0}}}

# Check whether the index is always consecutive, particularly for datasets with rows omitted

In [26]:
data.decennial_census

Unnamed: 0,simulant_id,household_id,first_name,middle_initial,last_name,age,date_of_birth,street_number,street_name,unit_number,city,state,zipcode,relationship_to_reference_person,sex,race_ethnicity,year
0,0_0,0_6,Gerald,R,Sorrell,70,01/18/1950,,barbara cir,,williamsburg,VA,22802,Reference person,Male,White,2020
1,0_1,0_6,Earline,R,Sorrell,63,12/23/1956,,barbara cir,,williamsburg,VA,22802,Opp-sex spouse,Female,White,2020
2,0_2,0_7,Diana,P,Kellv,25,05/06/1994,5112,145th st,,portland,OR,97601,Reference person,Female,White,2020
3,0_3,0_7,Anna,A,,25,09/69/1994,5174,145th st,,portland,OR,97601,Other relative,Female,White,2020
4,0_4,0_8,Eric,R,Stark Lozano,38,05/29/1981,1501,interlake ave n,,bradenton,FL,33174,Reference person,Male,Latino,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19819,0_20449,0_6619,Bridget,L,Ford,22,08/13/1997,100,milling st,,reynoldsburg,OH,45242,Other nonrelative,Female,White,2020
19820,0_20450,0_383,Jonathan,M,Morgan,38,09/18/1981,20430,autumn ridge dr,,bella vista,AR,72704,Sibling,Male,Latino,2020
19821,0_20451,0_8366,Dennis,D,Aagard,58,09/24/1961,1894,nrte 103rd st,,goose cr,SC,29625,Reference person,Male,White,2020
19822,0_20452,0_8366,Lisa,S,Aagard,55,08/05/1964,1894,nrte 103rd st,,goose cr,FL,29625,Opp-sex spouse,Female,White,2020


In [28]:
index = data.decennial_census.index
(index == pd.Series(range(len(index)))).all()

True

In [29]:
index = data.american_community_survey.index
(index == pd.Series(range(len(index)))).all()

True

In [30]:
index = data.current_population_survey.index
(index == pd.Series(range(len(index)))).all()

True

In [35]:
np.arange(5)

array([0, 1, 2, 3, 4])

## Write a function to check for consecutive index

In [33]:
def index_is_consecutive(df):
    index = df.index
    return (index == np.arange(len(index))).all()

In [34]:
all(index_is_consecutive(df) for df in data.values())

True

# Try generating a dataset with more missing rows and see if index is still consecutive

Yes, it is.

In [37]:
data.social_security

Unnamed: 0,simulant_id,first_name,middle_name,last_name,date_of_birth,sex,ssn,event_type,event_date
0,0_19979,Evelyn,Nancy,Hines,19191204,Female,786-77-6454,creation,19191204
1,0_6846,George,Robert,Dickens,19210616,Male,688-88-6377,creation,19210616
2,0_19983,Beatrice,Jennie,Fackler,19220113,Female,651-33-9561,creation,19220113
3,0_262,Eura,Nadine,Crusen,19220305,Female,665-25-7858,creation,19220305
4,0_12473,Roberta,Ruth,Muilenburg,19220306,Female,875-10-2359,creation,19220306
...,...,...,...,...,...,...,...,...,...
20289,0_20687,Matthew,Michael,Phillips,19800224,Male,183-90-0619,creation,20201229
20290,0_20686,Jermey,Tyler,Wimmer,19860415,Male,803-81-8527,creation,20201229
20291,0_20692,Brittanie,Lauren,Thao,19950118,Female,170-62-5253,creation,20201229
20292,0_20662,Marcus,Jasper,Murphy,20201230,Male,281-88-9330,creation,20201230


In [40]:
missing_config = {'social_security': {'row_noise': {'omit_row': {'row_probability': 0.5}}}}

In [41]:
ssa_missing = psp.generate_social_security(config=missing_config)
ssa_missing

                                                                                                                    

Unnamed: 0,simulant_id,first_name,middle_name,last_name,date_of_birth,sex,ssn,event_type,event_date
0,0_19979,Evelyn,Nancy,Hines,19191204,Female,786-77-6454,creation,19191204
1,0_19983,Beatrice,Jennie,Fackler,19220113,Female,651-33-9561,creation,19220113
2,0_19941,Betty,Mary,Bee,19220307,Female,420-19-3737,death,19220307
3,0_7141,Betty,Rhoda,Stutler,19220508,Female,494-11-1947,creation,19220508
4,0_18338,Lois,Mary,Dewispelaere,19220710,Female,325-04-6183,creation,19220710
...,...,...,...,...,...,...,...,...,...
10142,0_8740,Jay,Stephen,Chamberlain,19570606,Male,404-84-1009,death,20201229
10143,0_20688,Efrain,Paul,Harrison,19590520,Male,161-06-9252,creation,
10144,0_19726,David,Harvey,Coffman,19651221,Male,112-13-5339,death,20201229
10145,0_20690,Jeremiah,Carlos,Gooden,19800105,Male,438-57-6936,creation,20201229


In [42]:
index_is_consecutive(ssa_missing)

True

In [46]:
data.social_security.merge(ssa_missing, on=['simulant_id'], how='left')

Unnamed: 0,simulant_id,first_name_x,middle_name_x,last_name_x,date_of_birth_x,sex_x,ssn_x,event_type_x,event_date_x,first_name_y,middle_name_y,last_name_y,date_of_birth_y,sex_y,ssn_y,event_type_y,event_date_y
0,0_19979,Evelyn,Nancy,Hines,19191204,Female,786-77-6454,creation,19191204,Evelyn,Nancy,Hines,19191204,Female,786-77-6454,creation,19191204
1,0_6846,George,Robert,Dickens,19210616,Male,688-88-6377,creation,19210616,,,,,,,,
2,0_19983,Beatrice,Jennie,Fackler,19220113,Female,651-33-9561,creation,19220113,Beatrice,Jennie,Fackler,19220113,Female,651-33-9561,creation,19220113
3,0_19983,Beatrice,Jennie,Fackler,19220113,Female,651-33-9561,creation,19220113,Beatrice,Jennie,Fackler,19220113,Female,651-33-9561,death,20191231
4,0_262,Eura,Nadine,Crusen,19220305,Female,665-25-7858,creation,19220305,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20471,0_20687,Matthew,Michael,Phillips,19800224,Male,183-90-0619,creation,20201229,,,,,,,,
20472,0_20686,Jermey,Tyler,Wimmer,19860415,Male,803-81-8527,creation,20201229,,,,,,,,
20473,0_20692,Brittanie,Lauren,Thao,19950118,Female,170-62-5253,creation,20201229,,,,,,,,
20474,0_20662,Marcus,Jasper,Murphy,20201230,Male,281-88-9330,creation,20201230,Marcus,Jasper,Murphy,20201230,Male,281-88-9330,creation,20201230


## Verify that index does not "leak data" by telling use which rows are omitted

In [65]:
# This verifies that corresponding rows almost always have different index numbers in the two versions of the SSA data.
# Thus we don't have a "data leakage" problem in the index.
data.social_security.join(ssa_missing, rsuffix='_m')

Unnamed: 0,simulant_id,first_name,middle_name,last_name,date_of_birth,sex,ssn,event_type,event_date,simulant_id_m,first_name_m,middle_name_m,last_name_m,date_of_birth_m,sex_m,ssn_m,event_type_m,event_date_m
0,0_19979,Evelyn,Nancy,Hines,19191204,Female,786-77-6454,creation,19191204,0_19979,Evelyn,Nancy,Hines,19191204,Female,786-77-6454,creation,19191204
1,0_6846,George,Robert,Dickens,19210616,Male,688-88-6377,creation,19210616,0_19983,Beatrice,Jennie,Fackler,19220113,Female,651-33-9561,creation,19220113
2,0_19983,Beatrice,Jennie,Fackler,19220113,Female,651-33-9561,creation,19220113,0_19941,Betty,Mary,Bee,19220307,Female,420-19-3737,death,19220307
3,0_262,Eura,Nadine,Crusen,19220305,Female,665-25-7858,creation,19220305,0_7141,Betty,Rhoda,Stutler,19220508,Female,494-11-1947,creation,19220508
4,0_12473,Roberta,Ruth,Muilenburg,19220306,Female,875-10-2359,creation,19220306,0_18338,Lois,Mary,Dewispelaere,19220710,Female,325-04-6183,creation,19220710
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20289,0_20687,Matthew,Michael,Phillips,19800224,Male,183-90-0619,creation,20201229,,,,,,,,,
20290,0_20686,Jermey,Tyler,Wimmer,19860415,Male,803-81-8527,creation,20201229,,,,,,,,,
20291,0_20692,Brittanie,Lauren,Thao,19950118,Female,170-62-5253,creation,20201229,,,,,,,,,
20292,0_20662,Marcus,Jasper,Murphy,20201230,Male,281-88-9330,creation,20201230,,,,,,,,,


# Look at 1040 tax form

The `tax_year` column is in the wrong place -- it should be the last column, but it's in the middle.

In [47]:
data.taxes_1040

Unnamed: 0,simulant_id,household_id,first_name,middle_initial,last_name,mailing_address_street_number,mailing_address_street_name,mailing_address_unit_number,mailing_address_po_box,mailing_address_city,...,dependent_1_ssn,dependent_2_first_name,dependent_2_last_name,dependent_2_ssn,dependent_3_first_name,dependent_3_last_name,dependent_3_ssn,dependent_4_first_name,dependent_4_last_name,dependent_4_ssn
0,0_0,0_6,Gerald,R,Sorrell,,barbara cir,,,williamsburg,...,,,,,,,,,,
1,0_2,0_7,Diana,P,Kelly,5112,,,,portland,...,,,,,,,,,,
2,0_8,0_9,Elizabeth,P,Gonzalez,12,meridian st,,,albany,...,395-30-9975,,,,,,,,,
3,0_14,0_11,Gerald,R,Hutchison,502,n bingham st,,,oakland,...,462-01-8027,Dominic,Hutchison,178-81-4204,,,,,,
4,0_17,0_12,Gerald,R,Mcintyre,2105,hyde park ln,,,s bend,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12353,0_20750,0_5,Michael,E,Longley,4000,nw skycrest pkwy,,,new caney,...,,,,,,,,,,
12354,0_20751,0_3336,Brody,J,Dominguez,120,32 avenue,,,e brunswick,...,,,,,,,,,,
12355,0_20753,0_2825,Jennifer,T,Concepcion,4920,10th way,,,painesville,...,,,,,,,,,,
12356,0_20754,0_6380,Jazmine,A,Hughes,2690,norh e 65th street,,,birmingham,...,,,,,,,,,,


In [49]:
data.taxes_1040.memory_usage(deep=True) / len(data.taxes_1040)

Index                             0.010358
simulant_id                      63.466904
household_id                     62.722447
first_name                       62.622188
middle_initial                   57.768652
last_name                        63.312753
mailing_address_street_number    59.097993
mailing_address_street_name      68.429358
mailing_address_unit_number      34.784917
mailing_address_po_box           32.025247
mailing_address_city             65.610050
mailing_address_state             1.413740
mailing_address_zipcode          61.687328
ssn                              66.517964
tax_year                         36.000000
spouse_first_name                35.105357
spouse_middle_initial            34.621460
spouse_last_name                 35.179155
spouse_ssn                       35.414226
dependent_1_first_name           38.008820
dependent_1_last_name            38.166289
dependent_1_ssn                  38.939230
dependent_2_first_name           35.165965
dependent_2

In [54]:
data.taxes_1040.memory_usage(deep=True).sum() / 1e6

15.988971

In [55]:
sizemb(data.taxes_1040)

15.988987

In [58]:
data.taxes_1040.memory_usage(deep=True) / 1000

Index                              0.128
simulant_id                      784.324
household_id                     775.124
first_name                       773.885
middle_initial                   713.905
last_name                        782.419
mailing_address_street_number    730.333
mailing_address_street_name      845.650
mailing_address_unit_number      429.872
mailing_address_po_box           395.768
mailing_address_city             810.809
mailing_address_state             17.471
mailing_address_zipcode          762.332
ssn                              822.029
tax_year                         444.888
spouse_first_name                433.832
spouse_middle_initial            427.852
spouse_last_name                 434.744
spouse_ssn                       437.649
dependent_1_first_name           469.713
dependent_1_last_name            471.659
dependent_1_ssn                  481.211
dependent_2_first_name           434.581
dependent_2_last_name            435.753
dependent_2_ssn 

In [59]:
s = data.taxes_1040.dependent_1_ssn.astype('category')
s

0                NaN
1                NaN
2        395-30-9975
3        462-01-8027
4                NaN
            ...     
12353            NaN
12354            NaN
12355            NaN
12356            NaN
12357            NaN
Name: dependent_1_ssn, Length: 12358, dtype: category
Categories (2158, object): ['001-02-4588', '002-05-7114', '002-08-7357', '002-36-2766', ..., '897-53-4216', '899-23-4006', '899-23-6208', '899-86-3167']

In [64]:
s.memory_usage(deep=True) / 1000

237.679

# It looks like we have addresses from all over the US instead of just in Anytown, US

In [73]:
 data.taxes_1040.filter(like='mailing')

Unnamed: 0,mailing_address_street_number,mailing_address_street_name,mailing_address_unit_number,mailing_address_po_box,mailing_address_city,mailing_address_state,mailing_address_zipcode
0,,barbara cir,,,williamsburg,VA,22802
1,5112,,,,portland,OR,97601
2,12,meridian st,,,albany,NY,12308
3,502,n bingham st,,,oakland,CA,95678
4,2105,hyde park ln,,,s bend,IN,46033
...,...,...,...,...,...,...,...
12353,4000,nw skycrest pkwy,,,new caney,TX,78645
12354,120,32 avenue,,,e brunswick,NJ,07047
12355,4920,10th way,,,painesville,OH,44095
12356,2690,norh e 65th street,,,birmingham,AL,36117


In [71]:
data.decennial_census

Unnamed: 0,simulant_id,household_id,first_name,middle_initial,last_name,age,date_of_birth,street_number,street_name,unit_number,city,state,zipcode,relationship_to_reference_person,sex,race_ethnicity,year
0,0_0,0_6,Gerald,R,Sorrell,70,01/18/1950,,barbara cir,,williamsburg,VA,22802,Reference person,Male,White,2020
1,0_1,0_6,Earline,R,Sorrell,63,12/23/1956,,barbara cir,,williamsburg,VA,22802,Opp-sex spouse,Female,White,2020
2,0_2,0_7,Diana,P,Kellv,25,05/06/1994,5112,145th st,,portland,OR,97601,Reference person,Female,White,2020
3,0_3,0_7,Anna,A,,25,09/69/1994,5174,145th st,,portland,OR,97601,Other relative,Female,White,2020
4,0_4,0_8,Eric,R,Stark Lozano,38,05/29/1981,1501,interlake ave n,,bradenton,FL,33174,Reference person,Male,Latino,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19819,0_20449,0_6619,Bridget,L,Ford,22,08/13/1997,100,milling st,,reynoldsburg,OH,45242,Other nonrelative,Female,White,2020
19820,0_20450,0_383,Jonathan,M,Morgan,38,09/18/1981,20430,autumn ridge dr,,bella vista,AR,72704,Sibling,Male,Latino,2020
19821,0_20451,0_8366,Dennis,D,Aagard,58,09/24/1961,1894,nrte 103rd st,,goose cr,SC,29625,Reference person,Male,White,2020
19822,0_20452,0_8366,Lisa,S,Aagard,55,08/05/1964,1894,nrte 103rd st,,goose cr,FL,29625,Opp-sex spouse,Female,White,2020


In [72]:
data.american_community_survey

Unnamed: 0,simulant_id,household_id,survey_date,first_name,middle_initial,last_name,age,date_of_birth,street_number,street_name,unit_number,city,state,zipcode,relationship_to_reference_person,sex,race_ethnicity
0,0_6874,0_2799,2020-01-28,Heather,J,Howard,45,11/21/1974,3670,pearl dr,,lawrence,KS,66046,Opp-sex spouse,Female,White
1,0_6875,0_2799,2020-01-28,Cristina,E,Howard,9,09/12/2010,3670,pearl dr,,lawrence,KS,66046,Biological child,Female,White
2,0_6876,0_2799,2020-01-28,Chloe,G,Howard,8,11/23/2011,3670,pearl dr,,lawrence,KS,66046,Biological child,Female,White
3,0_1804,0_3,2020-04-21,Benjamin,M,Mcmillon,21,01/09/1999,8203,west farwell avenue,,augusta,GA,30350,Noninstitutionalized GQ pop,Male,White
4,0_1928,0_3,2020-04-21,Lucille,C,Smith,81,12/10/1938,8203,west farwell avenue,,augusta,GA,30350,Noninstitutionalized GQ pop,Female,Black
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84,0_15064,0_6067,2020-08-11,Jeffrey,J,Deakins,50,05/10/1977,4209,yost ln,,bixby,OK,73013,Same-sex spouse,Male,White
85,0_12124,0_4898,2020-10-06,Raymond,J,Valdez,16,10/22/2003,820,cameron road,,new york cty,NY,14120,Biological child,Male,Asian
86,0_20535,0_4898,2020-10-06,Ronin,K,Valdez,0,08/04/2020,820,cameron road,,new york cty,NY,14120,Biological child,Male,Asian
87,0_4986,0_2040,2020-11-03,Sarah,G,Wilbanks,61,06/28/1959,282,rathford dr,no 682 level 6,brookhaven,NY,14201,Reference person,Female,White


# Does SSN column in 1040 contain both SSNs and ITINs, or are ITIN rows just missing?

Answer from SWEs: The SSN column currently only contains SSNs, with no ITINs. So presumably people with ITINs will just have NaN for their SSN in the current data.

In [74]:
data.taxes_1040.ssn.isna().sum()

509

In [76]:
# Calculate percent missing
100 * data.taxes_1040.ssn.isna().sum() / len(data.taxes_1040.ssn.isna())

4.118789448130766

In [78]:
# There should only be 1% missing if missingness were just due to noise
config['taxes_1040']['column_noise']['ssn']

{'leave_blank': {'cell_probability': 0.01},
 'copy_from_household_member': {'cell_probability': 0.01},
 'write_wrong_digits': {'cell_probability': 0.01, 'token_probability': 0.1},
 'make_ocr_errors': {'cell_probability': 0.01, 'token_probability': 0.1},
 'make_typos': {'cell_probability': 0.01, 'token_probability': 0.1}}

# Does it work to pass `NO_NOISE` when generating a dataset?

In [80]:
%%time
wic0 = psp.generate_women_infants_and_children(config=psp.NO_NOISE)
wic0

                                                                                                                    

CPU times: user 468 ms, sys: 3.06 ms, total: 471 ms
Wall time: 468 ms




Unnamed: 0,simulant_id,household_id,first_name,middle_initial,last_name,date_of_birth,street_number,street_name,unit_number,city,state,zipcode,sex,race_ethnicity,year
0,0_58,0_27,Anna,A,Terrell,03071986,20,jill pl,,detroit,MI,49635,Female,White,2020
1,0_725,0_288,Samantha,S,Ohara,07281999,,keemont dr,# 63,auburn,KS,67460,Female,White,2020
2,0_1176,0_474,Stephanie,A,Martinez Burgos,10041996,2509,w venturi dr,,portland,OR,97301,Female,Latino,2020
3,0_1177,0_475,Anna,M,Dixon,11011994,4525,princess anne r,,philadelphia,PA,19124,Female,Black,2020
4,0_1312,0_527,Sara,S,Farrell,11291986,829,second ave,,steamboat sprngs,CO,80204,Female,Latino,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
304,0_20356,0_6751,Wyatt,H,Lopez,01192020,716,armory drive,,carlsbad,CA,91340,Male,Latino,2020
305,0_20357,0_6765,Benjamin,R,Quigley,01222020,13209,ne milton st,,lauderhill,FL,32563,Male,White,2020
306,0_20362,0_7561,Miles,M,Kinneman,01072020,6258,pinestead dr,,geneva,IL,60540,Male,White,2020
307,0_20363,0_5570,Donovan,P,Hansley,01202020,9618,western ave,,old bridge twp,NJ,08846,Male,Black,2020


In [83]:
%%time
w2_0 = psp.generate_taxes_w2_and_1099(config=psp.NO_NOISE)
w2_0

                                                                                                                    

CPU times: user 1.9 s, sys: 75.8 ms, total: 1.97 s
Wall time: 1.96 s




Unnamed: 0,simulant_id,household_id,first_name,middle_initial,last_name,age,date_of_birth,mailing_address_street_number,mailing_address_street_name,mailing_address_unit_number,...,employer_id,employer_name,employer_street_number,employer_street_name,employer_unit_number,employer_city,employer_state,employer_zipcode,tax_form,tax_year
0,0_0,0_6,Gerald,R,Sorrell,70,01/18/1950,,barbara cir,,...,100,Twin Boro Auto Repair Service (USPS),,edgecliff ct,,kingston,NY,12180,W2,2020
1,0_1,0_6,Earline,R,Sorrell,64,12/23/1956,,barbara cir,,...,78,Autism and Bob's Big Louie's,1960,skyview ter,unit 170,houston,TX,78543,W2,2020
2,0_2,0_7,Diana,P,Kelly,26,05/06/1994,5112,145th st,,...,27,The Wicked Pho Licious Burgers,5525,r r property,,independence,MO,65401,W2,2020
3,0_2,0_7,Diana,P,Kelly,26,05/06/1994,5112,145th st,,...,95,Pikes Creek Campground,e,ince dr,,virginia beach,VA,22026,W2,2020
4,0_3,0_7,Anna,A,Kelly,26,09/29/1994,5112,145th st,,...,23,Greenway Med Spa Services,631,regent ave,,mena,AR,71744,W2,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18737,0_20629,0_8599,Jamone,B,Tercero,27,05/01/1993,11001,e johnson ave,,...,104,San Benito Martinez Landscape Supply,4212,morning vista dr,,beavercreek,OH,44281,W2,2020
18738,0_20631,0_8600,Ashley,E,Cooke,33,08/23/1987,,,,...,60,Freeway Insurance Agency,1105,largess ln,,milwaukee,WI,53140,W2,2020
18739,0_20657,0_3505,Yesenia,K,Kilgore,19,10/25/2001,710,n dancer rd,,...,90,Northwell Health Center Inc,,barge roa sw,,washington,DC,20016,1099,2020
18740,0_20659,0_5225,Eric,C,Mcilwee,43,11/11/1977,3195,woodglen dr,,...,41,Aquarium,2916,4th ave w,,citrus heights,CA,92118,W2,2020


In [85]:
w2_0.isna().sum()

simulant_id                          0
household_id                         0
first_name                           0
middle_initial                       0
last_name                            0
age                                  0
date_of_birth                        0
mailing_address_street_number      686
mailing_address_street_name          0
mailing_address_unit_number      17110
mailing_address_po_box           18068
mailing_address_city                 0
mailing_address_state                0
mailing_address_zipcode              0
ssn                                  0
wages                                0
employer_id                          0
employer_name                        0
employer_street_number             765
employer_street_name                 0
employer_unit_number             17831
employer_city                        0
employer_state                       0
employer_zipcode                     0
tax_form                             0
tax_year                 

In [82]:
%%time
t1040_0 = psp.generate_taxes_1040(config=psp.NO_NOISE)
t1040_0

                                                                                                                    

CPU times: user 24.6 s, sys: 41.9 ms, total: 24.6 s
Wall time: 24.6 s




Unnamed: 0,simulant_id,household_id,first_name,middle_initial,last_name,mailing_address_street_number,mailing_address_street_name,mailing_address_unit_number,mailing_address_po_box,mailing_address_city,...,dependent_1_ssn,dependent_2_first_name,dependent_2_last_name,dependent_2_ssn,dependent_3_first_name,dependent_3_last_name,dependent_3_ssn,dependent_4_first_name,dependent_4_last_name,dependent_4_ssn
0,0_0,0_6,Gerald,R,Sorrell,,barbara cir,,,williamsburg,...,,,,,,,,,,
1,0_2,0_7,Diana,P,Kelly,5112,145th st,,,portland,...,,,,,,,,,,
2,0_8,0_9,Elizabeth,P,Gonzalez,12,meridian st,,,albany,...,268-18-4020,,,,,,,,,
3,0_14,0_11,Gerald,R,Hutchison,502,n bingham st,,,oakland,...,462-01-8027,Dominic,Hutchison,178-81-4204,,,,,,
4,0_17,0_12,Gerald,R,Mcintyre,2105,hyde park ln,,,s bend,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12353,0_20750,0_5,Michael,E,Longley,4000,nw skycrest pkwy,,,new caney,...,,,,,,,,,,
12354,0_20751,0_3336,Brody,J,Dominguez,120,32 avenue,,,e brunswick,...,,,,,,,,,,
12355,0_20753,0_2825,Jennifer,T,Concepcion,4920,10th way,,,painesville,...,,,,,,,,,,
12356,0_20754,0_6380,Jazmine,A,Hughes,2690,norh e 65th street,,,birmingham,...,,,,,,,,,,


In [84]:
t1040_0.isna().sum()

simulant_id                          0
household_id                         0
first_name                           0
middle_initial                       0
last_name                            0
mailing_address_street_number      485
mailing_address_street_name          0
mailing_address_unit_number      11231
mailing_address_po_box           11909
mailing_address_city                 0
mailing_address_state                0
mailing_address_zipcode              0
ssn                                401
tax_year                             0
spouse_first_name                11101
spouse_middle_initial            11101
spouse_last_name                 11101
spouse_ssn                       11172
dependent_1_first_name            9927
dependent_1_last_name             9927
dependent_1_ssn                   9948
dependent_2_first_name           11076
dependent_2_last_name            11076
dependent_2_ssn                  11084
dependent_3_first_name           11900
dependent_3_last_name    

# Try generating all datasets with no noise

In [87]:
%%time
data0 = generate_data(config=psp.NO_NOISE)
data0.keylist()

                                                                                                                    

CPU times: user 29.3 s, sys: 209 ms, total: 29.5 s
Wall time: 29.5 s




['american_community_survey',
 'current_population_survey',
 'decennial_census',
 'social_security',
 'taxes_1040',
 'taxes_w2_and_1099',
 'women_infants_and_children']

In [88]:
data0.women_infants_and_children.equals(wic0)

True

In [91]:
data0.taxes_1040.equals(t1040_0)

True

In [93]:
data0.taxes_1040.equals(data.taxes_1040)

False

# Check how noisy some datasets are

In [115]:
alpha.percent_of_rows_with_difference(data.women_infants_and_children, data0.women_infants_and_children)

21.035598705501616

In [99]:
alpha.percent_different_in_columns(data.women_infants_and_children, data0.women_infants_and_children)

simulant_id        0.000000
household_id       0.000000
first_name         2.912621
middle_initial     1.618123
last_name          1.941748
date_of_birth      3.883495
street_number      3.883495
street_name        2.588997
unit_number       94.174757
city               3.559871
state              1.941748
zipcode            1.294498
sex                2.265372
race_ethnicity     0.647249
year               0.000000
dtype: float64

In [113]:
alpha.percent_different_in_columns(data.women_infants_and_children, data0.women_infants_and_children)

simulant_id       0.000000
household_id      0.000000
first_name        2.912621
middle_initial    1.618123
last_name         1.941748
date_of_birth     3.883495
street_number     0.647249
street_name       2.588997
unit_number       0.000000
city              3.559871
state             1.941748
zipcode           1.294498
sex               2.265372
race_ethnicity    0.647249
year              0.000000
dtype: float64

In [114]:
alpha.percent_different_in_columns(data.taxes_1040, data0.taxes_1040)

simulant_id                      0.000000
household_id                     0.000000
first_name                       3.892216
middle_initial                   1.019582
last_name                        3.341965
mailing_address_street_number    1.764039
mailing_address_street_name      2.767438
mailing_address_unit_number      0.080919
mailing_address_po_box           0.097103
mailing_address_city             2.468037
mailing_address_state            1.925878
mailing_address_zipcode          2.492313
ssn                              3.261045
tax_year                         0.000000
spouse_first_name                0.461240
spouse_middle_initial            0.089011
spouse_last_name                 0.364137
spouse_ssn                       1.302800
dependent_1_first_name           0.712089
dependent_1_last_name            0.550251
dependent_1_ssn                  1.715488
dependent_2_first_name           0.404596
dependent_2_last_name            0.388412
dependent_2_ssn                  1

# Write a function to check percent missingness

In [94]:
def percent_missing(df):
    return 100 * df.isna().sum() / len(df)

In [95]:
percent_missing(data.taxes_1040)

simulant_id                       0.000000
household_id                      0.000000
first_name                        0.995307
middle_initial                    0.890112
last_name                         1.027674
mailing_address_street_number     4.976533
mailing_address_street_name       1.043858
mailing_address_unit_number      90.904677
mailing_address_po_box           96.382910
mailing_address_city              0.962939
mailing_address_state             0.954847
mailing_address_zipcode           1.043858
ssn                               4.118789
tax_year                          0.000000
spouse_first_name                89.933646
spouse_middle_initial            89.917462
spouse_last_name                 89.933646
spouse_ssn                       90.516265
dependent_1_first_name           80.538922
dependent_1_last_name            80.522738
dependent_1_ssn                  80.725036
dependent_2_first_name           89.771808
dependent_2_last_name            89.731348
dependent_2

# Test module functions after copying code to `alpha.py`

In [111]:
%%time
data00 = alpha.generate_datasets(config=psp.NO_NOISE)
data00.keylist()

                                                                                                                    

CPU times: user 30 s, sys: 140 ms, total: 30.1 s
Wall time: 30.1 s




['american_community_survey',
 'current_population_survey',
 'decennial_census',
 'social_security',
 'taxes_1040',
 'taxes_w2_and_1099',
 'women_infants_and_children']

In [101]:
data00.american_community_survey

Unnamed: 0,simulant_id,household_id,survey_date,first_name,middle_initial,last_name,age,date_of_birth,street_number,street_name,unit_number,city,state,zipcode,relationship_to_reference_person,sex,race_ethnicity
0,0_6874,0_2799,2020-01-28,Heather,J,Howard,45,11/21/1974,3670,pearl dr,,lawrence,KS,66046,Opp-sex spouse,Female,White
1,0_6875,0_2799,2020-01-28,Cristina,E,Howard,9,09/12/2010,3670,pearl dr,,lawrence,KS,66046,Biological child,Female,White
2,0_6876,0_2799,2020-01-28,Chloe,G,Howard,8,11/23/2011,3670,pearl dr,,lawrence,KS,66046,Biological child,Female,White
3,0_1804,0_3,2020-04-21,Benjamin,M,Mcmillon,21,01/09/1999,8203,west farwell avenue,,augusta,GA,30350,Noninstitutionalized GQ pop,Male,White
4,0_1928,0_3,2020-04-21,Lucille,C,Smith,81,12/10/1938,8203,west farwell avenue,,augusta,GA,30350,Noninstitutionalized GQ pop,Female,Black
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84,0_15064,0_6067,2020-08-11,Jeffrey,J,Deakins,50,08/06/1969,4209,yost ln,,bixby,OK,73013,Same-sex spouse,Male,White
85,0_12124,0_4898,2020-10-06,Raymond,J,Valdez,16,10/22/2003,820,cameron road,,new york cty,NY,14120,Biological child,Male,Asian
86,0_20535,0_4898,2020-10-06,Ronin,K,Valdez,0,08/04/2020,820,cameron road,,new york cty,NY,14120,Biological child,Male,Asian
87,0_4986,0_2040,2020-11-03,Sarah,G,Wilbanks,61,06/28/1959,282,rathford dr,no 682 level 6,brookhaven,NY,14201,Reference person,Female,White


In [102]:
config0 = alpha.get_zero_noise_config()
config0

{'decennial_census': {'row_noise': {'do_not_respond': {'row_probability': 0}},
  'column_noise': {'first_name': {'leave_blank': {'cell_probability': 0},
    'use_nickname': {'cell_probability': 0},
    'use_fake_name': {'cell_probability': 0},
    'make_phonetic_errors': {'cell_probability': 0, 'token_probability': 0.1},
    'make_ocr_errors': {'cell_probability': 0, 'token_probability': 0.1},
    'make_typos': {'cell_probability': 0, 'token_probability': 0.1}},
   'middle_initial': {'leave_blank': {'cell_probability': 0},
    'make_phonetic_errors': {'cell_probability': 0, 'token_probability': 0.1},
    'make_ocr_errors': {'cell_probability': 0, 'token_probability': 0.1},
    'make_typos': {'cell_probability': 0, 'token_probability': 0.1}},
   'last_name': {'leave_blank': {'cell_probability': 0},
    'use_fake_name': {'cell_probability': 0},
    'make_phonetic_errors': {'cell_probability': 0, 'token_probability': 0.1},
    'make_ocr_errors': {'cell_probability': 0, 'token_probability'

In [104]:
f = getattr(psp, 'generate_taxes_1040')
f

<function pseudopeople.interface.generate_taxes_1040(source: Union[pathlib.Path, str] = None, seed: int = 0, config: Union[pathlib.Path, str, Dict[str, Dict]] = None, year: Optional[int] = 2020, state: Optional[str] = None, verbose: bool = False) -> pandas.core.frame.DataFrame>

In [109]:
f.__class__

function

In [116]:
conda list

# packages in environment at /ihme/homes/ndbs/miniconda3/envs/pppl310:
#
# Name                    Version                   Build  Channel
_libgcc_mutex             0.1                 conda_forge    conda-forge
_openmp_mutex             4.5                       2_gnu    conda-forge
alabaster                 0.7.13                   pypi_0    pypi
anyio                     3.7.1                    pypi_0    pypi
argon2-cffi               21.3.0                   pypi_0    pypi
argon2-cffi-bindings      21.2.0                   pypi_0    pypi
arrow                     1.2.3                    pypi_0    pypi
asttokens                 2.2.1                    pypi_0    pypi
async-lru                 2.0.4                    pypi_0    pypi
attrs                     23.1.0                   pypi_0    pypi
babel                     2.12.1                   pypi_0    pypi
backcall                  0.2.0                    pypi_0    pypi
beautifulsoup4            4.12.2                   pyp

In [117]:
psp.generate_taxes_1040()

                                                                                                                    

Unnamed: 0,simulant_id,household_id,first_name,middle_initial,last_name,mailing_address_street_number,mailing_address_street_name,mailing_address_unit_number,mailing_address_po_box,mailing_address_city,...,dependent_1_ssn,dependent_2_first_name,dependent_2_last_name,dependent_2_ssn,dependent_3_first_name,dependent_3_last_name,dependent_3_ssn,dependent_4_first_name,dependent_4_last_name,dependent_4_ssn
0,0_0,0_6,Gerald,R,Sorrell,,barbara cir,,,williamsburg,...,,,,,,,,,,
1,0_2,0_7,Diana,P,Kelly,5112,,,,portland,...,,,,,,,,,,
2,0_8,0_9,Elizabeth,P,Gonzalez,12,meridian st,,,albany,...,395-30-9975,,,,,,,,,
3,0_14,0_11,Gerald,R,Hutchison,502,n bingham st,,,oakland,...,462-01-8027,Dominic,Hutchison,178-81-4204,,,,,,
4,0_17,0_12,Gerald,R,Mcintyre,2105,hyde park ln,,,s bend,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12353,0_20750,0_5,Michael,E,Longley,4000,nw skycrest pkwy,,,new caney,...,,,,,,,,,,
12354,0_20751,0_3336,Brody,J,Dominguez,120,32 avenue,,,e brunswick,...,,,,,,,,,,
12355,0_20753,0_2825,Jennifer,T,Concepcion,4920,10th way,,,painesville,...,,,,,,,,,,
12356,0_20754,0_6380,Jazmine,A,Hughes,2690,norh e 65th street,,,birmingham,...,,,,,,,,,,


In [118]:
conda_env_other = """
# packages in environment at /ihme/homes/ndbs/miniconda3/envs/pppl310:
#
# Name                    Version                   Build  Channel
_libgcc_mutex             0.1                 conda_forge    conda-forge
_openmp_mutex             4.5                       2_gnu    conda-forge
alabaster                 0.7.13                   pypi_0    pypi
anyio                     3.7.1                    pypi_0    pypi
argon2-cffi               21.3.0                   pypi_0    pypi
argon2-cffi-bindings      21.2.0                   pypi_0    pypi
arrow                     1.2.3                    pypi_0    pypi
asttokens                 2.2.1                    pypi_0    pypi
async-lru                 2.0.4                    pypi_0    pypi
attrs                     23.1.0                   pypi_0    pypi
babel                     2.12.1                   pypi_0    pypi
backcall                  0.2.0                    pypi_0    pypi
beautifulsoup4            4.12.2                   pypi_0    pypi
bleach                    6.0.0                    pypi_0    pypi
blosc2                    2.0.0                    pypi_0    pypi
bzip2                     1.0.8                h7f98852_4    conda-forge
ca-certificates           2023.7.22            hbcca054_0    conda-forge
certifi                   2023.7.22                pypi_0    pypi
cffi                      1.15.1                   pypi_0    pypi
charset-normalizer        3.2.0                    pypi_0    pypi
click                     8.1.6                    pypi_0    pypi
comm                      0.1.3                    pypi_0    pypi
contourpy                 1.1.0                    pypi_0    pypi
cycler                    0.11.0                   pypi_0    pypi
cython                    3.0.0                    pypi_0    pypi
debugpy                   1.6.7                    pypi_0    pypi
decorator                 5.1.1                    pypi_0    pypi
defusedxml                0.7.1                    pypi_0    pypi
docutils                  0.18.1                   pypi_0    pypi
exceptiongroup            1.1.2                    pypi_0    pypi
executing                 1.2.0                    pypi_0    pypi
fastjsonschema            2.18.0                   pypi_0    pypi
fonttools                 4.41.1                   pypi_0    pypi
fqdn                      1.5.1                    pypi_0    pypi
idna                      3.4                      pypi_0    pypi
imagesize                 1.4.1                    pypi_0    pypi
iniconfig                 2.0.0                    pypi_0    pypi
ipykernel                 6.25.0                   pypi_0    pypi
ipython                   8.14.0                   pypi_0    pypi
ipython-genutils          0.2.0                    pypi_0    pypi
ipywidgets                8.0.7                    pypi_0    pypi
isoduration               20.11.0                  pypi_0    pypi
jedi                      0.18.2                   pypi_0    pypi
jinja2                    3.1.2                    pypi_0    pypi
json5                     0.9.14                   pypi_0    pypi
jsonpointer               2.4                      pypi_0    pypi
jsonschema                4.18.4                   pypi_0    pypi
jsonschema-specifications 2023.7.1                 pypi_0    pypi
jupyter                   1.0.0                    pypi_0    pypi
jupyter-client            8.3.0                    pypi_0    pypi
jupyter-console           6.6.3                    pypi_0    pypi
jupyter-core              5.3.1                    pypi_0    pypi
jupyter-events            0.6.3                    pypi_0    pypi
jupyter-lsp               2.2.0                    pypi_0    pypi
jupyter-server            2.7.0                    pypi_0    pypi
jupyter-server-terminals  0.4.4                    pypi_0    pypi
jupyterlab                4.0.3                    pypi_0    pypi
jupyterlab-pygments       0.2.2                    pypi_0    pypi
jupyterlab-server         2.24.0                   pypi_0    pypi
jupyterlab-widgets        3.0.8                    pypi_0    pypi
kiwisolver                1.4.4                    pypi_0    pypi
ld_impl_linux-64          2.40                 h41732ed_0    conda-forge
libffi                    3.4.2                h7f98852_5    conda-forge
libgcc-ng                 13.1.0               he5830b7_0    conda-forge
libgomp                   13.1.0               he5830b7_0    conda-forge
libnsl                    2.0.0                h7f98852_0    conda-forge
libsqlite                 3.42.0               h2797004_0    conda-forge
libuuid                   2.38.1               h0b41bf4_0    conda-forge
libzlib                   1.2.13               hd590300_5    conda-forge
loguru                    0.7.0                    pypi_0    pypi
markupsafe                2.1.3                    pypi_0    pypi
matplotlib                3.7.2                    pypi_0    pypi
matplotlib-inline         0.1.6                    pypi_0    pypi
mistune                   3.0.1                    pypi_0    pypi
msgpack                   1.0.5                    pypi_0    pypi
nbclient                  0.8.0                    pypi_0    pypi
nbconvert                 7.7.3                    pypi_0    pypi
nbformat                  5.9.1                    pypi_0    pypi
ncurses                   6.4                  hcb278e6_0    conda-forge
nest-asyncio              1.5.6                    pypi_0    pypi
networkx                  3.1                      pypi_0    pypi
notebook                  7.0.0                    pypi_0    pypi
notebook-shim             0.2.3                    pypi_0    pypi
numexpr                   2.8.4                    pypi_0    pypi
numpy                     1.25.1                   pypi_0    pypi
openssl                   3.1.1                hd590300_1    conda-forge
overrides                 7.3.1                    pypi_0    pypi
packaging                 23.1                     pypi_0    pypi
pandas                    1.5.3                    pypi_0    pypi
pandocfilters             1.5.0                    pypi_0    pypi
parso                     0.8.3                    pypi_0    pypi
pexpect                   4.8.0                    pypi_0    pypi
pickleshare               0.7.5                    pypi_0    pypi
pillow                    10.0.0                   pypi_0    pypi
pip                       23.2.1             pyhd8ed1ab_0    conda-forge
platformdirs              3.9.1                    pypi_0    pypi
pluggy                    1.2.0                    pypi_0    pypi
prometheus-client         0.17.1                   pypi_0    pypi
prompt-toolkit            3.0.39                   pypi_0    pypi
pseudopeople              0.6.5                    pypi_0    pypi
psutil                    5.9.5                    pypi_0    pypi
ptyprocess                0.7.0                    pypi_0    pypi
pure-eval                 0.2.2                    pypi_0    pypi
py-cpuinfo                9.0.0                    pypi_0    pypi
pyarrow                   12.0.1                   pypi_0    pypi
pycparser                 2.21                     pypi_0    pypi
pygments                  2.15.1                   pypi_0    pypi
pyparsing                 3.0.9                    pypi_0    pypi
pytest                    7.4.0                    pypi_0    pypi
pytest-mock               3.11.1                   pypi_0    pypi
python                    3.10.12         hd12c33a_0_cpython    conda-forge
python-dateutil           2.8.2                    pypi_0    pypi
python-json-logger        2.0.7                    pypi_0    pypi
pytz                      2023.3                   pypi_0    pypi
pyyaml                    6.0.1                    pypi_0    pypi
pyzmq                     25.1.0                   pypi_0    pypi
qtconsole                 5.4.3                    pypi_0    pypi
qtpy                      2.3.1                    pypi_0    pypi
readline                  8.2                  h8228510_1    conda-forge
referencing               0.30.0                   pypi_0    pypi
requests                  2.31.0                   pypi_0    pypi
rfc3339-validator         0.1.4                    pypi_0    pypi
rfc3986-validator         0.1.1                    pypi_0    pypi
rpds-py                   0.9.2                    pypi_0    pypi
scipy                     1.11.1                   pypi_0    pypi
send2trash                1.8.2                    pypi_0    pypi
setuptools                68.0.0             pyhd8ed1ab_0    conda-forge
six                       1.16.0                   pypi_0    pypi
sniffio                   1.3.0                    pypi_0    pypi
snowballstemmer           2.2.0                    pypi_0    pypi
soupsieve                 2.4.1                    pypi_0    pypi
sphinx                    6.2.1                    pypi_0    pypi
sphinx-click              4.4.0                    pypi_0    pypi
sphinx-rtd-theme          1.2.2                    pypi_0    pypi
sphinxcontrib-applehelp   1.0.4                    pypi_0    pypi
sphinxcontrib-devhelp     1.0.2                    pypi_0    pypi
sphinxcontrib-htmlhelp    2.0.1                    pypi_0    pypi
sphinxcontrib-jquery      4.1                      pypi_0    pypi
sphinxcontrib-jsmath      1.0.1                    pypi_0    pypi
sphinxcontrib-qthelp      1.0.3                    pypi_0    pypi
sphinxcontrib-serializinghtml 1.1.5                    pypi_0    pypi
stack-data                0.6.2                    pypi_0    pypi
tables                    3.8.0                    pypi_0    pypi
terminado                 0.17.1                   pypi_0    pypi
tinycss2                  1.2.1                    pypi_0    pypi
tk                        8.6.12               h27826a3_0    conda-forge
tomli                     2.0.1                    pypi_0    pypi
tornado                   6.3.2                    pypi_0    pypi
tqdm                      4.65.0                   pypi_0    pypi
traitlets                 5.9.0                    pypi_0    pypi
typing-extensions         4.7.1                    pypi_0    pypi
tzdata                    2023.3                   pypi_0    pypi
uri-template              1.3.0                    pypi_0    pypi
urllib3                   2.0.4                    pypi_0    pypi
vivarium                  1.2.1                    pypi_0    pypi
vivarium-research-prl     0.0.1                     dev_0    <develop>
wcwidth                   0.2.6                    pypi_0    pypi
webcolors                 1.13                     pypi_0    pypi
webencodings              0.5.1                    pypi_0    pypi
websocket-client          1.6.1                    pypi_0    pypi
wheel                     0.41.0             pyhd8ed1ab_0    conda-forge
widgetsnbextension        4.0.8                    pypi_0    pypi
xz                        5.2.6                h166bdaf_0    conda-forge

Note: you may need to restart the kernel to use updated packages.
"""

In [119]:
conda_env = """
# packages in environment at /ihme/homes/ndbs/miniconda3/envs/pppl310:
#
# Name                    Version                   Build  Channel
_libgcc_mutex             0.1                 conda_forge    conda-forge
_openmp_mutex             4.5                       2_gnu    conda-forge
alabaster                 0.7.13                   pypi_0    pypi
anyio                     3.7.1                    pypi_0    pypi
argon2-cffi               21.3.0                   pypi_0    pypi
argon2-cffi-bindings      21.2.0                   pypi_0    pypi
arrow                     1.2.3                    pypi_0    pypi
asttokens                 2.2.1                    pypi_0    pypi
async-lru                 2.0.4                    pypi_0    pypi
attrs                     23.1.0                   pypi_0    pypi
babel                     2.12.1                   pypi_0    pypi
backcall                  0.2.0                    pypi_0    pypi
beautifulsoup4            4.12.2                   pypi_0    pypi
bleach                    6.0.0                    pypi_0    pypi
blosc2                    2.0.0                    pypi_0    pypi
bzip2                     1.0.8                h7f98852_4    conda-forge
ca-certificates           2023.7.22            hbcca054_0    conda-forge
certifi                   2023.7.22                pypi_0    pypi
cffi                      1.15.1                   pypi_0    pypi
charset-normalizer        3.2.0                    pypi_0    pypi
click                     8.1.6                    pypi_0    pypi
comm                      0.1.3                    pypi_0    pypi
contourpy                 1.1.0                    pypi_0    pypi
cycler                    0.11.0                   pypi_0    pypi
cython                    3.0.0                    pypi_0    pypi
debugpy                   1.6.7                    pypi_0    pypi
decorator                 5.1.1                    pypi_0    pypi
defusedxml                0.7.1                    pypi_0    pypi
docutils                  0.18.1                   pypi_0    pypi
exceptiongroup            1.1.2                    pypi_0    pypi
executing                 1.2.0                    pypi_0    pypi
fastjsonschema            2.18.0                   pypi_0    pypi
fonttools                 4.41.1                   pypi_0    pypi
fqdn                      1.5.1                    pypi_0    pypi
idna                      3.4                      pypi_0    pypi
imagesize                 1.4.1                    pypi_0    pypi
iniconfig                 2.0.0                    pypi_0    pypi
ipykernel                 6.25.0                   pypi_0    pypi
ipython                   8.14.0                   pypi_0    pypi
ipython-genutils          0.2.0                    pypi_0    pypi
ipywidgets                8.0.7                    pypi_0    pypi
isoduration               20.11.0                  pypi_0    pypi
jedi                      0.18.2                   pypi_0    pypi
jinja2                    3.1.2                    pypi_0    pypi
json5                     0.9.14                   pypi_0    pypi
jsonpointer               2.4                      pypi_0    pypi
jsonschema                4.18.4                   pypi_0    pypi
jsonschema-specifications 2023.7.1                 pypi_0    pypi
jupyter                   1.0.0                    pypi_0    pypi
jupyter-client            8.3.0                    pypi_0    pypi
jupyter-console           6.6.3                    pypi_0    pypi
jupyter-core              5.3.1                    pypi_0    pypi
jupyter-events            0.6.3                    pypi_0    pypi
jupyter-lsp               2.2.0                    pypi_0    pypi
jupyter-server            2.7.0                    pypi_0    pypi
jupyter-server-terminals  0.4.4                    pypi_0    pypi
jupyterlab                4.0.3                    pypi_0    pypi
jupyterlab-pygments       0.2.2                    pypi_0    pypi
jupyterlab-server         2.24.0                   pypi_0    pypi
jupyterlab-widgets        3.0.8                    pypi_0    pypi
kiwisolver                1.4.4                    pypi_0    pypi
ld_impl_linux-64          2.40                 h41732ed_0    conda-forge
libffi                    3.4.2                h7f98852_5    conda-forge
libgcc-ng                 13.1.0               he5830b7_0    conda-forge
libgomp                   13.1.0               he5830b7_0    conda-forge
libnsl                    2.0.0                h7f98852_0    conda-forge
libsqlite                 3.42.0               h2797004_0    conda-forge
libuuid                   2.38.1               h0b41bf4_0    conda-forge
libzlib                   1.2.13               hd590300_5    conda-forge
loguru                    0.7.0                    pypi_0    pypi
markupsafe                2.1.3                    pypi_0    pypi
matplotlib                3.7.2                    pypi_0    pypi
matplotlib-inline         0.1.6                    pypi_0    pypi
mistune                   3.0.1                    pypi_0    pypi
msgpack                   1.0.5                    pypi_0    pypi
nbclient                  0.8.0                    pypi_0    pypi
nbconvert                 7.7.3                    pypi_0    pypi
nbformat                  5.9.1                    pypi_0    pypi
ncurses                   6.4                  hcb278e6_0    conda-forge
nest-asyncio              1.5.6                    pypi_0    pypi
networkx                  3.1                      pypi_0    pypi
notebook                  7.0.0                    pypi_0    pypi
notebook-shim             0.2.3                    pypi_0    pypi
numexpr                   2.8.4                    pypi_0    pypi
numpy                     1.25.1                   pypi_0    pypi
openssl                   3.1.1                hd590300_1    conda-forge
overrides                 7.3.1                    pypi_0    pypi
packaging                 23.1                     pypi_0    pypi
pandas                    1.5.3                    pypi_0    pypi
pandocfilters             1.5.0                    pypi_0    pypi
parso                     0.8.3                    pypi_0    pypi
pexpect                   4.8.0                    pypi_0    pypi
pickleshare               0.7.5                    pypi_0    pypi
pillow                    10.0.0                   pypi_0    pypi
pip                       23.2.1             pyhd8ed1ab_0    conda-forge
platformdirs              3.9.1                    pypi_0    pypi
pluggy                    1.2.0                    pypi_0    pypi
prometheus-client         0.17.1                   pypi_0    pypi
prompt-toolkit            3.0.39                   pypi_0    pypi
pseudopeople              0.6.5                    pypi_0    pypi
psutil                    5.9.5                    pypi_0    pypi
ptyprocess                0.7.0                    pypi_0    pypi
pure-eval                 0.2.2                    pypi_0    pypi
py-cpuinfo                9.0.0                    pypi_0    pypi
pyarrow                   12.0.1                   pypi_0    pypi
pycparser                 2.21                     pypi_0    pypi
pygments                  2.15.1                   pypi_0    pypi
pyparsing                 3.0.9                    pypi_0    pypi
pytest                    7.4.0                    pypi_0    pypi
pytest-mock               3.11.1                   pypi_0    pypi
python                    3.10.12         hd12c33a_0_cpython    conda-forge
python-dateutil           2.8.2                    pypi_0    pypi
python-json-logger        2.0.7                    pypi_0    pypi
pytz                      2023.3                   pypi_0    pypi
pyyaml                    6.0.1                    pypi_0    pypi
pyzmq                     25.1.0                   pypi_0    pypi
qtconsole                 5.4.3                    pypi_0    pypi
qtpy                      2.3.1                    pypi_0    pypi
readline                  8.2                  h8228510_1    conda-forge
referencing               0.30.0                   pypi_0    pypi
requests                  2.31.0                   pypi_0    pypi
rfc3339-validator         0.1.4                    pypi_0    pypi
rfc3986-validator         0.1.1                    pypi_0    pypi
rpds-py                   0.9.2                    pypi_0    pypi
scipy                     1.11.1                   pypi_0    pypi
send2trash                1.8.2                    pypi_0    pypi
setuptools                68.0.0             pyhd8ed1ab_0    conda-forge
six                       1.16.0                   pypi_0    pypi
sniffio                   1.3.0                    pypi_0    pypi
snowballstemmer           2.2.0                    pypi_0    pypi
soupsieve                 2.4.1                    pypi_0    pypi
sphinx                    6.2.1                    pypi_0    pypi
sphinx-click              4.4.0                    pypi_0    pypi
sphinx-rtd-theme          1.2.2                    pypi_0    pypi
sphinxcontrib-applehelp   1.0.4                    pypi_0    pypi
sphinxcontrib-devhelp     1.0.2                    pypi_0    pypi
sphinxcontrib-htmlhelp    2.0.1                    pypi_0    pypi
sphinxcontrib-jquery      4.1                      pypi_0    pypi
sphinxcontrib-jsmath      1.0.1                    pypi_0    pypi
sphinxcontrib-qthelp      1.0.3                    pypi_0    pypi
sphinxcontrib-serializinghtml 1.1.5                    pypi_0    pypi
stack-data                0.6.2                    pypi_0    pypi
tables                    3.8.0                    pypi_0    pypi
terminado                 0.17.1                   pypi_0    pypi
tinycss2                  1.2.1                    pypi_0    pypi
tk                        8.6.12               h27826a3_0    conda-forge
tomli                     2.0.1                    pypi_0    pypi
tornado                   6.3.2                    pypi_0    pypi
tqdm                      4.65.0                   pypi_0    pypi
traitlets                 5.9.0                    pypi_0    pypi
typing-extensions         4.7.1                    pypi_0    pypi
tzdata                    2023.3                   pypi_0    pypi
uri-template              1.3.0                    pypi_0    pypi
urllib3                   2.0.4                    pypi_0    pypi
vivarium                  1.2.1                    pypi_0    pypi
vivarium-research-prl     0.0.1                     dev_0    <develop>
wcwidth                   0.2.6                    pypi_0    pypi
webcolors                 1.13                     pypi_0    pypi
webencodings              0.5.1                    pypi_0    pypi
websocket-client          1.6.1                    pypi_0    pypi
wheel                     0.41.0             pyhd8ed1ab_0    conda-forge
widgetsnbextension        4.0.8                    pypi_0    pypi
xz                        5.2.6                h166bdaf_0    conda-forge

Note: you may need to restart the kernel to use updated packages.
"""

In [120]:
conda_env == conda_env_other

True