In [1]:
import pseudopeople as psp
import pandas as pd, numpy as np

from vivarium_research_prl.utils import sizemb, MappingViaAttributes, build_full_address
from vivarium_research_prl import alpha

!date
!whoami
!uname -a
!python --version
!pwd

Tue 05 Sep 2023 02:23:00 PM PDT
ndbs
Linux long-slurm-sarchive-p0005 5.4.0-135-generic #152-Ubuntu SMP Wed Nov 23 20:19:22 UTC 2022 x86_64 x86_64 x86_64 GNU/Linux
Python 3.10.12
/mnt/share/code/ndbs/vivarium_research_prl/pseudopeople_testing


In [2]:
%load_ext autoreload
%autoreload 2

# Generate all sample datasets and list columns in each

In [3]:
%%time
data = alpha.generate_datasets()
data.keylist()

                                                                                                

CPU times: user 14.7 s, sys: 211 ms, total: 14.9 s
Wall time: 15.1 s




['american_community_survey',
 'current_population_survey',
 'decennial_census',
 'social_security',
 'taxes_1040',
 'taxes_w2_and_1099',
 'women_infants_and_children']

In [4]:
{name: df.dtypes for name, df in data.items()}

{'american_community_survey': simulant_id                                 object
 household_id                                object
 survey_date                         datetime64[ns]
 first_name                                  object
 middle_initial                              object
 last_name                                   object
 age                                         object
 date_of_birth                               object
 street_number                               object
 street_name                                 object
 unit_number                                 object
 city                                        object
 state                                     category
 zipcode                                     object
 housing_type                                object
 relationship_to_reference_person          category
 sex                                       category
 race_ethnicity                            category
 dtype: object,
 'current_populatio

# Check whether pseudopeople errors are now `Exception`s instead of `BaseException`s

Yes. Good.

In [5]:
issubclass(psp.exceptions.DataSourceError, Exception)

True

# Get a default configuration

In [6]:
config = psp.get_config()
config.keys()

dict_keys(['decennial_census', 'american_community_survey', 'current_population_survey', 'women_infants_and_children', 'social_security', 'taxes_w2_and_1099', 'taxes_1040'])

In [7]:
config['taxes_w2_and_1099']

{'row_noise': {'omit_row': {'row_probability': 0.005}},
 'column_noise': {'ssn': {'leave_blank': {'cell_probability': 0.01},
   'copy_from_household_member': {'cell_probability': 0.0},
   'write_wrong_digits': {'cell_probability': 0.01, 'token_probability': 0.1},
   'make_ocr_errors': {'cell_probability': 0.01, 'token_probability': 0.1},
   'make_typos': {'cell_probability': 0.01, 'token_probability': 0.1}},
  'wages': {'leave_blank': {'cell_probability': 0.01},
   'write_wrong_digits': {'cell_probability': 0.01, 'token_probability': 0.1},
   'make_ocr_errors': {'cell_probability': 0.01, 'token_probability': 0.1},
   'make_typos': {'cell_probability': 0.01, 'token_probability': 0.1}},
  'employer_name': {'leave_blank': {'cell_probability': 0.01},
   'make_ocr_errors': {'cell_probability': 0.01, 'token_probability': 0.1},
   'make_typos': {'cell_probability': 0.01, 'token_probability': 0.1}},
  'employer_street_number': {'leave_blank': {'cell_probability': 0.01},
   'write_wrong_digits'

# Write a function to flatten the config dictionary

In [None]:
# def find(key, d):
#     instances = []
#     def _find(key, d):
#         if d == key:
            

In [74]:
def f():
    x = 4
    def g():
        nonlocal x
        print(x)
        if x >= 6:
            return
        x += 1
        g()
    g()

f()

4
5
6


In [49]:
def keys_to_tuples(d):
    """Flattens a nested dictionary into single dictionary with tuples for keys."""
    new_dict = {}
    current_tuple = []
    def _keys_to_tuples(dict_or_val):
        if not isinstance(dict_or_val, dict):
            new_dict[tuple(current_tuple)] = dict_or_val
            return

        for key, val in dict_or_val.items():
            current_tuple.append(key)
            _keys_to_tuples(val)
            current_tuple.pop()

    _keys_to_tuples(d)
    return new_dict

In [50]:
flattened = keys_to_tuples(config['taxes_w2_and_1099'])
len(flattened)

128

In [51]:
set(len(k) for k in flattened)

{3, 4}

In [52]:
[k for k in flattened]

[('row_noise', 'omit_row', 'row_probability'),
 ('column_noise', 'ssn', 'leave_blank', 'cell_probability'),
 ('column_noise', 'ssn', 'copy_from_household_member', 'cell_probability'),
 ('column_noise', 'ssn', 'write_wrong_digits', 'cell_probability'),
 ('column_noise', 'ssn', 'write_wrong_digits', 'token_probability'),
 ('column_noise', 'ssn', 'make_ocr_errors', 'cell_probability'),
 ('column_noise', 'ssn', 'make_ocr_errors', 'token_probability'),
 ('column_noise', 'ssn', 'make_typos', 'cell_probability'),
 ('column_noise', 'ssn', 'make_typos', 'token_probability'),
 ('column_noise', 'wages', 'leave_blank', 'cell_probability'),
 ('column_noise', 'wages', 'write_wrong_digits', 'cell_probability'),
 ('column_noise', 'wages', 'write_wrong_digits', 'token_probability'),
 ('column_noise', 'wages', 'make_ocr_errors', 'cell_probability'),
 ('column_noise', 'wages', 'make_ocr_errors', 'token_probability'),
 ('column_noise', 'wages', 'make_typos', 'cell_probability'),
 ('column_noise', 'wages',

# Convert the flattened config into a Series with a MultiIndex

In [54]:
s = pd.Series(flattened)
s

row_noise     omit_row                 row_probability               0.005
column_noise  ssn                      leave_blank                    0.01
                                       copy_from_household_member      0.0
                                       write_wrong_digits             0.01
                                       write_wrong_digits              0.1
                                                                     ...  
              mailing_address_zipcode  make_ocr_errors                 0.1
                                       make_typos                     0.01
                                       make_typos                      0.1
              tax_form                 leave_blank                    0.01
                                       choose_wrong_option            0.01
Length: 128, dtype: object

In [56]:
s.index.nlevels

3

In [59]:
cdict = {t: val for t, val in flattened.items() if 'column_noise' in t}
len(cdict)

127

In [60]:
cs = pd.Series(cdict)
cs

column_noise  ssn                      leave_blank                 cell_probability     0.01
                                       copy_from_household_member  cell_probability      0.0
                                       write_wrong_digits          cell_probability     0.01
                                                                   token_probability     0.1
                                       make_ocr_errors             cell_probability     0.01
                                                                                        ... 
              mailing_address_zipcode  make_ocr_errors             token_probability     0.1
                                       make_typos                  cell_probability     0.01
                                                                   token_probability     0.1
              tax_form                 leave_blank                 cell_probability     0.01
                                       choose_wrong_option         cel

In [61]:
cs.filter(like='household')

column_noise  ssn  copy_from_household_member  cell_probability    0.0
dtype: object

# Find all the places with copy-from-household-member noise

First I'll need to deal with the fact that we have keys of differng lengths, since converting the entire dict to a Series will truncate keys to the shortest length.

In [62]:
flat_config = keys_to_tuples(config)
len(flat_config)

735

In [63]:
set(len(k) for k in flat_config)

{4, 5, 6}

In [64]:
key_series = pd.Series(flat_config.keys())
key_series.map(len).value_counts()

5    716
6     12
4      7
dtype: int64

## Which keys have length 6?

Hmm, `misreport_age`, because my function also flattened the `possible_age_differences` dictionary. I don't see an easy way to avoid that when using recursion. The only foolproof way I see to avoid this is to explicitly specify the levels we're flattening in nested for loops.

In [65]:
{k:v for k,v in flat_config.items() if len(k) == 6}

{('decennial_census',
  'column_noise',
  'age',
  'misreport_age',
  'possible_age_differences',
  -2): 0.1,
 ('decennial_census',
  'column_noise',
  'age',
  'misreport_age',
  'possible_age_differences',
  -1): 0.4,
 ('decennial_census',
  'column_noise',
  'age',
  'misreport_age',
  'possible_age_differences',
  1): 0.4,
 ('decennial_census',
  'column_noise',
  'age',
  'misreport_age',
  'possible_age_differences',
  2): 0.1,
 ('american_community_survey',
  'column_noise',
  'age',
  'misreport_age',
  'possible_age_differences',
  -2): 0.1,
 ('american_community_survey',
  'column_noise',
  'age',
  'misreport_age',
  'possible_age_differences',
  -1): 0.4,
 ('american_community_survey',
  'column_noise',
  'age',
  'misreport_age',
  'possible_age_differences',
  1): 0.4,
 ('american_community_survey',
  'column_noise',
  'age',
  'misreport_age',
  'possible_age_differences',
  2): 0.1,
 ('current_population_survey',
  'column_noise',
  'age',
  'misreport_age',
  'possible

In [66]:
config['decennial_census']['column_noise']['age']['misreport_age']

{'cell_probability': 0.01,
 'possible_age_differences': {-2: 0.1, -1: 0.4, 1: 0.4, 2: 0.1}}

## Which keys have length 4?

Good, it looks like it's precisely the `row_noise` keys.

In [67]:
{k:v for k,v in flat_config.items() if len(k) == 4}

{('decennial_census',
  'row_noise',
  'do_not_respond',
  'row_probability'): 0.0145,
 ('american_community_survey',
  'row_noise',
  'do_not_respond',
  'row_probability'): 0.0145,
 ('current_population_survey',
  'row_noise',
  'do_not_respond',
  'row_probability'): 0.2905,
 ('women_infants_and_children',
  'row_noise',
  'omit_row',
  'row_probability'): 0.0,
 ('social_security', 'row_noise', 'omit_row', 'row_probability'): 0.0,
 ('taxes_w2_and_1099', 'row_noise', 'omit_row', 'row_probability'): 0.005,
 ('taxes_1040', 'row_noise', 'omit_row', 'row_probability'): 0.0}

# Filter to keys of length 5, and find instances of copy-from-household-member noise

In [68]:
config_depth_5 = {k:v for k,v in flat_config.items() if len(k) == 5}
col_config = pd.Series(config_depth_5)
col_config

decennial_census  column_noise  first_name       leave_blank           cell_probability     0.01
                                                 use_nickname          cell_probability     0.01
                                                 use_fake_name         cell_probability     0.01
                                                 make_phonetic_errors  cell_probability     0.01
                                                                       token_probability     0.1
                                                                                            ... 
taxes_1040        column_noise  dependent_4_ssn  write_wrong_digits    token_probability     0.1
                                                 make_ocr_errors       cell_probability     0.01
                                                                       token_probability     0.1
                                                 make_typos            cell_probability     0.01
                              

In [69]:
col_config.index.nlevels

5

In [70]:
col_config.filter(like='household')

decennial_census            column_noise  age              copy_from_household_member  cell_probability    0.01
                                          date_of_birth    copy_from_household_member  cell_probability    0.01
american_community_survey   column_noise  age              copy_from_household_member  cell_probability    0.01
                                          date_of_birth    copy_from_household_member  cell_probability    0.01
current_population_survey   column_noise  age              copy_from_household_member  cell_probability    0.01
                                          date_of_birth    copy_from_household_member  cell_probability    0.01
women_infants_and_children  column_noise  date_of_birth    copy_from_household_member  cell_probability    0.01
social_security             column_noise  ssn              copy_from_household_member  cell_probability     0.0
                                          date_of_birth    copy_from_household_member  cell_probability 

# See what happens when we request a large probability of copy_from_household_member noise

It silently noises a smaller fraction, because we haven't implemented the user warning for when the requested fraction is not possible.

In [80]:
override_copy_noise = {'social_security': {'column_noise': {'ssn' : {'copy_from_household_member': {'cell_probability':0.95}}}}}
ssa_noisy = psp.generate_social_security(config=override_copy_noise)
ssa_noisy

                                                                                                

Unnamed: 0,simulant_id,ssn,first_name,middle_name,last_name,date_of_birth,sex,event_type,event_date
0,0_19979,786-77-6454,Evelyn,Nancy,Hines,19191204,Female,creation,19191204
1,0_6846,688-88-6377,George,Robert,Dickens,19211606,Male,creation,19210616
2,0_19983,651-33-9561,Beatrice,Jennie,Fackler,19220113,Female,creation,19220113
3,0_262,448-43-1664,Eura,Nadine,Crusen,19220305,Female,creation,19220305
4,0_12473,875-10-2359,Roberta,Ruth,Muilenburg,19420306,Female,creation,19220306
...,...,...,...,...,...,...,...,...,...
16492,0_20687,183-90-0619,Matthew,Michael,Phillips,19800224,Male,creation,20201229
16493,0_20686,803-81-8527,Jermey,Tyler,Wimmer,19860415,Male,creation,20201229
16494,0_20692,446-63-2138,Brittanie,Lauren,Thao,19950118,Female,creation,20201229
16495,0_20662,702-65-5925,Marcus,Jasper,Murphy,20201230,Male,creation,20201230


In [81]:
ssa_nonoise = psp.generate_social_security(config=psp.NO_NOISE)
ssa_nonoise

                                                                                                

Unnamed: 0,simulant_id,ssn,first_name,middle_name,last_name,date_of_birth,sex,event_type,event_date
0,0_19979,786-77-6454,Evelyn,Nancy,Hines,19191204,Female,creation,19191204
1,0_6846,688-88-6377,George,Robert,Dickens,19210616,Male,creation,19210616
2,0_19983,651-33-9561,Beatrice,Jennie,Fackler,19220113,Female,creation,19220113
3,0_262,665-25-7858,Eura,Nadine,Crusen,19220305,Female,creation,19220305
4,0_12473,875-10-2359,Roberta,Ruth,Muilenburg,19220306,Female,creation,19220306
...,...,...,...,...,...,...,...,...,...
16492,0_20687,183-90-0619,Matthew,Michael,Phillips,19800224,Male,creation,20201229
16493,0_20686,803-81-8527,Jermey,Tyler,Wimmer,19860415,Male,creation,20201229
16494,0_20692,170-62-5253,Brittanie,Lauren,Thao,19950118,Female,creation,20201229
16495,0_20662,281-88-9330,Marcus,Jasper,Murphy,20201230,Male,creation,20201230


In [82]:
ssa_noisy.ssn.compare(ssa_nonoise.ssn)

Unnamed: 0,self,other
3,448-43-1664,665-25-7858
6,134-68-8831,102-60-0838
12,881-81-1306,047-34-5906
14,580-70-8826,765-44-4521
15,209-06-3951,136-85-7371
...,...,...
16487,361-66-9187,885-72-7577
16489,833-17-4847,161-06-9252
16494,446-63-2138,170-62-5253
16495,702-65-5925,281-88-9330


In [123]:
9235 / len(ssa_nonoise) # Looks like the maximum fraction of noise-able rows was 56%

0.559798751288113

# See what the path is to the sample data

The directory is called `sample_datasets`, whereas with our updated terminology, it would be more appropriate to call it `sample_population`.

In [83]:
psp.constants.paths.SAMPLE_DATA_ROOT

PosixPath('/mnt/share/code/ndbs/pseudopeople/src/pseudopeople/data/sample_datasets')

In [84]:
data.american_community_survey.dtypes

simulant_id                                 object
household_id                                object
survey_date                         datetime64[ns]
first_name                                  object
middle_initial                              object
last_name                                   object
age                                         object
date_of_birth                               object
street_number                               object
street_name                                 object
unit_number                                 object
city                                        object
state                                     category
zipcode                                     object
housing_type                                object
relationship_to_reference_person          category
sex                                       category
race_ethnicity                            category
dtype: object

# Check whether survey datasets have a `survey_date` column, and what the dtype and date format are

In [85]:
data.current_population_survey.dtypes

simulant_id               object
household_id              object
survey_date       datetime64[ns]
first_name                object
middle_initial            object
last_name                 object
age                       object
date_of_birth             object
street_number             object
street_name               object
unit_number               object
city                      object
state                   category
zipcode                   object
sex                     category
race_ethnicity          category
dtype: object

In [86]:
col_config.filter(like='survey')

american_community_survey  column_noise  first_name      leave_blank           cell_probability     0.01
                                                         use_nickname          cell_probability     0.01
                                                         use_fake_name         cell_probability     0.01
                                                         make_phonetic_errors  cell_probability     0.01
                                                                               token_probability     0.1
                                                                                                    ... 
current_population_survey  column_noise  zipcode         make_typos            token_probability     0.1
                                         sex             leave_blank           cell_probability     0.01
                                                         choose_wrong_option   cell_probability     0.01
                                         race_ethnicity

In [87]:
col_config.xs('american_community_survey')

column_noise  first_name                        leave_blank           cell_probability     0.01
                                                use_nickname          cell_probability     0.01
                                                use_fake_name         cell_probability     0.01
                                                make_phonetic_errors  cell_probability     0.01
                                                                      token_probability     0.1
                                                                                           ... 
              relationship_to_reference_person  choose_wrong_option   cell_probability     0.01
              sex                               leave_blank           cell_probability     0.01
                                                choose_wrong_option   cell_probability     0.01
              race_ethnicity                    leave_blank           cell_probability     0.01
                                        

## Also check whether `survey_date` column is noise-able

Looks like it's not. According to Abie and Zeb, we are treating this as metadata, similar to year or tax year in the other datasets.

In [89]:
col_config.xs('american_community_survey').filter(like='survey')

Series([], dtype: object)

In [90]:
data.american_community_survey

Unnamed: 0,simulant_id,household_id,survey_date,first_name,middle_initial,last_name,age,date_of_birth,street_number,street_name,unit_number,city,state,zipcode,housing_type,relationship_to_reference_person,sex,race_ethnicity
0,0_12123,0_4898,2020-10-06,Jajes,N,Valdez,43,07/04/1977,820,cameron road,,Anytown,WA,00000,Household,Opposite-sex spouse,Female,Asian
1,0_12124,0_4898,2020-10-06,Raymond,J,Valdez,16,10/22/2003,820,cameron road,,Anytown,WA,00000,Household,Biological child,Male,Asian
2,0_20535,0_4898,2020-10-06,Ronin,K,Valdez,0,08/04/2020,820,cameron road,,Anytown,WA,00000,Household,Biological child,Male,Asian
3,0_8831,0_3589,2020-03-24,Edward,J,Hayden,58,12/23/1961,20,oakbridge py,,Anytown,WA,00000,Household,Reference person,,White
4,0_727,0_3,2020-04-21,Kari,,Cannon,36,05/05/1983,8203,west farwell avenue,,Anytown,WA,00000,College,Noninstitutionalized group quarters population,Female,White
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69,0_19709,0_3,2020-04-21,Erik,G,Underwood,23,09/21/1996,8203,west farwdll avenye,,Anytown,WA,00000,College,Noninstitutionalized group quarters population,Male,White
70,0_19779,0_3,2020-04-21,Eric,B,Murphy,40,08/10/1979,8203,west farwell avenue,,Anytown,WA,00000,College,Noninstitutionalized group quarters population,Male,Black
71,0_20082,0_3,2020-04-21,Eugene,G,Wu,19,10/12/2000,8203,west farwell avenue,,Anytown,WA,00000,College,Noninstitutionalized group quarters population,Male,Asian
72,0_20329,0_3,2020-04-21,Zachary,J,Walker,17,01/20/2003,8203,west farwell avenue,,Anyfown,WA,00000,College,Noninstitutionalized group quarters population,Male,White


In [91]:
data.current_population_survey

Unnamed: 0,simulant_id,household_id,survey_date,first_name,middle_initial,last_name,age,date_of_birth,street_number,street_name,unit_number,city,state,zipcode,sex,race_ethnicity
0,0_16607,0_6703,2020-05-19,Delores,P,Maholtz,84.0,10/08/1935,2100,hvn harbour way,,Anytown,WA,00000,Female,White
1,0_5893,0_2394,2020-01-28,Cameron,J,Duff,83.0,01/25/1936,3256,mclaurin cir,,Anytown,WA,00000,Male,White
2,0_5894,0_2394,2020-01-28,Lois,M,Duff,81.0,12/03/1938,3256,mclaurin cir,,Anyton,WA,00000,Female,White
3,0_9229,0_3752,2020-02-25,Greg,J,Schick,69.0,03/14/1950,14375,wagon train dr,,Anytown,WA,00000,Male,White
4,0_15857,0_6402,2020-02-25,Doreen,M,Mason,78.0,09/08/1941,1048,revuelta ct,,Anytown,WA,00000,Female,White
5,0_15858,0_6402,2020-02-25,Samuel,P,Mason,81.0,09/07/1938,1048,revuelta ct,,Anytown,WA,00000,Male,White
6,0_5055,0_2070,2020-03-24,Carolyn,V,Teske,63.0,09/01/1956,430,hagenson st,,Anytown,WA,00000,Female,White
7,0_5056,0_2070,2020-03-24,Zoe,A,Tecke,18.0,12/04/2001,430,hagenson st,,Anytown,WA,00000,Female,White
8,0_5057,0_2070,2020-03-24,Steve,R,Teske,62.0,07/23/1957,430,hagenson st,,Anytown,WA,00000,Male,White
9,0_12893,0_5190,2020-03-24,Randal,W,Grigsby,66.0,02/24/1954,2430,pleasant avenue,,Anytown,WA,00000,Male,Multiracial or Other


In [93]:
data.american_community_survey.survey_date.dt.strftime("%d-%m-%Y")

0     06-10-2020
1     06-10-2020
2     06-10-2020
3     24-03-2020
4     21-04-2020
         ...    
69    21-04-2020
70    21-04-2020
71    21-04-2020
72    21-04-2020
73    01-12-2020
Name: survey_date, Length: 74, dtype: object

In [94]:
data.american_community_survey.survey_date.dt.strftime("%d-%m-%Y").map(type).unique()

array([<class 'str'>], dtype=object)

In [95]:
type(data.american_community_survey.survey_date.iloc[0])

pandas._libs.tslibs.timestamps.Timestamp

In [96]:
type(data.current_population_survey.survey_date.iloc[0])

pandas._libs.tslibs.timestamps.Timestamp

In [97]:
data.american_community_survey.survey_date

0    2020-10-06
1    2020-10-06
2    2020-10-06
3    2020-03-24
4    2020-04-21
        ...    
69   2020-04-21
70   2020-04-21
71   2020-04-21
72   2020-04-21
73   2020-12-01
Name: survey_date, Length: 74, dtype: datetime64[ns]

In [98]:
isinstance(data.current_population_survey.survey_date.iloc[0], pd.Timestamp)

True

In [99]:
t = data.current_population_survey.survey_date.iloc[0]
t

Timestamp('2020-05-19 00:00:00')

# Check what happens when we request SSA data for different years

You can request years farther in the future than the end of the sim, but not too much farther. If you request years before the first record, you get the expected `ValueError` because there is no data. Or you may get a `ValueError` because the time is too far into the future or the past, raising an `OutOfBoundsDatetime` when attempting to convert to a Timestamp object.

In [100]:
psp.generate_social_security(year=1930)

                                                                                                         

Unnamed: 0,simulant_id,ssn,first_name,middle_name,last_name,date_of_birth,sex,event_type,event_date
0,0_19979,786-77-6454,Evelyn,Nancy,Hines,19191204,Female,creation,19191204
1,0_6846,688-88-6377,George,Robert,Dickens,19210616,Male,creation,19210616
2,0_19983,651-33-9561,Wife,Jennie,Fackler,19220113,Female,creation,19220113
3,0_262,665-25-7858,Eura,Nadine,Crusen,19220305,Female,creation,19220305
4,0_12473,875-10-2359,Roberta,Ruth,Muilenburg,19220306,Female,creation,19220306
...,...,...,...,...,...,...,...,...,...
134,0_5123,384-46-7224,Beatrice,Jennie,Makar,19300922,Female,creation,19300922
135,0_12683,567-54-2095,Norman,Charles,Adult,19300929,Male,creation,19300929
136,0_10201,613-15-1140,Betty,Priscilla,Kennedy,19301007,Female,creation,19301007
137,0_4377,519-55-9356,Shirley,Barbara,Mcdonald,19301112,Female,creation,19301112


In [101]:
psp.generate_social_security(year=2040)

                                                                                                         

Unnamed: 0,simulant_id,ssn,first_name,middle_name,last_name,date_of_birth,sex,event_type,event_date
0,0_19979,786-77-6454,Evelyn,Nancy,Hines,19191204,Female,creation,1919|204
1,0_6846,688-88-6377,George,Robert,Dickens,19210616,Male,creation,19210616
2,0_19983,651-33-9561,Beatrice,Jennie,Fackler,19220113,Female,creation,19220115
3,0_262,665-25-7858,Eura,Nadine,Crusen,19220305,Female,creation,19220305
4,0_12473,875-10-2359,Roberta,Ruth,Muilenburg,19220306,Female,creation,19220306
...,...,...,...,...,...,...,...,...,...
24066,0_27230,356-58-9816,Mariah,Lydia,Cline,20381218,Female,creation,20401204
24067,0_27218,576-40-3082,Claire,Cecilia,Bhakta,20401206,Female,creation,20401206
24068,0_27217,496-15-1541,Matilda,Isabella,Johnson,20401217,Female,creation,20401217
24069,0_27216,212-20-8121,Giovanni,Moshe,Home,20401219,Male,creation,20401219


In [102]:
psp.generate_social_security(year=2041)

                                                                                                         

Unnamed: 0,simulant_id,ssn,first_name,middle_name,last_name,date_of_birth,sex,event_type,event_date
0,0_19979,786-77-6454,Evelyn,Nancy,Hines,19191204,Female,creation,19191204
1,0_6846,688-88-6377,George,Robert,Dickens,19210616,Male,creation,19210616
2,0_19983,651-33-9561,Beatrice,Jennie,Fackler,19220113,Female,creation,19220113
3,0_262,665-25-7858,Eura,Nadine,Crusen,19220305,Female,creation,19220305
4,0_12473,875-10-2359,Roberta,Ruth,Muilenburg,19220306,Female,creation,19220306
...,...,...,...,...,...,...,...,...,...
24200,0_7196,085-58-1731,Tony,Timothy,Ni,19620912,Male,death,20410521
24201,0_2554,506-80-3608,Kevin,,Cook,19630319,Male,death,
24202,0_16187,059-96-9698,Jeffrey,Brannon,Fox,19691219,Male,death,20410521
24203,0_14782,714-38-0891,Jennifer,Jessica,Arthur,19800909,Female,death,20410521


In [103]:
psp.generate_social_security(year=2042)

                                                                                                         

Unnamed: 0,simulant_id,ssn,first_name,middle_name,last_name,date_of_birth,sex,event_type,event_date
0,0_19979,786-77-6454,Evelyn,Nancy,Hines,19191204,Female,creation,19191204
1,0_6846,688-88-6377,George,Robert,Dickens,19210616,Male,creation,19210616
2,0_19983,651-33-9561,Beatrice,Jennie,Fackler,19220113,Female,creation,19220113
3,0_262,665-25-7858,Eura,Nadine,Crusen,19220305,Female,creation,19220305
4,0_12473,875-10-2359,Roberta,Ruth,Muiilenburg,19220306,Female,creation,19220306
...,...,...,...,...,...,...,...,...,...
24200,0_7196,085-58-1731,Tony,Timothy,Ni,19620912,Male,death,z04105Z|
24201,0_2554,506-80-3608,Kevin,Charlie,Cook,19630319,Male,death,20410521
24202,0_16187,059-96-9698,Jeffrey,Brannon,Fox,19691219,Male,death,20410521
24203,0_14782,714-38-0891,Hwnnifer,Jessica,Arthur,19800909,Female,death,20410521


In [104]:
psp.generate_social_security(year=2050)

                                                                                                         

Unnamed: 0,simulant_id,ssn,first_name,middle_name,last_name,date_of_birth,sex,event_type,event_date
0,0_19979,786-77-6454,Evelyn,Nancy,Hines,19191204,Female,creation,19191204
1,0_6846,688-88-6377,George,Robert,Dickens,19210616,Male,creation,19210616
2,0_19983,651-33-9561,Beatrice,Jennie,Fackler,19221301,Female,creation,19220113
3,0_262,665-25-7858,Eura,Nadine,Crusen,19220305,Female,creation,19220305
4,0_12473,875-10-2359,Roberta,Female,Muilenburg,19220306,Female,creation,19220306
...,...,...,...,...,...,...,...,...,...
24200,0_7196,085-58-1731,Anthony,Timothy,Ni,19620912,Male,death,20410521
24201,0_2554,506-80-3608,Kevin,Charlie,Cook,19630319,Male,death,
24202,0_16187,059-96-9698,Jeffrey,Brannon,Fox,19691219,Male,death,20410521
24203,0_14782,714-38-0891,Jennifer,Jessica,Arthur,19800909,Female,,20410521


In [105]:
psp.generate_social_security(year=1919)

                                                                                                         

Unnamed: 0,simulant_id,ssn,first_name,middle_name,last_name,date_of_birth,sex,event_type,event_date
0,0_19979,786-77-6454,Evelyn,Nancy,Hines,19191204,Female,creation,19191204


In [106]:
psp.generate_social_security(year=1918)

ValueError: Invalid value provided for 'state' or 'year'. No data found with the user provided 'state' or 'year' filters at /mnt/share/code/ndbs/pseudopeople/src/pseudopeople/data/sample_datasets/social_security/social_security.parquet.

In [107]:
psp.generate_social_security(year=0)

ValueError: Invalid year provided: '0'

In [108]:
psp.generate_social_security(year=1)

ValueError: Invalid year provided: '1'

In [109]:
psp.generate_social_security(year=1800)

ValueError: Invalid value provided for 'state' or 'year'. No data found with the user provided 'state' or 'year' filters at /mnt/share/code/ndbs/pseudopeople/src/pseudopeople/data/sample_datasets/social_security/social_security.parquet.

In [110]:
psp.generate_social_security(year=50)

ValueError: Invalid year provided: '50'

In [111]:
psp.generate_social_security(year=5000)

ValueError: Invalid year provided: '5000'

In [112]:
psp.generate_social_security(year=3000)

ValueError: Invalid year provided: '3000'

In [113]:
psp.generate_social_security(year=2100)

                                                                                                         

Unnamed: 0,simulant_id,ssn,first_name,middle_name,last_name,date_of_birth,sex,event_type,event_date
0,0_19979,786-77-6454,Evelyn,Nancy,Hines,19191204,Female,creation,19191204
1,0_6846,688-88-6377,George,Robert,Dickens,19210616,Male,creation,19210616
2,0_19983,651-33-9561,Beatrice,Jennie,Fackler,19220113,Female,creation,19220113
3,0_262,665-25-7858,Eura,Nadine,Crusen,19220503,Female,creation,19220305
4,0_12473,875-10-2359,Roberta,Ruth,Muilenburg,19220306,Female,creation,19220306
...,...,...,...,...,...,...,...,...,...
24200,0_7196,085-58-1731,Tony,Timothy,Ni,19620912,Male,death,20410521
24201,0_2554,506-80-3608,Kevin,Charlie,Cook,19630319,Male,death,20410521
24202,0_16187,059-96-9698,Jeffrey,Brannon,Fox,19691219,Male,death,20410521
24203,0_14782,714-38-0891,Jennifer,Jessica,Arthur,19800909,Female,death,20410521


In [116]:
psp.generate_social_security(year=2300)

ValueError: Invalid year provided: '2300'

In [117]:
psp.generate_social_security(year=2200)

                                                                                                         

Unnamed: 0,simulant_id,ssn,first_name,middle_name,last_name,date_of_birth,sex,event_type,event_date
0,0_19979,786-77-6454,Evelyn,Nancy,Hines,19191204,Female,creation,19191204
1,0_6846,688-88-6377,George,Robert,Dickens,19210616,Male,creation,19210616
2,0_19983,651-33-9561,Beatrice,Jennie,Fackler,19220113,Female,creation,19220113
3,0_262,665-25-7858,Eura,Nadine,Crusen,19220305,Female,creation,19220305
4,0_12473,875-10-2359,Roberta,Ruth,Muilenburg,19220306,Female,creation,19220306
...,...,...,...,...,...,...,...,...,...
24200,0_7196,085-58-1731,Tony,Timothy,Ni,19620912,Male,death,20410521
24201,0_2554,506-80-3608,Kevin,Charlie,Cook,19630319,Male,death,20410521
24202,0_16187,059-96-9698,Jeffrey,Brannon,Fox,19691219,Male,death,20410521
24203,0_14782,714-38-0891,Jennifer,Jessica,Arthur,19800909,Female,death,20410521


In [118]:
type(None)

NoneType

In [119]:
type(True)

bool

In [122]:
isinstance(None, type(None))

True

In [124]:
pd.Timestamp('2003-02-05')

Timestamp('2003-02-05 00:00:00')

In [125]:
pd.Timestamp('0003-02-05')

OutOfBoundsDatetime: Out of bounds nanosecond timestamp: 3-02-05 00:00:00

# Write a function to pad the tuples in the flattened config dictionary

In [139]:
def pad_tuples(tuples, pad_val=np.nan):
    max_len = max(map(len, tuples))
    new_tuples = ((*t, *((max_len - len(t)) * [pad_val])) for t in tuples)
    return new_tuples

In [140]:
pad_tuples(flat_config.keys())

<generator object pad_tuples.<locals>.<genexpr> at 0x7fc238cdc900>

In [143]:
list(pad_tuples(flat_config.keys()))[:5]

[('decennial_census',
  'row_noise',
  'do_not_respond',
  'row_probability',
  nan,
  nan),
 ('decennial_census',
  'column_noise',
  'first_name',
  'leave_blank',
  'cell_probability',
  nan),
 ('decennial_census',
  'column_noise',
  'first_name',
  'use_nickname',
  'cell_probability',
  nan),
 ('decennial_census',
  'column_noise',
  'first_name',
  'use_fake_name',
  'cell_probability',
  nan),
 ('decennial_census',
  'column_noise',
  'first_name',
  'make_phonetic_errors',
  'cell_probability',
  nan)]

# Write a function to create a new dictionary with padded tuples as keys

In [145]:
def pad_flattened_dict(d, pad_val=np.nan):
    max_len = max(map(len, d.keys()))
    def pad_tuple(t):
        return (*t, *((max_len - len(t)) * [pad_val]))
    new_dict = {pad_tuple(key): val for key, val in d.items()}
    return new_dict

In [146]:
padded_flat_config = pad_flattened_dict(flat_config)
flat_config_series = pd.Series(padded_flat_config)
flat_config_series

decennial_census  row_noise     do_not_respond   row_probability       NaN                NaN    0.0145
                  column_noise  first_name       leave_blank           cell_probability   NaN      0.01
                                                 use_nickname          cell_probability   NaN      0.01
                                                 use_fake_name         cell_probability   NaN      0.01
                                                 make_phonetic_errors  cell_probability   NaN      0.01
                                                                                                  ...  
taxes_1040        column_noise  dependent_4_ssn  write_wrong_digits    token_probability  NaN       0.1
                                                 make_ocr_errors       cell_probability   NaN      0.01
                                                                       token_probability  NaN       0.1
                                                 make_typos     

In [147]:
flat_config_series.filter(like='row')

decennial_census            row_noise  do_not_respond  row_probability  NaN  NaN    0.0145
american_community_survey   row_noise  do_not_respond  row_probability  NaN  NaN    0.0145
current_population_survey   row_noise  do_not_respond  row_probability  NaN  NaN    0.2905
women_infants_and_children  row_noise  omit_row        row_probability  NaN  NaN       0.0
social_security             row_noise  omit_row        row_probability  NaN  NaN       0.0
taxes_w2_and_1099           row_noise  omit_row        row_probability  NaN  NaN     0.005
taxes_1040                  row_noise  omit_row        row_probability  NaN  NaN       0.0
dtype: object

In [148]:
flat_config_series.filter(items=['decennial_census'])

decennial_census    NaN
dtype: object

In [149]:
flat_config_series.filter(items=['do_not_respond'])

Series([], dtype: object)

In [150]:
flat_config_series.loc['decennial_census']

row_noise     do_not_respond                    row_probability       NaN               NaN    0.0145
column_noise  first_name                        leave_blank           cell_probability  NaN      0.01
                                                use_nickname          cell_probability  NaN      0.01
                                                use_fake_name         cell_probability  NaN      0.01
                                                make_phonetic_errors  cell_probability  NaN      0.01
                                                                                                ...  
              relationship_to_reference_person  choose_wrong_option   cell_probability  NaN      0.01
              sex                               leave_blank           cell_probability  NaN      0.01
                                                choose_wrong_option   cell_probability  NaN      0.01
              race_ethnicity                    leave_blank           cell_probabi

In [152]:
flat_config_series.xs('row_noise', level=1)

decennial_census            do_not_respond  row_probability  NaN  NaN    0.0145
american_community_survey   do_not_respond  row_probability  NaN  NaN    0.0145
current_population_survey   do_not_respond  row_probability  NaN  NaN    0.2905
women_infants_and_children  omit_row        row_probability  NaN  NaN       0.0
social_security             omit_row        row_probability  NaN  NaN       0.0
taxes_w2_and_1099           omit_row        row_probability  NaN  NaN     0.005
taxes_1040                  omit_row        row_probability  NaN  NaN       0.0
dtype: object

In [153]:
pd.Series(pad_flattened_dict(flat_config, pad_val=...))

decennial_census  row_noise     do_not_respond   row_probability       Ellipsis           Ellipsis    0.0145
                  column_noise  first_name       leave_blank           cell_probability   Ellipsis      0.01
                                                 use_nickname          cell_probability   Ellipsis      0.01
                                                 use_fake_name         cell_probability   Ellipsis      0.01
                                                 make_phonetic_errors  cell_probability   Ellipsis      0.01
                                                                                                       ...  
taxes_1040        column_noise  dependent_4_ssn  write_wrong_digits    token_probability  Ellipsis       0.1
                                                 make_ocr_errors       cell_probability   Ellipsis      0.01
                                                                       token_probability  Ellipsis       0.1
                   

In [154]:
pd.Series(pad_flattened_dict(flat_config, pad_val=''))

decennial_census  row_noise     do_not_respond   row_probability                              0.0145
                  column_noise  first_name       leave_blank           cell_probability         0.01
                                                 use_nickname          cell_probability         0.01
                                                 use_fake_name         cell_probability         0.01
                                                 make_phonetic_errors  cell_probability         0.01
                                                                                               ...  
taxes_1040        column_noise  dependent_4_ssn  write_wrong_digits    token_probability         0.1
                                                 make_ocr_errors       cell_probability         0.01
                                                                       token_probability         0.1
                                                 make_typos            cell_probability    

# Test functions after copying to module

In [156]:
pd.Series(alpha.pad_flattened_dict(alpha.flatten(config), pad_val=np.nan))

decennial_census  row_noise     do_not_respond   row_probability       NaN                NaN    0.0145
                  column_noise  first_name       leave_blank           cell_probability   NaN      0.01
                                                 use_nickname          cell_probability   NaN      0.01
                                                 use_fake_name         cell_probability   NaN      0.01
                                                 make_phonetic_errors  cell_probability   NaN      0.01
                                                                                                  ...  
taxes_1040        column_noise  dependent_4_ssn  write_wrong_digits    token_probability  NaN       0.1
                                                 make_ocr_errors       cell_probability   NaN      0.01
                                                                       token_probability  NaN       0.1
                                                 make_typos     

In [157]:
pd.Series(alpha.pad_flattened_dict(alpha.flatten(config, pad_after={'row_noise': np.nan})))

decennial_census  row_noise     NaN              do_not_respond        row_probability    NaN    0.0145
                  column_noise  first_name       leave_blank           cell_probability   NaN      0.01
                                                 use_nickname          cell_probability   NaN      0.01
                                                 use_fake_name         cell_probability   NaN      0.01
                                                 make_phonetic_errors  cell_probability   NaN      0.01
                                                                                                  ...  
taxes_1040        column_noise  dependent_4_ssn  write_wrong_digits    token_probability  NaN       0.1
                                                 make_ocr_errors       cell_probability   NaN      0.01
                                                                       token_probability  NaN       0.1
                                                 make_typos     