In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import logging
logging.basicConfig(level=logging.DEBUG, format='{asctime} - {name} - {levelname} - {message}', style='{')
import numpy as np, matplotlib.pyplot as plt, pandas as pd
# For finding and loading multiple files
import os
# Use this to see how much memory the dataframes use
from sys import getsizeof
from pathlib import Path
from linetimer import CodeTimer, linetimer

import pseudopeople as psp

from vivarium_research_prl.utils import sizemb, MappingViaAttributes, build_full_address
from vivarium_research_prl import alpha

!date
!whoami
!uname -a
!pwd
!python --version
!conda info --envs | grep '\*'
!conda list | grep -e pandas -e numpy -e vivarium -e pseudopeople

Fri 19 Jan 2024 02:01:12 PM PST
ndbs
Linux gen-slurm-sarchive-p0126 5.4.0-156-generic #173-Ubuntu SMP Tue Jul 11 07:25:22 UTC 2023 x86_64 x86_64 x86_64 GNU/Linux
/mnt/share/code/ndbs/vivarium_research_prl/pseudopeople_testing
Python 3.11.7
ppplv1.0-311          *  /ihme/homes/ndbs/miniconda3/envs/ppplv1.0-311
numpy                     1.26.3                   pypi_0    pypi
pandas                    2.1.4                    pypi_0    pypi
pseudopeople              0.7.2                    pypi_0    pypi
vivarium                  2.3.1                    pypi_0    pypi
vivarium-research-prl     0.0.1                     dev_0    <develop>


In [3]:
!pip freeze | grep pseudopeople

pseudopeople @ git+https://github.com/ihmeuw/pseudopeople.git@7d7e1db36125700fdd75ae9c667706d20b451bfd


In [4]:
# A clickable link to the GitHub UI to see what version I'm using
! pip freeze | grep pseudopeople | sed -e 's|pseudopeople @ ||' | sed -e 's|git+||' | sed -e 's|\.git@|/tree/|'

https://github.com/ihmeuw/pseudopeople/tree/7d7e1db36125700fdd75ae9c667706d20b451bfd


# Generate all sample datasets

In [25]:
%%time
data = alpha.generate_datasets()

2024-01-19 14:19:02,321 - vivarium_research_prl.alpha - INFO - Code block 'generate_american_community_survey' took: 0.02437 m
2024-01-19 14:19:03,228 - vivarium_research_prl.alpha - INFO - Code block 'generate_current_population_survey' took: 0.01512 m
2024-01-19 14:19:06,719 - vivarium_research_prl.alpha - INFO - Code block 'generate_decennial_census' took: 0.05817 m
2024-01-19 14:19:08,621 - vivarium_research_prl.alpha - INFO - Code block 'generate_social_security' took: 0.03168 m
2024-01-19 14:19:14,034 - vivarium_research_prl.alpha - INFO - Code block 'generate_taxes_1040' took: 0.09021 m
2024-01-19 14:19:18,847 - vivarium_research_prl.alpha - INFO - Code block 'generate_taxes_w2_and_1099' took: 0.08019 m
2024-01-19 14:19:20,145 - vivarium_research_prl.alpha - INFO - Code block 'generate_women_infants_and_children' took: 0.02162 m


CPU times: user 19.1 s, sys: 264 ms, total: 19.3 s
Wall time: 19.3 s


# Generate unnoised sample datasets

In [26]:
data0 = alpha.generate_datasets(config=psp.NO_NOISE)

2024-01-19 14:20:53,415 - vivarium_research_prl.alpha - INFO - Code block 'generate_american_community_survey' took: 0.01403 m
2024-01-19 14:20:54,047 - vivarium_research_prl.alpha - INFO - Code block 'generate_current_population_survey' took: 0.01051 m
2024-01-19 14:20:55,372 - vivarium_research_prl.alpha - INFO - Code block 'generate_decennial_census' took: 0.02206 m
2024-01-19 14:20:56,125 - vivarium_research_prl.alpha - INFO - Code block 'generate_social_security' took: 0.01255 m
2024-01-19 14:20:58,298 - vivarium_research_prl.alpha - INFO - Code block 'generate_taxes_1040' took: 0.03621 m
2024-01-19 14:20:59,458 - vivarium_research_prl.alpha - INFO - Code block 'generate_taxes_w2_and_1099' took: 0.01931 m
2024-01-19 14:21:00,010 - vivarium_research_prl.alpha - INFO - Code block 'generate_women_infants_and_children' took: 0.00920 m


# Make sure indices in the noised data are consecutive

In [27]:
{k: alpha.index_is_consecutive(df) for k, df in data.items()}

{'american_community_survey': True,
 'current_population_survey': True,
 'decennial_census': True,
 'social_security': True,
 'taxes_1040': True,
 'taxes_w2_and_1099': True,
 'women_infants_and_children': True}

In [28]:
{k: alpha.index_is_consecutive(df) for k, df in data0.items()}

{'american_community_survey': True,
 'current_population_survey': True,
 'decennial_census': True,
 'social_security': True,
 'taxes_1040': True,
 'taxes_w2_and_1099': True,
 'women_infants_and_children': True}

# Check lengths of each dataset vs. unnoised version

In [29]:
{k: len(df) for k, df in data.items()}

{'american_community_survey': 74,
 'current_population_survey': 40,
 'decennial_census': 10229,
 'social_security': 16497,
 'taxes_1040': 6518,
 'taxes_w2_and_1099': 9916,
 'women_infants_and_children': 141}

In [30]:
{k: len(df) for k, df in data0.items()}

{'american_community_survey': 76,
 'current_population_survey': 51,
 'decennial_census': 10387,
 'social_security': 16497,
 'taxes_1040': 6518,
 'taxes_w2_and_1099': 9972,
 'women_infants_and_children': 141}

# Find duplicated simulant IDs

Hmm, looks like the duplicated rows all appear at the end of the dataframe...

In [35]:
duplicated_ids = data.decennial_census.simulant_id.duplicated(keep=False)
duplicates = data.decennial_census.loc[duplicated_ids]
duplicates

Unnamed: 0,simulant_id,household_id,first_name,middle_initial,last_name,age,date_of_birth,street_number,street_name,unit_number,city,state,zipcode,housing_type,relationship_to_reference_person,sex,race_ethnicity,year
1301,0_9434,0_6613,Rene,J,,17,08/04/2002,11922.0,wagoner dr,,Anytown,WA,0,Household,Other nonrelative,Male,Multiracial or Other,2020
1885,0_4634,0_3,David,Q,Prudente,17,09/25/2002,8203.0,west farwell avenue,,Anytown,WA,0,College,Noninstitutionalized group quarters population,Male,Latino,2020
8929,0_9314,0_3784,Alina,R,Cordova,14,04/19/2005,899.0,spruce avenue,,Anytown,WA,0,Household,,Female,Latino,2020
10226,0_9434,0_3837,Rene,J,Rogers,17,08/04/2002,,bennington hills court,,Anytown,WA,0,Household,Other relative,Male,Multiracial or Other,2020
10227,0_9314,0_1,Alina,R,Cordova,14,04/19/2005,34.0,bowen cir sw,,Anytown,WA,0,Nursing home,Other relative,Female,Latino,2020
10228,0_4634,0_1890,David,Q,Prudente,17,09/25/2002,1975.0,calvin ave,,Anytown,WA,0,Household,Other relative,Male,Latino,2020


In [42]:
dup_household_ids = data.decennial_census.query(
    f"household_id in {duplicates.household_id.to_list()}")
dup_household_ids

Unnamed: 0,simulant_id,household_id,first_name,middle_initial,last_name,age,date_of_birth,street_number,street_name,unit_number,city,state,zipcode,housing_type,relationship_to_reference_person,sex,race_ethnicity,year
291,0_121,0_1,Drew,P,Crane,19,09/04/2000,34,bowen cir sw,,Anytown,WA,00000,Nursing home,Institutionalized group quarters population,Male,White,2020
292,0_279,0_1,Avery,A,Wallace,24,10/02/1995,34,bowen cir sw,,Anytown,WA,00000,Nursing home,Institutionalized group quarters population,Male,White,2020
293,0_390,0_1,Dexter,R,Donegan,25,11/27/1994,34,bowen cir sw,,Anytown,WA,00000,Nursing home,Institutionalized group quarters population,Male,White,2020
294,0_875,0_1,Julia,L,Nelson,z1,11/10/1998,34,bowen cir sw,,Anytown,WA,00000,Nursing home,Institutionalized group quarters population,Female,Multiracial or Other,2020
295,0_909,0_1,Samuel,S,Solo,31,03/26/1988,34,bowen cir sw,,Anytown,WA,00000,Nursing home,Institutionalized group quarters population,Male,White,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8929,0_9314,0_3784,Alina,R,Cordova,14,04/19/2005,899,spruce avenue,,Anytown,WA,00000,Household,,Female,Latino,2020
8930,0_9315,0_3784,Aiden,J,Cordova,12,02/10/2008,899,spruce avenue,,Anytown,WA,00000,Household,Biological child,Male,Latino,2020
10226,0_9434,0_3837,Rene,J,Rogers,17,08/04/2002,,bennington hills court,,Anytown,WA,00000,Household,Other relative,Male,Multiracial or Other,2020
10227,0_9314,0_1,Alina,R,Cordova,14,04/19/2005,34,bowen cir sw,,Anytown,WA,00000,Nursing home,Other relative,Female,Latino,2020


In [43]:
dup_household_ids.housing_type.value_counts()

housing_type
Nursing home              140
College                   126
Household                  16
Other noninstitutional      1
Other institutional         1
Name: count, dtype: int64

In [44]:
dup_household_ids.household_id.value_counts()

household_id
0_1       146
0_3       128
0_6613      4
0_3837      4
0_1890      4
0_3784      4
Name: count, dtype: int64

In [49]:
dup_household_ids_not_0103 = data.decennial_census.query(
    f"household_id not in ['0_1', '0_3'] and household_id in {duplicates.household_id.to_list()}")
dup_household_ids_not_0103

Unnamed: 0,simulant_id,household_id,first_name,middle_initial,last_name,age,date_of_birth,street_number,street_name,unit_number,city,state,zipcode,housing_type,relationship_to_reference_person,sex,race_ethnicity,year
1301,0_9434,0_6613,Rene,J,,17,08/04/2002,11922.0,wagoner dr,,Anytown,WA,0,Household,Other nonrelative,Male,Multiracial or Other,2020
1302,0_13164,0_6613,Richard,M,Rafkin,23,08/09/1996,11922.0,wagoner dr,,Anytown,WA,0,Household,Other nonrelative,Male,White,2020
1303,0_16393,0_6613,Daniel,,Mcconchie,62,09/18/1957,11922.0,wagoner dr,,Anytown,WA,0,Household,Reference person,Male,White,2020
1305,0_19924,0_6613,Brian,J,Martin,32,04/07/1987,11922.0,wagoner dr,,Anytown,WA,0,Household,Other nonrelative,,AIAN,2020
3854,0_9431,0_3837,Denise,H,Rogers,33,09/28/1986,,bennington hills court,,Anytown,WA,0,Household,Reference person,Female,Multiracial or Other,2020
3855,0_9432,0_3837,Logan,L,Rogers,9,01/15/2011,,bennington hills court,,Anytown,WA,0,Household,Biological child,Male,Multiracial or Other,2020
3856,0_9433,0_3837,David,M,Rogers,5,10/18/2014,,bennington hills court,,Anytown,WA,0,Household,Biological child,Male,Multiracial or Other,2020
7358,0_4631,0_1890,Consuelo,R,Prudente,43,09/01/1976,1975.0,calvin ave,,Anytown,WA,0,Household,Reference person,Female,White,2020
7359,0_4632,0_1890,Gabriel,P,Prudente,49,10/29/1970,1975.0,calvin ave,,Anytown,WA,0,Household,Opposite-sex spouse,Male,Latino,2020
7360,0_4633,0_1890,Jazmin,K,Prudente,24,01/01/1996,1975.0,calvin ave,,Anytown,WA,0,Household,Biological child,Female,White,2020


In [50]:
dup_household_ids_not_0103.housing_type.value_counts()

housing_type
Household    15
Name: count, dtype: int64

# Why do we have one simulant each in 'Other insitutional' and 'Other noninstitutional'?

Based on the unnoised data, it looks like they're actually in college, and the other housing types must be the result of "choose the wrong option" noise.

In [46]:
dup_household_ids.query("housing_type in ['Other noninstitutional', 'Other institutional']")

Unnamed: 0,simulant_id,household_id,first_name,middle_initial,last_name,age,date_of_birth,street_number,street_name,unit_number,city,state,zipcode,housing_type,relationship_to_reference_person,sex,race_ethnicity,year
1900,0_11077,0_3,Carlos,I,Das,20,06/22/1999,8203,west farwell avenue,,Anytown,WA,0,Other noninstitutional,Noninstitutionalized group quarters population,Male,Asian,2020
1905,0_13076,0_3,Nicholas,A,Mckinney,20,10/14/1999,8203,west farwell avenue,,Anytown,WA,0,Other institutional,Noninstitutionalized group quarters population,Male,White,2020


In [48]:
data0.decennial_census.query("simulant_id in ['0_11077', '0_13076']")

Unnamed: 0,simulant_id,household_id,first_name,middle_initial,last_name,age,date_of_birth,street_number,street_name,unit_number,city,state,zipcode,housing_type,relationship_to_reference_person,sex,race_ethnicity,year
1934,0_11077,0_3,Carlos,I,Das,20,06/22/1999,8203,west farwell avenue,,Anytown,WA,0,College,Noninstitutionalized group quarters population,Male,Asian,2020
1939,0_13076,0_3,Nicholas,A,Mckinney,20,10/14/1999,8203,west farwell avenue,,Anytown,WA,0,College,Noninstitutionalized group quarters population,Male,White,2020


# Let's see if people in the same household are grouped together in the dataframes

Not entirely, but it seems close...

In [55]:
in_03 = data.decennial_census.query("household_id == '0_3'")
alpha.index_is_consecutive(in_03)

False

In [58]:
diffs = in_03.index[1:] - in_03.index[:-1]
diffs

Index([1, 2, 1, 1, 1, 1, 1, 1, 1, 1,
       ...
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
      dtype='int64', length=127)

In [59]:
(diffs != 1).sum()

4

In [60]:
in_03

Unnamed: 0,simulant_id,household_id,first_name,middle_initial,last_name,age,date_of_birth,street_number,street_name,unit_number,city,state,zipcode,housing_type,relationship_to_reference_person,sex,race_ethnicity,year
1870,0_727,0_3,Kari,L,Henderson,36,05/05/1983,8203,west farwell avenue,,Anytown,WA,00000,College,Noninstitutionalized group quarters population,Female,White,2020
1871,0_1297,0_3,Marian,A,Mcfrazier,25,07/24/1994,8203,west farwell avenue,,Anytown,WA,00000,College,Noninstitutionalized group quarters population,Female,Black,2020
1873,0_1804,0_3,Benjamin,M,Ford,21,01/09/1999,8203,west farwell avenue,,Anytown,WA,00000,College,Noninstitutionalized group quarters population,Male,White,2020
1874,0_1928,0_3,Lucille,C,Spicer,81,12/10/1938,8203,west farwell avenue,,Anytown,WA,00000,College,Noninstitutionalized group quarters population,Female,Black,2020
1875,0_2730,0_3,Matthew,B,Sabal,25,02/26/1995,8203,west farwell avenue,,Anytown,WA,00000,College,Noninstitutionalized group quarters population,Male,Asian,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1997,0_20027,0_3,Bryce,J,Lam,21,08/17/1998,8203,west farwell avenue,,Anytown,WA,00000,College,Noninstitutionalized group quarters population,Male,Asian,2020
1998,0_20082,0_3,Eugene,G,Kong,19,10/12/2000,8203,west farwell avenue,,Anytown,WA,00000,College,Noninstitutionalized group quarters population,Male,Asian,2020
1999,0_20209,0_3,Elizabeth,J,Frost,23,08/29/1996,8203,west farwell avenue,,Anytown,WA,00000,College,Noninstitutionalized group quarters population,Female,White,2020
2000,0_20329,0_3,Zachary,J,Collier,17,01/20/2003,8203,west farwell avenue,,Anytown,WA,00000,College,Noninstitutionalized group quarters population,Male,White,2020


# Why is there a random person in a different household inserted at 1872?

I don't know...

In [61]:
data.decennial_census.loc[1872]

simulant_id                                   0_1546
household_id                                   0_629
first_name                                    Dionne
middle_initial                                     S
last_name                                    Schreck
age                                               45
date_of_birth                             03/07/1975
street_number                                  11591
street_name                                  k ih dr
unit_number                                      NaN
city                                         Anytown
state                                             WA
zipcode                                        00000
housing_type                               Household
relationship_to_reference_person    Reference person
sex                                           Female
race_ethnicity                                 White
year                                            2020
Name: 1872, dtype: object

In [62]:
data.decennial_census.query("household_id == '0_629'")

Unnamed: 0,simulant_id,household_id,first_name,middle_initial,last_name,age,date_of_birth,street_number,street_name,unit_number,city,state,zipcode,housing_type,relationship_to_reference_person,sex,race_ethnicity,year
1872,0_1546,0_629,Dionne,S,Schreck,45,03/07/1975,11591,k ih dr,,Anytown,WA,0,Household,Reference person,Female,White,2020


In [63]:
data0.decennial_census.query("household_id == '0_629'")

Unnamed: 0,simulant_id,household_id,first_name,middle_initial,last_name,age,date_of_birth,street_number,street_name,unit_number,city,state,zipcode,housing_type,relationship_to_reference_person,sex,race_ethnicity,year
1906,0_1546,0_629,Dionne,S,Schreck,45,03/07/1975,11591,k ih dr,,Anytown,WA,0,Household,Reference person,Female,White,2020


# Looks like there are several single random people inserted in the middle of household `0_3`

In [64]:
in_03[:-1].loc[diffs != 1]

Unnamed: 0,simulant_id,household_id,first_name,middle_initial,last_name,age,date_of_birth,street_number,street_name,unit_number,city,state,zipcode,housing_type,relationship_to_reference_person,sex,race_ethnicity,year
1871,0_1297,0_3,Marian,A,Mcfrazier,25,07/24/1994,8203,west farwell avenue,,Anytown,WA,0,College,Noninstitutionalized group quarters population,Female,Black,2020
1881,0_3858,0_3,Martha,R,Hackleman,91,10/17/1928,8203,west farwell avenue,,Anytown,WA,0,College,Noninstitutionalized group quarters population,Female,White,2020
1897,0_10089,0_3,James,D,Starr,35,05/09/1984,8203,west farwell avenue,,Anytown,WA,0,College,Noninstitutionalized group quarters population,Male,White,2020
1915,0_18775,0_3,Asia,H,Norton,20,12/30/1999,8203,west farwell avenue,,Anytown,WA,0,College,Noninstitutionalized group quarters population,Female,White,2020


In [65]:
in_03[1:].loc[diffs != 1]

Unnamed: 0,simulant_id,household_id,first_name,middle_initial,last_name,age,date_of_birth,street_number,street_name,unit_number,city,state,zipcode,housing_type,relationship_to_reference_person,sex,race_ethnicity,year
1873,0_1804,0_3,Benjamin,M,Ford,21,01/09/1999,8203,west farwell avenue,,Anytown,WA,0,College,Noninstitutionalized group quarters population,Male,White,2020
1883,0_4150,0_3,Sophia,M,Fleischmann,18,04/21/2001,8203,west farwell avenue,,Anytown,WA,0,College,Noninstitutionalized group quarters population,Female,White,2020
1899,0_10884,0_3,Cynthia,M,Easley,21,01/11/1999,8203,west farwell avenue,,Anytown,WA,0,College,Noninstitutionalized group quarters population,Female,Black,2020
1917,0_19404,0_3,Makayla,L,Butler,19,07/10/1999,8203,west farwell avenue,,Anytown,WA,0,College,Noninstitutionalized group quarters population,Female,White,2020


# Let's try something more sophisticated...

Yes, the households that have members that are very far apart in the dataframe are precisely those with a simulant duplicated at a guardian's address, namely the last three rows in the census dataframe.

In [67]:
data.decennial_census.household_id.nunique()

4065

In [72]:
def index_span_minus_size(df):
    return (df.index[-1] - df.index[0] + 1) - len(df)

index_span_minus_size(in_03)

4

In [73]:
hh_grouping_discrepancies = data.decennial_census.groupby('household_id').apply(index_span_minus_size)
# index is household_id, values are the discrepancy for that household
hh_grouping_discrepancies

household_id
0_0          7
0_1       9791
0_100        0
0_1000       0
0_1002       0
          ... 
0_993        0
0_994        0
0_995        0
0_996        0
0_998        0
Length: 4065, dtype: int64

In [74]:
# index is the discrepancy, values are the number of households with that discrepancy
hh_grouping_discrepancies.value_counts()

0       3798
3         45
2         38
5         35
4         32
1         31
6         21
7         18
10         9
8          8
11         8
12         6
14         4
9          3
13         3
17         2
2867       1
19         1
9791       1
6369       1
Name: count, dtype: int64

In [80]:
hh_grouping_discrepancies.loc[hh_grouping_discrepancies > 100]

household_id
0_1       9791
0_1890    2867
0_3837    6369
dtype: int64

In [78]:
duplicates.household_id.unique()

array(['0_6613', '0_3', '0_3784', '0_3837', '0_1', '0_1890'], dtype=object)

In [81]:
hh_grouping_discrepancies.loc[duplicates.household_id.unique()]

household_id
0_6613       1
0_3          4
0_3784       1
0_3837    6369
0_1       9791
0_1890    2867
dtype: int64

# What do some of the other households with large-ish numbers look like?

Hmm, not sure why sometimes a simulant is several rows below the rest of their household.

In [83]:
hh_grouping_discrepancies.loc[(10 < hh_grouping_discrepancies) & (hh_grouping_discrepancies < 100)]

household_id
0_1436    13
0_1528    11
0_1645    17
0_1865    19
0_2050    11
0_2111    13
0_2131    14
0_2385    11
0_307     12
0_3189    12
0_3482    14
0_3730    11
0_485     17
0_5528    14
0_6024    11
0_6218    12
0_6278    12
0_6712    14
0_726     11
0_7852    12
0_810     11
0_8207    13
0_902     12
0_987     11
dtype: int64

In [84]:
data.decennial_census.query("household_id == '0_1436'")

Unnamed: 0,simulant_id,household_id,first_name,middle_initial,last_name,age,date_of_birth,street_number,street_name,unit_number,city,state,zipcode,housing_type,relationship_to_reference_person,sex,race_ethnicity,year
5991,0_3530,0_1436,Terrence,P,Stewart,43,04/12/1976,11,south melbourne drive,,Anytown,,0,Household,Reference person,Male,Latino,2020
5992,0_3531,0_1436,Lamonica,K,Lady,38,01/14/1982,11,south melbourne drive,,Anytown,WA,0,Household,Opposite-sex spouse,Female,Latino,2020
5993,0_3532,0_1436,Justice,E,Stewart,9,03/29/2010,11,soyth melgojrne drivd,,Anytown,WA,0,Household,Biological child,Male,Latino,2020
5994,0_3533,0_1436,Josue,C,Stewart,4,06/26/2015,11,south melbourne drive,,Anytown,WA,0,Household,Biological child,Male,Latino,2020
6008,0_20338,0_1436,Annabelle,A,Stewart,0,01/12/2020,11,south melbourne drive,,Anytiwn,WA,0,Household,Biological child,Female,Latino,2020


In [85]:
data.decennial_census.loc[5991:6008]

Unnamed: 0,simulant_id,household_id,first_name,middle_initial,last_name,age,date_of_birth,street_number,street_name,unit_number,city,state,zipcode,housing_type,relationship_to_reference_person,sex,race_ethnicity,year
5991,0_3530,0_1436,Terrence,P,Stewart,43,04/12/1976,11,south melbourne drive,,Anytown,,0.0,Household,Reference person,Male,Latino,2020
5992,0_3531,0_1436,Lamonica,K,Lady,38,01/14/1982,11,south melbourne drive,,Anytown,WA,0.0,Household,Opposite-sex spouse,Female,Latino,2020
5993,0_3532,0_1436,Justice,E,Stewart,9,03/29/2010,11,soyth melgojrne drivd,,Anytown,WA,0.0,Household,Biological child,Male,Latino,2020
5994,0_3533,0_1436,Josue,C,Stewart,4,06/26/2015,11,south melbourne drive,,Anytown,WA,0.0,Household,Biological child,Male,Latino,2020
5995,0_6379,0_2596,Donnie,N,Emrick,61,04/28/1958,8572,diana pl,,Anytown,WA,0.0,Household,Reference person,Male,White,2020
5996,0_11830,0_4781,Willie,I,Loniewski,39,10/28/1980,12651,pasadena dr,,Anytown,WA,0.0,Household,Reference person,Male,White,2020
5997,0_11831,0_4781,Heather,K,Loniewski,38,02/19/1982,12651,pasadena dr,,Anytown,WA,0.0,Household,Opposite-sex spouse,Female,White,2020
5998,0_11832,0_4781,Evan,L,Dont Know,12,04/07/2007,12651,pasadena dr,,Anytown,WA,0.0,Household,Biological child,Male,White,2020
5999,0_11833,0_4781,Chelsea,M,Loniewski,10,03/13/2010,12651,pasadena dr,,Anytown,WA,0.0,Household,Biological child,Female,White,2020
6000,0_11834,0_4781,Zamarion,E,Loniewski,8,07/10/2011,12651,pasadena dr,,Anytown,WA,0.0,Household,Biological child,Male,White,2020


# Let's generate a census with more guardian duplication

In [88]:
default_config = psp.get_config()
default_config['decennial_census']

{'row_noise': {'do_not_respond': {'row_probability': 0.0145},
  'omit_row': {'row_probability': 0.0},
  'duplicate_with_guardian': {'row_probability_in_households_under_18': 0.02,
   'row_probability_in_college_group_quarters_under_24': 0.05}},
 'column_noise': {'first_name': {'leave_blank': {'cell_probability': 0.01},
   'use_nickname': {'cell_probability': 0.01},
   'use_fake_name': {'cell_probability': 0.01},
   'make_phonetic_errors': {'cell_probability': 0.01,
    'token_probability': 0.1},
   'make_ocr_errors': {'cell_probability': 0.01, 'token_probability': 0.1},
   'make_typos': {'cell_probability': 0.01, 'token_probability': 0.1}},
  'middle_initial': {'leave_blank': {'cell_probability': 0.01},
   'make_phonetic_errors': {'cell_probability': 0.01,
    'token_probability': 0.1},
   'make_ocr_errors': {'cell_probability': 0.01, 'token_probability': 0.1},
   'make_typos': {'cell_probability': 0.01, 'token_probability': 0.1}},
  'last_name': {'leave_blank': {'cell_probability': 0.

In [89]:
overrides = {
    'decennial_census': {
        'row_noise': {
            'duplicate_with_guardian': {
                'row_probability_in_households_under_18': 0.50,
                'row_probability_in_college_group_quarters_under_24': 0.80
            }
        }
    }
}

In [90]:
bad_census = psp.generate_decennial_census(config=overrides)
bad_census

[32m2024-01-19 16:21:28.353[0m | [36mpseudopeople.configuration.validator[0m:[36mvalidate_noise_level_proportions[0m:[36m335[0m - [33m[1mThe configured 'row_probability_in_households_under_18' noise level for row_noise 'nan' is 0.5, which is higher than the maximum possible value based on the provided data for 'decennial_census'. Noising as many rows as possible. [0m
[32m2024-01-19 16:21:28.355[0m | [36mpseudopeople.configuration.validator[0m:[36mvalidate_noise_level_proportions[0m:[36m335[0m - [33m[1mThe configured 'row_probability_in_college_group_quarters_under_24' noise level for row_noise 'nan' is 0.8, which is higher than the maximum possible value based on the provided data for 'decennial_census'. Noising as many rows as possible. [0m


                                                                                                        

Unnamed: 0,simulant_id,household_id,first_name,middle_initial,last_name,age,date_of_birth,street_number,street_name,unit_number,city,state,zipcode,housing_type,relationship_to_reference_person,sex,race_ethnicity,year
0,0_2,0_7,Diana,P,Kofron,25,05/06/1994,5112,145th st,,Anytown,WA,00000,Household,Reference person,Female,,2020
1,0_3,0_7,Anna,A,Kofron,25,09/29/1994,5112,145th st,,Anytown,WA,00000,Household,Other relative,Female,White,2020
2,0_923,0_8033,Gerald,R,Butler,76,11/03/1943,1130,mallory ln,,Anytown,WA,00000,Household,Reference person,Male,Black,2020
3,0_2641,0_1066,Loretta,T,Carley,61,06/01/1958,32597,delacorte dr,,Anytown,WA,00000,Household,Reference person,Female,White,2020
4,0_2801,0_1138,Richard,R,Jones,73,03/03/1947,950,caribou lane,,Anytown,WA,00000,Household,Reference person,Male,White,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10324,0_19541,0_5710,Maria,L,Woodhouse,22,06/11/1997,13614,burtz street northwest,,Anytown,WA,00000,Household,Other relative,Female,Black,2020
10325,0_19580,0_2254,Brooke,E,Morales,23,03/27/1996,5451,northwestrn pwy,,Anytown,WA,00000,Household,Other relative,Female,Latino,2020
10326,0_19683,0_3633,James,V,No,23,08/13/1996,580,breeze hill rd,,Anytown,WA,00000,Household,Other relative,Male,White,2020
10327,0_19697,0_5337,Lindsey,L,Milam,23,12/02/1996,925,sawmill rd,,Anytown,WA,00000,Household,Other relative,Female,White,2020


In [92]:
bad_duplicated_ids = bad_census.simulant_id.duplicated(keep=False)
bad_duplicates = bad_census.loc[bad_duplicated_ids]
bad_duplicates

Unnamed: 0,simulant_id,household_id,first_name,middle_initial,last_name,age,date_of_birth,street_number,street_name,unit_number,city,state,zipcode,housing_type,relationship_to_reference_person,sex,race_ethnicity,year
46,0_5338,0_2183,Grayson,X,Lopez,3,03/15/2017,1613,cr ledge dr,,Anytown,WA,00000,Household,Biological child,Male,Latino,2020
81,0_17232,0_4484,Wade,J,Widmann,3,07/17/2016,3303,viceroy dr,,Anytown,WA,00000,Household,Other nonrelative,Male,White,2020
197,0_7454,0_3037,,Z,Kruse,2,05/27/2017,6470,cr 430 rd,lot 28,Anytown,WA,00000,Household,Biological child,Male,White,2020
938,0_6936,0_2824,Ella,G,Leftwich,3,10/02/2016,17631,n roxboro st,,Anytown,WA,00000,Household,Biological child,Female,White,2020
978,0_5656,0_2302,Edison,A,Diamond,3,03/13/2017,235,pleasant vy rd,,Anytown,WA,00000,Household,Grandchild,Male,Black,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10324,0_19541,0_5710,Maria,L,Woodhouse,22,06/11/1997,13614,burtz street northwest,,Anytown,WA,00000,Household,Other relative,Female,Black,2020
10325,0_19580,0_2254,Brooke,E,Morales,23,03/27/1996,5451,northwestrn pwy,,Anytown,WA,00000,Household,Other relative,Female,Latino,2020
10326,0_19683,0_3633,James,V,No,23,08/13/1996,580,breeze hill rd,,Anytown,WA,00000,Household,Other relative,Male,White,2020
10327,0_19697,0_5337,Lindsey,L,Milam,23,12/02/1996,925,sawmill rd,,Anytown,WA,00000,Household,Other relative,Female,White,2020


In [97]:
bad_duplicates.index[-100:]

Index([10227, 10228, 10229, 10230, 10231, 10232, 10233, 10234, 10235, 10236,
       10237, 10238, 10239, 10240, 10241, 10242, 10243, 10244, 10245, 10246,
       10247, 10248, 10249, 10250, 10251, 10252, 10253, 10254, 10255, 10256,
       10257, 10258, 10259, 10260, 10261, 10262, 10263, 10264, 10265, 10266,
       10267, 10268, 10269, 10271, 10272, 10273, 10274, 10275, 10276, 10277,
       10278, 10279, 10280, 10281, 10282, 10283, 10284, 10285, 10286, 10287,
       10288, 10289, 10290, 10291, 10292, 10293, 10294, 10295, 10296, 10297,
       10298, 10299, 10300, 10301, 10302, 10303, 10304, 10305, 10306, 10307,
       10308, 10309, 10311, 10312, 10313, 10314, 10315, 10316, 10317, 10318,
       10319, 10320, 10321, 10322, 10323, 10324, 10325, 10326, 10327, 10328],
      dtype='int64')

In [98]:
bad_census.index[-100:]

RangeIndex(start=10229, stop=10329, step=1)

In [99]:
data0.decennial_census

Unnamed: 0,simulant_id,household_id,first_name,middle_initial,last_name,age,date_of_birth,street_number,street_name,unit_number,city,state,zipcode,housing_type,relationship_to_reference_person,sex,race_ethnicity,year
0,0_2,0_7,Diana,P,Kofron,25,05/06/1994,5112,145th st,,Anytown,WA,00000,Household,Reference person,Female,White,2020
1,0_3,0_7,Anna,A,Kofron,25,09/29/1994,5112,145th st,,Anytown,WA,00000,Household,Other relative,Female,White,2020
2,0_923,0_8033,Gerald,R,Butler,76,11/03/1943,1130,mallory ln,,Anytown,WA,00000,Household,Reference person,Male,Black,2020
3,0_2641,0_1066,Loretta,T,Carley,61,06/01/1958,32597,delacorte dr,,Anytown,WA,00000,Household,Reference person,Female,White,2020
4,0_2801,0_1138,Richard,R,Jones,73,03/03/1947,950,caribou lane,,Anytown,WA,00000,Household,Reference person,Male,White,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10382,0_18969,0_7630,Patty,E,Seligmann,87,01/11/1933,1706,lincoln ave se,,Anytown,WA,00000,Household,Opposite-sex spouse,Female,White,2020
10383,0_19008,0_8361,John,V,Thomas,58,12/29/1961,7736,trophy dr,,Anytown,WA,00000,Household,Reference person,Male,Black,2020
10384,0_20165,0_7999,Kimberly,K,Smith,65,04/05/1955,3506,oneill avenue,,Anytown,WA,00000,Household,Reference person,Female,White,2020
10385,0_19020,0_8130,Virginia,G,Sanderson,93,10/02/1926,201,edinburgh rd,,Anytown,WA,00000,Household,Reference person,Female,White,2020
