In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import logging
logging.basicConfig(level=logging.DEBUG, format='{asctime} - {name} - {levelname} - {message}', style='{')
import numpy as np, matplotlib.pyplot as plt, pandas as pd
# For finding and loading multiple files
import os
# Use this to see how much memory the dataframes use
from sys import getsizeof
from pathlib import Path
from linetimer import CodeTimer, linetimer

import pseudopeople as psp

from vivarium_research_prl.utils import sizemb, MappingViaAttributes
from vivarium_research_prl import alpha

!date
!whoami
!uname -a
!pwd
!python --version
!conda info --envs | grep '\*'
!conda list | grep -e pandas -e numpy -e vivarium -e pseudopeople

2024-02-01 16:03:17,476 - matplotlib - DEBUG - matplotlib data path: /ihme/homes/ndbs/miniconda3/envs/ppplv1.0-311/lib/python3.11/site-packages/matplotlib/mpl-data
2024-02-01 16:03:17,488 - matplotlib - DEBUG - CONFIGDIR=/ihme/homes/ndbs/.config/matplotlib
2024-02-01 16:03:17,490 - matplotlib - DEBUG - interactive is False
2024-02-01 16:03:17,491 - matplotlib - DEBUG - platform is linux
2024-02-01 16:03:17,619 - matplotlib - DEBUG - CACHEDIR=/ihme/homes/ndbs/.cache/matplotlib
2024-02-01 16:03:17,623 - matplotlib.font_manager - DEBUG - Using fontManager instance from /ihme/homes/ndbs/.cache/matplotlib/fontlist-v330.json
2024-02-01 16:03:18,443 - numexpr.utils - INFO - Note: NumExpr detected 64 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.


Thu 01 Feb 2024 04:03:24 PM PST
ndbs
Linux int-slurm-sarchive-p0005 5.4.0-167-generic #184-Ubuntu SMP Tue Oct 31 09:21:49 UTC 2023 x86_64 x86_64 x86_64 GNU/Linux
/mnt/share/code/ndbs/vivarium_research_prl/pseudopeople_testing
Python 3.11.7
ppplv1.0-311          *  /ihme/homes/ndbs/miniconda3/envs/ppplv1.0-311
numpy                     1.26.3                   pypi_0    pypi
pandas                    2.1.4                    pypi_0    pypi
pseudopeople              0.8.4.dev32+g8c8f99b          pypi_0    pypi
vivarium                  2.3.1                    pypi_0    pypi
vivarium-research-prl     0.0.1                     dev_0    <develop>


In [3]:
!pip freeze | grep pseudopeople

pseudopeople @ git+https://github.com/ihmeuw/pseudopeople.git@8c8f99b4ed3111e3bb0b9cd812c923049e39f7a8


In [4]:
# A clickable link to the GitHub UI to see what version I'm using
! pip freeze | grep pseudopeople | sed -e 's|pseudopeople @ ||' | sed -e 's|git+||' | sed -e 's|\.git@|/tree/|'

https://github.com/ihmeuw/pseudopeople/tree/8c8f99b4ed3111e3bb0b9cd812c923049e39f7a8


# Generate all sample datasets

In [5]:
%%time
data = alpha.generate_datasets()

2024-02-01 16:03:33,469 - vivarium_research_prl.alpha - INFO - Code block 'generate_american_community_survey' took: 0.00692 m
2024-02-01 16:03:33,674 - vivarium_research_prl.alpha - INFO - Code block 'generate_current_population_survey' took: 0.00341 m
2024-02-01 16:03:34,377 - vivarium_research_prl.alpha - INFO - Code block 'generate_decennial_census' took: 0.01169 m
2024-02-01 16:03:34,854 - vivarium_research_prl.alpha - INFO - Code block 'generate_social_security' took: 0.00794 m
2024-02-01 16:03:35,970 - vivarium_research_prl.alpha - INFO - Code block 'generate_taxes_1040' took: 0.01858 m    
2024-02-01 16:03:36,822 - vivarium_research_prl.alpha - INFO - Code block 'generate_taxes_w2_and_1099' took: 0.01420 m
2024-02-01 16:03:37,064 - vivarium_research_prl.alpha - INFO - Code block 'generate_women_infants_and_children' took: 0.00403 m


CPU times: user 3.85 s, sys: 102 ms, total: 3.95 s
Wall time: 4.01 s


# Generate unnoised sample datasets

In [6]:
data0 = alpha.generate_datasets(config=psp.NO_NOISE)

2024-02-01 16:03:37,222 - vivarium_research_prl.alpha - INFO - Code block 'generate_american_community_survey' took: 0.00219 m
2024-02-01 16:03:37,351 - vivarium_research_prl.alpha - INFO - Code block 'generate_current_population_survey' took: 0.00213 m
2024-02-01 16:03:37,695 - vivarium_research_prl.alpha - INFO - Code block 'generate_decennial_census' took: 0.00573 m
2024-02-01 16:03:37,932 - vivarium_research_prl.alpha - INFO - Code block 'generate_social_security' took: 0.00393 m
2024-02-01 16:03:38,424 - vivarium_research_prl.alpha - INFO - Code block 'generate_taxes_1040' took: 0.00819 m    
2024-02-01 16:03:38,858 - vivarium_research_prl.alpha - INFO - Code block 'generate_taxes_w2_and_1099' took: 0.00723 m
2024-02-01 16:03:38,960 - vivarium_research_prl.alpha - INFO - Code block 'generate_women_infants_and_children' took: 0.00169 m


# Make sure indices in the noised data are consecutive

In [7]:
{k: alpha.index_is_consecutive(df) for k, df in data.items()}

{'american_community_survey': True,
 'current_population_survey': True,
 'decennial_census': True,
 'social_security': True,
 'taxes_1040': True,
 'taxes_w2_and_1099': True,
 'women_infants_and_children': True}

In [8]:
{k: alpha.index_is_consecutive(df) for k, df in data0.items()}

{'american_community_survey': True,
 'current_population_survey': True,
 'decennial_census': True,
 'social_security': True,
 'taxes_1040': True,
 'taxes_w2_and_1099': True,
 'women_infants_and_children': True}

# Check lengths of each dataset vs. unnoised version

In [9]:
{k: len(df) for k, df in data.items()}

{'american_community_survey': 74,
 'current_population_survey': 40,
 'decennial_census': 10225,
 'social_security': 16497,
 'taxes_1040': 6518,
 'taxes_w2_and_1099': 9903,
 'women_infants_and_children': 140}

In [10]:
{k: len(df) for k, df in data0.items()}

{'american_community_survey': 76,
 'current_population_survey': 51,
 'decennial_census': 10387,
 'social_security': 16497,
 'taxes_1040': 6518,
 'taxes_w2_and_1099': 9972,
 'women_infants_and_children': 141}

# Find duplicated simulant IDs

Hmm, looks like the duplicated rows all appear at the end of the dataframe...

In [11]:
duplicated_ids = data.decennial_census.simulant_id.duplicated(keep=False)
duplicates = data.decennial_census.loc[duplicated_ids]
duplicates

Unnamed: 0,simulant_id,household_id,first_name,middle_initial,last_name,age,date_of_birth,street_number,street_name,unit_number,city,state,zipcode,housing_type,relationship_to_reference_person,sex,race_ethnicity,year
1266,0_19579,0_1802,Brielle,L,Gonzalez,19,11/27/2000,233,saint peters road,,Anytown,WA,0,Household,Other relative,Female,Latino,2020
1752,0_19556,0_2064,8eniamin,F,Allen,19,Oz/26/z0o1,2002,203rd pl se,,Anytown,WA,0,Household,Other relative,Male,Black,2020
3011,0_19556,0_3,Benjamin,C,Allen,19,02/26/2001,8203,west farwell avenue,,Anytown,WA,0,College,Noninstitutionalized group quarters population,Male,Black,2020
3015,0_19579,0_3,Brielle,L,Gonzalez,19,11/27/2000,8203,west farwell avenue,,Anytown,WA,0,College,Noninstitutionalized group quarters population,Female,Latino,2020
3034,0_19666,0_3,Kyle,L,Nims,19,03/31/2000,8203,west farwell avenue,,Anytown,MS,0,College,Noninstitutionalized group quarters population,Male,White,2020
3043,0_19693,0_3,Johana,M,Huang,20,08/04/1999,8203,west farwell avenue,,Anytown,WA,0,College,Noninstitutionalized group quarters population,Female,Asian,2020
5608,0_11994,0_4843,Lauren,H,Consul,17,10/25/2002,10949,delaware ave,,Anytown,WA,0,Household,Reference person,Female,White,2020
7401,0_19693,0_6152,Johana,M,Huang,20,08/04/1999,1095,ernst st,,Anytown,WA,0,Household,Other relative,Female,Asian,2020
9767,0_11994,0_8051,Lauren,H,Consul,17,10/25/2002,3304,ethan allen way,unit 200,Anytown,WA,0,Household,Other relative,Female,White,2020
10054,0_19666,0_881,Kyle,L,Nims,19,03/31/2000,224,s moraine st,,Anytown,WA,0,Household,Other relative,Male,White,2020


In [12]:
duplicates.simulant_id.value_counts()

simulant_id
0_19579    2
0_19556    2
0_19666    2
0_19693    2
0_11994    2
Name: count, dtype: int64

# Looks like duplicated simulants are still at the end of the dataframe...

In [13]:
data.decennial_census

Unnamed: 0,simulant_id,household_id,first_name,middle_initial,last_name,age,date_of_birth,street_number,street_name,unit_number,city,state,zipcode,housing_type,relationship_to_reference_person,sex,race_ethnicity,year
0,0_123,0_0,Angela,A,Mckinney,19,01/14/2001,1231,riverside dr,,Anytown,WA,00000,Carceral,Institutionalized group quarters population,Female,White,2020
1,0_244,0_0,Marcus,S,Chamberlain,20,01/30/2000,1231,riverside dr,,Anytown,WA,00000,Carceral,Institutionalized group quarters population,Male,White,2020
2,0_691,0_0,Kelvin,M,Higgins,59,06/04/1960,1231,riverside dr,,Anytown,WA,00000,Carceral,Institutionalized group quarters population,Male,Multiracial or Other,2020
3,0_2606,0_0,Michael,M,Scorzelli,14,12/05/2005,1231,riverside dr,,Anytown,WA,,Carceral,Institutionalized group quarters population,Male,Multiracial or Other,2020
4,0_3692,0_0,Dustin,M,Ormiston,30,08/05/1989,,riverside dr,,Anytown,WA,00000,Carceral,Institutionalized group quarters population,Male,Multiracial or Other,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10220,0_2482,0_996,Craig,S,Levi,69,04/24/1950,15200,a forest st,,Anytown,WA,00000,Household,Parent,Male,White,2020
10221,0_2483,0_996,Donna,R,Dont Know,69,11/30/1950,10200,a forest st,,Anytown,WA,00000,Household,Parent,Female,White,2020
10222,0_2487,0_998,Charles,J,Taylor,70,05/13/1949,1121,lida ct,,Anytown,WA,00000,Household,Reference person,Male,White,2020
10223,0_2488,0_998,Rebecca,R,Taylor,52,04/13/1967,1121,lida ct,,Anytown,WA,00000,Household,Opposite-sex spouse,,White,2020


In [14]:
data.decennial_census.index

RangeIndex(start=0, stop=10225, step=1)

# Calculate the "non-consecutiveness" of each household

In [15]:
data.decennial_census.household_id.nunique()

4061

In [16]:
# People in household 0_3
in_03 = data.decennial_census.query("household_id == '0_3'")
alpha.index_is_consecutive(in_03)

True

In [17]:
len(in_03)

128

In [18]:
def index_span_minus_size(df):
    """Calculates how far a dataframe index is
    from being consecutive.
    """
    return (df.index[-1] - df.index[0] + 1) - len(df)

index_span_minus_size(in_03)

0

In [19]:
hh_grouping_discrepancies = data.decennial_census.groupby('household_id').apply(index_span_minus_size)
# index is household_id, values are the discrepancy for that household
hh_grouping_discrepancies

household_id
0_0       0
0_1       0
0_100     0
0_1000    0
0_1002    0
         ..
0_993     0
0_994     0
0_995     0
0_996     0
0_998     0
Length: 4061, dtype: int64

In [20]:
# index is the discrepancy, values are the number of households with that discrepancy
hh_grouping_discrepancies.value_counts()

0    4061
Name: count, dtype: int64

In [21]:
hh_grouping_discrepancies.loc[hh_grouping_discrepancies > 100]

Series([], dtype: int64)

# Sort the census and compare with the version generated above

In [22]:
data.decennial_census.year.unique()

array([2020], dtype=object)

In [23]:
sorted_census = data.decennial_census.sort_values(['year', 'household_id']).reset_index(drop=True)
sorted_census

Unnamed: 0,simulant_id,household_id,first_name,middle_initial,last_name,age,date_of_birth,street_number,street_name,unit_number,city,state,zipcode,housing_type,relationship_to_reference_person,sex,race_ethnicity,year
0,0_123,0_0,Angela,A,Mckinney,19,01/14/2001,1231,riverside dr,,Anytown,WA,00000,Carceral,Institutionalized group quarters population,Female,White,2020
1,0_244,0_0,Marcus,S,Chamberlain,20,01/30/2000,1231,riverside dr,,Anytown,WA,00000,Carceral,Institutionalized group quarters population,Male,White,2020
2,0_691,0_0,Kelvin,M,Higgins,59,06/04/1960,1231,riverside dr,,Anytown,WA,00000,Carceral,Institutionalized group quarters population,Male,Multiracial or Other,2020
3,0_2606,0_0,Michael,M,Scorzelli,14,12/05/2005,1231,riverside dr,,Anytown,WA,,Carceral,Institutionalized group quarters population,Male,Multiracial or Other,2020
4,0_3692,0_0,Dustin,M,Ormiston,30,08/05/1989,,riverside dr,,Anytown,WA,00000,Carceral,Institutionalized group quarters population,Male,Multiracial or Other,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10220,0_2482,0_996,Craig,S,Levi,69,04/24/1950,15200,a forest st,,Anytown,WA,00000,Household,Parent,Male,White,2020
10221,0_2483,0_996,Donna,R,Dont Know,69,11/30/1950,10200,a forest st,,Anytown,WA,00000,Household,Parent,Female,White,2020
10222,0_2487,0_998,Charles,J,Taylor,70,05/13/1949,1121,lida ct,,Anytown,WA,00000,Household,Reference person,Male,White,2020
10223,0_2488,0_998,Rebecca,R,Taylor,52,04/13/1967,1121,lida ct,,Anytown,WA,00000,Household,Opposite-sex spouse,,White,2020


In [24]:
sorted_census.index

RangeIndex(start=0, stop=10225, step=1)

In [25]:
dup_household_ids = sorted_census.query(f"household_id in {duplicates.household_id.to_list()}")
dup_household_ids

Unnamed: 0,simulant_id,household_id,first_name,middle_initial,last_name,age,date_of_birth,street_number,street_name,unit_number,city,state,zipcode,housing_type,relationship_to_reference_person,sex,race_ethnicity,year
1263,0_4413,0_1802,John,M,Ventura,51,04/11/1968,233,saint peters road,,Anytown,WA,00000,Household,Reference person,Male,Latino,2020
1264,0_4414,0_1802,Zachary,C,Ventura,31,04/11/1968,233,saint peters road,,Anytown,WA,00000,Household,Biological child,Male,Latino,2020
1265,0_4415,0_1802,Sullivan,M,Ventura,6,12/06/2013,233,saint peters road,,Anytown,WA,00000,Household,Biological child,Male,White,2020
1266,0_19579,0_1802,Brielle,L,Gonzalez,19,11/27/2000,233,saint peters road,,Anytown,WA,00000,Household,Other relative,Female,Latino,2020
1746,0_5040,0_2064,Scott,T,Sanders,57,05/08/1962,2002,203rd pl se,,Anytown,WA,00000,Household,Reference person,Male,,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10050,0_2188,0_881,Michelle,B,Smith,45,06/13/1974,224,s moraine st,,Anytown,WA,00000,Household,Reference person,Female,White,2020
10051,0_2189,0_881,David,J,Smith,45,08/26/1974,224,s moraine st,,Anytown,WA,00000,Household,Opposite-sex spouse,Male,White,2020
10052,0_2190,0_881,Jimmy,A,Smith,14,01/21/2006,224,s moraine st,,Anytown,WA,,Household,Biological child,Male,White,2020
10053,0_2191,0_881,Benjamin,R,Smith,10,02/13/2010,224,s moraine st,,Anytown,WA,00000,Household,Biological child,Male,White,2020


In [26]:
dup_household_ids.housing_type.value_counts()

housing_type
College      127
Household     22
Name: count, dtype: int64

In [27]:
dup_household_ids.query("housing_type == 'Household'")

Unnamed: 0,simulant_id,household_id,first_name,middle_initial,last_name,age,date_of_birth,street_number,street_name,unit_number,city,state,zipcode,housing_type,relationship_to_reference_person,sex,race_ethnicity,year
1263,0_4413,0_1802,John,M,Ventura,51,04/11/1968,233,saint peters road,,Anytown,WA,0.0,Household,Reference person,Male,Latino,2020
1264,0_4414,0_1802,Zachary,C,Ventura,31,04/11/1968,233,saint peters road,,Anytown,WA,0.0,Household,Biological child,Male,Latino,2020
1265,0_4415,0_1802,Sullivan,M,Ventura,6,12/06/2013,233,saint peters road,,Anytown,WA,0.0,Household,Biological child,Male,White,2020
1266,0_19579,0_1802,Brielle,L,Gonzalez,19,11/27/2000,233,saint peters road,,Anytown,WA,0.0,Household,Other relative,Female,Latino,2020
1746,0_5040,0_2064,Scott,T,Sanders,57,05/08/1962,2002,203rd pl se,,Anytown,WA,0.0,Household,Reference person,Male,,2020
1747,0_5041,0_2064,Sandra,H,Sanders,38,10/29/1981,2002,203rd pl se,,Anytown,WA,0.0,Household,Opposite-sex spouse,Female,Black,2020
1748,0_5042,0_2064,Samantha,M,Sanders,16,04/25/2003,2002,203rd pl se,,Anytown,WA,0.0,Household,Biological child,Female,Black,2020
1749,0_5043,0_2064,Kimberly,Z,Sanders,14,02/23/2006,2002,203rd pl se,,Anytown,WA,0.0,Household,Biological child,Female,Black,2020
1750,0_5044,0_2064,Christopher,L,Sanders,10,04/15/2009,2002,203rd pl se,,Anytown,WA,0.0,Household,Biological child,Male,Black,2020
1751,0_20285,0_2064,Evan,J,Sanders,0,11/15/2019,2002,203rd pl se,,Anytown,WA,0.0,Household,Grandchild,Male,Black,2020


# Generate a census with more duplication and check household types

Looks like duplicated simulants are having their relationship to reference person correctly assigned.

In [28]:
config = {
    'decennial_census': {
        'row_noise': {
            'duplicate_with_guardian': {
                'row_probability_in_households_under_18': 0.70,
                'row_probability_in_college_group_quarters_under_24': 0.90
            }
        },
        'column_noise': {
            'housing_type': {
                'choose_wrong_option': {'cell_probability': 0.00}
            },
            'relationship_to_reference_person': {
                'choose_wrong_option': {'cell_probability': 0.00}
            }
        }
    }
}

In [29]:
bad_census = psp.generate_decennial_census(config=config)
bad_census

[32m2024-02-01 16:03:39.763[0m | [36mpseudopeople.configuration.validator[0m:[36mvalidate_noise_level_proportions[0m:[36m335[0m - [33m[1mThe configured 'row_probability_in_households_under_18' noise level for row_noise 'nan' is 0.7, which is higher than the maximum possible value based on the provided data for 'decennial_census'. Noising as many rows as possible. [0m
[32m2024-02-01 16:03:39.764[0m | [36mpseudopeople.configuration.validator[0m:[36mvalidate_noise_level_proportions[0m:[36m335[0m - [33m[1mThe configured 'row_probability_in_college_group_quarters_under_24' noise level for row_noise 'nan' is 0.9, which is higher than the maximum possible value based on the provided data for 'decennial_census'. Noising as many rows as possible. [0m


                                                                                                                   

Unnamed: 0,simulant_id,household_id,first_name,middle_initial,last_name,age,date_of_birth,street_number,street_name,unit_number,city,state,zipcode,housing_type,relationship_to_reference_person,sex,race_ethnicity,year
0,0_123,0_0,Angela,A,Mckinney,19,01/14/2001,1231,riverside dr,,Anytown,WA,00000,Carceral,Institutionalized group quarters population,Female,White,2020
1,0_244,0_0,Marcus,S,Chamberlain,20,01/30/2000,1231,riverside dr,,Anytown,WA,00000,Carceral,Institutionalized group quarters population,Male,White,2020
2,0_691,0_0,Kelvin,M,Higgins,59,06/04/1960,1231,riverside dr,,Anytown,WA,00000,Carceral,Institutionalized group quarters population,Male,Multiracial or Other,2020
3,0_2606,0_0,Michael,M,Scorzelli,14,12/05/2005,1231,riverside dr,,Anytown,WA,00000,Carceral,Institutionalized group quarters population,Male,Multiracial or Other,2020
4,0_3692,0_0,Dustin,M,Ormiston,30,08/05/1989,1231,riverside dr,,Anytown,WA,00000,Carceral,Institutionalized group quarters population,Male,Multiracial or Other,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10350,0_2482,0_996,Craig,S,Levi,69,04/24/1950,10200,a forest st,,Anytown,WA,00000,Household,Parent,Male,White,2020
10351,0_2483,0_996,Donna,R,Ref,69,11/30/1950,10200,a forest st,,Anytown,WA,00000,Household,Parent,Female,White,2020
10352,0_2487,0_998,Charles,J,Taylor,70,05/13/1949,1121,lida ct,,Anytown,WA,00000,Household,Reference person,Male,White,2020
10353,0_2488,0_998,Rebecca,R,Taylor,52,04/13/1967,1121,lida ct,,Anytown,WA,00000,Household,Opposite-sex spouse,,White,2020


In [30]:
bad_dup_ids = bad_census.simulant_id.duplicated(keep=False)
bad_duplicates = bad_census.loc[bad_dup_ids]
bad_duplicates

Unnamed: 0,simulant_id,household_id,first_name,middle_initial,last_name,age,date_of_birth,street_number,street_name,unit_number,city,state,zipcode,housing_type,relationship_to_reference_person,sex,race_ethnicity,year
107,0_6935,0_0,Logan,D,Leftwich,6,12/05/2013,1231,riverside dr,,Anytown,WA,00000,,Institutionalized group quarters population,Male,White,2020
108,0_6936,0_0,Ella,G,Leftwich,3,10/02/2016,1231,riverside dr,,Anytown,WA,,Carceral,Institutionalized group quarters population,Female,White,2020
109,0_10552,0_0,Alan,K,Revels,10,12/14/2009,1231,riverside dr,,Anytown,WA,00000,Carceral,Institutionalized group quarters population,Male,White,2020
110,0_5431,0_0,Christopher,N,Yager,5,06/23/2014,1231,riverside dr,,Anytown,WA,00000,Carceral,Institutionalized group quarters population,Male,White,2020
255,0_9820,0_1,Allison,L,Curtis,3,12/26/2016,34,bowen cir sw,,Anytown,WA,00001,Nursing home,Institutionalized group quarters population,Female,White,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10260,0_7512,0_926,Dakota,M,Heller,6,11/19/2013,4010,e highway 12,,Anytown,WA,00000,Household,Other relative,Male,White,2020
10261,0_7513,0_926,Benjamin,W,Heller,2,11/08/2017,4010,e highway 12,,Anytown,WA,00000,Household,Other relative,Male,White,2020
10291,0_2367,0_957,Benjamin,R,Moore,14,04/27/2005,3333,hideaway ln,,Anytown,WA,00000,Household,Biological child,Male,White,2020
10299,0_2377,0_961,Luis,P,Brumbelow,16,07/21/2003,703,franklin avn,,Anytown,WA,00000,Household,Other relative,Male,White,2020


In [31]:
bad_duplicates.housing_type.value_counts()

housing_type
Household              218
College                 32
Nursing home             5
Carceral                 3
Other institutional      3
Name: count, dtype: int64

In [32]:
mismatched = bad_duplicates.query("housing_type != 'Household' and relationship_to_reference_person == 'Other relative'")
mismatched

Unnamed: 0,simulant_id,household_id,first_name,middle_initial,last_name,age,date_of_birth,street_number,street_name,unit_number,city,state,zipcode,housing_type,relationship_to_reference_person,sex,race_ethnicity,year
8698,0_9019,0_7064,Aaliyah,I,Mason,9,02/27/2011,4444,caldwell,,Anytown,WA,0,,Other relative,Female,White,2020


In [33]:
bad_duplicates.query(f"simulant_id.isin({mismatched.simulant_id.to_list()})")

Unnamed: 0,simulant_id,household_id,first_name,middle_initial,last_name,age,date_of_birth,street_number,street_name,unit_number,city,state,zipcode,housing_type,relationship_to_reference_person,sex,race_ethnicity,year
4017,0_9019,0_3662,Aaliyah,I,Mason,9,02/27/2011,2255,vint hill rd,unit # 1013,Anytown,NJ,0,Household,Biological child,Female,White,2020
8698,0_9019,0_7064,Aaliyah,I,Mason,9,02/27/2011,4444,caldwell,,Anytown,WA,0,,Other relative,Female,White,2020


# Check whether there are guardians living in group quarters

Yes.

In [34]:
# HACK -- this only works because the duplicate rows
# appear at the end of the dataframe
n_dups = len(bad_duplicates) // 2
sim_rows = bad_duplicates.iloc[:n_dups+1]
guardian_rows = bad_duplicates.iloc[n_dups:]
assert set(sim_rows.simulant_id) == set(guardian_rows.simulant_id)
guardian_rows

AssertionError: 

In [None]:
guardian_rows.housing_type.value_counts()

In [None]:
sim_rows.housing_type.value_counts()

In [None]:
guardian_rows.query("housing_type != 'Household'")

In [None]:
issubclass(psp.exceptions.DataSourceError, Exception)