In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import logging
logging.basicConfig(level=logging.DEBUG, format='{asctime} - {name} - {levelname} - {message}', style='{')
import numpy as np, matplotlib.pyplot as plt, pandas as pd
# For finding and loading multiple files
import os
import traceback
# Use this to see how much memory the dataframes use
from sys import getsizeof
from pathlib import Path
from linetimer import CodeTimer, linetimer

import pseudopeople as psp

from vivarium_research_prl.utils import sizemb, MappingViaAttributes
from vivarium_research_prl import alpha, datatypes, utils

!date
!whoami
!uname -a
!pwd
!python --version
!conda info --envs | grep '\*'
!conda list | grep -e pandas -e numpy -e vivarium -e pseudopeople
!pip freeze | grep pseudopeople

2024-02-06 19:19:58,804 - matplotlib - DEBUG - matplotlib data path: /ihme/homes/ndbs/miniconda3/envs/ppplv1.0-311/lib/python3.11/site-packages/matplotlib/mpl-data
2024-02-06 19:19:58,811 - matplotlib - DEBUG - CONFIGDIR=/ihme/homes/ndbs/.config/matplotlib
2024-02-06 19:19:58,814 - matplotlib - DEBUG - interactive is False
2024-02-06 19:19:58,815 - matplotlib - DEBUG - platform is linux
2024-02-06 19:19:58,904 - matplotlib - DEBUG - CACHEDIR=/ihme/homes/ndbs/.cache/matplotlib
2024-02-06 19:19:58,906 - matplotlib.font_manager - DEBUG - Using fontManager instance from /ihme/homes/ndbs/.cache/matplotlib/fontlist-v330.json
2024-02-06 19:19:59,266 - numexpr.utils - INFO - Note: NumExpr detected 64 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.


Tue 06 Feb 2024 07:20:02 PM PST
ndbs
Linux long-slurm-sarchive-p0050 5.4.0-167-generic #184-Ubuntu SMP Tue Oct 31 09:21:49 UTC 2023 x86_64 x86_64 x86_64 GNU/Linux
/mnt/share/code/ndbs/vivarium_research_prl/pseudopeople_testing
Python 3.11.7
ppplv1.0-311          *  /ihme/homes/ndbs/miniconda3/envs/ppplv1.0-311
numpy                     1.26.3                   pypi_0    pypi
pandas                    2.1.4                    pypi_0    pypi
pseudopeople              0.8.4.dev34+gc5d0c15          pypi_0    pypi
vivarium                  2.3.1                    pypi_0    pypi
vivarium-research-prl     0.0.1                     dev_0    <develop>
pseudopeople @ git+https://github.com/ihmeuw/pseudopeople.git@c5d0c15dc889b0ef51f56fa9757c2949b8531f3b


In [3]:
# A clickable link to the GitHub UI to see what version I'm using
! pip freeze | grep pseudopeople | sed -e 's|pseudopeople @ ||' | sed -e 's|git+||' | sed -e 's|\.git@|/tree/|'

https://github.com/ihmeuw/pseudopeople/tree/c5d0c15dc889b0ef51f56fa9757c2949b8531f3b


# Find data

Project directory:

```
/mnt/team/simulation_science/pub/models/vivarium_census_prl_synth_pop
```

My initial attempt to find RI data (this is the `best` run):
```
'/mnt/team/simulation_science/pub/models/vivarium_census_prl_synth_pop/results/release_02_yellow/full_data/united_states_of_america/2023_07_28_08_33_09/final_results/2023_07_31_08_59_48/states/pseudopeople_input_data_rhode_island_0.0.2'
```

Directory [Zeb sent in Slack](https://ihme.slack.com/archives/C02KUQ9LX32/p1705720235528949?thread_ts=1705713714.552259&cid=C02KUQ9LX32), saying that this is the latest zipped RI data (this is similar to the `latest` run, but the postprocessing date is different):
```
/mnt/team/simulation_science/priv/engineering/vivarium_census_prl_synth_pop/results/release_02_yellow/full_data/united_states_of_america/2023_08_21_16_35_27/final_results/2023_08_31_15_58_01/
```

In [4]:
project_path = Path('/mnt/team/simulation_science/pub/models/vivarium_census_prl_synth_pop')
usa_path = project_path / 'results/release_02_yellow/full_data/united_states_of_america'

# Not sure what these runs are for...
latest_run = usa_path / 'latest'
best_run = usa_path / 'best'
latest_run_path = usa_path / '2023_08_21_16_35_27/final_results/2024_01_11_07_27_51/pseudopeople_input_data_usa_2.0.0'
best_run_path = usa_path / '2023_07_28_08_33_09/final_results/2023_07_31_08_59_48/'

# I think this is the run we should be using...
last_zipped_path = usa_path / '2023_08_21_16_35_27/final_results/2023_08_31_15_58_01'

ri_data_dir = last_zipped_path / 'states/pseudopeople_simulated_population_rhode_island_2_0_0'
usa_data_dir = last_zipped_path / 'pseudopeople_simulated_population_usa_2_0_0'


In [5]:
!ls -halt $project_path/results/release_02_yellow/full_data/united_states_of_america/latest

lrwxrwxrwx 1 albrja IHME-Simulationscience 218 Jan 11 12:08 /mnt/team/simulation_science/pub/models/vivarium_census_prl_synth_pop/results/release_02_yellow/full_data/united_states_of_america/latest -> /mnt/team/simulation_science/pub/models/vivarium_census_prl_synth_pop/results/release_02_yellow/full_data/united_states_of_america/2023_08_21_16_35_27/final_results/2024_01_11_07_27_51/pseudopeople_input_data_usa_2.0.0


In [6]:
# Last zipped path (from Zeb)
!ls $usa_path/2023_08_21_16_35_27/final_results/2023_08_31_15_58_01/

pseudopeople_simulated_population_rhode_island_2_0_0.zip
pseudopeople_simulated_population_usa_2_0_0
pseudopeople_simulated_population_usa_2_0_0.zip
states


In [7]:
ri_data_dir

PosixPath('/mnt/team/simulation_science/pub/models/vivarium_census_prl_synth_pop/results/release_02_yellow/full_data/united_states_of_america/2023_08_21_16_35_27/final_results/2023_08_31_15_58_01/states/pseudopeople_simulated_population_rhode_island_2_0_0')

In [8]:
!ls $ri_data_dir

american_community_survey  decennial_census  taxes_dependents
CHANGELOG.rst		   social_security   taxes_w2_and_1099
current_population_survey  taxes_1040	     women_infants_and_children


# Set up a logger to log to file

In [9]:
logs_directory = Path('logs')

# Set up a logger
file_logger = logging.getLogger(__name__) # This gets a new logger for the current, __main__ module

# Create and configure file log handler
file_handler = logging.FileHandler(logs_directory / 'data_generation_ri_20240206.log')
# Set level to the lowest threshold (DEBUG) to capture all messages
file_handler.setLevel(logging.DEBUG)
file_format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
file_handler.setFormatter(file_format)

# Add handlers to the logger
# logger.addHandler(console_handler)
file_logger.addHandler(file_handler)

# Verify that there's no longer a bug when all values in a column are missing

Originally, generating ACS for Rhode Island failed because one shard had a column with all values missing.

### Good, looks like it works now

In [22]:
lots_missing = {
    'decennial_census': {
        'column_noise': {
            'first_name': {
                'leave_blank': {'cell_probability': 1.0}
            }
        }
    }
}
try:
    bad_census = psp.generate_decennial_census(config=lots_missing)
except Exception as e:
    # traceback.print_tb(e.__traceback__)
    traceback.print_exception(e)

                                                                                                                           

In [23]:
psp.generate_decennial_census(config=lots_missing)

                                                                                                                           

Unnamed: 0,simulant_id,household_id,first_name,middle_initial,last_name,age,date_of_birth,street_number,street_name,unit_number,city,state,zipcode,housing_type,relationship_to_reference_person,sex,race_ethnicity,year
0,0_2,0_7,,P,Kofron,25,05/06/1994,5112,145th st,,Anytown,WA,00000,Household,Reference person,Female,White,2020
1,0_3,0_7,,A,Kofron,25,09/29/1994,5112,145th st,,Anytown,WA,00000,Household,Other relative,Female,White,2020
2,0_923,0_8033,,R,Butler,76,11/03/1943,1130,mallory ln,,Anytown,WA,00000,Household,Reference person,Male,Black,2020
3,0_2641,0_1066,,T,Carley,61,07/71/1958,,delacorte dr,,Anytown,WA,00000,Household,Reference person,Female,White,2020
4,0_2801,0_1138,,R,Jones,63,03/03/1947,950,caribou lane,,Anytown,WA,00000,Household,Reference person,Male,White,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10220,0_11994,0_8051,,H,Consul,17,10/25/2002,3304,ethan allen way,unit 200,Anytown,WA,00000,Household,Other relative,Female,White,2020
10221,0_19693,0_6152,,M,Huang,20,08/04/1999,1095,ernst st,,Anytown,WA,00000,Household,Other relative,Female,Asian,2020
10222,0_19556,0_2064,,F,Allen,19,Oz/26/z0o1,2002,203rd pl se,,Anytown,WA,00000,Household,Other relative,Male,Black,2020
10223,0_19579,0_1802,,L,Gonzalez,19,11/27/2000,233,saint peters road,,Anytown,WA,00000,Household,Other relative,Female,Latino,2020


# Test `generate_datasets` with new `skip` parameter

When I originally tried to load the Rhode Island data on 2024-02-01, it killed my kernel with 15 GiB memory when trying to generate the social security data. So on 2024-02-02 I used the `skip` parameter as below to load all RI deatsets except SSA.

On 2024-02-06, I requested a node with 500 GiB RAM so that I could load all RI datasets including SSA (and more!), which is what I do below.

In [10]:
%%time
sample_data = alpha.generate_datasets(skip='social_security')
sample_data.keylist()

2024-02-06 19:20:09,386 - vivarium_research_prl.alpha - INFO - Function 'generate_datasets' called with args=(), kwargs={}
2024-02-06 19:20:09,387 - vivarium_research_prl.alpha - DEBUG - skip_pattern=re.compile('social_security')
2024-02-06 19:20:09,389 - vivarium_research_prl.alpha - INFO - Calling function generate_american_community_survey
2024-02-06 19:20:09,816 - vivarium_research_prl.alpha - INFO - Code block 'generate_american_community_survey' took: 0.00711 m
2024-02-06 19:20:09,819 - vivarium_research_prl.alpha - INFO - <class 'pandas.core.frame.DataFrame'> american_community_survey occupies 0.067614 MB in memory
2024-02-06 19:20:09,820 - vivarium_research_prl.alpha - INFO - Calling function generate_current_population_survey
2024-02-06 19:20:10,052 - vivarium_research_prl.alpha - INFO - Code block 'generate_current_population_survey' took: 0.00384 m
2024-02-06 19:20:10,054 - vivarium_research_prl.alpha - INFO - <class 'pandas.core.frame.DataFrame'> current_population_survey o

CPU times: user 4.13 s, sys: 181 ms, total: 4.31 s
Wall time: 4.29 s


['american_community_survey',
 'current_population_survey',
 'decennial_census',
 'taxes_1040',
 'taxes_w2_and_1099',
 'women_infants_and_children']

# Generate all Rhode Island datasets, logging runtimes and memory to file

In a previous run, it took about 112 minutes and 3.06 GB of memory for all RI datasets except ACS and social security.

Below, it took about 301 minutes and 157.8 GB of memory for everything including ACS and SSA.

The SSA dataset by itself took about 122 minutes and 154.7 GB of memory.

In [11]:
%%time
file_logger.info(f"Generating all Rhode Island datasets from directory {ri_data_dir}")
with CodeTimer("RI data generation", unit='m', logger_func=file_logger.info):
    data = alpha.generate_datasets(str(ri_data_dir), logger=file_logger)

2024-02-06 19:20:13,716 - __main__ - INFO - Generating all Rhode Island datasets from directory /mnt/team/simulation_science/pub/models/vivarium_census_prl_synth_pop/results/release_02_yellow/full_data/united_states_of_america/2023_08_21_16_35_27/final_results/2023_08_31_15_58_01/states/pseudopeople_simulated_population_rhode_island_2_0_0
2024-02-06 19:20:13,718 - __main__ - INFO - Function 'generate_datasets' called with args=('/mnt/team/simulation_science/pub/models/vivarium_census_prl_synth_pop/results/release_02_yellow/full_data/united_states_of_america/2023_08_21_16_35_27/final_results/2023_08_31_15_58_01/states/pseudopeople_simulated_population_rhode_island_2_0_0',), kwargs={}
2024-02-06 19:20:13,720 - __main__ - DEBUG - skip_pattern=re.compile('(?!)')
2024-02-06 19:20:13,722 - __main__ - INFO - Calling function generate_american_community_survey
Noising data:   0%|                                                                                | 0/334 [00:00<?, ?it/s]
Applying no

CPU times: user 4h 27min 25s, sys: 34min 34s, total: 5h 2min
Wall time: 5h 1min 9s


In [12]:
data.keylist()

['american_community_survey',
 'current_population_survey',
 'decennial_census',
 'social_security',
 'taxes_1040',
 'taxes_w2_and_1099',
 'women_infants_and_children']

# Try "compressing" the RI data

It goes from about 157.8 GB to 34.9 GB. Woo hoo!

In [25]:
%%time
with CodeTimer("RI datatype conversion", unit='m', logger_func=file_logger.info):
    cdata = MappingViaAttributes(
        {k: datatypes.to_int_and_categorical(df, exclude='ssn')
         # Filter to DataFrames (vs. Exceptions) in case we got any errors on data generation
         for k, df in data.items() if isinstance(df, pd.DataFrame)})
cdata.keylist()

2024-02-08 14:42:58,130 - __main__ - INFO - Code block 'RI datatype conversion' took: 23.16865 m


CPU times: user 18min 54s, sys: 4min 18s, total: 23min 13s
Wall time: 23min 10s


['american_community_survey',
 'current_population_survey',
 'decennial_census',
 'social_security',
 'taxes_1040',
 'taxes_w2_and_1099',
 'women_infants_and_children']

In [27]:
file_logger.info(f"Converted to ints and categoricals, the RI datasets occupy {sum(sizemb(df) for df in cdata.values())} MB in memory")

2024-02-08 14:47:23,387 - __main__ - INFO - Converted to ints and categoricals, the RI datasets occupy 34851.847759000004 MB in memory


# Compare memory usage between original and compressed

In [28]:
from IPython.display import display, Markdown

for name in cdata:
    display(Markdown(f'### {name}'))
    display(data[name].memory_usage(deep=True) - cdata[name].memory_usage(deep=True))

### american_community_survey

Index                                   0
simulant_id                         63403
household_id                        63193
survey_date                             0
first_name                          10324
middle_initial                      57548
last_name                            8093
age                                 53361
date_of_birth                      -32965
street_number                        8554
street_name                          8378
unit_number                         31728
city                                61566
state                                   0
zipcode                             57638
housing_type                        68091
relationship_to_reference_person        0
sex                                     0
race_ethnicity                          0
dtype: int64

### current_population_survey

Index                  0
simulant_id       240543
household_id      239739
survey_date            0
first_name        125366
middle_initial    225420
last_name          40404
age               221503
date_of_birth    -120752
street_number      40839
street_name         8381
unit_number       114189
city              245994
state                  0
zipcode           232908
sex                    0
race_ethnicity         0
dtype: int64

### decennial_census

Index                                      0
simulant_id                         64205913
household_id                        63758268
first_name                          63273760
middle_initial                      60908192
last_name                           52475823
age                                 60750437
date_of_birth                       61559096
street_number                       55783655
street_name                         42351780
unit_number                         32706926
city                                68437679
state                                      0
zipcode                             63758542
housing_type                        69712671
relationship_to_reference_person           0
sex                                        0
race_ethnicity                             0
year                                37574204
dtype: int64

### social_security

Index                      0
simulant_id      20313111418
ssn                        0
first_name       19844580754
middle_name      19844676660
last_name        19650498385
date_of_birth    20411522544
sex                        0
event_type                 0
event_date       20413057639
dtype: int64

### taxes_1040

Index                                   0
simulant_id                      40278068
household_id                     39941144
first_name                       39258599
middle_initial                   38189401
last_name                        31672300
ssn                                     0
mailing_address_street_number    33322747
mailing_address_street_name      18300522
mailing_address_unit_number      20304869
mailing_address_po_box           19957606
mailing_address_city             42909116
mailing_address_state                   0
mailing_address_zipcode          39967553
spouse_first_name                21813256
spouse_middle_initial            22542279
spouse_last_name                 20101868
spouse_ssn                       14676268
dependent_1_first_name           23244091
dependent_1_last_name            19259805
dependent_1_ssn                  11025252
dependent_2_first_name           21650936
dependent_2_last_name            20145391
dependent_2_ssn                  1

### taxes_w2_and_1099

Index                                   0
simulant_id                      62574768
household_id                     62055220
employer_id                      11542792
ssn                                     0
wages                            45657239
employer_name                    14962372
employer_street_number           53996849
employer_street_name             37610813
employer_unit_number             31752627
employer_city                    63509081
employer_state                          0
employer_zipcode                 57058992
first_name                       61876840
middle_initial                   59366682
last_name                        51044549
mailing_address_street_number    53485926
mailing_address_street_name      40594626
mailing_address_unit_number      32064459
mailing_address_po_box           31457978
mailing_address_city             66687788
mailing_address_state                   0
mailing_address_zipcode          62143771
tax_form                          

### women_infants_and_children

Index                  0
simulant_id       852504
household_id      845247
first_name        600991
middle_initial    801577
last_name         365995
date_of_birth     451307
street_number     360082
street_name        29575
unit_number       402995
city              886576
state                  0
zipcode           824788
sex                    0
race_ethnicity         0
year              496149
dtype: int64

# Compare original SSA to compressed SSA

In [29]:
data.social_security.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 339564253 entries, 0 to 339564252
Data columns (total 9 columns):
 #   Column         Dtype   
---  ------         -----   
 0   simulant_id    object  
 1   ssn            object  
 2   first_name     object  
 3   middle_name    object  
 4   last_name      object  
 5   date_of_birth  object  
 6   sex            category
 7   event_type     category
 8   event_date     object  
dtypes: category(2), object(7)
memory usage: 18.3+ GB


In [30]:
cdata.social_security.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 339564253 entries, 0 to 339564252
Data columns (total 9 columns):
 #   Column         Dtype   
---  ------         -----   
 0   simulant_id    int64   
 1   ssn            object  
 2   first_name     category
 3   middle_name    category
 4   last_name      category
 5   date_of_birth  category
 6   sex            category
 7   event_type     category
 8   event_date     category
dtypes: category(7), int64(1), object(1)
memory usage: 12.4+ GB


# Compare original ACS with compressed ACS

In [31]:
data.american_community_survey

Unnamed: 0,simulant_id,household_id,survey_date,first_name,middle_initial,last_name,age,date_of_birth,street_number,street_name,unit_number,city,state,zipcode,housing_type,relationship_to_reference_person,sex,race_ethnicity
0,103_652707,103_262256,2020-06-16,Ryan,P,Saenz,37,07/29/1982,18651,keen street,,lincoln,RI,02860,Household,Reference person,Male,Latino
1,1061_245665,1061_98878,2020-03-24,Lorin,R,Zieba,61,11/03/1958,8114,w 127th pl,,bristol,RI,02813,Household,Reference person,Male,White
2,1061_754487,1061_303835,2020-05-19,Robert,S,Stoll,15,12/21/2004,28,hampton road,,newport,RI,02895,Household,Biological child,Male,White
3,1061_754488,1061_303835,2020-05-19,Kaylee,H,Stoll,10,10/26/2009,28,hampton road,,newport,RI,02895,Household,Biological child,Female,White
4,1069_717890,1069_288897,2020-10-06,Benjamin,C,Ho,46,09/23/1973,2838,frm brook way,,coventry,RI,02859,Household,Reference person,Male,Asian
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1055,9971_944282,9971_379364,2020-03-24,Anna,V,Muller,38,07/15/1981,132,apache ave,,east greenwich,RI,02863,Household,Reference person,Female,White
1056,9971_944283,9971_379364,2020-03-24,Timothy,L,Muller,36,01/21/1984,132,apache ave,,east greenwich,,02863,Household,Opposite-sex spouse,Male,White
1057,9971_944284,9971_379364,2020-03-24,Rayden,H,Muller,7,03/02/2012,132,apache ave,,east greenwich,RI,02863,Household,Biological child,Male,White
1058,9971_944285,9971_379364,2020-03-24,Charlotte,A,Muller,4,07/07/2015,132,apache ave,,east greenwich,RI,02863,Household,Biological child,Female,White


In [32]:
cdata.american_community_survey

Unnamed: 0,simulant_id,household_id,survey_date,first_name,middle_initial,last_name,age,date_of_birth,street_number,street_name,unit_number,city,state,zipcode,housing_type,relationship_to_reference_person,sex,race_ethnicity
0,103000652707,103000262256,2020-06-16,Ryan,P,Saenz,37,07/29/1982,18651,keen street,,lincoln,RI,02860,Household,Reference person,Male,Latino
1,1061000245665,1061000098878,2020-03-24,Lorin,R,Zieba,61,11/03/1958,8114,w 127th pl,,bristol,RI,02813,Household,Reference person,Male,White
2,1061000754487,1061000303835,2020-05-19,Robert,S,Stoll,15,12/21/2004,28,hampton road,,newport,RI,02895,Household,Biological child,Male,White
3,1061000754488,1061000303835,2020-05-19,Kaylee,H,Stoll,10,10/26/2009,28,hampton road,,newport,RI,02895,Household,Biological child,Female,White
4,1069000717890,1069000288897,2020-10-06,Benjamin,C,Ho,46,09/23/1973,2838,frm brook way,,coventry,RI,02859,Household,Reference person,Male,Asian
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1055,9971000944282,9971000379364,2020-03-24,Anna,V,Muller,38,07/15/1981,132,apache ave,,east greenwich,RI,02863,Household,Reference person,Female,White
1056,9971000944283,9971000379364,2020-03-24,Timothy,L,Muller,36,01/21/1984,132,apache ave,,east greenwich,,02863,Household,Opposite-sex spouse,Male,White
1057,9971000944284,9971000379364,2020-03-24,Rayden,H,Muller,7,03/02/2012,132,apache ave,,east greenwich,RI,02863,Household,Biological child,Male,White
1058,9971000944285,9971000379364,2020-03-24,Charlotte,A,Muller,4,07/07/2015,132,apache ave,,east greenwich,RI,02863,Household,Biological child,Female,White


# Delete `data` to free up memory

In [33]:
del data

In [34]:
cdata.decennial_census

Unnamed: 0,simulant_id,household_id,first_name,middle_initial,last_name,age,date_of_birth,street_number,street_name,unit_number,city,state,zipcode,housing_type,relationship_to_reference_person,sex,race_ethnicity,year
0,1007000000574,1007000000258,Harry,A,Cook,76,11/05/1943,2216,ronnie rd,,east providence,RI,02908,Household,Reference person,Male,Black,2020
1,1007000000709,1007000000322,Tonya,J,Troyer,32,01/03/1988,6615,spruce,,portsmouth,RI,02812,Household,Reference person,Female,White,2020
2,1007000000710,1007000000322,Daniel,I,Troyer,32,02/06/1988,6615,spruce,,portsmouth,RI,02812,Household,Sibling,Male,White,2020
3,1007000000711,1007000000322,Francisco,W,Troyer,31,12/17/1988,6615,spruce,,portsmouth,RI,02812,Household,Sibling,Male,White,2020
4,1007000001281,1007000000547,Christina,J,Smith,44,07/17/1975,60,putters pl,,pawtucket,RI,02861,Household,Reference person,Female,White,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1073547,9971001022747,9971000417220,Terry,M,Carrasco,64,10/28/1955,7597,noah road,,warwick,RI,02908,Household,Reference person,Male,Latino,2020
1073548,9971001022748,9971000417220,Madison,K,Carrasco,17,08/17/2002,7597,noah road,,warwick,RI,02908,Household,Other relative,Female,Latino,2020
1073549,9971001022756,9971000417225,Kyle,J,Lembke,32,08/14/1987,47,palmer square west,,westerly,RI,02915,Household,Reference person,Male,White,2020
1073550,9971001022757,9971000417225,Erin,L,Lembke,,04/20/1983,47,palmer square west,,westerly,RI,02915,Household,Opposite-sex spouse,Female,White,2020


# Investigate guardian-based duplication in the census

## First find duplicates

Looks like there are 5 duplicated simulants who each appear twice. Two of them are 20-somethings in college, and the other three are children in households.

In [35]:
dup_ids = cdata.decennial_census.simulant_id.duplicated(keep=False)
duplicates = cdata.decennial_census.loc[dup_ids]
duplicates

Unnamed: 0,simulant_id,household_id,first_name,middle_initial,last_name,age,date_of_birth,street_number,street_name,unit_number,city,state,zipcode,housing_type,relationship_to_reference_person,sex,race_ethnicity,year
75378,1490000611092,1490000245525,Cole,W,Jones,10,11/13/2009,13712.0,w tappen park ln,,pawtucket,RI,2919,Household,Grandchild,Male,White,2020
76598,1490000611092,1490000261075,Cole,W,Jones,10,11/13/2009,2951.0,spruce st,,westerly,RI,2889,Household,Other relative,Male,White,2020
272319,3374000590091,3374000237308,Avery,,Valliere,4,03/07/2016,1910.0,durfee str,,cranston,RI,2860,Household,Biological child,Male,White,2020
273536,3374000590091,3374000083634,Avery,D,Valliere,4,03/07/2016,,257th st,,pawtucket,RI,2888,Household,Grandchild,Male,White,2020
494338,4943000982836,4943000000003,Esmeralda,R,Hanson,22,04/30/1997,4619.0,arbutus st,,warwick,RI,2864,College,Noninstitutionalized group quarters population,Female,White,2020
494865,4943000984671,4943000000003,Morgan,A,Zuluaga,20,10/22/1999,4619.0,arbutus st,,warwick,RI,2864,College,Noninstitutionalized group quarters population,Female,Latino,2020
495574,4943000984671,4943000146365,Morgan,A,Zuluaga,20,10/22/1999,91.0,placer ave,,pawtucket,RI,2904,Household,Other relative,Female,Latino,2020
495575,4943000982836,4943000079553,Esmeralda,R,Hanson,22,04/30/1997,601.0,e columbus st,,exeter,RI,2906,Household,Other relative,Female,White,2020
1050160,9859000421173,9859000174887,Ahlani,I,Torres,3,05/01/2016,802.0,farrcroft grn,,warren,RI,2905,Household,Other nonrelative,Female,Latino,2020
1051914,9859000421173,9859000395266,Ahlani,I,Torres,3,05/01/2016,40463.0,stover av sw,,warwick,RI,2825,Household,Other relative,Female,Latino,2020


In [39]:
len(duplicates)

10

In [40]:
duplicates.simulant_id.value_counts()

simulant_id
1490000611092    2
3374000590091    2
4943000982836    2
4943000984671    2
9859000421173    2
Name: count, dtype: int64

# See if duplicates appear at the end of the shard

Yes, looks like they do, at least for shard 4943.

In [36]:
cdata.decennial_census.loc[495572:495576]

Unnamed: 0,simulant_id,household_id,first_name,middle_initial,last_name,age,date_of_birth,street_number,street_name,unit_number,city,state,zipcode,housing_type,relationship_to_reference_person,sex,race_ethnicity,year
495572,4943001022483,4943000000003,Jessica,C,Davis,18,12/29/2001,4619,arbutus st,,warwick,RI,22864,College,Noninstitutionalized group quarters population,Female,Black,2020
495573,4943001022487,4943000000003,Nicholas,B,Freddie,30,07/11/1989,4619,arbutus st,,warwick,RI,2864,College,Noninstitutionalized group quarters population,Male,Black,2020
495574,4943000984671,4943000146365,Morgan,A,Zuluaga,20,10/22/1999,91,placer ave,,pawtucket,RI,2904,Household,Other relative,Female,Latino,2020
495575,4943000982836,4943000079553,Esmeralda,R,Hanson,22,04/30/1997,601,e columbus st,,exeter,RI,2906,Household,Other relative,Female,White,2020
495576,4950000001391,4950000000571,Jennifer,J,Leonard,45,07/11/1974,50240,n 975 w,,warwick,RI,2816,Household,Reference person,Female,White,2020


In [37]:
cdata.decennial_census.index

RangeIndex(start=0, stop=1073552, step=1)

In [38]:
cdata.decennial_census.household_id.nunique()

451584

# Which households are 'College'?

Looks like the real 'College' is `4943000000003`, and others labeled as 'College' are probably due to 'choose the wrong option' noise.

In [41]:
cdata.decennial_census.query("housing_type == 'College'")['household_id'].value_counts()

household_id
4943000000003    6496
1609000000000      14
7384000000005      14
3298000000005      11
446000000001        8
                 ... 
3793000257102       1
3793000143326       1
3793000014373       1
3793000003407       1
386000060212        1
Name: count, Length: 1743, dtype: int64

In [43]:
cdata.decennial_census.query("household_id == 1609000000000")['housing_type'].value_counts()

housing_type
Carceral                  6147
College                     14
Household                   14
Nursing home                12
Other noninstitutional      12
Military                    11
Other institutional          9
Name: count, dtype: int64

In [44]:
cdata.decennial_census.query("household_id == 7384000000005")['housing_type'].value_counts()

housing_type
Other noninstitutional    6320
College                     14
Military                    13
Household                   12
Carceral                     9
Other institutional          8
Nursing home                 7
Name: count, dtype: int64

In [45]:
# People in household 4943000000003 (College)
in_college = cdata.decennial_census.query("household_id == 4943000000003")
alpha.index_is_consecutive(in_college)

False

In [46]:
len(in_college)

6635

# Calculate the "non-consecutiveness" of each household

In [47]:
def index_span_minus_size(df):
    """Calculates how far a dataframe index is
    from being consecutive.
    """
    return (df.index[-1] - df.index[0] + 1) - len(df)

index_span_minus_size(in_college)

3032

In [48]:
hh_grouping_discrepancies = cdata.decennial_census.groupby('household_id').apply(index_span_minus_size)
# index is household_id, values are the discrepancy for that household
hh_grouping_discrepancies

household_id
28000000332         0
28000000549         0
28000000563         0
28000000579         0
28000001155      2990
                 ... 
9971000417175       0
9971000417178       0
9971000417208       0
9971000417220       0
9971000417225       0
Length: 451584, dtype: int64

In [49]:
# index is the discrepancy, values are the number of households with that discrepancy
hh_grouping_discrepancies.value_counts()

0       406236
1           70
256         36
109         36
102         34
         ...  
6248         1
6378         1
6402         1
6238         1
5860         1
Name: count, Length: 3629, dtype: int64

In [53]:
hh_grouping_discrepancies.loc[hh_grouping_discrepancies > 9000]

household_id
446000007216     9160
446000009425     9106
446000009892     9020
1219000000575    9343
1219000001450    9362
1219000004416    9255
1219000012795    9205
1219000025722    9027
1219000027100    9020
1609000009557    9104
3298000005288    9471
3298000015636    9245
3298000027047    9210
3298000028358    9291
3298000035232    9181
3298000041591    9058
3298000337136    9307
4943000032319    9169
4943000036674    9070
4943000042361    9116
4943000043846    9043
4943000052748    9011
4943000088485    9395
4943000300418    9271
5114000009642    9055
5114000010046    9110
5114000022848    9002
5114000027323    9130
5114000065579    9241
7264000010303    9113
7264000017556    9017
dtype: int64

# Find household IDs for duplicated simulants

In [56]:
dup_household_ids = duplicates.household_id.to_list()
dup_household_ids

[1490000245525,
 1490000261075,
 3374000237308,
 3374000083634,
 4943000000003,
 4943000000003,
 4943000146365,
 4943000079553,
 9859000174887,
 9859000395266]

In [57]:
dup_households = cdata.decennial_census.query(f"household_id in {dup_household_ids}")
dup_households

Unnamed: 0,simulant_id,household_id,first_name,middle_initial,last_name,age,date_of_birth,street_number,street_name,unit_number,city,state,zipcode,housing_type,relationship_to_reference_person,sex,race_ethnicity,year
75375,1490000611089,1490000261075,Cathy,A,Jones,44,11/13/1975,,spruce st,,westerly,RI,02889,Household,Other nonrelative,Female,White,2020
75376,1490000611090,1490000245525,Chuck,L,Jones,45,01/27/1975,13712,w tappen park ln,,pawtucket,RI,02919,Household,Child-in-law,Male,White,2020
75377,1490000611091,1490000245525,Devyn,A,Jones,13,05/17/2006,13712,w tappen park ln,,pawtucket,RI,02919,Household,Grandchild,Male,White,2020
75378,1490000611092,1490000245525,Cole,W,Jones,10,11/13/2009,13712,w tappen park ln,,pawtucket,RI,02919,Household,Grandchild,Male,White,2020
75379,1490000611093,1490000245525,Charles,G,Jones,70,02/20/1950,13712,w tappen park ln,,pawtucket,RI,02919,Household,Reference person,Male,White,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1050160,9859000421173,9859000174887,Ahlani,I,Torres,3,05/01/2016,802,farrcroft grn,,warren,RI,02905,Household,Other nonrelative,Female,Latino,2020
1050196,9859000434658,9859000174887,David,J,Negrete-Sanabria,49,11/24/1970,802,farrcroft grn,,warren,RI,02905,Household,Reference person,Male,Latino,2020
1050197,9859000434659,9859000174887,Kristin,T,Negrete-Sanabria,35,11/03/1984,802,farrcroft grn,,warren,RI,02905,Household,Foster child,Female,Latino,2020
1050198,9859000434660,9859000174887,Nkah,H,Negrete-Sanabria,7,10/06/2012,802,farrcroft grn,,warren,RI,02905,Household,Biological child,Male,Latino,2020


In [58]:
dup_households.housing_type.value_counts()

housing_type
College                   6496
Household                   44
Nursing home                17
Other noninstitutional      14
Carceral                    12
Military                    11
Other institutional          8
Name: count, dtype: int64

# Check the discrepancy for the duplicate households

Huh, these are not even the largest discrepancies. But what I really want is the guardian's households, so let's do a hack to get those specifically...

In [59]:
hh_grouping_discrepancies.loc[dup_household_ids]

household_id
1490000245525       0
1490000261075    1219
3374000237308       1
3374000083634    2469
4943000000003    3032
4943000000003    3032
4943000146365    7914
4943000079553    8752
9859000174887      35
9859000395266    1754
dtype: int64

# HACK: Get the indices of the duplicated rows

This only works because duplicates always appear at the end of the shard.

In [67]:
guardian_dup_index = duplicates.reset_index().groupby('simulant_id')['index'].max()
guardian_dup_index

simulant_id
1490000611092      76598
3374000590091     273536
4943000982836     495575
4943000984671     495574
9859000421173    1051914
Name: index, dtype: int64

In [69]:
duplicates

Unnamed: 0,simulant_id,household_id,first_name,middle_initial,last_name,age,date_of_birth,street_number,street_name,unit_number,city,state,zipcode,housing_type,relationship_to_reference_person,sex,race_ethnicity,year
75378,1490000611092,1490000245525,Cole,W,Jones,10,11/13/2009,13712.0,w tappen park ln,,pawtucket,RI,2919,Household,Grandchild,Male,White,2020
76598,1490000611092,1490000261075,Cole,W,Jones,10,11/13/2009,2951.0,spruce st,,westerly,RI,2889,Household,Other relative,Male,White,2020
272319,3374000590091,3374000237308,Avery,,Valliere,4,03/07/2016,1910.0,durfee str,,cranston,RI,2860,Household,Biological child,Male,White,2020
273536,3374000590091,3374000083634,Avery,D,Valliere,4,03/07/2016,,257th st,,pawtucket,RI,2888,Household,Grandchild,Male,White,2020
494338,4943000982836,4943000000003,Esmeralda,R,Hanson,22,04/30/1997,4619.0,arbutus st,,warwick,RI,2864,College,Noninstitutionalized group quarters population,Female,White,2020
494865,4943000984671,4943000000003,Morgan,A,Zuluaga,20,10/22/1999,4619.0,arbutus st,,warwick,RI,2864,College,Noninstitutionalized group quarters population,Female,Latino,2020
495574,4943000984671,4943000146365,Morgan,A,Zuluaga,20,10/22/1999,91.0,placer ave,,pawtucket,RI,2904,Household,Other relative,Female,Latino,2020
495575,4943000982836,4943000079553,Esmeralda,R,Hanson,22,04/30/1997,601.0,e columbus st,,exeter,RI,2906,Household,Other relative,Female,White,2020
1050160,9859000421173,9859000174887,Ahlani,I,Torres,3,05/01/2016,802.0,farrcroft grn,,warren,RI,2905,Household,Other nonrelative,Female,Latino,2020
1051914,9859000421173,9859000395266,Ahlani,I,Torres,3,05/01/2016,40463.0,stover av sw,,warwick,RI,2825,Household,Other relative,Female,Latino,2020


In [68]:
duplicates.loc[guardian_dup_index]

Unnamed: 0,simulant_id,household_id,first_name,middle_initial,last_name,age,date_of_birth,street_number,street_name,unit_number,city,state,zipcode,housing_type,relationship_to_reference_person,sex,race_ethnicity,year
76598,1490000611092,1490000261075,Cole,W,Jones,10,11/13/2009,2951.0,spruce st,,westerly,RI,2889,Household,Other relative,Male,White,2020
273536,3374000590091,3374000083634,Avery,D,Valliere,4,03/07/2016,,257th st,,pawtucket,RI,2888,Household,Grandchild,Male,White,2020
495575,4943000982836,4943000079553,Esmeralda,R,Hanson,22,04/30/1997,601.0,e columbus st,,exeter,RI,2906,Household,Other relative,Female,White,2020
495574,4943000984671,4943000146365,Morgan,A,Zuluaga,20,10/22/1999,91.0,placer ave,,pawtucket,RI,2904,Household,Other relative,Female,Latino,2020
1051914,9859000421173,9859000395266,Ahlani,I,Torres,3,05/01/2016,40463.0,stover av sw,,warwick,RI,2825,Household,Other relative,Female,Latino,2020


In [70]:
guardian_hh_ids = duplicates.loc[guardian_dup_index, 'household_id'].to_list()
guardian_hh_ids

[1490000261075, 3374000083634, 4943000079553, 4943000146365, 9859000395266]

# Check the discrepancy for the guardians' households

They're somewhere in the middle.

In [71]:
hh_grouping_discrepancies.loc[guardian_hh_ids]

household_id
1490000261075    1219
3374000083634    2469
4943000079553    8752
4943000146365    7914
9859000395266    1754
dtype: int64

# Look at some of the higher discrepancy houesholds

Hmm, not sure what's going on, but it looks like maybe when the simulant IDs are consecutive, so are the rows in the dataframe.

In [72]:
cdata.decennial_census.query("household_id == 446000007216")

Unnamed: 0,simulant_id,household_id,first_name,middle_initial,last_name,age,date_of_birth,street_number,street_name,unit_number,city,state,zipcode,housing_type,relationship_to_reference_person,sex,race_ethnicity,year
421203,446000017978,446000007216,Travis,S,Johnson,43,02/25/1977,4406,frontier trl,,south kingstown,RI,2816,Household,Reference person,Male,White,2020
421204,446000017979,446000007216,Jessica,A,Johnson,34,07/05/1985,4406,frontier trl,,south kingstown,RI,2816,Household,Opposite-sex spouse,Female,White,2020
421205,446000017980,446000007216,Aiden,J,Johnson,14,01/17/2006,4406,frontier trl,,south kingstown,RI,2816,Household,Biological child,Male,White,2020
421206,446000017981,446000007216,Dani,N,Johnson,12,15/10/2007,4406,frontier trl,,south kingstown,RI,2816,Household,Biological child,Female,White,2020
421207,446000017982,446000007216,Leonardo,J,Johnson,10,07/16/2009,4406,frontier trl,,south kingstown,RI,2816,Household,Biological child,Male,White,2020
421208,446000017983,446000007216,Aiden,M,Johnson,6,11/23/2013,4406,frontier trl,,south kingstown,RI,2816,Household,Biological child,Male,White,2020
421209,446000017984,446000007216,Zaylee,G,Johnson,1,10/07/2018,4406,frontier trl,,south kingstovvn,RI,2816,Household,Biological child,Female,White,2020
430370,446001021650,446000007216,Owen,K,Johnson,0,03/29/2020,4406,frontier trl,,south kingstown,RI,2816,Household,Biological child,Male,White,2020


In [73]:
cdata.decennial_census.query("household_id == 446000009425")

Unnamed: 0,simulant_id,household_id,first_name,middle_initial,last_name,age,date_of_birth,street_number,street_name,unit_number,city,state,zipcode,housing_type,relationship_to_reference_person,sex,race_ethnicity,year
421240,446000023482,446000009425,Douglas,R,Parada,47,08/02/1972,650,red fir road,,coventry,RI,2906,Household,Reference person,Male,Latino,2020
421241,446000023483,446000009425,Danielle,E,Parada,40,02/26/1980,650,red fir road,,coventry,RI,2906,Household,Opposite-sex spouse,Female,Latino,2020
421242,446000023484,446000009425,Kaleb,M,Parada,13,07/23/2006,650,red fir road,,coventry,RI,2906,Household,Biological child,Male,Latino,2020
421243,446000023485,446000009425,Daniel,T,Parada,6,10/14/2013,650,red fir road,,coventahy,RI,2906,Household,Biological child,Male,Latino,2020
430350,446001017316,446000009425,Mariam,E,Bland,3,03/27/2017,650,red fir road,,coventry,RI,2906,Household,Other nonrelative,Female,White,2020


In [74]:
cdata.decennial_census.query("household_id == 7264000017556")

Unnamed: 0,simulant_id,household_id,first_name,middle_initial,last_name,age,date_of_birth,street_number,street_name,unit_number,city,state,zipcode,housing_type,relationship_to_reference_person,sex,race_ethnicity,year
775059,7264000043747,7264000017556,Javier,C,Chapuis,29.0,07/18/1990,120,old town rd,,newport,RI,2869,Household,Reference person,Male,White,2020
775060,7264000043748,7264000017556,Arielle,C,Chapuis,29.0,,120,old town rd,,newport,RI,2860,Household,Opposite-sex unmarried partner,Female,White,2020
778190,7264000669907,7264000017556,Anne,E,Bixler,5.0,07/13/2014,120,old town rd,,newport,RI,2860,Household,Other nonrelative,Female,White,2020
784079,7264001016289,7264000017556,Nolan,H,Chapuis,,12/27/2019,120,old town rd,,newport,RI,2860,Household,Biological child,Male,White,2020


# Let's try to see if the fraction  of duplicates in each category is approximately correct...

In [83]:
# Get list of ages under 18 because we can't directly do comparisons with Categoricals
under_18 = list(map(str, range(18)))
in_households_under_18 = cdata.decennial_census.query(f"age in {under_18} and housing_type == 'Household'")
in_households_under_18

Unnamed: 0,simulant_id,household_id,first_name,middle_initial,last_name,age,date_of_birth,street_number,street_name,unit_number,city,state,zipcode,housing_type,relationship_to_reference_person,sex,race_ethnicity,year
6,1007000001283,1007000000547,William,I,Smith,12,10/23/2007,60,putters pl,,pawtucket,RI,02861,Household,Biological child,Male,White,2020
7,1007000001284,1007000000547,Emma,Y,Smith,9,04/08/2010,60,,,pawtucket,RI,02861,Household,Biological child,Female,White,2020
13,1007000008067,1007000003290,Ernest,J,Garza,8,12/12/2011,8701,vanesta drive,,pawtucket,RI,02920,Household,Biological child,Male,Latino,2020
14,1007000008068,1007000003290,Kaylee,E,Garza,5,06/23/2014,8701,vanesta drive,,pawtucket,RI,02920,Household,Biological child,Female,Latino,2020
25,1007000010361,1007000004233,Alexis,M,Shay,15,10/09/2004,12458,e mohave rd,,cranston,RI,02852,Household,Biological child,Female,White,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1073539,9971001020892,9971000343984,Malachi,A,Stannard,0,03/21/2020,13110,noble street,,north kingstown,RI,02904,Household,Biological child,Male,White,2020
1073541,9971001021724,9971000106535,Michael,F,Wright,0,03/29/2020,1817,via sun,,smithfield,RI,02837,Household,Biological child,Male,White,2020
1073542,9971001022156,9971000300395,Reese,C,Dohse,0,04/11/2020,4420,north pearl street,,warwick,RI,02825,Household,Biological child,Female,White,2020
1073548,9971001022748,9971000417220,Madison,K,Carrasco,17,08/17/2002,7597,noah road,,warwick,RI,02908,Household,Other relative,Female,Latino,2020


## See what row probabilities we're supposed to have

It's 2% for "in households under 18" and 5% for "in college under 24".

We have 204,387 simulants in households under 18, 2% of which is 4088. Which is way more than the 3 simulants in this category that we actually have duplicated...

In [81]:
default_config = psp.get_config()
config_series = pd.Series(alpha.pad_flattened_dict(alpha.flatten(default_config)))
config_series

decennial_census  row_noise     do_not_respond           row_probability                                     NaN                NaN    0.0145
                                omit_row                 row_probability                                     NaN                NaN       0.0
                                duplicate_with_guardian  row_probability_in_households_under_18              NaN                NaN      0.02
                                                         row_probability_in_college_group_quarters_under_24  NaN                NaN      0.05
                  column_noise  first_name               leave_blank                                         cell_probability   NaN      0.01
                                                                                                                                        ...  
taxes_1040        column_noise  dependent_4_ssn          write_wrong_digits                                  token_probability  NaN       0.1
      

In [82]:
config_series.filter(like="duplicate")

decennial_census  row_noise  duplicate_with_guardian  row_probability_in_households_under_18              NaN  NaN    0.02
                                                      row_probability_in_college_group_quarters_under_24  NaN  NaN    0.05
dtype: object

In [86]:
len(in_households_under_18)

204387

In [85]:
0.02 * len(in_households_under_18)

4087.7400000000002

# Let's look at college kids

Similarly, we should have duplicates for about 5% of 4313 college students, or about 216 duplicates. But we only have two...

In [87]:
# Get list of ages under 24 because we can't directly do comparisons with Categoricals
under_24 = list(map(str, range(24)))
in_college_under_24 = cdata.decennial_census.query(f"age in {under_24} and housing_type == 'College'")
in_college_under_24

Unnamed: 0,simulant_id,household_id,first_name,middle_initial,last_name,age,date_of_birth,street_number,street_name,unit_number,city,state,zipcode,housing_type,relationship_to_reference_person,sex,race_ethnicity,year
4899,103000542759,103000217988,Tracy,T,Jeffries,12,01/22/1978,,west virgo rd,unit # 2,middletown,RI,02809,College,Reference person,Female,White,2020
6098,103000901996,103000362303,Benjamin,L,Bremer,9,06/27/2010,25601,obsidian ct,,blk island,RI,02882,College,Biological child,Male,White,2020
6693,1061000083841,1061000033591,Justin,A,Thacker,22,04/13/1997,9440,s grant ave,,ltl compton,RI,02852,College,Adopted child,Male,White,2020
7078,1061000207708,1061000083603,Leona,I,Perez,11,11/15/2008,32,eagle run cir,,west warwick,RI,02907,College,Biological child,Female,Asian,2020
17847,1074000680390,1074000273167,Abby,J,Overton,17,02/17/2003,7201,n st james dr,,portsmouth,RI,02885,College,Biological child,Female,White,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1066705,9901000816406,9901000328644,Tyler,D,Day,10,05/23/2009,512,country way,,cumberland,RI,02911,College,Biological child,Male,White,2020
1067356,9911000028642,9911000011428,Garrett,S,Garza,22,05/26/1997,230,wachusett st,,coventry,RI,02904,College,Biological child,Male,Latino,2020
1068048,9911000251860,9911000101110,Izabella,D,Bhatti,7,03/25/2012,7034,sw brasada ranch rd,,providence,RI,02908,College,Biological child,Female,Black,2020
1069762,9911000812384,9911000326831,Ruby,,Long,4,04/18/2015,6400,cliff street,,east providence,RI,02818,College,Biological child,Female,White,2020


In [88]:
len(in_college_under_24)

4313

In [89]:
0.05 * len(in_college_under_24)

215.65

In [90]:
age_18_to_24 = list(map(str, range(18, 24)))
in_college_18_to_24 = cdata.decennial_census.query(f"age in {age_18_to_24} and housing_type == 'College'")
in_college_18_to_24

Unnamed: 0,simulant_id,household_id,first_name,middle_initial,last_name,age,date_of_birth,street_number,street_name,unit_number,city,state,zipcode,housing_type,relationship_to_reference_person,sex,race_ethnicity,year
6693,1061000083841,1061000033591,Justin,A,Thacker,22,04/13/1997,9440,s grant ave,,ltl compton,RI,02852,College,Adopted child,Male,White,2020
23606,1154000578619,1154000232474,Ryan,C,Green,21,06/15/1998,2193,primrose la,,north providence,RI,02909,College,Biological child,Male,Latino,2020
23621,1154000582571,1154000234078,Alissa,H,Brown,22,09/26/1997,43,drummer way,,portsmouth,RI,02806,College,Reference person,Female,White,2020
36146,1219000459948,1219000000000,Brooklynn,A,Barker,20,08/18/1999,1616,rock creek villa dr,,pawtucket,RI,02891,College,Institutionalized group quarters population,Female,Latino,2020
38440,1219000914494,1219000367671,Jonathan,J,Harvey,18,12/01/2001,1085,stillridge dr,,south kingstown,RI,02852,College,Biological child,Male,Asian,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1018689,9696000194393,9696000078294,Jena,E,Connolly,22,07/30/1997,6301,carr st,,portsmouth,RI,02908,College,Biological child,Female,White,2020
1030465,9768000086476,9768000034801,Nathanieo,H,Froebel,19,12/07/2000,30705,white pelican cir,,charlestown,RI,02809,College,Biological child,Male,White,2020
1033593,9772000171503,9772000304238,Cheyenne,T,Rojas,23,02/06/1997,808,s 34th ln,,newport,RI,02852,College,Other nonrelative,Female,White,2020
1065505,9901000421084,9901000053904,Elijah,J,Hall,20,03/26/1999,6968,41st avenu swest,,cranston,RI,02863,College,Other nonrelative,Male,White,2020


In [92]:
in_college_18_to_24.household_id.value_counts()

household_id
4943000000003    3546
7384000000005       8
3298000000005       7
1609000000000       4
5114000000005       2
                 ... 
3621000275570       1
3585000157669       1
3545000375565       1
338000378893        1
4621000383123       1
Name: count, Length: 106, dtype: int64

In [94]:
in_college.query("age in @under_24")

Unnamed: 0,simulant_id,household_id,first_name,middle_initial,last_name,age,date_of_birth,street_number,street_name,unit_number,city,state,zipcode,housing_type,relationship_to_reference_person,sex,race_ethnicity,year
485912,4943000001669,4943000000003,Paige,K,Four,23,07/17/1996,4619,arbutus st,,,RI,02864,College,Noninstitutionalized group quarters population,Female,Latino,2020
485921,4943000003014,4943000000003,Adam,R,Staten,22,10/18/1997,4619,arbutus st,,warwick,RI,,College,Noninstitutionalized group quarters population,Male,Black,2020
485923,4943000003329,4943000000003,Chad,H,Davis,22,02/14/1998,4619,arbutus st,,warwick,RI,02864,College,Noninstitutionalized group quarters population,Male,White,2020
485936,4943000005520,4943000000003,Elizabeth,K,Davis,22,01/27/1998,4619,arbutus st,,warwick,RI,02864,College,Noninstitutionalized group quarters population,Female,Asian,2020
485938,4943000006673,4943000000003,Jessica,S,Allen,20,09/24/1999,4619,arbutus st,,warwick,RI,02864,College,Noninstitutionalized group quarters population,Female,White,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495565,4943001022469,4943000000003,Phoebe,J,Broughton,18,09/17/2001,4619,arbutus st,,warwick,RI,02864,College,Noninstitutionalized group quarters population,Female,White,2020
495566,4943001022472,4943000000003,Brendan,D,Lakhani,20,10/29/1999,4619,arbutus st,,warwick,RI,02864,College,Noninstitutionalized group quarters population,Female,Asian,2020
495567,4943001022474,4943000000003,Haley,M,Jennings,20,07/14/1999,4619,arbutus st,,waahwycck,RI,02864,College,Noninstitutionalized group quarters population,Female,White,2020
495571,4943001022482,4943000000003,Christopher,M,Marcano,12,08/25/2007,4619,arbutus st,,warwick,RI,02864,College,Noninstitutionalized group quarters population,Male,Latino,2020


In [None]:
# Oops, killed my kernel because I forgot to specify `observed=False` when grouping by Categoricals
# Thank god, this dumb behavior looks like it's deprecated since pandas 2.1.0,
# and the default will change to `observed=True` in the future
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.groupby.html
in_college.query("age in @under_24").groupby(['first_name', 'middle_initial', 'last_name']).count()