In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import logging
# Set level to the lowest threshold (DEBUG) to capture all messages
logging.basicConfig(level=logging.DEBUG, format='{asctime} - {name} - {levelname} - {message}', style='{')
import numpy as np, matplotlib.pyplot as plt, pandas as pd
# For finding and loading multiple files
import os
# Use this to see how much memory the dataframes use
from sys import getsizeof
from pathlib import Path
from linetimer import CodeTimer, linetimer

import pseudopeople as psp

from vivarium_research_prl.utils import sizemb, MappingViaAttributes
from vivarium_research_prl import alpha, datatypes

!date
!whoami
!uname -a
!pwd
!python --version
!conda info --envs | grep '\*'
!conda list | grep -e pandas -e numpy -e vivarium -e pseudopeople
!pip freeze | grep pseudopeople

2024-02-09 02:44:40,178 - matplotlib - DEBUG - matplotlib data path: /ihme/homes/ndbs/miniconda3/envs/ppplv1.0-311/lib/python3.11/site-packages/matplotlib/mpl-data
2024-02-09 02:44:40,189 - matplotlib - DEBUG - CONFIGDIR=/ihme/homes/ndbs/.config/matplotlib
2024-02-09 02:44:40,193 - matplotlib - DEBUG - interactive is False
2024-02-09 02:44:40,194 - matplotlib - DEBUG - platform is linux
2024-02-09 02:44:40,319 - matplotlib - DEBUG - CACHEDIR=/ihme/homes/ndbs/.cache/matplotlib
2024-02-09 02:44:40,324 - matplotlib.font_manager - DEBUG - Using fontManager instance from /ihme/homes/ndbs/.cache/matplotlib/fontlist-v330.json
2024-02-09 02:44:41,080 - numexpr.utils - INFO - Note: NumExpr detected 64 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.


Fri 09 Feb 2024 02:44:46 AM PST
ndbs
Linux long-slurm-sarchive-p0050 5.4.0-167-generic #184-Ubuntu SMP Tue Oct 31 09:21:49 UTC 2023 x86_64 x86_64 x86_64 GNU/Linux
/mnt/share/code/ndbs/vivarium_research_prl/pseudopeople_testing
Python 3.11.7
ppplv1.0-311          *  /ihme/homes/ndbs/miniconda3/envs/ppplv1.0-311
numpy                     1.26.3                   pypi_0    pypi
pandas                    2.1.4                    pypi_0    pypi
pseudopeople              0.8.4.dev34+gc5d0c15          pypi_0    pypi
vivarium                  2.3.1                    pypi_0    pypi
vivarium-research-prl     0.0.1                     dev_0    <develop>
pseudopeople @ git+https://github.com/ihmeuw/pseudopeople.git@c5d0c15dc889b0ef51f56fa9757c2949b8531f3b


In [3]:
# A clickable link to the GitHub UI to see what version I'm using
! pip freeze | grep pseudopeople | sed -e 's|pseudopeople @ ||' | sed -e 's|git+||' | sed -e 's|\.git@|/tree/|'

https://github.com/ihmeuw/pseudopeople/tree/c5d0c15dc889b0ef51f56fa9757c2949b8531f3b


# Find data

```
/mnt/team/simulation_science/pub/models/vivarium_census_prl_synth_pop
```

In [4]:
project_path = Path('/mnt/team/simulation_science/pub/models/vivarium_census_prl_synth_pop')
usa_path = project_path / 'results/release_02_yellow/full_data/united_states_of_america'

# Not sure what these runs are for...
latest_run = usa_path / 'latest'
best_run = usa_path / 'best'
latest_run_path = usa_path / '2023_08_21_16_35_27/final_results/2024_01_11_07_27_51/pseudopeople_input_data_usa_2.0.0'
best_run_path = usa_path / '2023_07_28_08_33_09/final_results/2023_07_31_08_59_48/'

# I think this is the run we should be using...
last_zipped_path = usa_path / '2023_08_21_16_35_27/final_results/2023_08_31_15_58_01'

ri_data_dir = last_zipped_path / 'states/pseudopeople_simulated_population_rhode_island_2_0_0'
usa_data_dir = last_zipped_path / 'pseudopeople_simulated_population_usa_2_0_0'


In [5]:
!ls -halt $project_path/results/release_02_yellow/full_data/united_states_of_america/latest

lrwxrwxrwx 1 albrja IHME-Simulationscience 218 Jan 11 12:08 /mnt/team/simulation_science/pub/models/vivarium_census_prl_synth_pop/results/release_02_yellow/full_data/united_states_of_america/latest -> /mnt/team/simulation_science/pub/models/vivarium_census_prl_synth_pop/results/release_02_yellow/full_data/united_states_of_america/2023_08_21_16_35_27/final_results/2024_01_11_07_27_51/pseudopeople_input_data_usa_2.0.0


In [6]:
!ls $usa_data_dir

american_community_survey  decennial_census  taxes_1040
CHANGELOG.rst		   logs		     taxes_w2_and_1099
current_population_survey  social_security   women_infants_and_children


In [7]:
usa_data_dir

PosixPath('/mnt/team/simulation_science/pub/models/vivarium_census_prl_synth_pop/results/release_02_yellow/full_data/united_states_of_america/2023_08_21_16_35_27/final_results/2023_08_31_15_58_01/pseudopeople_simulated_population_usa_2_0_0')

# Set up a logger to log to file

In [8]:
logs_directory = Path('logs')

# Set up a logger
file_logger = logging.getLogger(__name__) # This gets a new logger for the current, __main__ module

# Create and configure file log handler
file_handler = logging.FileHandler(logs_directory / 'usa_decennial_census_20240209.log')
# Set level to the lowest threshold (DEBUG) to capture all messages
file_handler.setLevel(logging.DEBUG)
file_format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
file_handler.setFormatter(file_format)

# Add handlers to the logger
# logger.addHandler(console_handler)
file_logger.addHandler(file_handler)

# Generate full USA decennial census

In [9]:
%%time
file_logger.info(f"Generating USA decennial census from directory {usa_data_dir}")
file_logger.info(f"Calling function {psp.generate_decennial_census.__name__}")
with CodeTimer("USA decennial census generation", unit='h', logger_func=file_logger.info):
    census = psp.generate_decennial_census(usa_data_dir)

2024-02-09 02:44:58,659 - __main__ - INFO - Generating USA decennial census from directory /mnt/team/simulation_science/pub/models/vivarium_census_prl_synth_pop/results/release_02_yellow/full_data/united_states_of_america/2023_08_21_16_35_27/final_results/2023_08_31_15_58_01/pseudopeople_simulated_population_usa_2_0_0
2024-02-09 02:44:58,659 - __main__ - INFO - Calling function generate_decennial_census
Noising data:   0%|                                                                                | 0/334 [00:00<?, ?it/s]
Applying noise:   0%|                                                                             | 0/15 [00:00<?, ?type/s][A
Applying noise:   7%|████▌                                                                | 1/15 [00:06<01:29,  6.40s/type][A
Applying noise:  13%|█████████▏                                                           | 2/15 [00:08<00:47,  3.67s/type][A
Applying noise:  20%|█████████████▊                                                     

CPU times: user 2h 48min 42s, sys: 18min 1s, total: 3h 6min 43s
Wall time: 3h 7min 9s


In [10]:
file_logger.info(f"{type(census)} census occupies {sizemb(census)} MB in memory")

2024-02-09 06:06:34,931 - __main__ - INFO - <class 'pandas.core.frame.DataFrame'> census occupies 277575.444814 MB in memory


In [11]:
file_logger.info(f"DataFrame info: {census.info()}")

2024-02-09 06:06:35,578 - __main__ - INFO - DataFrame info: None


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 330811542 entries, 0 to 330811541
Data columns (total 18 columns):
 #   Column                            Dtype   
---  ------                            -----   
 0   simulant_id                       object  
 1   household_id                      object  
 2   first_name                        object  
 3   middle_initial                    object  
 4   last_name                         object  
 5   age                               object  
 6   date_of_birth                     object  
 7   street_number                     object  
 8   street_name                       object  
 9   unit_number                       object  
 10  city                              object  
 11  state                             category
 12  zipcode                           object  
 13  housing_type                      object  
 14  relationship_to_reference_person  category
 15  sex                               category
 16  race_ethnicity

In [12]:
file_logger.info(f"Detailed DataFrame memory usage: {census.memory_usage(deep=True)}")

2024-02-09 06:21:06,175 - __main__ - INFO - Detailed DataFrame memory usage: Index                                       132
simulant_id                         22431980013
household_id                        22308622678
first_name                          20707715312
middle_initial                      19101141995
last_name                           20925307730
age                                 19390311234
date_of_birth                       22048792559
street_number                       19551188244
street_name                         22753729177
unit_number                         11209476244
city                                21724459018
state                                 330816655
zipcode                             20411153783
housing_type                        21779096905
relationship_to_reference_person      330813526
sex                                   330811774
race_ethnicity                        330812291
year                                11909215512
dtype: int6

In [13]:
%%time
file_logger.info("Converting to ints and categoricals...")
with CodeTimer("US census datatype conversion", unit='m', logger_func=file_logger.info):
    census = datatypes.to_int_and_categorical(census)

2024-02-09 06:21:06,947 - __main__ - INFO - Converting to ints and categoricals...
2024-02-09 07:08:58,906 - __main__ - INFO - Code block 'US census datatype conversion' took: 47.86595 m


CPU times: user 37min 48s, sys: 10min 10s, total: 47min 59s
Wall time: 47min 51s


In [14]:
file_logger.info(f"After converting to ints and categoricals, {type(census)} census occupies {sizemb(census)} MB in memory")

2024-02-09 07:09:08,513 - __main__ - INFO - After converting to ints and categoricals, <class 'pandas.core.frame.DataFrame'> census occupies 20641.082617 MB in memory


In [15]:
file_logger.info(f"Detailed DataFrame memory usage after converting: {census.memory_usage(deep=True)}")

2024-02-09 07:09:14,300 - __main__ - INFO - Detailed DataFrame memory usage after converting: Index                                      132
simulant_id                         2646492336
household_id                        2646492336
first_name                          1373800152
middle_initial                       661637661
last_name                           1791896428
age                                  661742943
date_of_birth                       1513028204
street_number                       1352244039
street_name                         1856733640
unit_number                         1347389322
city                                1449424652
state                                330816655
zipcode                             1355322510
housing_type                         330812326
relationship_to_reference_person     330813526
sex                                  330811774
race_ethnicity                       330812291
year                                 330811658
dtype: int64


# Investigate guardian-based duplication in the census

## First find duplicates

In [16]:
%%time
dup_ids = census.simulant_id.duplicated(keep=False)
duplicates = census.loc[dup_ids]
duplicates

CPU times: user 1min 9s, sys: 6.45 s, total: 1min 15s
Wall time: 1min 15s


Unnamed: 0,simulant_id,household_id,first_name,middle_initial,last_name,age,date_of_birth,street_number,street_name,unit_number,city,state,zipcode,housing_type,relationship_to_reference_person,sex,race_ethnicity,year
17,1007000000017,1007000000015,Aaliyah,E,Wiles,12,05/21/2007,7458,pawley ave,,stokes,OH,43506,Household,Sibling,Female,White,2020
2898,1007000002998,1007000001255,Janiyah,C,Nannini,9,02/04/2011,18005,isleview dr,,greenville,SC,29040,Household,Biological child,Female,White,2020
4506,1007000004678,1007000001947,Ayden,D,Williams,9,06/13/2010,1523,green valley rd,,boonville,IN,46342,Household,Biological child,Male,Black,2020
5059,1007000005250,1007000002157,Sean,A,Hopkins,10,10/05/2009,,sweetwood dr,,pine bluff,AR,71801,Household,Stepchild,Male,White,2020
7852,1007000008108,1007000003309,Daniel,A,Taylor,12,02/16/2008,,n ramunno dr,,maryville,TN,37210,Household,Roommate or housemate,Male,Black,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
330811537,9971000970069,9971000288034,Lacie,K,Bennett,23,09/10/1996,3200,broad st,,pittsburgh,PA,19128,Household,Other relative,Female,White,2020
330811538,9971000980981,9971000178884,Stephanie,G,Pearlman,19,10/28/2000,2143,w 5500 s,,young,AZ,85387,Household,Other relative,Female,White,2020
330811539,9971000975589,9971000163772,Brendon,B,Moss,20,01/05/2000,230,cialella ps,,ellenwood,GA,31522,Household,Other relative,Male,White,2020
330811540,9971000979343,9971000310654,Sean,J,Charles,21,01/28/1999,1048,10th place se,,brandon,MS,39117,Household,Other relative,Male,Black,2020


In [17]:
len(duplicates)/2

235244.0

In [18]:
duplicates.simulant_id.value_counts().unique()

array([2])

# Figure out household IDs for group quarters

I think the GQ household IDs within each shard are 0,1,2,3,4,5, and there's one of each type in each of the 334 shards.

In [20]:
%%time
college_hh_counts = census.query("housing_type == 'College'")['household_id'].value_counts()
college_hh_counts

CPU times: user 2.24 s, sys: 1.38 s, total: 3.62 s
Wall time: 3.62 s


household_id
3713000000003    6642
9588000000003    6622
778000000003     6612
3624000000003    6611
5628000000003    6603
                 ... 
3793000278641       1
3793000279461       1
3793000279601       1
3793000279838       1
3793000278018       1
Name: count, Length: 525865, dtype: int64

In [22]:
college_hh_counts.loc[college_hh_counts > 6000]

household_id
3713000000003    6642
9588000000003    6622
778000000003     6612
3624000000003    6611
5628000000003    6603
                 ... 
2787000000003    6200
9859000000003    6185
5300000000003    6176
3731000000003    6174
1154000000003    6168
Name: count, Length: 334, dtype: int64

In [23]:
college_hh_counts.loc[(college_hh_counts > 1000) & (college_hh_counts < 6000)]

Series([], Name: count, dtype: int64)

In [25]:
college_hh_counts.loc[(college_hh_counts > 10) & (college_hh_counts < 6000)]

household_id
8305000000001    22
103000000000     22
6929000000002    22
4950000000001    21
8501000000001    21
                 ..
1559000000002    11
1632000000004    11
2500000000001    11
6520000000005    11
338000000005     11
Name: count, Length: 771, dtype: int64

In [26]:
%%time
census.query("household_id == 8305000000001")['housing_type'].value_counts()

CPU times: user 1.49 s, sys: 1.42 s, total: 2.91 s
Wall time: 2.92 s


housing_type
Nursing home              6187
College                     22
Military                    14
Carceral                    12
Other institutional         10
Household                    8
Other noninstitutional       6
Name: count, dtype: int64

In [27]:
%%time
census.query("household_id == 103000000000")['housing_type'].value_counts()

CPU times: user 1.53 s, sys: 1.36 s, total: 2.89 s
Wall time: 2.89 s


housing_type
Carceral                  6139
College                     22
Nursing home                20
Military                    13
Other institutional         10
Other noninstitutional       6
Household                    5
Name: count, dtype: int64

In [28]:
%%time
census.query("household_id == 6929000000002")['housing_type'].value_counts()

CPU times: user 1.54 s, sys: 1.38 s, total: 2.92 s
Wall time: 2.92 s


housing_type
Other institutional       6131
College                     22
Carceral                    15
Nursing home                13
Military                     9
Other noninstitutional       9
Household                    6
Name: count, dtype: int64

In [29]:
%%time
census.query("household_id == 338000000005")['housing_type'].value_counts()

CPU times: user 1.55 s, sys: 1.37 s, total: 2.92 s
Wall time: 2.92 s


housing_type
Other noninstitutional    6217
Household                   12
College                     11
Military                    11
Carceral                    10
Nursing home                 9
Other institutional          6
Name: count, dtype: int64

In [30]:
datatypes.ID_PAD_WIDTH

9

In [31]:
%%time
hh_id_1 = census.query(f"household_id % {10**datatypes.ID_PAD_WIDTH} == 1")
hh_id_1

CPU times: user 6.78 s, sys: 1.41 s, total: 8.2 s
Wall time: 8.19 s


Unnamed: 0,simulant_id,household_id,first_name,middle_initial,last_name,age,date_of_birth,street_number,street_name,unit_number,city,state,zipcode,housing_type,relationship_to_reference_person,sex,race_ethnicity,year
104,1007000000114,1007000000001,Dell,F,Graff,88,05/01/1931,600,n blair ave,,vadito,NM,87001,Nursing home,Institutionalized group quarters population,Female,White,2020
413,1007000000432,1007000000001,Aaron,E,Cavicchio,17,10/12/2002,600,n blair ave,,vadito,AL,87001,Nursing home,Institutionalized group quarters population,Male,White,2020
954,1007000000985,1007000000001,Salvador,Z,Roach,19,07/28/2000,600,n blair ave,,vadito,NM,87001,Nursing home,Institutionalized group quarters population,Male,Asian,2020
2033,1007000002110,1007000000001,Michael,J,Russell,46,01/02/1974,600,n blair ave,,vadito,NM,87001,Nursing home,Institutionalized group quarters population,Male,Multiracial or Other,2020
2313,1007000002397,1007000000001,Louis,G,Murphy,19,02/03/2001,600,n blair ave,,vadito,NM,87001,Nursing home,Institutionalized group quarters population,Male,White,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
330810958,9971000583451,9971000000001,Helen,C,Coleman,14,01/27/2006,11801,,,statesville,NC,27011,Nursing home,Institutionalized group quarters population,Female,Black,2020
330811036,9971000448939,9971000000001,Amari,B,Delatejera,10,03/16/2010,11801,west 26th street,,statesville,NC,27011,Nursing home,Institutionalized group quarters population,Male,Latino,2020
330811116,9971000173458,9971000000001,Cortez,D,Joa,12,11/20/2007,11801,west 26th street,,statesville,NC,27011,Nursing home,Institutionalized group quarters population,Male,White,2020
330811147,9971000360446,9971000000001,Leila,I,Moore,9,11/08/2010,11801,west 26th street,,statesville,NC,27011,Nursing home,Institutionalized group quarters population,Female,AIAN,2020


In [32]:
hh_id_1['housing_type'].value_counts()

housing_type
Nursing home              2043321
Other institutional          3452
Carceral                     3442
College                      3412
Household                    3389
Other noninstitutional       3350
Military                     3327
Name: count, dtype: int64

In [33]:
hh_id_1.household_id.nunique()

334

# Let's do some quick and dirty calculations of the number of simulants in each duplication category

In [34]:
# Get list of ages under 18 because we can't directly do comparisons with Categoricals
under_18_ages = list(map(str, range(18)))
in_households_under_18 = census.query(f"age in {under_18_ages} and housing_type == 'Household'")
in_households_under_18

Unnamed: 0,simulant_id,household_id,first_name,middle_initial,last_name,age,date_of_birth,street_number,street_name,unit_number,city,state,zipcode,housing_type,relationship_to_reference_person,sex,race_ethnicity,year
17,1007000000017,1007000000015,Aaliyah,E,Wiles,12,05/21/2007,7458,pawley ave,,stokes,OH,43506,Household,Sibling,Female,White,2020
25,1007000000025,1007000000018,Aaron,E,Roberts,16,07/22/2003,2103,south bdwy,,spring,TX,75165,Household,Biological child,Male,White,2020
28,1007000000028,1007000000019,Aaron,E,Jackson,11,05/04/2008,1146,ocean avnu,,smyrna,TN,37334,Household,Biological child,Male,White,2020
34,1007000000034,1007000000021,Olivia,J,Whiddon,12,11/25/2007,1480,high st,,st. clair shores,MI,49638,Household,Biological child,Female,White,2020
35,1007000000035,1007000000021,Aaron,E,Whiddon,9,01/12/2011,1480,high st,,st. clair shores,MI,49638,Household,Grandchild,Male,White,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
330811465,9971000665249,9971000267275,Ernesto,W,Bryant,14,01/14/2003,116-09,w payran st,4th platform,san francisco,CA,90036,Household,Other relative,Male,Black,2020
330811504,9971000182680,9971000073564,Robert,B,Torres-Mejia,17,03/02/2003,10195,chases grove ro,,portland,OR,97140,Household,Other relative,Male,Latino,2020
330811512,9971000610423,9971000405219,Christopher,A,Gutierrez,17,10/22/2002,36206,,,green oak twp,MI,49315,Household,Other relative,Male,Latino,2020
330811514,9971000973328,9971000308163,Justice,P,King,17,05/29/2002,21509,gregory st,,woodstock,GA,30442,Household,Other relative,Female,Black,2020


In [35]:
in_households_under_18_dups = duplicates.query(f"age in {under_18_ages} and housing_type == 'Household'")
in_households_under_18_dups

Unnamed: 0,simulant_id,household_id,first_name,middle_initial,last_name,age,date_of_birth,street_number,street_name,unit_number,city,state,zipcode,housing_type,relationship_to_reference_person,sex,race_ethnicity,year
17,1007000000017,1007000000015,Aaliyah,E,Wiles,12,05/21/2007,7458,pawley ave,,stokes,OH,43506,Household,Sibling,Female,White,2020
2898,1007000002998,1007000001255,Janiyah,C,Nannini,9,02/04/2011,18005,isleview dr,,greenville,SC,29040,Household,Biological child,Female,White,2020
4506,1007000004678,1007000001947,Ayden,D,Williams,9,06/13/2010,1523,green valley rd,,boonville,IN,46342,Household,Biological child,Male,Black,2020
5059,1007000005250,1007000002157,Sean,A,Hopkins,10,10/05/2009,,sweetwood dr,,pine bluff,AR,71801,Household,Stepchild,Male,White,2020
7852,1007000008108,1007000003309,Daniel,A,Taylor,12,02/16/2008,,n ramunno dr,,maryville,TN,37210,Household,Roommate or housemate,Male,Black,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
330811465,9971000665249,9971000267275,Ernesto,W,Bryant,14,01/14/2003,116-09,w payran st,4th platform,san francisco,CA,90036,Household,Other relative,Male,Black,2020
330811504,9971000182680,9971000073564,Robert,B,Torres-Mejia,17,03/02/2003,10195,chases grove ro,,portland,OR,97140,Household,Other relative,Male,Latino,2020
330811512,9971000610423,9971000405219,Christopher,A,Gutierrez,17,10/22/2002,36206,,,green oak twp,MI,49315,Household,Other relative,Male,Latino,2020
330811514,9971000973328,9971000308163,Justice,P,King,17,05/29/2002,21509,gregory st,,woodstock,GA,30442,Household,Other relative,Female,Black,2020


In [36]:
len(in_households_under_18_dups) / len(in_households_under_18)

0.004761901106310395