In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import logging
logging.basicConfig(level=logging.DEBUG, format='{asctime} - {name} - {levelname} - {message}', style='{')
import numpy as np, matplotlib.pyplot as plt, pandas as pd
# For finding and loading multiple files
import os
import traceback
# Use this to see how much memory the dataframes use
from sys import getsizeof
from pathlib import Path
from linetimer import CodeTimer, linetimer

import pseudopeople as psp

from vivarium_research_prl.utils import sizemb, MappingViaAttributes
from vivarium_research_prl import alpha, datatypes, utils

!date
!whoami
!uname -a
!pwd
!python --version
!conda info --envs | grep '\*'
!conda list | grep -e pandas -e numpy -e vivarium -e pseudopeople
!pip freeze | grep pseudopeople

2024-02-02 01:45:42,689 - matplotlib - DEBUG - matplotlib data path: /ihme/homes/ndbs/miniconda3/envs/ppplv1.0-311/lib/python3.11/site-packages/matplotlib/mpl-data
2024-02-02 01:45:42,696 - matplotlib - DEBUG - CONFIGDIR=/ihme/homes/ndbs/.config/matplotlib
2024-02-02 01:45:42,698 - matplotlib - DEBUG - interactive is False
2024-02-02 01:45:42,699 - matplotlib - DEBUG - platform is linux
2024-02-02 01:45:42,804 - matplotlib - DEBUG - CACHEDIR=/ihme/homes/ndbs/.cache/matplotlib
2024-02-02 01:45:42,806 - matplotlib.font_manager - DEBUG - Using fontManager instance from /ihme/homes/ndbs/.cache/matplotlib/fontlist-v330.json
2024-02-02 01:45:43,358 - numexpr.utils - INFO - Note: NumExpr detected 64 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.


Fri 02 Feb 2024 01:45:47 AM PST
ndbs
Linux int-slurm-sarchive-p0005 5.4.0-167-generic #184-Ubuntu SMP Tue Oct 31 09:21:49 UTC 2023 x86_64 x86_64 x86_64 GNU/Linux
/mnt/share/code/ndbs/vivarium_research_prl/pseudopeople_testing
Python 3.11.7
ppplv1.0-311          *  /ihme/homes/ndbs/miniconda3/envs/ppplv1.0-311
numpy                     1.26.3                   pypi_0    pypi
pandas                    2.1.4                    pypi_0    pypi
pseudopeople              0.8.4.dev32+g8c8f99b          pypi_0    pypi
vivarium                  2.3.1                    pypi_0    pypi
vivarium-research-prl     0.0.1                     dev_0    <develop>
pseudopeople @ git+https://github.com/ihmeuw/pseudopeople.git@8c8f99b4ed3111e3bb0b9cd812c923049e39f7a8


In [3]:
# A clickable link to the GitHub UI to see what version I'm using
! pip freeze | grep pseudopeople | sed -e 's|pseudopeople @ ||' | sed -e 's|git+||' | sed -e 's|\.git@|/tree/|'

https://github.com/ihmeuw/pseudopeople/tree/8c8f99b4ed3111e3bb0b9cd812c923049e39f7a8


# Find data

Project directory:

```
/mnt/team/simulation_science/pub/models/vivarium_census_prl_synth_pop
```

My initial attempt to find RI data (this is the `best` run):
```
'/mnt/team/simulation_science/pub/models/vivarium_census_prl_synth_pop/results/release_02_yellow/full_data/united_states_of_america/2023_07_28_08_33_09/final_results/2023_07_31_08_59_48/states/pseudopeople_input_data_rhode_island_0.0.2'
```

Directory [Zeb sent in Slack](https://ihme.slack.com/archives/C02KUQ9LX32/p1705720235528949?thread_ts=1705713714.552259&cid=C02KUQ9LX32), saying that this is the latest zipped RI data (this is similar to the `latest` run, but the postprocessing date is different):
```
/mnt/team/simulation_science/priv/engineering/vivarium_census_prl_synth_pop/results/release_02_yellow/full_data/united_states_of_america/2023_08_21_16_35_27/final_results/2023_08_31_15_58_01/
```

In [4]:
project_path = Path('/mnt/team/simulation_science/pub/models/vivarium_census_prl_synth_pop')
usa_path = project_path / 'results/release_02_yellow/full_data/united_states_of_america'

# Not sure what these runs are for...
latest_run = usa_path / 'latest'
best_run = usa_path / 'best'
latest_run_path = usa_path / '2023_08_21_16_35_27/final_results/2024_01_11_07_27_51/pseudopeople_input_data_usa_2.0.0'
best_run_path = usa_path / '2023_07_28_08_33_09/final_results/2023_07_31_08_59_48/'

# I think this is the run we should be using...
last_zipped_path = usa_path / '2023_08_21_16_35_27/final_results/2023_08_31_15_58_01'

ri_data_dir = last_zipped_path / 'states/pseudopeople_simulated_population_rhode_island_2_0_0'
usa_data_dir = last_zipped_path / 'pseudopeople_simulated_population_usa_2_0_0'


In [5]:
!ls -halt $project_path/results/release_02_yellow/full_data/united_states_of_america/latest

lrwxrwxrwx 1 albrja IHME-Simulationscience 218 Jan 11 12:08 /mnt/team/simulation_science/pub/models/vivarium_census_prl_synth_pop/results/release_02_yellow/full_data/united_states_of_america/latest -> /mnt/team/simulation_science/pub/models/vivarium_census_prl_synth_pop/results/release_02_yellow/full_data/united_states_of_america/2023_08_21_16_35_27/final_results/2024_01_11_07_27_51/pseudopeople_input_data_usa_2.0.0


In [6]:
# Last zipped path (from Zeb)
!ls $usa_path/2023_08_21_16_35_27/final_results/2023_08_31_15_58_01/

pseudopeople_simulated_population_rhode_island_2_0_0.zip
pseudopeople_simulated_population_usa_2_0_0
pseudopeople_simulated_population_usa_2_0_0.zip
states


In [7]:
ri_data_dir

PosixPath('/mnt/team/simulation_science/pub/models/vivarium_census_prl_synth_pop/results/release_02_yellow/full_data/united_states_of_america/2023_08_21_16_35_27/final_results/2023_08_31_15_58_01/states/pseudopeople_simulated_population_rhode_island_2_0_0')

In [8]:
!ls $ri_data_dir

american_community_survey  decennial_census  taxes_dependents
CHANGELOG.rst		   social_security   taxes_w2_and_1099
current_population_survey  taxes_1040	     women_infants_and_children


# Set up a logger to log to file

In [9]:
logs_directory = Path('logs')

# Set up a logger
file_logger = logging.getLogger(__name__) # This gets a new logger for the current, __main__ module

# Create and configure file log handler
file_handler = logging.FileHandler(logs_directory / 'data_generation_ri_20240202.log')
# Set level to the lowest threshold (DEBUG) to capture all messages
file_handler.setLevel(logging.DEBUG)
file_format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
file_handler.setFormatter(file_format)

# Add handlers to the logger
# logger.addHandler(console_handler)
file_logger.addHandler(file_handler)

# Test `generate_datasets` with new `skip` parameter

When I originally tried to load the Rhode Island data on 2024-02-01, it killed my kernel with 15 GiB memory when trying to generate the social security data. So let's try skipping that one.

In [10]:
%%time
sample_data = alpha.generate_datasets(skip='social_security')
sample_data.keylist()

2024-02-02 01:45:55,836 - vivarium_research_prl.alpha - DEBUG - skip_pattern=re.compile('social_security')
2024-02-02 01:45:55,837 - vivarium_research_prl.alpha - INFO - Calling function generate_american_community_survey
2024-02-02 01:45:56,227 - vivarium_research_prl.alpha - INFO - Code block 'generate_american_community_survey' took: 0.00649 m
2024-02-02 01:45:56,229 - vivarium_research_prl.alpha - INFO - <class 'pandas.core.frame.DataFrame'> american_community_survey occupies 0.067614 MB in memory
2024-02-02 01:45:56,231 - vivarium_research_prl.alpha - INFO - Calling function generate_current_population_survey
2024-02-02 01:45:56,425 - vivarium_research_prl.alpha - INFO - Code block 'generate_current_population_survey' took: 0.00323 m
2024-02-02 01:45:56,427 - vivarium_research_prl.alpha - INFO - <class 'pandas.core.frame.DataFrame'> current_population_survey occupies 0.035549 MB in memory
2024-02-02 01:45:56,428 - vivarium_research_prl.alpha - INFO - Calling function generate_dece

CPU times: user 3.41 s, sys: 125 ms, total: 3.54 s
Wall time: 3.52 s


['american_community_survey',
 'current_population_survey',
 'decennial_census',
 'taxes_1040',
 'taxes_w2_and_1099',
 'women_infants_and_children']

# Generate all Rhode Island datasets, logging runtimes and memory to file

In [11]:
%%time
file_logger.info(f"Generating all Rhode Island datasets from directory {ri_data_dir}")
with CodeTimer("RI data generation", unit='m', logger_func=file_logger.info):
    data = alpha.generate_datasets(str(ri_data_dir), skip='social_security', logger=file_logger)

2024-02-02 01:45:59,377 - __main__ - INFO - Generating all Rhode Island datasets from directory /mnt/team/simulation_science/pub/models/vivarium_census_prl_synth_pop/results/release_02_yellow/full_data/united_states_of_america/2023_08_21_16_35_27/final_results/2023_08_31_15_58_01/states/pseudopeople_simulated_population_rhode_island_2_0_0
2024-02-02 01:45:59,378 - __main__ - DEBUG - skip_pattern=re.compile('social_security')
2024-02-02 01:45:59,379 - __main__ - INFO - Calling function generate_american_community_survey
Noising data:   0%|                                                                        | 0/334 [00:00<?, ?it/s]
Applying noise:   0%|                                                                     | 0/15 [00:00<?, ?type/s][A
Noising data:   1%|▍                                                               | 2/334 [00:00<00:52,  6.37it/s][A
Applying noise:   0%|                                                                     | 0/15 [00:00<?, ?type/s][A
No

In [12]:
data.keylist()

['american_community_survey',
 'current_population_survey',
 'decennial_census',
 'taxes_1040',
 'taxes_w2_and_1099',
 'women_infants_and_children']

In [13]:
# 3.06 GB for all RI datasets except ACS and social security
sum(sizemb(df) for df in data.values())

3060.6762830000002

In [14]:
file_logger.info(f"Altogether, RI datasets occupy {sum(sizemb(df) for df in data.values())} MB in memory")

2024-02-02 16:41:25,280 - __main__ - INFO - Altogether, RI datasets occupy 3060.6762830000002 MB in memory


# Verify that there's a bug when all values in a column are missing

This appears to be what happened with ACS above.

In [41]:
lots_missing = {
    'decennial_census': {
        'column_noise': {
            'first_name': {
                'leave_blank': {'cell_probability': 1.0}
            }
        }
    }
}
try:
    bad_census = psp.generate_decennial_census(config=lots_missing)
except Exception as e:
    # traceback.print_tb(e.__traceback__)
    traceback.print_exception(e)

Traceback (most recent call last):                                                                                 
  File "/tmp/ipykernel_1588941/3865727153.py", line 11, in <module>
    bad_census = psp.generate_decennial_census(config=lots_missing)
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/ihme/homes/ndbs/miniconda3/envs/ppplv1.0-311/lib/python3.11/site-packages/pseudopeople/interface.py", line 302, in generate_decennial_census
    _generate_dataset(DATASETS.census, source, seed, config, user_filters, verbose)
  File "/ihme/homes/ndbs/miniconda3/envs/ppplv1.0-311/lib/python3.11/site-packages/pseudopeople/interface.py", line 88, in _generate_dataset
    noised_data = noise_dataset(dataset, data, configuration_tree, data_path_seed)
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/ihme/homes/ndbs/miniconda3/envs/ppplv1.0-311/lib/python3.11/site-packages/pseudopeople/noise.py", line 94, in noise_dataset
    da

In [42]:
psp.generate_decennial_census(config=lots_missing)

                                                                                                                   

FloatingPointError: invalid value encountered in scalar divide

# Can we estimate how much memory the full SSN data will need?

## First, see how big it is for the sample data.

Hmm, it's almost as big as the census, and the relative size for RI should be much larger than for the sample data, because it contains simulants from the entire USA, not just RI...

In [15]:
%%time
sample_data = alpha.generate_datasets()
sample_data.keylist()

2024-02-02 16:43:21,339 - vivarium_research_prl.alpha - DEBUG - skip_pattern=re.compile('(?!)')
2024-02-02 16:43:21,340 - vivarium_research_prl.alpha - INFO - Calling function generate_american_community_survey
2024-02-02 16:43:21,629 - vivarium_research_prl.alpha - INFO - Code block 'generate_american_community_survey' took: 0.00479 m
2024-02-02 16:43:21,630 - vivarium_research_prl.alpha - INFO - <class 'pandas.core.frame.DataFrame'> american_community_survey occupies 0.067614 MB in memory
2024-02-02 16:43:21,631 - vivarium_research_prl.alpha - INFO - Calling function generate_current_population_survey
2024-02-02 16:43:21,834 - vivarium_research_prl.alpha - INFO - Code block 'generate_current_population_survey' took: 0.00336 m
2024-02-02 16:43:21,836 - vivarium_research_prl.alpha - INFO - <class 'pandas.core.frame.DataFrame'> current_population_survey occupies 0.035549 MB in memory
2024-02-02 16:43:21,837 - vivarium_research_prl.alpha - INFO - Calling function generate_decennial_censu

CPU times: user 3.93 s, sys: 105 ms, total: 4.04 s
Wall time: 4.03 s


['american_community_survey',
 'current_population_survey',
 'decennial_census',
 'social_security',
 'taxes_1040',
 'taxes_w2_and_1099',
 'women_infants_and_children']

# Try "compressing" the RI data

In [18]:
%%time
with CodeTimer("RI datatype conversion", unit='m', logger_func=file_logger.info):
    cdata = MappingViaAttributes(
        {k: datatypes.to_int_and_categorical(df, exclude='ssn')
         for k, df in data.items() if isinstance(df, pd.DataFrame)})
cdata.keylist()

2024-02-03 18:52:11,678 - __main__ - INFO - Code block 'RI datatype conversion' took: 0.35429 m


CPU times: user 21.1 s, sys: 146 ms, total: 21.3 s
Wall time: 21.3 s


['current_population_survey',
 'decennial_census',
 'taxes_1040',
 'taxes_w2_and_1099',
 'women_infants_and_children']

In [19]:
sum(sizemb(df) for df in cdata.values())

622.204032

In [48]:
file_logger.info(f"Converted to ints and categoricals, the RI datasets occupy {sum(sizemb(df) for df in cdata.values())} MB in memory")

2024-02-04 00:33:43,196 - __main__ - INFO - Converted to ints and categoricals, the RI datasets occupy 622.204032 MB in memory


# Compare memory usage between original and compressed

In [47]:
from IPython.display import display, Markdown

for name in cdata:
    display(Markdown(f'### {name}'))
    display(data[name].memory_usage(deep=True) - cdata[name].memory_usage(deep=True))

### current_population_survey

Index                  0
simulant_id       240543
household_id      239739
survey_date            0
first_name        125366
middle_initial    225420
last_name          40404
age               221503
date_of_birth    -120752
street_number      40839
street_name         8381
unit_number       114189
city              245994
state                  0
zipcode           232908
sex                    0
race_ethnicity         0
dtype: int64

### decennial_census

Index                                      0
simulant_id                         64205913
household_id                        63758268
first_name                          63273760
middle_initial                      60908192
last_name                           52475823
age                                 60750437
date_of_birth                       61559096
street_number                       55783655
street_name                         42351780
unit_number                         32706926
city                                68437679
state                                      0
zipcode                             63758542
housing_type                        69712671
relationship_to_reference_person           0
sex                                        0
race_ethnicity                             0
year                                37574204
dtype: int64

### taxes_1040

Index                                   0
simulant_id                      40278068
household_id                     39941144
first_name                       39258599
middle_initial                   38189401
last_name                        31672300
ssn                                     0
mailing_address_street_number    33322747
mailing_address_street_name      18300522
mailing_address_unit_number      20304869
mailing_address_po_box           19957606
mailing_address_city             42909116
mailing_address_state                   0
mailing_address_zipcode          39967553
spouse_first_name                21813256
spouse_middle_initial            22542279
spouse_last_name                 20101868
spouse_ssn                       14676268
dependent_1_first_name           23244091
dependent_1_last_name            19259805
dependent_1_ssn                  11025252
dependent_2_first_name           21650936
dependent_2_last_name            20145391
dependent_2_ssn                  1

### taxes_w2_and_1099

Index                                   0
simulant_id                      62574768
household_id                     62055220
employer_id                      11542792
ssn                                     0
wages                            45657239
employer_name                    14962372
employer_street_number           53996849
employer_street_name             37610813
employer_unit_number             31752627
employer_city                    63509081
employer_state                          0
employer_zipcode                 57058992
first_name                       61876840
middle_initial                   59366682
last_name                        51044549
mailing_address_street_number    53485926
mailing_address_street_name      40594626
mailing_address_unit_number      32064459
mailing_address_po_box           31457978
mailing_address_city             66687788
mailing_address_state                   0
mailing_address_zipcode          62143771
tax_form                          

### women_infants_and_children

Index                  0
simulant_id       852504
household_id      845247
first_name        600991
middle_initial    801577
last_name         365995
date_of_birth     451307
street_number     360082
street_name        29575
unit_number       402995
city              886576
state                  0
zipcode           824788
sex                    0
race_ethnicity         0
year              496149
dtype: int64

In [43]:
data.current_population_survey.memory_usage(deep=True) - cdata.current_population_survey.memory_usage(deep=True)

Index                  0
simulant_id       240543
household_id      239739
survey_date            0
first_name        125366
middle_initial    225420
last_name          40404
age               221503
date_of_birth    -120752
street_number      40839
street_name         8381
unit_number       114189
city              245994
state                  0
zipcode           232908
sex                    0
race_ethnicity         0
dtype: int64

In [44]:
data.decennial_census.memory_usage(deep=True) - cdata.decennial_census.memory_usage(deep=True)

Index                                      0
simulant_id                         64205913
household_id                        63758268
first_name                          63273760
middle_initial                      60908192
last_name                           52475823
age                                 60750437
date_of_birth                       61559096
street_number                       55783655
street_name                         42351780
unit_number                         32706926
city                                68437679
state                                      0
zipcode                             63758542
housing_type                        69712671
relationship_to_reference_person           0
sex                                        0
race_ethnicity                             0
year                                37574204
dtype: int64