In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import logging
# Set level to the lowest threshold (DEBUG) to capture all messages
logging.basicConfig(level=logging.DEBUG, format='{asctime} - {name} - {levelname} - {message}', style='{')
import numpy as np, matplotlib.pyplot as plt, pandas as pd
# For finding and loading multiple files
import os
# Use this to see how much memory the dataframes use
from sys import getsizeof
from pathlib import Path
from linetimer import CodeTimer, linetimer

import pseudopeople as psp

from vivarium_research_prl.utils import sizemb, MappingViaAttributes
from vivarium_research_prl import alpha, datatypes

!date
!whoami
!uname -a
!pwd
!python --version
!conda info --envs | grep '\*'
!conda list | grep -e pandas -e numpy -e vivarium -e pseudopeople
!pip freeze | grep pseudopeople

2024-02-06 18:10:28,661 - matplotlib - DEBUG - matplotlib data path: /ihme/homes/ndbs/miniconda3/envs/ppplv1.0-311/lib/python3.11/site-packages/matplotlib/mpl-data
2024-02-06 18:10:28,679 - matplotlib - DEBUG - CONFIGDIR=/ihme/homes/ndbs/.config/matplotlib
2024-02-06 18:10:28,683 - matplotlib - DEBUG - interactive is False
2024-02-06 18:10:28,684 - matplotlib - DEBUG - platform is linux
2024-02-06 18:10:28,840 - matplotlib - DEBUG - CACHEDIR=/ihme/homes/ndbs/.cache/matplotlib
2024-02-06 18:10:28,846 - matplotlib.font_manager - DEBUG - Using fontManager instance from /ihme/homes/ndbs/.cache/matplotlib/fontlist-v330.json
2024-02-06 18:10:30,191 - numexpr.utils - INFO - Note: NumExpr detected 56 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.


Tue 06 Feb 2024 06:10:37 PM PST
ndbs
Linux long-slurm-sarchive-p0040 5.4.0-167-generic #184-Ubuntu SMP Tue Oct 31 09:21:49 UTC 2023 x86_64 x86_64 x86_64 GNU/Linux
/mnt/share/code/ndbs/vivarium_research_prl/pseudopeople_testing
Python 3.11.7
ppplv1.0-311          *  /ihme/homes/ndbs/miniconda3/envs/ppplv1.0-311
numpy                     1.26.3                   pypi_0    pypi
pandas                    2.1.4                    pypi_0    pypi
pseudopeople              0.8.4.dev34+gc5d0c15          pypi_0    pypi
vivarium                  2.3.1                    pypi_0    pypi
vivarium-research-prl     0.0.1                     dev_0    <develop>
pseudopeople @ git+https://github.com/ihmeuw/pseudopeople.git@c5d0c15dc889b0ef51f56fa9757c2949b8531f3b


In [3]:
# A clickable link to the GitHub UI to see what version I'm using
! pip freeze | grep pseudopeople | sed -e 's|pseudopeople @ ||' | sed -e 's|git+||' | sed -e 's|\.git@|/tree/|'

https://github.com/ihmeuw/pseudopeople/tree/c5d0c15dc889b0ef51f56fa9757c2949b8531f3b


# Find data

```
/mnt/team/simulation_science/pub/models/vivarium_census_prl_synth_pop
```

In [4]:
project_path = Path('/mnt/team/simulation_science/pub/models/vivarium_census_prl_synth_pop')
usa_path = project_path / 'results/release_02_yellow/full_data/united_states_of_america'

# Not sure what these runs are for...
latest_run = usa_path / 'latest'
best_run = usa_path / 'best'
latest_run_path = usa_path / '2023_08_21_16_35_27/final_results/2024_01_11_07_27_51/pseudopeople_input_data_usa_2.0.0'
best_run_path = usa_path / '2023_07_28_08_33_09/final_results/2023_07_31_08_59_48/'

# I think this is the run we should be using...
last_zipped_path = usa_path / '2023_08_21_16_35_27/final_results/2023_08_31_15_58_01'

ri_data_dir = last_zipped_path / 'states/pseudopeople_simulated_population_rhode_island_2_0_0'
usa_data_dir = last_zipped_path / 'pseudopeople_simulated_population_usa_2_0_0'


In [5]:
!ls -halt $project_path/results/release_02_yellow/full_data/united_states_of_america/latest

lrwxrwxrwx 1 albrja IHME-Simulationscience 218 Jan 11 12:08 /mnt/team/simulation_science/pub/models/vivarium_census_prl_synth_pop/results/release_02_yellow/full_data/united_states_of_america/latest -> /mnt/team/simulation_science/pub/models/vivarium_census_prl_synth_pop/results/release_02_yellow/full_data/united_states_of_america/2023_08_21_16_35_27/final_results/2024_01_11_07_27_51/pseudopeople_input_data_usa_2.0.0


In [6]:
!ls $usa_data_dir

american_community_survey  decennial_census  taxes_1040
CHANGELOG.rst		   logs		     taxes_w2_and_1099
current_population_survey  social_security   women_infants_and_children


In [7]:
usa_data_dir

PosixPath('/mnt/team/simulation_science/pub/models/vivarium_census_prl_synth_pop/results/release_02_yellow/full_data/united_states_of_america/2023_08_21_16_35_27/final_results/2023_08_31_15_58_01/pseudopeople_simulated_population_usa_2_0_0')

# Set up a logger to log to file

In [8]:
logs_directory = Path('logs')

# Set up a logger
file_logger = logging.getLogger(__name__) # This gets a new logger for the current, __main__ module

# Create and configure file log handler
file_handler = logging.FileHandler(logs_directory / 'usa_decennial_census_20240206.log')
# Set level to the lowest threshold (DEBUG) to capture all messages
file_handler.setLevel(logging.DEBUG)
file_format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
file_handler.setFormatter(file_format)

# Add handlers to the logger
# logger.addHandler(console_handler)
file_logger.addHandler(file_handler)

# Generate full USA decennial census

In [9]:
%%time
file_logger.info(f"Generating USA decennial census from directory {usa_data_dir}")
file_logger.info(f"Calling function {psp.generate_decennial_census.__name__}")
with CodeTimer("USA decennial census generation", unit='h', logger_func=file_logger.info):
    census = psp.generate_decennial_census(usa_data_dir)

2024-02-06 18:17:26,819 - __main__ - INFO - Generating USA decennial census from directory /mnt/team/simulation_science/pub/models/vivarium_census_prl_synth_pop/results/release_02_yellow/full_data/united_states_of_america/2023_08_21_16_35_27/final_results/2023_08_31_15_58_01/pseudopeople_simulated_population_usa_2_0_0
2024-02-06 18:17:26,821 - __main__ - INFO - Calling function generate_decennial_census
Noising data:   0%|                                                                                | 0/334 [00:00<?, ?it/s]
Applying noise:   0%|                                                                             | 0/15 [00:00<?, ?type/s][A
Applying noise:   7%|████▌                                                                | 1/15 [00:08<01:53,  8.10s/type][A
Applying noise:  13%|█████████▏                                                           | 2/15 [00:10<01:00,  4.62s/type][A
Applying noise:  20%|█████████████▊                                                     

In [10]:
file_logger.info(f"{type(census)} census occupies {sizemb(census)} MB in memory")

2024-02-06 21:54:30,228 - __main__ - INFO - <class 'pandas.core.frame.DataFrame'> census occupies 277575.444814 MB in memory


In [11]:
file_logger.info(f"DataFrame info: {census.info()}")

2024-02-06 21:54:30,857 - __main__ - INFO - DataFrame info: None


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 330811542 entries, 0 to 330811541
Data columns (total 18 columns):
 #   Column                            Dtype   
---  ------                            -----   
 0   simulant_id                       object  
 1   household_id                      object  
 2   first_name                        object  
 3   middle_initial                    object  
 4   last_name                         object  
 5   age                               object  
 6   date_of_birth                     object  
 7   street_number                     object  
 8   street_name                       object  
 9   unit_number                       object  
 10  city                              object  
 11  state                             category
 12  zipcode                           object  
 13  housing_type                      object  
 14  relationship_to_reference_person  category
 15  sex                               category
 16  race_ethnicity

In [12]:
file_logger.info(f"Detailed DataFrame memory usage: {census.memory_usage(deep=True)}")

2024-02-06 22:10:42,944 - __main__ - INFO - Detailed DataFrame memory usage: Index                                       132
simulant_id                         22431980013
household_id                        22308622678
first_name                          20707715312
middle_initial                      19101141995
last_name                           20925307730
age                                 19390311234
date_of_birth                       22048792559
street_number                       19551188244
street_name                         22753729177
unit_number                         11209476244
city                                21724459018
state                                 330816655
zipcode                             20411153783
housing_type                        21779096905
relationship_to_reference_person      330813526
sex                                   330811774
race_ethnicity                        330812291
year                                11909215512
dtype: int6

In [None]:
%%time
file_logger.info("Converting to ints and categoricals...")
with CodeTimer("US census datatype conversion", unit='m', logger_func=file_logger.info):
    census = datatypes.to_int_and_categorical(census)

2024-02-07 00:13:59,890 - __main__ - INFO - Converting to ints and categoricals...


In [None]:
file_logger.info(f"After converting to ints and categoricals, {type(census)} census occupies {sizemb(census)} MB in memory")

In [None]:
file_logger.info(f"Detailed DataFrame memory usage after converting: {census.memory_usage(deep=True)}")