In [1]:
import numpy as np, matplotlib.pyplot as plt, pandas as pd
pd.set_option('display.max_rows', 50)

import os
# Use this to see how much memory the dataframes use
from sys import getsizeof

import yaml
import re

import pseudopeople as pp
from pseudopeople.configuration import get_configuration
from vivarium.framework.randomness import RandomnessStream
from vivarium.config_tree import ConfigTree

from vivarium_research_prl.utils import sizemb, MappingViaAttributes
from vivarium_research_prl import datatypes, alpha, data_loading
from vivarium_research_prl.noise import corruption, fake_names, noisify
from vivarium_research_prl.find_kids import datasets, noisify_data


!date
!whoami
!uname -a
!pwd

Sun 16 Apr 2023 10:34:52 PM PDT
ndbs
Linux long-slurm-sarchive-p0041 5.4.0-135-generic #152-Ubuntu SMP Wed Nov 23 20:19:22 UTC 2022 x86_64 x86_64 x86_64 GNU/Linux
/mnt/share/code/ndbs/vivarium_research_prl/pseudopeople_testing


In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
project_dir = '/mnt/team/simulation_science/priv/engineering/vivarium_census_prl_synth_pop'
model_dir = (
    f'{project_dir}/results'
    '/v2.0_three_censuses/united_states_of_america/2023_04_08_14_07_04'
)
parquet_dir = f'{model_dir}/final_results/parquet'
hdf_dir = f'{model_dir}/final_results/hdf'
rhode_island_par_dir = f'{parquet_dir}/states/rhode_island'
usa_par_dir = f'{parquet_dir}/usa'
rhode_island_hdf_dir = f'{hdf_dir}/states/rhode_island'
usa_hdf_dir = f'{hdf_dir}/usa'

!ls -halt $hdf_dir

total 96K
drwxrwsr-x  3 rmudambi IHME-Simulationscience  512 Apr  9 18:49 states
drwxrwsr-x  5 rmudambi IHME-Simulationscience 1.5K Apr  9 18:49 .
drwxrwsr-x 10 rmudambi IHME-Simulationscience 4.0K Apr  9 15:35 usa
drwxrwsr-x  4 rmudambi IHME-Simulationscience 2.0K Apr  9 15:21 ..
drwxrwsr-x  2 rmudambi IHME-Simulationscience 335K Apr  9 11:55 logs


# More categorical tests

In [4]:
s = pd.Series([1, 8, 5, '8', np.nan, 9, 8, '1', np.nan])
s

0      1
1      8
2      5
3      8
4    NaN
5      9
6      8
7      1
8    NaN
dtype: object

In [5]:
s_cat = s.astype('category')
s_cat

0      1
1      8
2      5
3      8
4    NaN
5      9
6      8
7      1
8    NaN
dtype: category
Categories (6, object): [1, 5, 8, 9, '1', '8']

In [7]:
s_cat.cat.codes

0    0
1    2
2    1
3    5
4   -1
5    3
6    2
7    4
8   -1
dtype: int8

In [8]:
cat_map = dict(zip(s_cat.cat.categories, s_cat.cat.categories.astype(str)))
cat_map

{1: '1', 5: '5', 8: '8', 9: '9', '1': '1', '8': '8'}

In [10]:
s_cat.dtype

CategoricalDtype(categories=[1, 5, 8, 9, '1', '8'], ordered=False)

In [9]:
s_cat.cat.categories.map(cat_map)

Index(['1', '5', '8', '9', '1', '8'], dtype='object')

In [14]:
new_cats = s_cat.cat.categories.map(cat_map).unique()
new_cats

Index(['1', '5', '8', '9'], dtype='object')

In [15]:
new_cat_to_code = dict(zip(new_cats, range(len(new_cats))))
new_cat_to_code

{'1': 0, '5': 1, '8': 2, '9': 3}

In [17]:
s_cat.cat.categories[s_cat.cat.codes] # ?? Why -1 -> '8'?

Index([1, 8, 5, '8', '8', 9, 8, '1', '8'], dtype='object')

In [18]:
s_cat

0      1
1      8
2      5
3      8
4    NaN
5      9
6      8
7      1
8    NaN
dtype: category
Categories (6, object): [1, 5, 8, 9, '1', '8']

In [22]:
new_code_arr = s_cat.cat.categories.map(cat_map).map(new_cat_to_code)
new_code_arr

Int64Index([0, 1, 2, 3, 0, 2], dtype='int64')

In [23]:
s_cat.cat.codes

0    0
1    2
2    1
3    5
4   -1
5    3
6    2
7    4
8   -1
dtype: int8

In [24]:
old_cats = s_cat.cat.categories
old_cat_to_old_code = dict(zip(old_cats, range(len(old_cats))))
old_cat_to_old_code

{1: 0, 5: 1, 8: 2, 9: 3, '1': 4, '8': 5}

In [31]:
new_codes = s_cat.cat.codes.map(pd.Series(new_code_arr)).fillna(-1).astype(int)
new_codes

0    0
1    2
2    1
3    2
4   -1
5    3
6    2
7    0
8   -1
dtype: int64

In [28]:
pd.Categorical.from_codes(new_codes, new_cats)

['1', '8', '5', '8', NaN, '9', '8', '1', NaN]
Categories (4, object): ['1', '5', '8', '9']

In [29]:
s

0      1
1      8
2      5
3      8
4    NaN
5      9
6      8
7      1
8    NaN
dtype: object

In [38]:
new_code_map = dict(zip(range(len(new_code_arr)), new_code_arr))
new_code_map.update({-1:-1})
new_code_map

{0: 0, 1: 1, 2: 2, 3: 3, 4: 0, 5: 2, -1: -1}

In [39]:
s_cat.cat.codes.map(new_code_map)

0    0
1    2
2    1
3    2
4   -1
5    3
6    2
7    0
8   -1
dtype: int64

In [80]:
def merge_series_categories(series, category_mapping):
    # https://stackoverflow.com/questions/32262982/pandas-combining-multiple-categories-into-one
    return series.map(category_mapping).astype('category')

def merge_categories(categorical: pd.Categorical, old_cat_to_new_cat: dict):
    new_cat_array = categorical.categories.map(old_cat_to_new_cat)
    new_cats = new_cat_array.unique()
    if len(new_cats) == len(new_cat_array):
        # one-to-one mapping -> no merging is necessary, just renaming of categories
        new_categorical = categorical.rename_categories(new_cats)
    else:
        # Map each new category to its index in the categories array, i.e., its code
        new_cat_to_new_code = dict(zip(new_cats, range(len(new_cats))))
        # This array replaces each old category with the index (code) of the new category
        new_code_array = new_cat_array.map(new_cat_to_new_code)
        # The index (code) of the old category is mapped to the index (code) of the new category
        old_code_to_new_code = dict(zip(range(len(new_code_array)), new_code_array))
        # -1 indicates NaN and needs to stay the same in the new codes
        old_code_to_new_code.update({-1: -1})
        new_codes = categorical.codes.map(old_code_to_new_code)
        new_categorical = pd.Categorical.from_codes(new_codes, new_cats)
    return new_categorical

%time merge_categories(s_cat.cat, cat_map)

CPU times: user 3.96 ms, sys: 0 ns, total: 3.96 ms
Wall time: 3.82 ms


['1', '8', '5', '8', NaN, '9', '8', '1', NaN]
Categories (4, object): ['1', '5', '8', '9']

In [45]:
%timeit merge_categories(s_cat.cat, cat_map)

1.6 ms ± 18.8 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [47]:
4

4

In [50]:
s_cat.cat.categories.map(cat_map).duplicated().any()

True

In [51]:
s_cat.cat.categories.duplicated().any()

False

In [52]:
s_cat.nbytes

57

In [53]:
s_cat.memory_usage(deep=True)

569

In [54]:
getsizeof(s_cat)

585

In [74]:
size = 1_000_000
rng = np.random.default_rng(99988)
a = rng.choice([1, 2, 3, 4, 4], size=size)
b = rng.choice(['1', '1', '2', '3'], size=size)
ab = pd.concat([pd.Series(a), pd.Series(b)], ignore_index=True).astype('category')
ab

0          4
1          3
2          4
3          2
4          4
          ..
1999995    1
1999996    2
1999997    1
1999998    2
1999999    1
Length: 2000000, dtype: category
Categories (7, object): [1, 2, 3, 4, '1', '2', '3']

In [75]:
ab.value_counts()

1    500467
4    399850
3    250008
2    249525
1    200601
2    200092
3    199457
dtype: int64

In [76]:
d = dict(zip(ab.cat.categories, ab.cat.categories.astype(str)))
d

{1: '1', 2: '2', 3: '3', 4: '4', '1': '1', '2': '2', '3': '3'}

In [81]:
%timeit pd.Series(merge_categories(ab.cat, d))
%timeit merge_series_categories(ab, d)

21 ms ± 103 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
139 ms ± 455 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [82]:
ab_str1 = pd.Series(merge_categories(ab.cat, d))
ab_str2 = merge_series_categories(ab, d)
ab_str1.equals(ab_str2)

True

In [83]:
ab_str1

0          4
1          3
2          4
3          2
4          4
          ..
1999995    1
1999996    2
1999997    1
1999998    2
1999999    1
Length: 2000000, dtype: category
Categories (4, object): ['1', '2', '3', '4']

# See what columns are in current data

In [84]:
!ls -halt $usa_par_dir

total 284K
drwxrwsr-x  5 rmudambi IHME-Simulationscience 1.5K Apr  9 18:48 ..
drwxrwsr-x 10 rmudambi IHME-Simulationscience 4.0K Apr  9 15:41 .
drwxrwsr-x  2 rmudambi IHME-Simulationscience 167K Apr  9 12:21 tax_dependents_observer
drwxrwsr-x  2 rmudambi IHME-Simulationscience 167K Apr  9 12:20 tax_1040_observer
drwxrwsr-x  2 rmudambi IHME-Simulationscience 167K Apr  9 12:15 tax_w2_observer
drwxrwsr-x  2 rmudambi IHME-Simulationscience 167K Apr  9 12:09 social_security_observer
drwxrwsr-x  2 rmudambi IHME-Simulationscience 167K Apr  9 12:09 wic_observer
drwxrwsr-x  2 rmudambi IHME-Simulationscience 167K Apr  9 12:09 household_survey_observer_cps
drwxrwsr-x  2 rmudambi IHME-Simulationscience 167K Apr  9 12:09 household_survey_observer_acs
drwxrwsr-x  2 rmudambi IHME-Simulationscience 167K Apr  9 12:09 decennial_census_observer


In [85]:
!ls -halt $usa_par_dir/household_survey_observer_acs

total 602M
drwxrwsr-x 10 rmudambi IHME-Simulationscience 4.0K Apr  9 15:41 ..
-rw-r--r--  1 rmudambi IHME-Simulationscience 2.2M Apr  9 12:36 household_survey_observer_acs_2689.parquet
-rw-r--r--  1 rmudambi IHME-Simulationscience 2.0M Apr  9 12:27 household_survey_observer_acs_6545.parquet
-rw-r--r--  1 rmudambi IHME-Simulationscience 1.8M Apr  9 12:09 household_survey_observer_acs_9888.parquet
drwxrwsr-x  2 rmudambi IHME-Simulationscience 167K Apr  9 12:09 .
-rw-r--r--  1 rmudambi IHME-Simulationscience 1.7M Apr  9 12:08 household_survey_observer_acs_9901.parquet
-rw-r--r--  1 rmudambi IHME-Simulationscience 1.7M Apr  9 12:08 household_survey_observer_acs_9840.parquet
-rw-r--r--  1 rmudambi IHME-Simulationscience 1.8M Apr  9 12:08 household_survey_observer_acs_9911.parquet
-rw-r--r--  1 rmudambi IHME-Simulationscience 1.8M Apr  9 12:08 household_survey_observer_acs_9971.parquet
-rw-r--r--  1 rmudambi IHME-Simulationscience 1.7M Apr  9 12:08 household_survey_observer_acs_99.

In [None]:
pp.generate_american_communities_survey()

In [None]:
# pick a few seeds
seeds = [2689, 6545, 9888]


In [None]:
221149/