In [1]:
import numpy as np, matplotlib.pyplot as plt, pandas as pd
pd.set_option('display.max_rows', 40)

from vivarium_research_prl.noise import corruption, fake_names
from vivarium_research_prl.find_kids import datasets

!date
!whoami
!uname -a
!pwd

Wed 21 Dec 2022 11:25:22 AM PST
ndbs
Linux int-slurm-sarchive-p0001 5.4.0-88-generic #99-Ubuntu SMP Thu Sep 23 17:29:00 UTC 2021 x86_64 x86_64 x86_64 GNU/Linux
/mnt/share/code/ndbs/vivarium_research_prl/linkage/wic_case_study


In [2]:
%load_ext autoreload
%autoreload 2

# Load census data and state table for year 2020

In [3]:
project_output_dir = '/mnt/team/simulation_science/priv/engineering/vivarium_census_prl_synth_pop'
output_subdir = 'results/special_last_names/florida/2022_10_14_10_49_32/population_table/'
output_dir = f'{project_output_dir}/{output_subdir}'

!ls -l $output_dir

total 32224
-rw-rw-r-- 1 albrja   IHME-Simulationscience 12622072 Oct 20 23:08 decennial_census.hdf
-rwxrwxrwx 1 beatrixh IHME-Simulationscience 20364830 Nov 14 16:42 state_table.hdf


In [4]:
decennial_census_path = f'{output_dir}/decennial_census.hdf'
state_table_path = f'{output_dir}/state_table.hdf'

df_census_orig = pd.read_hdf(decennial_census_path, 'year_2020')
df_state_table = pd.read_hdf(state_table_path, 'ymd_2020_4_1')
print(f'{df_census_orig.shape=}')
print(f'{df_state_table.shape=}')

df_census_orig.shape=(47444, 10)
df_state_table.shape=(50000, 27)


# Get a random SeedSequence and Generator using my saved entropy from last time

In [5]:
sq = np.random.SeedSequence(66624024798819663709061712465147975287)
print(sq.entropy)
rng = np.random.default_rng(sq)
rng

66624024798819663709061712465147975287


Generator(PCG64) at 0x7FCF8EA92D60

# Generate decennial census and WIC data

And edit the census and WIC data generating functions to omit zipcodes that are not 5 digits.

In [17]:
df_census = datasets.generate_census_data(
    df_state_table, overall_frac=0.95, kid_frac=0.90, random_state=rng)
df_wic = datasets.generate_wic_data(df_state_table, rng)
print(f"{df_census.shape=}")
print(f"{df_wic.shape=}")

df_census.shape=(47494, 10)
df_wic.shape=(615, 10)


In [24]:
wic_not_census = df_wic.index.difference(df_census.index)
len(wic_not_census)

64

In [25]:
census_not_wic = df_census.loc[df_census.age<5].index.difference(df_wic.index)
len(census_not_wic)

1669

# Testing stuff

In [118]:
def swap_month_day(date, date_format="yyyy-mm-dd"):
    if isinstance(date, pd.Series):
        date = date.str
    date_format = date_format.lower()
    y_idx = date_format.index("yyyy")
    m_idx = date_format.index("mm")
    d_idx = date_format.index("dd")
    if y_idx == -1:
        y_idx = date_format.index("yy") # in case year format is yy not yyyy
        year = date[y_idx:y_idx+2]
    else:
        year = date[y_idx:y_idx+4]
    month = date[m_idx:m_idx+2]
    day = date[d_idx:d_idx+2]
    if y_idx==0 and m_idx==5 and d_idx==8: # e.g. "yyyy-mm-dd" or "yyyy/mm/dd"
        # Use same separators as in original date
        swapped_date = year + date[4] + day + date[7] + month
    else:
        raise ValueError(f"unsupported date format: {date_format}")
    return swapped_date
swap_month_day('2023-10-31')

'2023-31-10'

In [115]:
"yyyy-mm-dd".find('yyyy')

0

In [41]:
swap_month_day('2023/10/31')

'2023/31/10'

In [32]:
dob = df_wic.date_of_birth.str
dob

<pandas.core.strings.accessor.StringMethods at 0x7fcf8f51efd0>

In [33]:
dob[:4]

183      2019
203      2019
306      2019
323      2020
401      2018
         ... 
48136    2019
48171    2019
48186    2019
48200    2019
48340    2018
Name: date_of_birth, Length: 615, dtype: object

In [36]:
df_wic.date_of_birth

183      2019-12-12
203      2019-04-09
306      2019-12-30
323      2020-03-03
401      2018-06-23
            ...    
48136    2019-08-21
48171    2019-06-27
48186    2019-09-28
48200    2019-01-18
48340    2018-06-09
Name: date_of_birth, Length: 615, dtype: object

In [119]:
swap_month_day(df_wic.date_of_birth)

183      2019-12-12
203      2019-09-04
306      2019-30-12
323      2020-03-03
401      2018-23-06
            ...    
48136    2019-21-08
48171    2019-27-06
48186    2019-28-09
48200    2019-18-01
48340    2018-09-06
Name: date_of_birth, Length: 615, dtype: object

In [133]:
def miswrite_zipcode(
    zipcode,
    first2_prob=0.001,
    middle_prob=0.005,
    last2_prob=0.02,
    random_state=None
):
    rng = np.random.default_rng(random_state)
    is_series = isinstance(zipcode, pd.Series)
    if is_series:
        shape = (len(zipcode),5)
        zipcode_series = zipcode
        zipcode = zipcode.str
    else: # type should be str
        shape = (1,5)
    threshold = np.array([2*[first2_prob] + [middle_prob] + 2*[last2_prob]])
    replace = rng.random(shape) < threshold
    random_digits = rng.choice(list('0123456789'), shape)
    digits = []
    for i in range(5):
        digit = np.where(replace[:,i], random_digits[:,i], zipcode[i])
        if is_series:
            digit = pd.Series(digit, index=zipcode_series.index, name=zipcode_series.name)
        else:
            digit = digit[0]
        digits.append(digit)
    new_zipcode = digits[0] + digits[1] + digits[2] + digits[3] + digits[4]
    return new_zipcode

In [44]:
np.where([True, False, True], [1,2,3], 4)

array([1, 4, 3])

In [45]:
np.where([True, False, True], [1,2,3], [4,5,6])

array([1, 5, 3])

In [47]:
np.where(np.array([True, False, True])[:1], 1, 4)

array([1])

In [53]:
str(np.where(True, 1, 4))

'1'

In [48]:
t = type('abc')
t

str

In [49]:
t(9)

'9'

In [50]:
s = type(df_wic.date_of_birth)
s

pandas.core.series.Series

In [51]:
s('a')

0    a
dtype: object

In [130]:
miswrite_zipcode('12345', .1, 0.5, .8)

'12048'

In [131]:
miswrite_zipcode(df_wic.zipcode, .1, .2, .8)

183      34681
203      33478
306      42536
323      33006
401      24476
         ...  
48136    34797
48171    31201
48186    33771
48200    33610
48340    33862
Name: zipcode, Length: 615, dtype: object

In [85]:
shape = (1,5)
threshold = np.array(3*[.2] + 2*[.8])
print(threshold, threshold.shape)
replace = rng.random(shape) < threshold#.reshape((1,5))
replace

[0.2 0.2 0.2 0.8 0.8] (5,)


array([[ True, False, False,  True,  True]])

In [93]:
replace[:,3]

True

In [134]:
%timeit miswrite_zipcode(df_wic.zipcode, .1, .2, .8)

1.85 ms ± 8.85 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [107]:
%timeit df_wic.zipcode.map(lambda z: miswrite_zipcode(z, .1, .2, .8, rng))

21.3 ms ± 963 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [141]:
df_wic.first_name

183         Jose
203        Nolan
306       Emilee
323       Gunner
401      Julissa
          ...   
48136     Xavier
48171    Kailani
48186      Alana
48200     Rylynn
48340     Dakota
Name: first_name, Length: 615, dtype: object

In [143]:
df_wic.first_name.str[5]

183      NaN
203      NaN
306        e
323        r
401        s
        ... 
48136      r
48171      n
48186    NaN
48200      n
48340      a
Name: first_name, Length: 615, dtype: object