In [1]:
import numpy as np, matplotlib.pyplot as plt, pandas as pd
pd.set_option('display.max_rows', 40)

from vivarium_research_prl.noise import corruption, fake_names, noisify
from vivarium_research_prl.find_kids import datasets, noisify_data

!date
!whoami
!uname -a
!pwd

Wed 04 Jan 2023 02:59:36 PM PST
ndbs
Linux int-slurm-sarchive-p0001 5.4.0-88-generic #99-Ubuntu SMP Thu Sep 23 17:29:00 UTC 2021 x86_64 x86_64 x86_64 GNU/Linux
/mnt/share/code/ndbs/vivarium_research_prl/linkage/wic_case_study


In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
project_output_dir = '/mnt/team/simulation_science/priv/engineering/vivarium_census_prl_synth_pop'
output_subdir = 'results/results'
output_dir = f'{project_output_dir}/{output_subdir}'

!ls -l $output_dir

total 19624
-rw-r--r-- 1 abie IHME-Simulationscience 15912336 Dec 18 12:52 decennial_census.hdf
-rw-r--r-- 1 abie IHME-Simulationscience  4170840 Dec 18 12:52 wic.hdf


In [10]:
decennial_census_path = f'{output_dir}/decennial_census.hdf'
wic_path = f'{output_dir}/wic.hdf'

hdf_keys = {}
for hdf_path in [decennial_census_path, wic_path]:
    with pd.HDFStore(hdf_path, 'r') as hdf:
        filename = hdf_path.split('/')[-1]
        print(filename, hdf.info(), sep='\n', end='\n\n')
        hdf_keys[filename[:-4]] = hdf.keys()

print('Keys:', hdf_keys, sep='\n')

decennial_census.hdf
<class 'pandas.io.pytables.HDFStore'>
File path: /mnt/team/simulation_science/priv/engineering/vivarium_census_prl_synth_pop/results/results/decennial_census.hdf
/responses            frame        (shape->[184607,15])

wic.hdf
<class 'pandas.io.pytables.HDFStore'>
File path: /mnt/team/simulation_science/priv/engineering/vivarium_census_prl_synth_pop/results/results/wic.hdf
/responses            frame        (shape->[25927,19])

Keys:
{'decennial_census': ['/responses'], 'wic': ['/responses']}


In [7]:
df_census_orig = pd.read_hdf(decennial_census_path)
df_wic_orig = pd.read_hdf(wic_path)
print(f'{df_census_orig.shape=}')
print(f'{df_wic_orig.shape=}')

df_census_orig.shape=(184607, 15)
df_wic_orig.shape=(25927, 19)


# View loaded census and WIC data

The data only have `address_id`, not address and zipcode, so I wouldn't be able to realistically use address for linking. It might be worth trying to link only on name and birth date.

In [8]:
df_census_orig

Unnamed: 0,first_name,middle_initial,last_name,age,date_of_birth,address_id,relation_to_household_head,sex,race_ethnicity,census_year,guardian_1,guardian_1_address_id,guardian_2,guardian_2_address_id,housing_type
0,Alice,G,Clark,84.611928,1935-09-18,0.0,Reference person,Female,Black,2020,-1,,-1,,Standard
1,Paul,A,Littlejohn,61.021640,1959-04-22,1.0,Reference person,Male,Black,2020,-1,,-1,,Standard
2,Tiffany,F,Jackson,21.674647,1998-08-26,1.0,Biological child,Female,Black,2020,-1,,-1,,Standard
3,Thomas,K,Cox,29.552490,1990-10-10,1.0,Stepchild,Male,Black,2020,-1,,-1,,Standard
4,Mark,B,Tucker,67.254183,1953-01-26,2.0,Reference person,Male,White,2020,-1,,-1,,Standard
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108654,Wyatt,A,Gutierrez,0.050304,2030-04-27,91904.0,Other nonrelative,Male,Latino,2030,94402,91904.0,-1,,Standard
108655,Addison,K,Grandmaison,0.067495,2030-04-21,30474.0,Other nonrelative,Female,White,2030,97094,30474.0,-1,,Standard
108656,Wyatt,A,Martinez,0.070185,2030-04-20,77042.0,Other nonrelative,Male,Latino,2030,97221,77042.0,-1,,Standard
108657,Wyatt,A,Ingram,0.050247,2030-04-27,10440.0,Biological child,Male,Black,2030,24646,10440.0,-1,,Standard


In [9]:
df_wic_orig

Unnamed: 0,address_id,first_name,middle_initial,last_name,age,date_of_birth,sex,race_ethnicity,wic_year,guardian_1,guardian_1_address_id,guardian_2,guardian_2_address_id,income,census_year,nominal_age,hh_size,hh_income,wic_eligible
413,192.0,Kristen,A,Lujan,32.852589,1988-03-01,Female,Latino,2021,-1,,-1,,0.0,2021.0,32.0,2.0,0.0,True
449,210.0,Katelyn,Y,Morales,19.120803,2001-11-23,Female,Latino,2021,-1,,-1,,29000.0,2021.0,19.0,7.0,58000.0,True
579,265.0,Jade,E,Kleczkowski,13.844425,2007-03-04,Female,White,2021,-1,,-1,,0.0,2021.0,13.0,4.0,29000.0,True
664,302.0,Vanessa,J,Meade,39.310184,1981-09-15,Female,White,2021,-1,,-1,,0.0,2021.0,39.0,5.0,29000.0,True
811,368.0,Andrea,J,Wilks,27.011747,1994-01-02,Female,Black,2021,-1,,-1,,29000.0,2021.0,27.0,7.0,29000.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108426,15642.0,Gabriel,H,Romero,0.059252,2030-01-02,Male,Latino,2030,81878,15642.0,-1,,0.0,2030.0,0.0,4.0,29000.0,True
108427,34612.0,Gabriel,H,Delacruz,0.016810,2030-01-17,Male,Latino,2030,82145,34612.0,-1,,0.0,2030.0,0.0,5.0,29000.0,True
108428,35351.0,Gabriel,H,Chandler,0.007624,2030-01-21,Male,White,2030,83966,35351.0,-1,,0.0,2030.0,0.0,4.0,29000.0,True
108430,37595.0,Ella,A,Begue,0.031204,2030-01-12,Female,White,2030,89310,37595.0,-1,,0.0,2030.0,0.0,3.0,29000.0,True
