In [1]:
import numpy as np, matplotlib.pyplot as plt, pandas as pd
pd.set_option('display.max_rows', 40)

from vivarium_research_prl.noise import corruption, fake_names, noisify
from vivarium_research_prl.find_kids import datasets, noisify_data

!date
!whoami
!uname -a
!pwd

Fri 23 Dec 2022 03:23:21 PM PST
ndbs
Linux int-slurm-sarchive-p0001 5.4.0-88-generic #99-Ubuntu SMP Thu Sep 23 17:29:00 UTC 2021 x86_64 x86_64 x86_64 GNU/Linux
/mnt/share/code/ndbs/vivarium_research_prl/linkage/wic_case_study


In [2]:
%load_ext autoreload
%autoreload 2

# Goal: Test functions in `noisify_data` module

This module combines all the noise functions and applies them to the fake decennial census and WIC data I generated.

# Load stuff

In [3]:
project_output_dir = '/mnt/team/simulation_science/priv/engineering/vivarium_census_prl_synth_pop'
output_subdir = 'results/special_last_names/florida/2022_10_14_10_49_32/population_table/'
output_dir = f'{project_output_dir}/{output_subdir}'

!ls -l $output_dir


total 32224
-rw-rw-r-- 1 albrja   IHME-Simulationscience 12622072 Oct 20 23:08 decennial_census.hdf
-rwxrwxrwx 1 beatrixh IHME-Simulationscience 20364830 Nov 14 16:42 state_table.hdf


In [4]:
decennial_census_path = f'{output_dir}/decennial_census.hdf'
state_table_path = f'{output_dir}/state_table.hdf'

df_census_orig = pd.read_hdf(decennial_census_path, 'year_2020')
df_state_table = pd.read_hdf(state_table_path, 'ymd_2020_4_1')
print(f'{df_census_orig.shape=}')
print(f'{df_state_table.shape=}')

df_census_orig.shape=(47444, 10)
df_state_table.shape=(50000, 27)


In [5]:
sq = np.random.SeedSequence(66624024798819663709061712465147975287)
print(sq.entropy)
rng = np.random.default_rng(sq)
rng

66624024798819663709061712465147975287


Generator(PCG64) at 0x7F58DC9B8C80

In [6]:
df_census = datasets.generate_census_data(
    df_state_table, overall_frac=0.95, kid_frac=0.90, random_state=rng)
df_wic = datasets.generate_wic_data(df_state_table, rng)
print(f"{df_census.shape=}")
print(f"{df_wic.shape=}")

df_census.shape=(47529, 10)
df_wic.shape=(633, 10)


In [7]:
wic_not_census = df_wic.index.difference(df_census.index)
census_not_wic = df_census.loc[df_census.age<5].index.difference(df_wic.index)
print(len(wic_not_census))
print(len(census_not_wic))

64
1672


# Test fake name noise

In [9]:
f = getattr(fake_names._fake_first_name_string, 'title')
f

<function str.title()>

In [10]:
f()

'Girl\nMom\nA\nGoh\nMother\nAdult\nGrandchild\nMr\nAdult Male\nGranddaughter\nMrs\nB\nGrandson\nMs\nBaby\nH\nN\nBoy\nHija\nNephew\nBrother\nHijo\nNino\nC\nHouse\nO\nChild\nHusband\nOldest\nChild F\nInmate\nOne\nCoh\nJ\nP\nD\nK\nPerson\nDad\nKid\nR\nDau\nL\nResident\nDaughter\nLady\nRespondent\nDaughter Of\nLady In The\nS\nDoh\nLady Of\nSenor\nE\nLady Of House\nSenora\nF\nLady Of The\nSister\nFather\nLoh\nSoh\nFemale\nM\nSon\nFemale Child\nMale\nSon Of\nFriend\nMale Child\nT\nG\nMan\nV\nGent\nMan In The\nW\nGentelman\nMan Of\nWife\nGentle\nMan Of The\nWoman\nGentleman\nMinor\nYoungest\nGentleman Of\nMiss\nGentlemen\nMoh'

In [14]:
fake_names.fake_first_names('lower')

['girl',
 'mom',
 'a',
 'goh',
 'mother',
 'adult',
 'grandchild',
 'mr',
 'adult male',
 'granddaughter',
 'mrs',
 'b',
 'grandson',
 'ms',
 'baby',
 'h',
 'n',
 'boy',
 'hija',
 'nephew',
 'brother',
 'hijo',
 'nino',
 'c',
 'house',
 'o',
 'child',
 'husband',
 'oldest',
 'child f',
 'inmate',
 'one',
 'coh',
 'j',
 'p',
 'd',
 'k',
 'person',
 'dad',
 'kid',
 'r',
 'dau',
 'l',
 'resident',
 'daughter',
 'lady',
 'respondent',
 'daughter of',
 'lady in the',
 's',
 'doh',
 'lady of',
 'senor',
 'e',
 'lady of house',
 'senora',
 'f',
 'lady of the',
 'sister',
 'father',
 'loh',
 'soh',
 'female',
 'm',
 'son',
 'female child',
 'male',
 'son of',
 'friend',
 'male child',
 't',
 'g',
 'man',
 'v',
 'gent',
 'man in the',
 'w',
 'gentelman',
 'man of',
 'wife',
 'gentle',
 'man of the',
 'woman',
 'gentleman',
 'minor',
 'youngest',
 'gentleman of',
 'miss',
 'gentlemen',
 'moh']

# Test noise-ification of census data

In [27]:
# Set fixed random seed for reproducibility when re-running for testing
df_census_noisy = noisify_data.add_noise_to_census(df_census, 554433)
df_census_noisy

Unnamed: 0,first_name,middle_initial,last_name,date_of_birth,age,sex,race_ethnicity,relation_to_household_head,address,zipcode
0,Margaret,J,Clark,1951-07-27,68.0,Female,Black,Reference person,"1344 winoka rd brooksville, fl",34601
1,Jeffrey,V,Littlejohn,1967-05-03,52.0,Male,Black,Reference person,"927 23rd st clearwater, fl",34698
2,Briana,,Jacmson,2006-09-07,13.0,Female,Black,Biological child,"927 23rd st clearwater, fl",34698
3,Benjamin,D,Cox,1998-10-21,21.0,Male,Black,Stepchild,"927 23rd st clearwater, fl",34698
4,Willie,J,Tucker,1947-10-09,72.0,Male,White,Reference person,"8904 167th place fleming island, fl",32003
...,...,...,...,...,...,...,...,...,...,...
49994,Marcus,S,Roman,1988-07-08,31.0,Male,Multiracial or Other,Institutionalized GQ pop,"2210 henn hyde rd ne hollywood, fl",33021
49996,Nathaniel,,Campbell,1941-01-08,79.0,Male,White,Institutionalized GQ pop,"2210 henn hyde rd ne hollywood, fl",33021
49997,Christian,C,Rosales,1983-12-16,36.0,Male,Latino,Institutionalized GQ pop,"701 haber rd vero beach, fl",32968
49998,Phillip,J,Morton,1985-06-11,34.0,Male,White,Institutionalized GQ pop,"114 s frnt st fort myers, fl",33919


In [28]:
df_census.compare(df_census_noisy)

Unnamed: 0_level_0,first_name,first_name,middle_initial,middle_initial,last_name,last_name,date_of_birth,date_of_birth,sex,sex,race_ethnicity,race_ethnicity,address,address,zipcode,zipcode
Unnamed: 0_level_1,self,other,self,other,self,other,self,other,self,other,self,other,self,other,self,other
2,,,A,,Jackson,Jacmson,,,,,,,,,,
25,,,,,,,1963-05-21,1963-21-05,,,,,,,,
36,,,G,,,,,,,,,,,,,
37,Henry,Hengy,,,,,,,,,,,,,,
44,,,,,,,,,,,,,"26 cypress crt jacksonville, fl","26 cvpress crt jacksonv'llle, fl",,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49973,,,,,,,,,Male,F,,,,,,
49986,Betty,Bettj,,,,,,,,,,,"701 haber rd vero beach, fl","701 haber rd vero peach, fl",,
49988,Bruce,Bahuse,,,,,,,,,,,,,,
49993,,,,,,,,,,,,,"2210 henn hyde rd ne hollywood, fl","2210 henn byde rd ne hollvwood, fl",,


In [29]:
7544/47529 # About 16% of records are corrupted

0.1587241473626628

## See how many rows for under-5-year-olds are corrupted

About the same percentage. Good, because "corrupted" and "under 5" should be independent events.

In [35]:
under_5 = df_census.age < 5
under_5.sum()

2241

In [36]:
df_census.loc[under_5].compare(df_census_noisy.loc[under_5])

Unnamed: 0_level_0,first_name,first_name,middle_initial,middle_initial,last_name,last_name,date_of_birth,date_of_birth,sex,sex,race_ethnicity,race_ethnicity,address,address,zipcode,zipcode
Unnamed: 0_level_1,self,other,self,other,self,other,self,other,self,other,self,other,self,other,self,other
203,,,J,,,,,,,,,,,,,
347,,,,,,,,,Female,F,,,,,,
444,,,,,,,2019-10-14,2019-14-10,,,,,,,,
492,,,H,,,,,,,,,,,,,
531,,,,,Grimm,Grim,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47400,,,,,,,2016-02-14,2016-14-02,,,,,,,,
48121,,,E,,,,,,,,,,,,,
48128,,,,,Allen,Of House,,,,,,,,,,
48235,,,A,,,,,,,,,,,,,


In [37]:
360/2241 # Still about 16%

0.1606425702811245

# Test noise-ification of census and WIC data

In [30]:
# Set fixed random seed for reproducibility when re-running for testing
df_wic_noisy = noisify_data.add_noise_to_wic(df_wic, 77889900)
df_wic_noisy

Unnamed: 0,first_name,middle_name,last_name,date_of_birth,sex,race_ethnicity,address,zipcode,household_id,wic_id
82,Sadie,Katia,Tidwell,2017-10-15,Female,Black,"w 4th st north port, fl",34287,48,1
83,Liliana,Addisyn,Marshall,2019-12-03,Female,Black,"w 4th st north port, fl",34287,48,2
174,Holly,Emma,Yount,2019-05-17,Female,White,"7944 se 62nd ave unincorporated, fl",32824,88,3
306,Emilee,,Haskew,2019-12-30,Female,Latino,"749 mi ridge ests destin, fl",02541,150,4
323,Gunner,,Parkinson,2020-03-03,Male,White,"600 n maranantha rd hialeah, fl",33016,157,5
...,...,...,...,...,...,...,...,...,...,...
48269,Kaylee,Trinity,Hill,2017-10-20,Female,Black,"98 melanie dr pembroke pines, fl",33026,20380,629
48351,Lev,,Dove,2018-10-18,Male,Black,"671 john muir road spring hill, fl",34610,20422,630
48442,Frederick,,Rodriguez,2019-06-04,Male,Latino,"5765 heards forest dr crestview, fl",32539,20452,631
48456,Liam,,Sardone,2017-01-08,Male,White,"107 brown ave st. petersburg, fl",33704,20458,632


In [31]:
df_wic.compare(df_wic_noisy)

Unnamed: 0_level_0,first_name,first_name,middle_name,middle_name,last_name,last_name,date_of_birth,date_of_birth,sex,sex,race_ethnicity,race_ethnicity,address,address,zipcode,zipcode
Unnamed: 0_level_1,self,other,self,other,self,other,self,other,self,other,self,other,self,other,self,other
306,,,Guadalupe,,,,,,,,,,,,32541,02541
323,,,Liam,,,,,,,,,,,,,
347,,,Mayra,,,,,,,,,,,,,
870,,,Isabella,,,,,,,,,,,,,
986,,,Sara,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48136,,,Matias,,,,,,,,,,,,,
48171,,,Nyla,,,,,,,,,,,,,
48351,,,Thomas,,,,,,,,,,,,,
48442,,,Cameron,,,,,,,,,,,,,


In [32]:
392/633 # About 62% of rows are corrupted -- most are because of missing middle name

0.6192733017377567

## Check again with with middle names dropped

In [33]:
df_wic.drop(columns='middle_name').compare(df_wic_noisy.drop(columns='middle_name'))

Unnamed: 0_level_0,first_name,first_name,last_name,last_name,date_of_birth,date_of_birth,sex,sex,race_ethnicity,race_ethnicity,address,address,zipcode,zipcode
Unnamed: 0_level_1,self,other,self,other,self,other,self,other,self,other,self,other,self,other
306,,,,,,,,,,,,,32541,02541
1825,,,,,,,,,,,,,33971,33981
3904,,,Abbasi Rehman,Apasi Rhman,,,,,,,,,,
5672,,,,,2017-05-07,2017-07-05,,,,,,,,
6047,,,,,,,,,White,NHOPI,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43752,,,,,,,,,,,,,34743,34745
43970,,,Madrigal Mendoza,Madngal Mendoza,,,,,,,,,,
44885,,,,,,,,,,,,,32952,35953
44920,Noah,Nosh,,,,,,,,,,,,


In [34]:
55/633 # Only 9% of rows are corrupted if we ignore missing middle names

0.08688783570300158