#  Simulated Data – Not CUI

In [1]:
import numpy as np, matplotlib.pyplot as plt, pandas as pd
pd.set_option('display.max_rows', 40)

# from vivarium_research_prl.noise import corruption, fake_names, noisify
from vivarium_research_prl.find_kids import datasets, noisify_data
# import vivarium_research_prl.find_kids as find_kids

from splink.duckdb.duckdb_linker import DuckDBLinker
# import splink.duckdb.duckdb_comparison_library as cl

# # For viewing waterfall charts and precision-recall curve
# import altair as alt
# alt.renderers.enable('mimetype')
# alt.renderers.enable('html')

# # For viewing the comparison viewer dashboard
# from IPython.display import IFrame

!date
!whoami
!uname -a
!pwd

Fri 27 Jan 2023 10:34:07 AM PST
ndbs
Linux int-slurm-sarchive-p0012 5.4.0-88-generic #99-Ubuntu SMP Thu Sep 23 17:29:00 UTC 2021 x86_64 x86_64 x86_64 GNU/Linux
/mnt/share/code/ndbs/vivarium_research_prl/linkage/wic_case_study


In [2]:
%load_ext autoreload
%autoreload 2

# Goal: Link simulated WIC data to simulated decennial census data to identify kids under 5 that were missed in the census but enrolled in WIC

### Note

We will do the linkage using [Splink](https://github.com/moj-analytical-services/splink), but any record linkage software could be used in its place.

# Step 0 – Generate simulated data (Pseudopseudopeople)

This section includes the steps corresponding to what would eventually be handled by our **Pseudopeople package** for simulated data, which is still a work in progress.

## Load state table of 50,000 simulants from Florida

In [3]:
project_output_dir = '/mnt/team/simulation_science/priv/engineering/vivarium_census_prl_synth_pop'
output_subdir = 'results/special_last_names/florida/2022_10_14_10_49_32/population_table/'
output_dir = f'{project_output_dir}/{output_subdir}'

!ls -l $output_dir


total 32224
-rw-rw-r-- 1 albrja   IHME-Simulationscience 12622072 Oct 20 23:08 decennial_census.hdf
-rwxrwxrwx 1 beatrixh IHME-Simulationscience 20364830 Nov 14 16:42 state_table.hdf


In [4]:
state_table_path = f'{output_dir}/state_table.hdf'
df_state_table = pd.read_hdf(state_table_path, 'ymd_2020_4_1')
print(f'{df_state_table.shape=}')

df_state_table.shape=(50000, 27)


## Set a random seed and create a random number generator

In [5]:
sq = np.random.SeedSequence(66624024798819663709061712465147975287)
print(sq.entropy)
rng = np.random.default_rng(sq)
rng

66624024798819663709061712465147975287


Generator(PCG64) at 0x7FEE76867740

## Generate census and WIC data from state table

- The census data will have some people missing at random

- The WIC data include kids under 5 (but not women) sampled from the state table according to age-specific coverage rates of WIC

In [6]:
df_census_raw = datasets.generate_census_data(
    df_state_table, overall_frac=0.95, kid_frac=0.90, random_state=rng)
df_wic_raw = datasets.generate_wic_data(df_state_table, rng)
print(f"{df_census_raw.shape=}")
print(f"{df_wic_raw.shape=}")

df_census_raw.shape=(47529, 10)
df_wic_raw.shape=(633, 10)


## Add noise to data

In [7]:
# Allow setting random seed for reproducibility during testing
random_state = rng
df_census_noisy = noisify_data.add_noise_to_census(df_census_raw, random_state)
df_wic_noisy = noisify_data.add_noise_to_wic(df_wic_raw, random_state)

In [8]:
df_census_noisy

Unnamed: 0,first_name,middle_initial,last_name,date_of_birth,age,sex,race_ethnicity,relation_to_household_head,address,zipcode
0,Margaret,J,Clark,1951-07-27,68.0,Female,Black,Reference person,"1344 winoka rd brooksville, fl",34601
1,Jeffrey,V,Littlejohn,1967-05-03,52.0,Male,Black,Reference person,"927 23rd st clearwater, fl",34698
2,Briana,A,Jackson,2006-09-07,13.0,Female,Black,Biological child,"927 23rd st clearwater, fl",34698
3,Benjamin,D,Cox,1998-10-21,21.0,Male,Black,Stepchild,"927 23rd st clearwater, fl",34698
4,Willie,,Tucker,1947-10-09,72.0,Male,White,Reference person,"8904 167th place fleming island, fl",32003
...,...,...,...,...,...,...,...,...,...,...
49994,Marcus,S,Roman,1988-07-08,31.0,Male,Multiracial or Other,Institutionalized GQ pop,"2210 henn hyde rd ne hollywood, fl",33021
49996,Nathaniel,J,Campbell,1941-01-08,79.0,Male,White,Institutionalized GQ pop,"2210 henn hyde rd ne hollywood, fl",33021
49997,Christian,C,Rosales,1983-12-16,36.0,Male,Latino,Institutionalized GQ pop,"701 haber rd vero beach, fl",32968
49998,Phillip,J,Morton,1985-06-11,34.0,Male,White,Institutionalized GQ pop,"114 s frnt st fort myers, fl",33919


In [9]:
df_wic_noisy

Unnamed: 0,first_name,middle_name,last_name,date_of_birth,sex,race_ethnicity,address,zipcode,household_id,wic_id
82,Sadie,Katia,Tidwell,2017-10-15,Female,Black,"w 4th st north port, fl",34287,48,1
83,Liliana,Addisyn,Marshall,2019-12-03,Female,Black,"w 4th st north port, fl",34287,48,2
174,Holly,Emma,Yount,2019-05-17,M,White,"7944 se 62nd ave unincorporated, fl",32824,88,3
306,Emilee,Guadalupe,Haskew,2019-12-30,Female,Latino,"749 mi ridge ests destin, fl",32541,150,4
323,Gunner,Liam,Parkinson,2020-03-03,Male,White,"600 n maranantha rd hialeah, fl",33016,157,5
...,...,...,...,...,...,...,...,...,...,...
48269,Kaylee,Trinity,Hill,2017-10-20,Female,Black,"98 melanie dr pembroke pines, fl",33026,20380,629
48351,Lev,,Dove,2018-10-18,Male,Black,"671 john muir road spring hill, fl",34610,20422,630
48442,Frederick,Cameron,Rodriguez,2019-06-04,Male,Latino,"5765 heards forest dr crestview, fl",32539,20452,631
48456,Liam,Emmett,Sardone,2017-01-08,Male,White,"107 brown ave st. petersburg, fl",33704,20458,632


## Save data

For the real Pseudopeople package, the output would presumably be a downloaded file. But here I'm just going to preprocess the noised data in memory, and then save.

In [10]:
# Don't actually need to save anything

# Step 1 - Preprocessing (prepare data for linking)

**Note:** Eventually, this step will need to include all the data cleaning, data alignment, and conforming the data to required software specifications.

For the present datasets, the only steps necessary are to conform the columns to Splink's requirements:

- Ensure WIC and Census data have identical column names

  - If a column appeared in one dataset but not the other, I added an empty column with the same name to the other dataset

- Add a `unique_id` column to both datasets

## See how many kids show up in WIC but not census and vice versa

Use our labeled data to check how many missing kids I'll actually be looking for.

In [11]:
wic_not_census = df_wic_raw.index.difference(df_census_raw.index)
census_not_wic = df_census_raw.loc[df_census_raw.age<5].index.difference(df_wic_raw.index)
print(len(wic_not_census))
print(len(census_not_wic))

64
1672


## View columns in the two datasets because I need to conform them

In [12]:
df_census_noisy.dtypes

first_name                     object
middle_initial                 object
last_name                      object
date_of_birth                  object
age                           float64
sex                            object
race_ethnicity                 object
relation_to_household_head     object
address                        object
zipcode                        object
dtype: object

In [13]:
df_wic_noisy.dtypes

first_name        object
middle_name       object
last_name         object
date_of_birth     object
sex               object
race_ethnicity    object
address           object
zipcode           object
household_id       int64
wic_id             int64
dtype: object

In [14]:
df_census_noisy.relation_to_household_head.unique()

array(['Reference person', 'Biological child', 'Stepchild',
       'Opp-sex spouse', 'Grandchild', 'Other nonrelative',
       'Parent-in-law', 'Roommate', 'Child-in-law', 'Opp-sex partner',
       'Sibling', 'Parent', 'Adopted child', 'Same-sex partner',
       'Other relative', 'Same-sex spouse', 'Foster child',
       'Noninstitutionalized GQ pop', 'Institutionalized GQ pop'],
      dtype=object)

## Define functions to process the above "original" datasets for linking via Splink (or whatever other linking software we'll be using)

[Necessary conditions for Splink data](https://moj-analytical-services.github.io/splink/demos/01_Prerequisites.html):

* Dataframes must have the same column names
* Each data set needs a column with a unique identifier within that dataset (Splink assumes the column name is "unique_id" by default, but this can be changed if desired)
* Null values should be represented by actual `NaN`s, not, e.g., empty strings or something
* Data should be "cleaned," whatever that means for this particular data

To avoid losing data in either dataset, I am:

* Adding columns missing from one data set to the the other dataset, just filled with NaNs (unless there is something more intelligent I could do, like filling in age from birthdate). For example, I add an empty `household_id` column to the census data even though this won't be used directly for linking (though I may use it in a post-processing step, e.g. to identify a kid who was missing from census but whose sibling matched).
* Renaming `middle_name` and `middle_initial` to just `middle`, rather than only keeping the middle initial in WIC, so as to not lose information by dropping the rest of the name in the WIC data.

### Note:

In practice, if you *know* a column won't be used in linking (e.g. the `household_id` column, it would probably be more efficient to drop it for the linking step, then join the linked data back with the original data for the post-processing steps. (I kind of like the RecordLinkage strategy where you just keep the original datasets, and you can add columns if you want, and then during the blocking/indexing step, it just creates a MultiIndex containing the pairs you want to compare, and you can specify arbitrary comparison functions on whatever columns are available...)

### Questions:

* What about something like `relation_to_household_head`, which is not present in WIC, but could be useful for blocking/filtering since I'm only looking for kids?

In [15]:
def prepare_census_data(df_census_orig):
    df_census = (
        df_census_orig
        .assign(
            unique_id=range(1,1+len(df_census_orig)),
            household_id=np.nan # Need to either drop this from WIC or add it to census
        )
        # Rename middle name/initial columns to have the same column name
        .rename(columns={'middle_initial': 'middle'})
    )
    return df_census

def prepare_wic_data(df_wic_orig):
    df_wic = (
        df_wic_raw
        # Need to either drop these from census or add to WIC
        .assign(
            # I could try actually computing the age based on DOB and census date instead...
            age=np.nan,
            relation_to_household_head=np.nan,
        )
        # To avoid losing data, just rename column instead of keeping only middle initial 
        .rename(columns={'middle_name': 'middle', 'wic_id': 'unique_id'})
        # Seems like I should not avoid 
    )
    return df_wic

In [16]:
df_census = prepare_census_data(df_census_noisy)
df_census

Unnamed: 0,first_name,middle,last_name,date_of_birth,age,sex,race_ethnicity,relation_to_household_head,address,zipcode,unique_id,household_id
0,Margaret,J,Clark,1951-07-27,68.0,Female,Black,Reference person,"1344 winoka rd brooksville, fl",34601,1,
1,Jeffrey,V,Littlejohn,1967-05-03,52.0,Male,Black,Reference person,"927 23rd st clearwater, fl",34698,2,
2,Briana,A,Jackson,2006-09-07,13.0,Female,Black,Biological child,"927 23rd st clearwater, fl",34698,3,
3,Benjamin,D,Cox,1998-10-21,21.0,Male,Black,Stepchild,"927 23rd st clearwater, fl",34698,4,
4,Willie,,Tucker,1947-10-09,72.0,Male,White,Reference person,"8904 167th place fleming island, fl",32003,5,
...,...,...,...,...,...,...,...,...,...,...,...,...
49994,Marcus,S,Roman,1988-07-08,31.0,Male,Multiracial or Other,Institutionalized GQ pop,"2210 henn hyde rd ne hollywood, fl",33021,47525,
49996,Nathaniel,J,Campbell,1941-01-08,79.0,Male,White,Institutionalized GQ pop,"2210 henn hyde rd ne hollywood, fl",33021,47526,
49997,Christian,C,Rosales,1983-12-16,36.0,Male,Latino,Institutionalized GQ pop,"701 haber rd vero beach, fl",32968,47527,
49998,Phillip,J,Morton,1985-06-11,34.0,Male,White,Institutionalized GQ pop,"114 s frnt st fort myers, fl",33919,47528,


In [17]:
df_wic = prepare_wic_data(df_wic_noisy)
df_wic

Unnamed: 0,first_name,middle,last_name,date_of_birth,sex,race_ethnicity,address,zipcode,household_id,unique_id,age,relation_to_household_head
82,Sadie,Katia,Tidwell,2017-10-15,Female,Black,"w 4th st north port, fl",34287,48,1,,
83,Liliana,Addisyn,Marshall,2019-12-03,Female,Black,"w 4th st north port, fl",34287,48,2,,
174,Holly,Emma,Yount,2019-05-17,Female,White,"7944 se 62nd ave unincorporated, fl",32824,88,3,,
306,Emilee,Guadalupe,Haskew,2019-12-30,Female,Latino,"749 mi ridge ests destin, fl",32541,150,4,,
323,Gunner,Liam,Parkinson,2020-03-03,Male,White,"600 n maranantha rd hialeah, fl",33016,157,5,,
...,...,...,...,...,...,...,...,...,...,...,...,...
48269,Kaylee,Trinity,Hill,2017-10-20,Female,Black,"98 melanie dr pembroke pines, fl",33026,20380,629,,
48351,Lev,Thomas,Dove,2018-10-18,Male,Black,"671 john muir road spring hill, fl",34610,20422,630,,
48442,Frederick,Cameron,Rodriguez,2019-06-04,Male,Latino,"5765 heards forest dr crestview, fl",32539,20452,631,,
48456,Liam,Emmett,Sardone,2017-01-08,Male,White,"107 brown ave st. petersburg, fl",33704,20458,632,,


## Test whether data is properly formatted for Splink

**Note:** `profile_columns` complains if the sets of column names in the two datasets are different.

In [18]:
# Minimal settings needed to call .profile_columns()
initial_settings = {'link_type': 'link_only'}
linker = DuckDBLinker([df_census, df_wic], initial_settings)

In [19]:
linker.profile_columns(['first_name', 'middle', 'last_name'])

In [20]:
linker.profile_columns(['address', 'zipcode'])

In [21]:
linker.profile_columns(['relation_to_household_head', 'race_ethnicity', 'age'])

## Save prepared data

In [22]:
data_dir = 'data'
df_census.to_csv(f'{data_dir}/prepared_2020_census_20221014.csv')
df_wic.to_csv(f'{data_dir}/prepared_wic_20221014.csv')

In [23]:
!ls -l $data_dir

total 5496
-rw-rw-r-- 1 ndbs IHME-users 5535621 Jan 27 10:34 prepared_2020_census_20221014.csv
-rw-rw-r-- 1 ndbs IHME-users   66364 Jan 27 10:34 prepared_wic_20221014.csv
-rw-rw-r-- 1 ndbs IHME-users    7613 Jan 20 16:21 saved_model_from_wic_census_20221014.json


### Simulated Data – Disclosure NOT Prohibited: NOT Title 13