#  Simulated Data – Not CUI

In [1]:
import numpy as np, matplotlib.pyplot as plt, pandas as pd
pd.set_option('display.max_rows', 40)

# from vivarium_research_prl.noise import corruption, fake_names, noisify
# from vivarium_research_prl.find_kids import datasets, noisify_data
# import vivarium_research_prl.find_kids as find_kids

from splink.duckdb.duckdb_linker import DuckDBLinker
import splink.duckdb.duckdb_comparison_library as cl

# For viewing waterfall charts and precision-recall curve
import altair as alt
alt.renderers.enable('mimetype')
alt.renderers.enable('html')

# For viewing the comparison viewer dashboard
from IPython.display import IFrame

!date
!whoami
!uname -a
!pwd

Fri 27 Jan 2023 11:03:31 AM PST
ndbs
Linux int-slurm-sarchive-p0012 5.4.0-88-generic #99-Ubuntu SMP Thu Sep 23 17:29:00 UTC 2021 x86_64 x86_64 x86_64 GNU/Linux
/mnt/share/code/ndbs/vivarium_research_prl/linkage/wic_case_study


In [2]:
%load_ext autoreload
%autoreload 2

# Steps 0-1

## Read in simulated WIC and decennial census data that has been noised and prepared for Splink

In [3]:
!ls -l data

total 5496
-rw-rw-r-- 1 ndbs IHME-users 5535621 Jan 27 10:34 prepared_2020_census_20221014.csv
-rw-rw-r-- 1 ndbs IHME-users   66364 Jan 27 10:34 prepared_wic_20221014.csv
-rw-rw-r-- 1 ndbs IHME-users    7613 Jan 20 16:21 saved_model_from_wic_census_20221014.json


In [4]:
data_dir = 'data'
dtypes = {'zipcode': str} # Make sure zipcodes are str not int
df_census = pd.read_csv(
    f'{data_dir}/prepared_2020_census_20221014.csv',
    dtype=dtypes,
    index_col=0,
)
df_wic = pd.read_csv(
    f'{data_dir}/prepared_wic_20221014.csv',
    dtype=dtypes,
    index_col=0,
)
print(df_census.shape, df_wic.shape, sep='\n')

(47529, 12)
(633, 12)


## View the data we'll be linking

In [5]:
df_census.dtypes

first_name                     object
middle                         object
last_name                      object
date_of_birth                  object
age                           float64
sex                            object
race_ethnicity                 object
relation_to_household_head     object
address                        object
zipcode                        object
unique_id                       int64
household_id                  float64
dtype: object

In [6]:
df_wic.dtypes

first_name                     object
middle                         object
last_name                      object
date_of_birth                  object
sex                            object
race_ethnicity                 object
address                        object
zipcode                        object
household_id                    int64
unique_id                       int64
age                           float64
relation_to_household_head    float64
dtype: object

In [7]:
df_census

Unnamed: 0,first_name,middle,last_name,date_of_birth,age,sex,race_ethnicity,relation_to_household_head,address,zipcode,unique_id,household_id
0,Margaret,J,Clark,1951-07-27,68.0,Female,Black,Reference person,"1344 winoka rd brooksville, fl",34601,1,
1,Jeffrey,V,Littlejohn,1967-05-03,52.0,Male,Black,Reference person,"927 23rd st clearwater, fl",34698,2,
2,Briana,A,Jackson,2006-09-07,13.0,Female,Black,Biological child,"927 23rd st clearwater, fl",34698,3,
3,Benjamin,D,Cox,1998-10-21,21.0,Male,Black,Stepchild,"927 23rd st clearwater, fl",34698,4,
4,Willie,,Tucker,1947-10-09,72.0,Male,White,Reference person,"8904 167th place fleming island, fl",32003,5,
...,...,...,...,...,...,...,...,...,...,...,...,...
49994,Marcus,S,Roman,1988-07-08,31.0,Male,Multiracial or Other,Institutionalized GQ pop,"2210 henn hyde rd ne hollywood, fl",33021,47525,
49996,Nathaniel,J,Campbell,1941-01-08,79.0,Male,White,Institutionalized GQ pop,"2210 henn hyde rd ne hollywood, fl",33021,47526,
49997,Christian,C,Rosales,1983-12-16,36.0,Male,Latino,Institutionalized GQ pop,"701 haber rd vero beach, fl",32968,47527,
49998,Phillip,J,Morton,1985-06-11,34.0,Male,White,Institutionalized GQ pop,"114 s frnt st fort myers, fl",33919,47528,


In [8]:
df_wic

Unnamed: 0,first_name,middle,last_name,date_of_birth,sex,race_ethnicity,address,zipcode,household_id,unique_id,age,relation_to_household_head
82,Sadie,Katia,Tidwell,2017-10-15,Female,Black,"w 4th st north port, fl",34287,48,1,,
83,Liliana,Addisyn,Marshall,2019-12-03,Female,Black,"w 4th st north port, fl",34287,48,2,,
174,Holly,Emma,Yount,2019-05-17,Female,White,"7944 se 62nd ave unincorporated, fl",32824,88,3,,
306,Emilee,Guadalupe,Haskew,2019-12-30,Female,Latino,"749 mi ridge ests destin, fl",32541,150,4,,
323,Gunner,Liam,Parkinson,2020-03-03,Male,White,"600 n maranantha rd hialeah, fl",33016,157,5,,
...,...,...,...,...,...,...,...,...,...,...,...,...
48269,Kaylee,Trinity,Hill,2017-10-20,Female,Black,"98 melanie dr pembroke pines, fl",33026,20380,629,,
48351,Lev,Thomas,Dove,2018-10-18,Male,Black,"671 john muir road spring hill, fl",34610,20422,630,,
48442,Frederick,Cameron,Rodriguez,2019-06-04,Male,Latino,"5765 heards forest dr crestview, fl",32539,20452,631,,
48456,Liam,Emmett,Sardone,2017-01-08,Male,White,"107 brown ave st. petersburg, fl",33704,20458,632,,


# Step 2 - Blocking/Indexing/Cascading

## Step 2a - Cascade strategy

- For round 1 of the cascade, match only to under-5-year olds in census

- For rounds 2+, we'll pull out everyone who linked, and link the remaining kids in WIC to the remaining people in census, in case a child's reported age was inaccurate

In [9]:
# Round 1: subset to only kids under 5 in decennial census
under5 = df_census.age<5
df_census_u5 = df_census.loc[under5]
df_census_u5.shape

(2243, 12)

## Step 2b - Set up blocking rules for Splink for round 1 of the cascade

### Create a settings dictionary with minimal required settings, and create a linker from it

We initialize with the minimal required settings to evaluate potential blocking rules with the `count_num_comparisons_from_blocking_rule` and `cumulative_num_comparisons_from_blocking_rules_chart` functions.

In [10]:
# Minimal settings needed to count comparisons
initial_settings = {'link_type': 'link_only'}
linker = DuckDBLinker([df_wic, df_census_u5], initial_settings)
linker._settings_dict

{'link_type': 'link_only', 'sql_dialect': 'duckdb'}

## Create some blocking rules and count comparisons using the linker created above

Order of rules doesn't matter. All that matters is:

1. Is (almost) every true match captured by one of the rules?
2. Is the total number of comparisons computationally feasible?

In [11]:
prediction_blocking_rules = [
    (
        'substr(l.first_name, 1,1) = substr(r.first_name, 1,1)'
        ' and substr(l.last_name, 1,1) = substr(r.last_name, 1,1)'
    ),
    'l.date_of_birth = r.date_of_birth',
    'l.zipcode = r.zipcode',
    'l.address = r.address',
    'l.last_name = r.last_name',
    'l.first_name = r.first_name',
    # Let's first see how well we do without these very loose rules:
#     'l.sex = r.sex and substr(l.zipcode, 1,3) = substr(r.zipcode, 1,3)',
#     'substr(l.zipcode, 1,3) = substr(r.zipcode, 1,3)', # Results in > 91_000 comparisons
]

print('Cartesion product of input:', len(df_census_u5) * len(df_wic), '\n')

for rule in prediction_blocking_rules:
    count = linker.count_num_comparisons_from_blocking_rule(rule)
    print(f"Number of comparisons generated by '{rule}': {count:,.0f}")

linker.cumulative_num_comparisons_from_blocking_rules_chart(prediction_blocking_rules)

Cartesion product of input: 1419819 

Number of comparisons generated by 'substr(l.first_name, 1,1) = substr(r.first_name, 1,1) and substr(l.last_name, 1,1) = substr(r.last_name, 1,1)': 6,074
Number of comparisons generated by 'l.date_of_birth = r.date_of_birth': 1,302
Number of comparisons generated by 'l.zipcode = r.zipcode': 3,610
Number of comparisons generated by 'l.address = r.address': 821
Number of comparisons generated by 'l.last_name = r.last_name': 1,658
Number of comparisons generated by 'l.first_name = r.first_name': 2,966


## Try to estimate how the blocking strategy will scale

Since we have a very small dataset, try to estimate how many comparisons the above blocking rules would generate in the full census-scale data.

In [12]:
# Looks like maybe 8 billion comparisons would be generated with the above rules
(
    # Fraction of comparisons should stay about the same when scaling up census
    (13_000 / len(df_census)**2) 
    # Ratio of WIC to census should stay about the same when scaling up census
    * (len(df_wic)/len(df_census))
    # Estimate census size of 330 million
    * (330e6)**2
)

8346394559.734445

In [13]:
# Looks like maybe 59 billion comparisons would be generated
# if we include zip3 blocking
(
    # Fraction of comparisons should stay about the same when scaling up census
    (92_000 / len(df_census)**2) 
    # Ratio of WIC to census should stay about the same when scaling up census
    * (len(df_wic)/len(df_census))
    # Estimate census size of 330 million
    * (330e6)**2
)

59066792268.88992

# Step 2.5 - Model selection and training

We'll be using Splink to run the Fellegi-Sunter algorithm on our datasets, so we need to specify settings for our `Linker` object and estimate the `m` and `u` probabilities.

## Step 2.5a - Define comparisons we want to use to determine links

Here's an example of how to specify column comparisons for Splink

```python
# Example comparisons from Robin Linacre via Abie 
"comparisons": [
        levenshtein("first_name", 2, term_frequency_adjustments=True),
        levenshtein("last_name", 2, term_frequency_adjustments=True),
        levenshtein("dob", 2, term_frequency_adjustments=True),
        exact_match("sex"),
        levenshtein("zip", 2, term_frequency_adjustments=True),
        exact_match("ssn"),
    ],
```


In [14]:
# Comparisons for simulated WIC and decennial census data
comparisons = [
        cl.levenshtein_at_thresholds("first_name"),
        cl.levenshtein_at_thresholds("last_name"),
        cl.levenshtein_at_thresholds("date_of_birth", 1),
        cl.exact_match("sex"),
        cl.levenshtein_at_thresholds("zipcode"),
    ]

## Step 2.5b - Training/parameter estimation

Now we need to estimate the `m` and `u` probabilities for our F-S model.

## Step 2.5b(1) - Estimate probability two random records match, for use in estimating `m` probabilities with EM algorithm

As a first approximation, we expect every record in WIC to match one record in the census, so we can estimate this as

$$
\frac{\text{number of matches}}{\text{number of pairs}} \approx
\frac{\text{# records in WIC}}{(\text{# records in WIC}) \cdot (\text{# records in census})} = \frac{1}{\text{# records in census}}.
$$

In [15]:
probability_two_random_records_match = 1/len(df_census_u5)
probability_two_random_records_match

0.00044583147570218456

### Create a settings dictionary using parameters defined above, and reilitialize the linker with them

Namely, we need the prediction blocking rules, comparisons, and the probability two random records match before we can initialize the settings (idiosyncracy of Splink).

In [16]:
settings = {
    "link_type": "link_only",
    "comparisons": comparisons,
    "probability_two_random_records_match": probability_two_random_records_match,
    "blocking_rules_to_generate_predictions": prediction_blocking_rules,
    "retain_matching_columns": True,
    "retain_intermediate_calculation_columns": True,
}

linker.initialise_settings(settings)

## Step 2.5b(2) - Use random sampling to estimate `u` probabilities for the comparisons defined above

This requires the comparisons to be defined in the linker's settings.

In [17]:
%%time
linker.estimate_u_using_random_sampling(target_rows=len(df_wic)*len(df_census_u5))

----- Estimating u probabilities using random sampling -----

Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - first_name (no m values are trained).
    - last_name (no m values are trained).
    - date_of_birth (no m values are trained).
    - sex (no m values are trained).
    - zipcode (no m values are trained).


CPU times: user 2.94 s, sys: 3.33 ms, total: 2.95 s
Wall time: 1.5 s


In [18]:
# u probabilities have been estimated, but m probabilities have default values
linker.m_u_parameters_chart()

## Step 2.5b(3) - Define training blocking rules for EM algorithm, and estimate `m` probabilities

According to the Splink documentation for the training blocking rules:

> It does not matter if this blocking rule excludes some true matches - it just needs to generate examples of matches and non matches. The expectation maximisation algorithm seems to work best when the pairwise record comparisons are a mix of anywhere between around 0.1% and 99.9% true matches.

Thus the criteria for choosing a list of training blocking rules are:

1. Each rule must produce a small enough block for the EM algorithm to be computationally tractible.

1. Each rule must contain both matches and non-matches, ideally with a mix of between 0.1% and 99.9% true matches.

1. Every column must have a blocking rule in which it does *not* appear, because probabilities for the blocked columns can't be estimated.

In [19]:
training_blocking_rules = [
    "l.first_name = r.first_name",
    "l.zipcode = r.zipcode",
]

for rule in training_blocking_rules:
    count = linker.count_num_comparisons_from_blocking_rule(rule)
    print(f"Number of comparisons generated by '{rule}': {count:,.0f}")

Number of comparisons generated by 'l.first_name = r.first_name': 2,966
Number of comparisons generated by 'l.zipcode = r.zipcode': 3,610


In [20]:
%%time
training_sessions = {}
for rule in training_blocking_rules:
    training_sessions[rule] = linker.estimate_parameters_using_expectation_maximisation(rule)
    


----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
l.first_name = r.first_name

Parameter estimates will be made for the following comparison(s):
    - last_name
    - date_of_birth
    - sex
    - zipcode

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
    - first_name

Iteration 1: Largest change in params was 0.0479 in probability_two_random_records_match
Iteration 2: Largest change in params was 0.00039 in the m_probability of last_name, level `Exact match`
Iteration 3: Largest change in params was 1.94e-05 in the m_probability of last_name, level `Exact match`

EM converged after 3 iterations

Your model is not yet fully trained. Missing estimates for:
    - first_name (no m values are trained).

----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
l.zipcode = r.zipcode

Parameter estimates will be made for the followi

CPU times: user 372 ms, sys: 15.2 ms, total: 387 ms
Wall time: 362 ms


## View the estimated `m` and `u` parameters as a chart

In [21]:
# Now all probabilities have been estimated
linker.m_u_parameters_chart()

## View match weights chart

In [22]:
linker.match_weights_chart()

## View settings object as dictionary to see estimated model parameters

In [23]:
linker._settings_obj.as_dict()

{'link_type': 'link_only',
 'comparisons': [{'output_column_name': 'first_name',
   'comparison_levels': [{'sql_condition': '"first_name_l" IS NULL OR "first_name_r" IS NULL',
     'label_for_charts': 'Null',
     'is_null_level': True},
    {'sql_condition': '"first_name_l" = "first_name_r"',
     'label_for_charts': 'Exact match',
     'm_probability': 0.9747169496617627,
     'u_probability': 0.0026250185013741807},
    {'sql_condition': 'levenshtein("first_name_l", "first_name_r") <= 1',
     'label_for_charts': 'levenshtein <= 1',
     'm_probability': 0.008944600683756722,
     'u_probability': 0.001121691156198033},
    {'sql_condition': 'levenshtein("first_name_l", "first_name_r") <= 2',
     'label_for_charts': 'levenshtein <= 2',
     'm_probability': 0.0017851557202681523,
     'u_probability': 0.005457542611204142},
    {'sql_condition': 'ELSE',
     'label_for_charts': 'All other comparisons',
     'm_probability': 0.014553293934212464,
     'u_probability': 0.990795747731

# Save trained model

It looks like this saves the above dictionary as a `.json` file.

In [24]:
linker.save_settings_to_json(
    f"./{data_dir}/saved_model_from_wic_census_20221014.json",
    overwrite=True
)

In [25]:
!ls -l data/

total 5496
-rw-rw-r-- 1 ndbs IHME-users 5535621 Jan 27 10:34 prepared_2020_census_20221014.csv
-rw-rw-r-- 1 ndbs IHME-users   66364 Jan 27 10:34 prepared_wic_20221014.csv
-rw-rw-r-- 1 ndbs IHME-users    7610 Jan 27 11:03 saved_model_from_wic_census_20221014.json


### Simulated Data – Disclosure NOT Prohibited: NOT Title 13