In [1]:
import numpy as np, matplotlib.pyplot as plt, pandas as pd
pd.set_option('display.max_rows', 40)

# from vivarium_research_prl.noise import corruption, fake_names, noisify
# from vivarium_research_prl.find_kids import datasets, noisify_data
# import vivarium_research_prl.find_kids as find_kids

from splink.duckdb.duckdb_linker import DuckDBLinker
import splink.duckdb.duckdb_comparison_library as cl

# For viewing waterfall charts and precision-recall curve
import altair as alt
alt.renderers.enable('mimetype')
alt.renderers.enable('html')

# For viewing the comparison viewer dashboard
from IPython.display import IFrame

!date
!whoami
!uname -a
!pwd

Fri 20 Jan 2023 02:13:50 PM PST
ndbs
Linux int-slurm-sarchive-p0012 5.4.0-88-generic #99-Ubuntu SMP Thu Sep 23 17:29:00 UTC 2021 x86_64 x86_64 x86_64 GNU/Linux
/mnt/share/code/ndbs/vivarium_research_prl/linkage/wic_case_study


In [2]:
%load_ext autoreload
%autoreload 2

# Read in saved data

In [3]:
!ls -l data

total 5484
-rw-rw-r-- 1 ndbs IHME-users 5535621 Jan 11 15:47 prepared_2020_census_20221014.csv
-rw-rw-r-- 1 ndbs IHME-users   66364 Jan 11 15:47 prepared_wic_20221014.csv


In [4]:
data_dir = 'data'
dtypes = {'zipcode': str} # Make sure zipcodes are str not int
df_census = pd.read_csv(
    f'{data_dir}/prepared_2020_census_20221014.csv',
    dtype=dtypes,
    index_col=0,
)
df_wic = pd.read_csv(
    f'{data_dir}/prepared_wic_20221014.csv',
    dtype=dtypes,
    index_col=0,
)
print(df_census.shape, df_wic.shape, sep='\n')

(47529, 12)
(633, 12)


In [5]:
df_census.dtypes

first_name                     object
middle                         object
last_name                      object
date_of_birth                  object
age                           float64
sex                            object
race_ethnicity                 object
relation_to_household_head     object
address                        object
zipcode                        object
unique_id                       int64
household_id                  float64
dtype: object

In [6]:
df_wic.dtypes

first_name                     object
middle                         object
last_name                      object
date_of_birth                  object
sex                            object
race_ethnicity                 object
address                        object
zipcode                        object
household_id                    int64
unique_id                       int64
age                           float64
relation_to_household_head    float64
dtype: object

In [7]:
df_census

Unnamed: 0,first_name,middle,last_name,date_of_birth,age,sex,race_ethnicity,relation_to_household_head,address,zipcode,unique_id,household_id
0,Margaret,J,Clark,1951-07-27,68.0,Female,Black,Reference person,"1344 winoka rd brooksville, fl",34601,1,
1,Jeffrey,V,Littlejohn,1967-05-03,52.0,Male,Black,Reference person,"927 23rd st clearwater, fl",34698,2,
2,Briana,A,Jackson,2006-09-07,13.0,Female,Black,Biological child,"927 23rd st clearwater, fl",34698,3,
3,Benjamin,D,Cox,1998-10-21,21.0,Male,Black,Stepchild,"927 23rd st clearwater, fl",34698,4,
4,Willie,,Tucker,1947-10-09,72.0,Male,White,Reference person,"8904 167th place fleming island, fl",32003,5,
...,...,...,...,...,...,...,...,...,...,...,...,...
49994,Marcus,S,Roman,1988-07-08,31.0,Male,Multiracial or Other,Institutionalized GQ pop,"2210 henn hyde rd ne hollywood, fl",33021,47525,
49996,Nathaniel,J,Campbell,1941-01-08,79.0,Male,White,Institutionalized GQ pop,"2210 henn hyde rd ne hollywood, fl",33021,47526,
49997,Christian,C,Rosales,1983-12-16,36.0,Male,Latino,Institutionalized GQ pop,"701 haber rd vero beach, fl",32968,47527,
49998,Phillip,J,Morton,1985-06-11,34.0,Male,White,Institutionalized GQ pop,"114 s frnt st fort myers, fl",33919,47528,


In [8]:
df_wic

Unnamed: 0,first_name,middle,last_name,date_of_birth,sex,race_ethnicity,address,zipcode,household_id,unique_id,age,relation_to_household_head
82,Sadie,Katia,Tidwell,2017-10-15,Female,Black,"w 4th st north port, fl",34287,48,1,,
83,Liliana,Addisyn,Marshall,2019-12-03,Female,Black,"w 4th st north port, fl",34287,48,2,,
174,Holly,Emma,Yount,2019-05-17,Female,White,"7944 se 62nd ave unincorporated, fl",32824,88,3,,
306,Emilee,Guadalupe,Haskew,2019-12-30,Female,Latino,"749 mi ridge ests destin, fl",32541,150,4,,
323,Gunner,Liam,Parkinson,2020-03-03,Male,White,"600 n maranantha rd hialeah, fl",33016,157,5,,
...,...,...,...,...,...,...,...,...,...,...,...,...
48269,Kaylee,Trinity,Hill,2017-10-20,Female,Black,"98 melanie dr pembroke pines, fl",33026,20380,629,,
48351,Lev,Thomas,Dove,2018-10-18,Male,Black,"671 john muir road spring hill, fl",34610,20422,630,,
48442,Frederick,Cameron,Rodriguez,2019-06-04,Male,Latino,"5765 heards forest dr crestview, fl",32539,20452,631,,
48456,Liam,Emmett,Sardone,2017-01-08,Male,White,"107 brown ave st. petersburg, fl",33704,20458,632,,


# For round 1, match only to under-5-year olds in census

In [9]:
under5 = df_census.age<5
df_census_u5 = df_census.loc[under5]
df_census_u5.shape

(2243, 12)

# Create a settings dictionary with minimal required settings, and create a linker from it

We initialize with the minimal required settings to evaluate potential blocking rules with the `count_num_comparisons_from_blocking_rule` and `cumulative_num_comparisons_from_blocking_rules_chart` functions.

In [10]:
# Minimal settings needed to count comparisons
settings = {'link_type': 'link_only'}
linker = DuckDBLinker([df_census_u5, df_wic], settings)
linker._settings_dict

{'link_type': 'link_only', 'sql_dialect': 'duckdb'}

# Create some blocking rules and count comparisons using the linker created above

Order of rules doesn't matter. All that matters is:

1. Is (almost) every true match captured by one of the rules?
2. Is the total number of comparisons computationally feasible?

In [11]:
prediction_blocking_rules = [
    (
        'substr(l.first_name, 1,1) = substr(r.first_name, 1,1)'
        ' and substr(l.last_name, 1,1) = substr(r.last_name, 1,1)'
    ),
    'l.date_of_birth = r.date_of_birth',
    'l.zipcode = r.zipcode',
    'l.address = r.address',
    'l.last_name = r.last_name',
    'l.first_name = r.first_name',
    # Let's first see how well we do without these very loose rules:
#     'l.sex = r.sex and substr(l.zipcode, 1,3) = substr(r.zipcode, 1,3)',
#     'substr(l.zipcode, 1,3) = substr(r.zipcode, 1,3)',
]

print('Cartesion product of input:', len(df_census_u5) * len(df_wic), '\n')

for rule in prediction_blocking_rules:
    count = linker.count_num_comparisons_from_blocking_rule(rule)
    print(f"Number of comparisons generated by '{rule}': {count:,.0f}")

linker.cumulative_num_comparisons_from_blocking_rules_chart(prediction_blocking_rules)

Cartesion product of input: 1419819 

Number of comparisons generated by 'substr(l.first_name, 1,1) = substr(r.first_name, 1,1) and substr(l.last_name, 1,1) = substr(r.last_name, 1,1)': 6,074
Number of comparisons generated by 'l.date_of_birth = r.date_of_birth': 1,302
Number of comparisons generated by 'l.zipcode = r.zipcode': 3,610
Number of comparisons generated by 'l.address = r.address': 821
Number of comparisons generated by 'l.last_name = r.last_name': 1,658
Number of comparisons generated by 'l.first_name = r.first_name': 2,966


# Try to estimate how the blocking strategy will scale

In [12]:
# Looks like maybe 8 billion comparisons would be generated with the above rules
(
    # Fraction of comparisons should stay about the same when scaling up census
    (13_000 / len(df_census)**2) 
    # Ratio of WIC to census should stay about the same when scaling up census
    * (len(df_wic)/len(df_census))
    # Estimate census size of 330 million
    * (330e6)**2
)

8346394559.734445

In [13]:
# Looks like maybe 59 billion comparisons would be generated
# if we include zip3 blocking
(
    # Fraction of comparisons should stay about the same when scaling up census
    (92_000 / len(df_census)**2) 
    # Ratio of WIC to census should stay about the same when scaling up census
    * (len(df_wic)/len(df_census))
    # Estimate census size of 330 million
    * (330e6)**2
)

59066792268.88992

# Define comparisons we want to use, add them to settings dictionary, and reinitialize the linker's settings

In [14]:
comparisons = [
        cl.levenshtein_at_thresholds("first_name"),
        cl.levenshtein_at_thresholds("last_name"),
#         c1.exact_match("last_name"),
        cl.levenshtein_at_thresholds("date_of_birth", 1),
        cl.exact_match("sex"),
        cl.levenshtein_at_thresholds("zipcode"),
    ]
settings["comparisons"] = comparisons

linker.initialise_settings(settings)
linker._settings_dict

{'link_type': 'link_only',
 'sql_dialect': 'duckdb',
 'comparisons': [<Comparison Exact match vs. levenshtein at thresholds 1, 2 vs. anything else with 4 levels at 0x7efc5aa825e0>,
  <Comparison Exact match vs. levenshtein at thresholds 1, 2 vs. anything else with 4 levels at 0x7efc5aa657c0>,
  <Comparison Exact match vs. levenshtein at threshold 1 vs. anything else with 3 levels at 0x7efc5aaa2340>,
  <Comparison Exact match vs. anything else with 2 levels at 0x7efc5aaa2a30>,
  <Comparison Exact match vs. levenshtein at thresholds 1, 2 vs. anything else with 4 levels at 0x7efc5aaa2280>]}

# Use random sampiling stimate `u` probabilities for the comparisons defined above

In [15]:
%%time
linker.estimate_u_using_random_sampling(target_rows=len(df_wic)*len(df_census_u5))

----- Estimating u probabilities using random sampling -----

Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - first_name (no m values are trained).
    - last_name (no m values are trained).
    - date_of_birth (no m values are trained).
    - sex (no m values are trained).
    - zipcode (no m values are trained).


CPU times: user 2.84 s, sys: 0 ns, total: 2.84 s
Wall time: 1.45 s


In [16]:
linker.m_u_parameters_chart()

# Estimate probability two random records match, for use in estimating `m` probabilities with EM algorithm

As a first approximation, we expect every record in WIC to match one record in the census, so we can estimate this as

$$
\frac{\text{number of matches}}{\text{number of pairs}} \approx
\frac{\text{# records in WIC}}{(\text{# records in census}) \cdot (\text{# records in WIC})} = \frac{1}{\text{# records in census}}.
$$

## Then add this to the settings dictionary and update the linker

In [17]:
probability_two_random_records_match = 1/len(df_census_u5)

settings['probability_two_random_records_match'] = probability_two_random_records_match
linker.initialise_settings(settings)
linker._settings_dict

{'link_type': 'link_only',
 'sql_dialect': 'duckdb',
 'comparisons': [<Comparison Exact match vs. levenshtein at thresholds 1, 2 vs. anything else with 4 levels at 0x7efc5aa825e0>,
  <Comparison Exact match vs. levenshtein at thresholds 1, 2 vs. anything else with 4 levels at 0x7efc5aa657c0>,
  <Comparison Exact match vs. levenshtein at threshold 1 vs. anything else with 3 levels at 0x7efc5aaa2340>,
  <Comparison Exact match vs. anything else with 2 levels at 0x7efc5aaa2a30>,
  <Comparison Exact match vs. levenshtein at thresholds 1, 2 vs. anything else with 4 levels at 0x7efc5aaa2280>],
 'probability_two_random_records_match': 0.00044583147570218456}

In [18]:
# Oops, reinitializing the settings deleted our estimated u probabilities...
linker.m_u_parameters_chart()

# Define training blocking rules for EM algorithm, and estimate `m` values

According to the Splink documentation for the training blocking rules:

> It does not matter if this blocking rule excludes some true matches - it just needs to generate examples of matches and non matches.

In [19]:
# training_blocking_rules = [
#     "l.first_name = r.first_name and l.last_name = r.last_name and l.zipcode = r.zipcode",
#     "l.first_name = r.first_name and l.date_of_birth = r.date_of_birth and l.zipcode = r.zipcode",
#     "l.last_name = r.last_name and l.date_of_birth = r.date_of_birth and l.zipcode = r.zipcode",
#     "l.last_name = r.last_name and l.date_of_birth = r.date_of_birth and l.first_name = r.first_name"
# ]

training_blocking_rules = [
    "l.first_name = r.first_name",
    "l.zipcode = r.zipcode",
]

for rule in training_blocking_rules:
    count = linker.count_num_comparisons_from_blocking_rule(rule)
    print(f"Number of comparisons generated by '{rule}': {count:,.0f}")

# linker.cumulative_num_comparisons_from_blocking_rules_chart(training_blocking_rules)

Number of comparisons generated by 'l.first_name = r.first_name': 2,966
Number of comparisons generated by 'l.zipcode = r.zipcode': 3,610


In [20]:
%%time
training_sessions = {}
for rule in training_blocking_rules:
    training_sessions[rule] = linker.estimate_parameters_using_expectation_maximisation(rule)
    


----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
l.first_name = r.first_name

Parameter estimates will be made for the following comparison(s):
    - last_name
    - date_of_birth
    - sex
    - zipcode

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
    - first_name

Iteration 1: Largest change in params was 0.19 in the m_probability of last_name, level `All other comparisons`
Iteration 2: Largest change in params was 0.593 in probability_two_random_records_match
Iteration 3: Largest change in params was 0.151 in probability_two_random_records_match
Iteration 4: Largest change in params was 0.00475 in probability_two_random_records_match
Iteration 5: Largest change in params was 0.00485 in the m_probability of sex, level `All other comparisons`
Iteration 6: Largest change in params was 0.00417 in probability_two_random_records_match
Iteration 7: Largest change in para

CPU times: user 1.8 s, sys: 62.5 ms, total: 1.86 s
Wall time: 1.72 s


In [21]:
linker.m_u_parameters_chart()

In [22]:
# "comparisons": [
#         levenshtein("first_name", 2, term_frequency_adjustments=True),
#         levenshtein("last_name", 2, term_frequency_adjustments=True),
#         levenshtein("dob", 2, term_frequency_adjustments=True),
#         exact_match("sex"),
#         levenshtein("zip", 2, term_frequency_adjustments=True),
#         exact_match("ssn"),
#     ],

In [23]:
linker._settings_dict

{'link_type': 'link_only',
 'sql_dialect': 'duckdb',
 'comparisons': [<Comparison Exact match vs. levenshtein at thresholds 1, 2 vs. anything else with 4 levels at 0x7efc5aa825e0>,
  <Comparison Exact match vs. levenshtein at thresholds 1, 2 vs. anything else with 4 levels at 0x7efc5aa657c0>,
  <Comparison Exact match vs. levenshtein at threshold 1 vs. anything else with 3 levels at 0x7efc5aaa2340>,
  <Comparison Exact match vs. anything else with 2 levels at 0x7efc5aaa2a30>,
  <Comparison Exact match vs. levenshtein at thresholds 1, 2 vs. anything else with 4 levels at 0x7efc5aaa2280>],
 'probability_two_random_records_match': 0.00044583147570218456}

In [24]:
1/len(df_census_u5)

0.00044583147570218456

In [25]:
print(linker._settings_obj.human_readable_description)

SUMMARY OF LINKING MODEL
------------------------
The similarity of pairwise record comparison in your model will be assessed as follows:

Comparison of "first_name"
Description: 'Exact match vs. levenshtein at thresholds 1, 2 vs. anything else'
Comparison levels:
    - 'Null' with SQL rule: "first_name_l" IS NULL OR "first_name_r" IS NULL
    - 'Exact match' with SQL rule: "first_name_l" = "first_name_r"
    - 'levenshtein <= 1' with SQL rule: levenshtein("first_name_l", "first_name_r") <= 1
    - 'levenshtein <= 2' with SQL rule: levenshtein("first_name_l", "first_name_r") <= 2
    - 'All other comparisons' with SQL rule: ELSE

Comparison of "last_name"
Description: 'Exact match vs. levenshtein at thresholds 1, 2 vs. anything else'
Comparison levels:
    - 'Null' with SQL rule: "last_name_l" IS NULL OR "last_name_r" IS NULL
    - 'Exact match' with SQL rule: "last_name_l" = "last_name_r"
    - 'levenshtein <= 1' with SQL rule: levenshtein("last_name_l", "last_name_r") <= 1
    - 'lev

In [26]:
print(linker._settings_obj_.salting_required)

False


# Create a settings dictionary

In [27]:
settings = {
    "link_type": "link_only",
    "comparisons": comparisons,
    "blocking_rules_to_generate_predictions": prediction_blocking_rules,
#     "probability_two_random_records_match": 1/len(df_census_u5),
    "retain_matching_columns": True,
    "retain_intermediate_calculation_columns": True,
}

linker.initialise_settings(settings)