In [1]:
import numpy as np, matplotlib.pyplot as plt, pandas as pd
pd.set_option('display.max_rows', 40)

from splink.duckdb.duckdb_linker import DuckDBLinker
import splink.duckdb.duckdb_comparison_library as cl

# For viewing waterfall chargs
import altair as alt
alt.renderers.enable('mimetype')
# Note: As far as I saw, the Splink documentation didn't ever specify that
# I needed to enable the 'html' renderer, but the waterfall chart and
# precision-recall curve did not display until I added this line (thanks to Zeb)
alt.renderers.enable('html')

# For viewing the comparison viewer dashboard
from IPython.display import IFrame

!date
!whoami
!uname -a
!pwd

Thu 17 Nov 2022 12:58:12 PM PST
ndbs
Linux int-slurm-sarchive-p0006 5.4.0-88-generic #99-Ubuntu SMP Thu Sep 23 17:29:00 UTC 2021 x86_64 x86_64 x86_64 GNU/Linux
/mnt/share/code/ndbs/vivarium_research_prl/linkage/demo_notebooks


# Goal: Complete the prediction and linking steps of the Splink tutorial

# Load previously saved census test data

In [2]:
!ls -l splink_test_data/

total 4176
-rw-rw-r-- 1 ndbs Domain Users 1059095 Nov 17 12:57 census_2020_test_sample.csv
-rw-rw-r-- 1 ndbs Domain Users  958764 Nov 17 12:57 census_2030_test_sample.csv
-rw-rw-r-- 1 ndbs Domain Users 1110805 Nov 14 15:01 cluster_studio.html
-rw-rw-r-- 1 ndbs Domain Users 1109719 Nov 14 15:01 comparison_viewer.html
-rw-rw-r-- 1 ndbs Domain Users    5804 Nov 17 12:57 saved_model_from_census_test.json


In [3]:
test_data_path = 'splink_test_data'

years = [2020, 2030]

census = {
    year:
    pd.read_csv(f"{test_data_path}/census_{year}_test_sample.csv", index_col=0)
    for year in years
}
for year in years:
    print(year, census[year].shape)

2020 (9495, 11)
2030 (8539, 11)


In [4]:
census[2020]

Unnamed: 0,first_name,last_name,age,date_of_birth,address,zipcode,relation_to_household_head,sex,race_ethnicity,middle_initial,unique_id
0,Margaret,Clark,68,1951-07-27,"1344 winoka rd brooksville, fl",34601,Reference person,Female,Black,J,0
1,Jeffrey,Littlejohn,52,1967-05-03,"927 23rd st clearwater, fl",34698,Reference person,Male,Black,V,1
2,Briana,Jackson,13,2006-09-07,"927 23rd st clearwater, fl",34698,Biological child,Female,Black,A,2
3,Benjamin,Cox,21,1998-10-21,"927 23rd st clearwater, fl",34698,Stepchild,Male,Black,D,3
4,Willie,Tucker,72,1947-10-09,"8904 167th place fleming island, fl",32003,Reference person,Male,White,J,4
...,...,...,...,...,...,...,...,...,...,...,...
9995,Jordy,Thomas,9,2011-03-27,"8 bainridge raod unincorporated, fl",33446,Biological child,Male,White,N,9995
9997,Carl,Saunders,72,1948-01-24,"16901 sw 66 st fort myers, fl",33916,Reference person,Male,White,M,9997
9998,Julie,Hasapis,60,1959-07-23,"16901 sw 66 st fort myers, fl",33916,Opp-sex spouse,Female,White,K,9998
9999,Jason,Cains,23,1996-10-15,"137 belle terre blvd vero beach, fl",32967,Reference person,Male,White,E,9999


# Load saved model

[Loading saved model](https://moj-analytical-services.github.io/splink/demos/04_Predicting_results.html#load-estimated-model-from-previous-tutorial) from Splink documentation:

```python
linker = DuckDBLinker(df) # The demo was for de-duplication, so only one df
linker.load_settings_from_json("./demo_settings/saved_model_from_demo.json")
```

In [5]:
linker = DuckDBLinker([census[2020], census[2030]])
linker.load_settings_from_json(f"{test_data_path}/saved_model_from_census_test.json")
linker

<splink.duckdb.duckdb_linker.DuckDBLinker at 0x7fc0a19f1c70>

# Predicting which records match

[04 Predicting results](https://moj-analytical-services.github.io/splink/demos/04_Predicting_results.html)
(Predicting which records match)

> We use `linker.predict()` to run the model.
>
> Under the hood this will:
>
> Generate all pairwise record comparisons that match at least one of the blocking_rules_to_generate_predictions
>
> Use the rules specified in the Comparisons to evaluate the similarity of the input data
>
> Use the estimated match weights, applying term frequency adjustments where requested to produce the final match_weight and match_probability scores
>
> Optionally, a `threshold_match_probability` or `threshold_match_weight` can be provided, which will drop any row where the predicted score is below the threshold.

In [6]:
%%time
df_predictions = linker.predict(threshold_match_probability=0.2)
df_predictions.as_pandas_dataframe(limit=5)


You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary.  To produce predictions the following untrained trained parameters will use default values.
Comparison: 'last_name':
    m values not fully trained


CPU times: user 3.89 s, sys: 0 ns, total: 3.89 s
Wall time: 2.02 s


Unnamed: 0,match_weight,match_probability,source_dataset_l,unique_id_l,source_dataset_r,unique_id_r,first_name_l,first_name_r,gamma_first_name,bf_first_name,...,sex_r,gamma_sex,bf_sex,age_l,age_r,gamma_age,bf_age,zipcode_l,zipcode_r,match_key
0,2.504696,0.850194,_a,0,_b,0,Margaret,Margaret,3,267.641005,...,Female,1,1.995687,68,70,0,0.170119,34601,34601,0
1,2.504696,0.850194,_a,1,_b,1,Jeffrey,Jeffrey,3,267.641005,...,Male,1,1.995687,52,53,0,0.170119,34698,33428,0
2,11.224322,0.999582,_a,3,_b,3,Benjamin,Benjamin,3,267.641005,...,Male,1,1.995687,21,31,2,71.716997,34698,32164,0
3,2.504696,0.850194,_a,4,_b,4,Willie,Willie,3,267.641005,...,Male,1,1.995687,72,78,0,0.170119,32003,32003,0
4,11.224322,0.999582,_a,5,_b,5,Frank,Frank,3,267.641005,...,Male,1,1.995687,80,90,2,71.716997,32218,32218,0


In [7]:
df_predictions.as_pandas_dataframe().shape

(8248, 25)

# Cluster the predictions

https://moj-analytical-services.github.io/splink/demos/04_Predicting_results.html#clustering

> Often, an alternative representation of this result is more useful, where each row is an input record, and where records link, they are assigned to the same cluster.
>
> The algorithm that converts between the pairwise results and the clusters is called connected components, and it is included in Splink. 

In [8]:
%%time
clusters = linker.cluster_pairwise_predictions_at_threshold(
    df_predictions, threshold_match_probability=0.5
)
clusters.as_pandas_dataframe(limit=10)

Completed iteration 1, root rows count 0


CPU times: user 333 ms, sys: 0 ns, total: 333 ms
Wall time: 216 ms


Unnamed: 0,cluster_id,source_dataset,first_name,last_name,age,date_of_birth,address,zipcode,relation_to_household_head,sex,race_ethnicity,middle_initial,unique_id
0,_a-__-1303,_a,Mary,Smith,71,1948-08-31,"16263 119th pl ne lk land, fl",33810,Reference person,Female,Black,M,6778
1,_a-__-2010,_b,William,Schmidt,77,1953-04-12,"5047-5059 cherrybark ln se palm bay, fl",32907,Reference person,Male,White,L,2010
2,_a-__-6200,_b,Kelly,Smith,47,1982-11-23,"5785 north tegner ro apt 107 l sunrise, fl",33351,Reference person,Female,White,M,6200
3,_a-__-1004,_a,Jennifer,Wallace,47,1973-02-13,"4475 13th st spring hill, fl",34606,Other nonrelative,Female,White,R,1004
4,_a-__-1080,_a,Trent,Wallen,20,2000-03-04,"10615 nevada avenue jacksonville, fl",32224,Biological child,Male,White,M,1080
5,_a-__-1153,_a,Ramona,Defoy,63,1956-06-27,"313 w commerce st jacksonville, fl",32205,Reference person,Female,White,D,1153
6,_a-__-1173,_a,Michael,Lower,13,2007-03-30,"1556 spring st lakeland, fl",33813,Biological child,Male,White,D,1173
7,_a-__-1305,_a,Mildred,Benton Ivers,81,1939-01-01,"352 plms ave orlando, fl",32811,Reference person,Female,White,J,1305
8,_a-__-143,_a,Anthony,Miller,49,1970-07-31,"8828 upper 89th street cir s orlando, fl",32807,Reference person,Male,Black,M,143
9,_a-__-1464,_a,Katelyn,Firth,14,2005-06-22,"6808 harrowdale r riverview, fl",33578,Biological child,Female,White,R,1464


In [9]:
clusters.as_pandas_dataframe().shape

(18034, 13)

# Plot a waterfall chart

https://moj-analytical-services.github.io/splink/demos/05_Visualising_predictions.html

Well, that's annoying. It didn't work for some arcane reason.

In [10]:
records_to_view  = df_predictions.as_record_dict(limit=5)
linker.waterfall_chart(records_to_view, filter_nulls=False)

# Make a comparison viewer dashboard

https://moj-analytical-services.github.io/splink/demos/05_Visualising_predictions.html

In [11]:
comparison_viewer_filpath = f"{test_data_path}/comparison_viewer.html"

linker.comparison_viewer_dashboard(
    df_predictions, comparison_viewer_filpath, overwrite=True
)

# You can view the scv.html file in your browser, or inline in a notbook as follows
IFrame(
    src=comparison_viewer_filpath, width="100%", height=1200
)

# Cluster studio dashboard

https://moj-analytical-services.github.io/splink/demos/05_Visualising_predictions.html#cluster-studio-dashboard

In [12]:
cluster_studio_filepath = f"{test_data_path}/cluster_studio.html"

linker.cluster_studio_dashboard(
    df_predictions,
    clusters,
    cluster_studio_filepath,
    sampling_method="by_cluster_size",
    overwrite=True)

# You can view the scv.html file in your browser, or inline in a notbook as follows
IFrame(
    src=cluster_studio_filepath, width="100%", height=1200
)

# Precision-Recall curve

In [13]:
x = linker.precision_recall_chart_from_labels_column("unique_id")
x


You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary.  To produce predictions the following untrained trained parameters will use default values.
Comparison: 'last_name':
    m values not fully trained


In [26]:
type(x)

splink.charts.VegaliteNoValidate

In [34]:
x.renderers

RendererRegistry(active='mimetype', registered=['colab', 'default', 'html', 'json', 'jupyterlab', 'kaggle', 'mimetype', 'nteract', 'png', 'svg', 'zeppelin'])

In [35]:
x.schema_path

('altair.vegalite.v4.display', 'schema/vega-lite-schema.json')

In [36]:
x.spec

{'$schema': 'https://vega.github.io/schema/vega-lite/v4.8.1.json',
 'title': 'Precision-recall curve',
 'data': {'values': [{'truth_threshold': -763.8546525297719,
    'match_probability': 1.1398225928877631e-230,
    'row_count': 2956.0,
    'P': 828.0,
    'N': 2128.0,
    'TP': 828.0,
    'TN': 0.0,
    'FP': 2128.0,
    'FN': 0.0,
    'P_rate': 0.0,
    'N_rate': 0.7198917269706726,
    'TP_rate': 1.0,
    'TN_rate': 0.0,
    'FP_rate': 1.0,
    'FN_rate': 0.0,
    'precision': 0.280108243227005,
    'recall': 1.0,
    'F1': 0.43763214349746704},
   {'truth_threshold': -759.4443790256178,
    'match_probability': 2.4236051958218802e-229,
    'row_count': 2956.0,
    'P': 828.0,
    'N': 2128.0,
    'TP': 828.0,
    'TN': 5.0,
    'FP': 2123.0,
    'FN': 0.0,
    'P_rate': 0.0,
    'N_rate': 0.7198917269706726,
    'TP_rate': 1.0,
    'TN_rate': 0.0023496239446103573,
    'FP_rate': 0.9976503849029541,
    'FN_rate': 0.0,
    'precision': 0.28058284521102905,
    'recall': 1.0,
    