In [37]:
import numpy as np, matplotlib.pyplot as plt, pandas as pd
pd.set_option('display.max_rows', 40)

from splink.duckdb.duckdb_linker import DuckDBLinker
import splink.duckdb.duckdb_comparison_library as cl

# For viewing waterfall chargs
import altair as alt
alt.renderers.enable('mimetype')
# Note: As far as I saw, the Splink documentation didn't ever specify that
# I needed to enable the 'html' renderer, but the waterfall chart and
# precision-recall curve did not display until I added this line (thanks to Zeb)
alt.renderers.enable('html')

# For viewing the comparison viewer dashboard
from IPython.display import IFrame

!date
!whoami
!uname -a
!pwd

Mon 14 Nov 2022 03:00:56 PM PST
ndbs
Linux int-slurm-sarchive-p0001 5.4.0-88-generic #99-Ubuntu SMP Thu Sep 23 17:29:00 UTC 2021 x86_64 x86_64 x86_64 GNU/Linux
/mnt/share/homes/ndbs/notebooks


In [38]:
alt.renderers.enable('html')

RendererRegistry.enable('html')

# Goal: Complete the prediction and linking steps of the Splink tutorial

# Load previously saved census test data

In [39]:
!ls -l splink_test_data/

total 3488
-rw-rw-r-- 1 ndbs Domain Users  104935 Nov 10 16:44 census_2020_test_sample.csv
-rw-rw-r-- 1 ndbs Domain Users   95387 Nov 10 16:44 census_2030_test_sample.csv
-rw-rw-r-- 1 ndbs Domain Users 1110805 Nov 10 22:19 cluster_studio.html
-rw-rw-r-- 1 ndbs Domain Users 1109719 Nov 10 22:18 comparison_viewer.html
-rw-rw-r-- 1 ndbs Domain Users    5771 Nov  9 17:31 saved_model_from_census_test.json
-rw-rw-r-- 1 ndbs Domain Users 1109719 Nov 10 21:54 scv.html


In [40]:
test_data_path = 'splink_test_data'

years = [2020, 2030]

census = {
    year:
    pd.read_csv(f"{test_data_path}/census_{year}_test_sample.csv", index_col=0)
    for year in years
}
for year in years:
    print(year, census[year].shape)

2020 (960, 11)
2030 (865, 11)


In [41]:
census[2020]

Unnamed: 0,first_name,last_name,age,date_of_birth,address,zipcode,relation_to_household_head,sex,race_ethnicity,middle_initial,unique_id
0,Margaret,Clark,68,1951-07-27,"1344 winoka rd brooksville, fl",34601,Reference person,Female,Black,J,0
1,Jeffrey,Littlejohn,52,1967-05-03,"927 23rd st clearwater, fl",34698,Reference person,Male,Black,V,1
2,Briana,Jackson,13,2006-09-07,"927 23rd st clearwater, fl",34698,Biological child,Female,Black,A,2
3,Benjamin,Cox,21,1998-10-21,"927 23rd st clearwater, fl",34698,Stepchild,Male,Black,D,3
4,Willie,Tucker,72,1947-10-09,"8904 167th place fleming island, fl",32003,Reference person,Male,White,J,4
...,...,...,...,...,...,...,...,...,...,...,...
996,Cassandra,Stevens,56,1963-08-14,"4475 13th st spring hill, fl",34606,Reference person,Female,White,S,996
997,James,Hansen,64,1955-08-13,"4475 13th st spring hill, fl",34606,Opp-sex spouse,Male,White,R,997
998,Deanna,Miller,28,1991-05-03,"4475 13th st spring hill, fl",34606,Biological child,Female,White,R,998
999,Denzel,Heaney,26,1993-05-06,"4475 13th st spring hill, fl",34606,Biological child,Male,White,N,999


# Load saved model

[Loading saved model](https://moj-analytical-services.github.io/splink/demos/04_Predicting_results.html#load-estimated-model-from-previous-tutorial) from Splink documentation:

```python
linker = DuckDBLinker(df) # The demo was for de-duplication, so only one df
linker.load_settings_from_json("./demo_settings/saved_model_from_demo.json")
```

In [42]:
linker = DuckDBLinker([census[2020], census[2030]])
linker.load_settings_from_json(f"{test_data_path}/saved_model_from_census_test.json")
linker

<splink.duckdb.duckdb_linker.DuckDBLinker at 0x7f2c4ff45d60>

# Predicting which records match

[04 Predicting results](https://moj-analytical-services.github.io/splink/demos/04_Predicting_results.html)
(Predicting which records match)

> We use `linker.predict()` to run the model.
>
> Under the hood this will:
>
> Generate all pairwise record comparisons that match at least one of the blocking_rules_to_generate_predictions
>
> Use the rules specified in the Comparisons to evaluate the similarity of the input data
>
> Use the estimated match weights, applying term frequency adjustments where requested to produce the final match_weight and match_probability scores
>
> Optionally, a `threshold_match_probability` or `threshold_match_weight` can be provided, which will drop any row where the predicted score is below the threshold.

In [43]:
%%time
df_predictions = linker.predict(threshold_match_probability=0.2)
df_predictions.as_pandas_dataframe(limit=5)


You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary.  To produce predictions the following untrained trained parameters will use default values.
Comparison: 'last_name':
    m values not fully trained


CPU times: user 109 ms, sys: 24.7 ms, total: 133 ms
Wall time: 118 ms


Unnamed: 0,match_weight,match_probability,source_dataset_l,unique_id_l,source_dataset_r,unique_id_r,first_name_l,first_name_r,gamma_first_name,bf_first_name,...,sex_r,gamma_sex,bf_sex,age_l,age_r,gamma_age,bf_age,zipcode_l,zipcode_r,match_key
0,4.323437,0.952428,_a,0,_b,0,Margaret,Margaret,3,130.301965,...,Female,1,2.0,68,70,0,0.151487,34601.0,34601.0,0
1,4.323437,0.952428,_a,1,_b,1,Jeffrey,Jeffrey,3,130.301965,...,Male,1,2.0,52,53,0,0.151487,34698.0,33428.0,0
2,13.212943,0.999895,_a,2,_b,2,Briana,Briana,3,130.301965,...,Female,1,2.0,13,23,2,71.842787,34698.0,33157.0,0
3,13.212943,0.999895,_a,3,_b,3,Benjamin,Benjamin,3,130.301965,...,Male,1,2.0,21,31,2,71.842787,34698.0,32164.0,0
4,4.323437,0.952428,_a,4,_b,4,Willie,Willie,3,130.301965,...,Male,1,2.0,72,78,0,0.151487,32003.0,32003.0,0


In [44]:
df_predictions.as_pandas_dataframe().shape

(832, 25)

# Cluster the predictions

https://moj-analytical-services.github.io/splink/demos/04_Predicting_results.html#clustering

> Often, an alternative representation of this result is more useful, where each row is an input record, and where records link, they are assigned to the same cluster.
>
> The algorithm that converts between the pairwise results and the clusters is called connected components, and it is included in Splink. 

In [45]:
%%time
clusters = linker.cluster_pairwise_predictions_at_threshold(
    df_predictions, threshold_match_probability=0.5
)
clusters.as_pandas_dataframe(limit=10)

Completed iteration 1, root rows count 0


CPU times: user 23.5 ms, sys: 29 ms, total: 52.4 ms
Wall time: 38.2 ms


Unnamed: 0,cluster_id,source_dataset,first_name,last_name,age,date_of_birth,address,zipcode,relation_to_household_head,sex,race_ethnicity,middle_initial,unique_id
0,_a-__-88,_a,Gary,Nolte,71,1948-11-20,"907 santorini ln santa rosa beach, fl",32459.0,Reference person,Male,White,K,932
1,_a-__-253,_b,Louis,Perry,78,1951-11-08,"108-49 52 avenue franjo, fl",33193.0,Reference person,Male,Black,J,253
2,_a-__-253,_b,Louis,Perry,71,1958-09-20,523 foundry street northwest flat 575 pompano ...,33062.0,Reference person,Male,White,J,509
3,_a-__-88,_b,Gary,Nolt,80,1949-08-13,"10780 rivercrest dr kissimmee, fl",34744.0,Reference person,Male,White,G,88
4,_a-__-88,_b,Gary,Nolte,81,1948-11-20,"907 santorini ln santa rosa beach, fl",32459.0,Reference person,Male,White,K,932
5,_a-__-0,_a,Margaret,Clark,68,1951-07-27,"1344 winoka rd brooksville, fl",34601.0,Reference person,Female,Black,J,0
6,_a-__-1,_a,Jeffrey,Littlejohn,52,1967-05-03,"927 23rd st clearwater, fl",34698.0,Reference person,Male,Black,V,1
7,_a-__-10,_a,Kenneth,Truebenbach,71,1949-01-05,"136 southwood ave north pt, fl",33953.0,Reference person,Male,White,A,10
8,_a-__-100,_a,Ruth,Susan,86,1933-04-03,"926 forestwood dr cocoa, fl",32926.0,Reference person,Female,White,M,100
9,_a-__-1000,_a,Michael,Moran,9,2010-07-23,"4475 13th st spring hill, fl",34606.0,Grandchild,Male,White,L,1000


In [46]:
clusters.as_pandas_dataframe().shape

(1825, 13)

# Plot a waterfall chart

https://moj-analytical-services.github.io/splink/demos/05_Visualising_predictions.html

Well, that's annoying. It didn't work for some arcane reason.

In [47]:
records_to_view  = df_predictions.as_record_dict(limit=5)
linker.waterfall_chart(records_to_view, filter_nulls=False)

# Make a comparison viewer dashboard

https://moj-analytical-services.github.io/splink/demos/05_Visualising_predictions.html

In [48]:
comparison_viewer_filpath = f"{test_data_path}/comparison_viewer.html"

linker.comparison_viewer_dashboard(
    df_predictions, comparison_viewer_filpath, overwrite=True
)

# You can view the scv.html file in your browser, or inline in a notbook as follows
IFrame(
    src=comparison_viewer_filpath, width="100%", height=1200
)

# Cluster studio dashboard

https://moj-analytical-services.github.io/splink/demos/05_Visualising_predictions.html#cluster-studio-dashboard

In [49]:
cluster_studio_filepath = f"{test_data_path}/cluster_studio.html"

linker.cluster_studio_dashboard(
    df_predictions,
    clusters,
    cluster_studio_filepath,
    sampling_method="by_cluster_size",
    overwrite=True)

# You can view the scv.html file in your browser, or inline in a notbook as follows
IFrame(
    src=cluster_studio_filepath, width="100%", height=1200
)

# Precision-Recall curve

In [50]:
x = linker.precision_recall_chart_from_labels_column("unique_id")
x


You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary.  To produce predictions the following untrained trained parameters will use default values.
Comparison: 'last_name':
    m values not fully trained


In [26]:
type(x)

splink.charts.VegaliteNoValidate

In [34]:
x.renderers

RendererRegistry(active='mimetype', registered=['colab', 'default', 'html', 'json', 'jupyterlab', 'kaggle', 'mimetype', 'nteract', 'png', 'svg', 'zeppelin'])

In [35]:
x.schema_path

('altair.vegalite.v4.display', 'schema/vega-lite-schema.json')

In [36]:
x.spec

{'$schema': 'https://vega.github.io/schema/vega-lite/v4.8.1.json',
 'title': 'Precision-recall curve',
 'data': {'values': [{'truth_threshold': -763.8546525297719,
    'match_probability': 1.1398225928877631e-230,
    'row_count': 2956.0,
    'P': 828.0,
    'N': 2128.0,
    'TP': 828.0,
    'TN': 0.0,
    'FP': 2128.0,
    'FN': 0.0,
    'P_rate': 0.0,
    'N_rate': 0.7198917269706726,
    'TP_rate': 1.0,
    'TN_rate': 0.0,
    'FP_rate': 1.0,
    'FN_rate': 0.0,
    'precision': 0.280108243227005,
    'recall': 1.0,
    'F1': 0.43763214349746704},
   {'truth_threshold': -759.4443790256178,
    'match_probability': 2.4236051958218802e-229,
    'row_count': 2956.0,
    'P': 828.0,
    'N': 2128.0,
    'TP': 828.0,
    'TN': 5.0,
    'FP': 2123.0,
    'FN': 0.0,
    'P_rate': 0.0,
    'N_rate': 0.7198917269706726,
    'TP_rate': 1.0,
    'TN_rate': 0.0023496239446103573,
    'FP_rate': 0.9976503849029541,
    'FN_rate': 0.0,
    'precision': 0.28058284521102905,
    'recall': 1.0,
    