In [1]:
from splink import splink_datasets

In [3]:
df = splink_datasets.fake_1000
# Split a simple dataset into two, separate datasets which can be linked together.
df_l = df.sample(frac=0.5)
df_r = df.drop(df_l.index)

df_l.head(2)

Unnamed: 0,unique_id,first_name,surname,dob,city,email,cluster
129,129,Matilda,Barker,1990-03-08,Reading,m.b@bell-brown.com,36
763,763,Arabella,Martin,1984-04-14,London,amartin@navarro.com,194


In [4]:
import splink.comparison_library as cl

from splink import DuckDBAPI, Linker, SettingsCreator, block_on

settings = SettingsCreator(
    link_type="link_only",
    blocking_rules_to_generate_predictions=[
        block_on("first_name"),
        block_on("surname"),
    ],
    comparisons=[
        cl.NameComparison(
            "first_name",
        ),
        cl.NameComparison("surname"),
        cl.DateOfBirthComparison(
            "dob",
            input_is_string=True,
            invalid_dates_as_null=True,
        ),
        cl.ExactMatch("city").configure(term_frequency_adjustments=True),
        cl.EmailComparison("email"),
    ],
)

linker = Linker(
    [df_l, df_r],
    settings,
    db_api=DuckDBAPI(),
    input_table_aliases=["df_left", "df_right"],
)

In [6]:
from splink.exploratory import completeness_chart

completeness_chart(
    [df_l, df_r],
    cols=["first_name", "surname", "dob", "city", "email"],
    db_api=DuckDBAPI(),
    table_names_for_chart=["df_left", "df_right"],
)


In [7]:
df_l.isnull().sum()

unique_id       0
first_name     89
surname        87
dob             0
city          100
email          99
cluster         0
dtype: int64

In [8]:
deterministic_rules = [
    "l.first_name = r.first_name and levenshtein(r.dob, l.dob) <= 1",
    "l.surname = r.surname and levenshtein(r.dob, l.dob) <= 1",
    "l.first_name = r.first_name and levenshtein(r.surname, l.surname) <= 2",
    block_on("email"),
]


linker.training.estimate_probability_two_random_records_match(deterministic_rules, recall=0.7)

Probability two random records match is estimated to be  0.00321.
This means that amongst all possible pairwise record comparisons, one in 311.94 are expected to match.  With 250,000 total possible comparisons, we expect a total of around 801.43 matching pairs


In [9]:
linker.training.estimate_u_using_random_sampling(max_pairs=1e6, seed=1)

You are using the default value for `max_pairs`, which may be too small and thus lead to inaccurate estimates for your model's u-parameters. Consider increasing to 1e8 or 1e9, which will result in more accurate estimates, but with a longer run time.
----- Estimating u probabilities using random sampling -----

Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - first_name (no m values are trained).
    - surname (no m values are trained).
    - dob (no m values are trained).
    - city (no m values are trained).
    - email (no m values are trained).


In [10]:
session_dob = linker.training.estimate_parameters_using_expectation_maximisation(block_on("dob"))
session_email = linker.training.estimate_parameters_using_expectation_maximisation(
    block_on("email")
)
session_first_name = linker.training.estimate_parameters_using_expectation_maximisation(
    block_on("first_name")
)


----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
l."dob" = r."dob"

Parameter estimates will be made for the following comparison(s):
    - first_name
    - surname
    - city
    - email

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
    - dob

Level Jaro-Winkler >0.88 on username on comparison email not observed in dataset, unable to train m value

Iteration 1: Largest change in params was -0.435 in the m_probability of surname, level `Exact match on surname`
Iteration 2: Largest change in params was 0.113 in probability_two_random_records_match
Iteration 3: Largest change in params was 0.0387 in probability_two_random_records_match
Iteration 4: Largest change in params was 0.0138 in probability_two_random_records_match
Iteration 5: Largest change in params was 0.00569 in probability_two_random_records_match
Iteration 6: Largest change in params was 0.00256 in probab

In [11]:
results = linker.inference.predict(threshold_match_probability=0.9)

Blocking time: 0.02 seconds
Predict time: 0.40 seconds


In [15]:
a = results.as_pandas_dataframe()

In [16]:
a.match_probability.describe()

count    428.000000
mean       0.992857
std        0.015899
min        0.913469
25%        0.996057
50%        0.999724
75%        0.999992
max        1.000000
Name: match_probability, dtype: float64