In [4]:
from faker import Faker
import random
import pandas as pd
fake = Faker()

In [5]:
import splink
print(splink.__version__)

4.0.6


In [6]:
def generate_custom_population(size=10000, seed=None):
    if seed is not None:
        random.seed(seed)
        fake = Faker()
        Faker.seed(seed)
    else:
        fake = Faker()
    
    return pd.DataFrame({
        'name': [fake.name() for _ in range(size)],
        'age': [random.randint(18, 90) for _ in range(size)],
        'state': [fake.state() for _ in range(size)],
        'gender': [random.choice(['Male', 'Female']) for _ in range(size)],
        'address': [fake.address() for _ in range(size)],
    })

In [7]:
population1 = generate_custom_population(10000, seed=1)
population2 = generate_custom_population(10000, seed=2)

In [8]:
import duckdb
print(duckdb.__version__)

1.1.3


In [9]:
import splink
print(splink.__file__)

c:\Users\dcjc9\anaconda3\lib\site-packages\splink\__init__.py


In [12]:
import duckdb
import pandas as pd

# Connect to an in-memory DuckDB database
conn = duckdb.connect(database=':memory:')

# Load your dataframes into DuckDB
conn.register('population1', population1)
conn.register('population2', population2)

<duckdb.duckdb.DuckDBPyConnection at 0x2a7b0b17830>

In [13]:
# Deduplicate population1 based on columns like name, age, state, etc.
deduplicated_population1 = conn.execute('''
    SELECT DISTINCT name, age, state, gender, address
    FROM population1
''').fetchdf()

# Deduplicate population2 similarly
deduplicated_population2 = conn.execute('''
    SELECT DISTINCT name, age, state, gender, address
    FROM population2
''').fetchdf()

In [8]:
import splink.comparison_library as cl
from splink import DuckDBAPI, Linker, SettingsCreator, block_on, splink_datasets

db_api = DuckDBAPI()

df = splink_datasets.fake_1000

settings = SettingsCreator(
    link_type="dedupe_only",
    comparisons=[
        cl.NameComparison("first_name"),
        cl.JaroAtThresholds("surname"),
        cl.DateOfBirthComparison(
            "dob",
            input_is_string=True,
        ),
        cl.ExactMatch("city").configure(term_frequency_adjustments=True),
        cl.EmailComparison("email"),
    ],
    blocking_rules_to_generate_predictions=[
        block_on("first_name", "dob"),
        block_on("surname"),
    ]
)

linker = Linker(df, settings, db_api)

linker.training.estimate_probability_two_random_records_match(
    [block_on("first_name", "surname")],
    recall=0.7,
)

linker.training.estimate_u_using_random_sampling(max_pairs=1e6)

linker.training.estimate_parameters_using_expectation_maximisation(
    block_on("first_name", "surname")
)

linker.training.estimate_parameters_using_expectation_maximisation(block_on("email"))

pairwise_predictions = linker.inference.predict(threshold_match_weight=-5)

clusters = linker.clustering.cluster_pairwise_predictions_at_threshold(
    pairwise_predictions, 0.95
)

df_clusters = clusters.as_pandas_dataframe()

Probability two random records match is estimated to be  0.000821.
This means that amongst all possible pairwise record comparisons, one in 1,218.29 are expected to match.  With 499,500 total possible comparisons, we expect a total of around 410.00 matching pairs
You are using the default value for `max_pairs`, which may be too small and thus lead to inaccurate estimates for your model's u-parameters. Consider increasing to 1e8 or 1e9, which will result in more accurate estimates, but with a longer run time.
----- Estimating u probabilities using random sampling -----

Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - first_name (no m values are trained).
    - surname (no m values are trained).
    - dob (no m values are trained).
    - city (no m values are trained).
    - email (no m values are trained).

----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
(l."first_nam

In [14]:
pairwise_predictions = linker.predict()

clusters = linker.clustering.cluster_pairwise_predictions_at_threshold(
    pairwise_predictions, 0.95
)

df_clusters = clusters.as_pandas_dataframe()

AttributeError: 'Linker' object has no attribute 'predict'

In [13]:
# Select one record per cluster (e.g., the first record in each cluster)
deduplicated_df = (
    df_clusters
    .sort_values("match_probability", ascending=False)  # Sort by match probability
    .drop_duplicates(subset=["cluster_id"])  # Keep one record per cluster
)

print(deduplicated_df.head())

KeyError: 'match_probability'

In [41]:
df[df['unique_id'] == 24]

Unnamed: 0,unique_id,first_name,surname,dob,city,email,cluster
24,24,Thoas,Green,1974-10-05,London,thomas.green@clark.org,10


In [47]:
import splink.comparison_library as cl
from splink import DuckDBAPI, Linker, SettingsCreator, block_on, splink_datasets

# Initialize the DuckDB API
db_api = DuckDBAPI()

# Load your two datasets (replace these with actual datasets)
df1 = splink_datasets.fake_1000  # Replace with actual first dataset
df2 = splink_datasets.fake_1000  # Replace with actual second dataset

# Create settings for deduplication
settings = SettingsCreator(
    link_type="link_only",  # Use "link_only" for linking between two datasets
    comparisons=[
        cl.NameComparison("first_name"),
        cl.JaroAtThresholds("surname"),
        cl.DateOfBirthComparison("dob", input_is_string=True),
        cl.ExactMatch("city").configure(term_frequency_adjustments=True),
        cl.EmailComparison("email"),
    ],
    blocking_rules_to_generate_predictions=[
        block_on("first_name", "dob"),
        block_on("surname"),
    ]
)

# Create the linker with the two datasets
linker = Linker([df1, df2], settings, db_api)

# Estimate probability and other parameters for matching
linker.training.estimate_probability_two_random_records_match(
    [block_on("first_name", "surname")],
    recall=0.7,
)

linker.training.estimate_u_using_random_sampling(max_pairs=1e6)

# Estimate parameters using EM
linker.training.estimate_parameters_using_expectation_maximisation(block_on("first_name", "surname"))
linker.training.estimate_parameters_using_expectation_maximisation(block_on("email"))

# Make pairwise predictions based on a threshold
pairwise_predictions = linker.inference.predict(threshold_match_weight=-5)

# Cluster the pairwise predictions based on a threshold
clusters = linker.clustering.cluster_pairwise_predictions_at_threshold(
    pairwise_predictions, 0.95
)

# Convert clusters to a DataFrame and display the first 5 rows
df_clusters = clusters.as_pandas_dataframe(limit=5)

# Display the clusters (or you can perform further analysis)
print(df_clusters)

Probability two random records match is estimated to be  0.0018.
This means that amongst all possible pairwise record comparisons, one in 555.56 are expected to match.  With 1,000,000 total possible comparisons, we expect a total of around 1,800.00 matching pairs
You are using the default value for `max_pairs`, which may be too small and thus lead to inaccurate estimates for your model's u-parameters. Consider increasing to 1e8 or 1e9, which will result in more accurate estimates, but with a longer run time.
----- Estimating u probabilities using random sampling -----


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))


Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - first_name (no m values are trained).
    - surname (no m values are trained).
    - dob (no m values are trained).
    - city (no m values are trained).
    - email (no m values are trained).

----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
(l."first_name" = r."first_name") AND (l."surname" = r."surname")

Parameter estimates will be made for the following comparison(s):
    - dob
    - city
    - email

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
    - first_name
    - surname

Level Jaro-Winkler >0.88 on username on comparison email not observed in dataset, unable to train m value

Iteration 1: Largest change in params was -0.199 in the m_probability of dob, level `Exact match on date of birth`
Iteration 2: Largest change in params was 0.0212 in prob

                      cluster_id  unique_id first_name    surname         dob  \
0   __splink__input_table_0-__-4          4      Grace       None  1997-04-26   
1  __splink__input_table_0-__-10          9       Evie       Dean  2015-03-03   
2  __splink__input_table_0-__-14         14     Oliver  Griffiths  1991-10-26   
3  __splink__input_table_0-__-19         19       Rowe      Caleb  1992-12-20   
4  __splink__input_table_0-__-25         25    Gabriel     Thomas  1977-09-13   

         city                            email  cluster  \
0        Hull          grace.kelly52@jones.com        1   
1  Pootsmruth        evihd56@earris-bailey.net        3   
2      Lunton  o.griffiths90@reyes-coleman.com        5   
3    Lvpreool              calebr@thompson.org        8   
4      London         gabriel.t54@nichols.info       11   

            source_dataset  
0  __splink__input_table_0  
1  __splink__input_table_0  
2  __splink__input_table_0  
3  __splink__input_table_0  
4  __splink__

In [50]:
# Make pairwise predictions based on a threshold
pairwise_predictions = linker.inference.predict(threshold_match_weight=-5)

# Convert pairwise predictions to a DataFrame
df_matches = pairwise_predictions.as_pandas_dataframe()

# Inspect the matches
print(df_matches.head())

# Filter matches with high probability
df_high_prob_matches = df_matches[df_matches["match_probability"] > 0.9]

# Display high-probability matches
print(df_high_prob_matches.head())

Blocking time: 0.02 seconds
Predict time: 0.24 seconds

You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary.  To produce predictions the following untrained trained parameters will use default values.
Comparison: 'email':
    m values not fully trained


   match_weight  match_probability         source_dataset_l  \
0     23.454689            1.00000  __splink__input_table_0   
1     28.818160            1.00000  __splink__input_table_0   
2     32.625514            1.00000  __splink__input_table_0   
3     25.190886            1.00000  __splink__input_table_0   
4     16.668011            0.99999  __splink__input_table_0   

          source_dataset_r  unique_id_l  unique_id_r first_name_l  \
0  __splink__input_table_1            4            4        Grace   
1  __splink__input_table_1           14           14       Oliver   
2  __splink__input_table_1           19           19         Rowe   
3  __splink__input_table_1           25           25      Gabriel   
4  __splink__input_table_1           26           30       Thomas   

  first_name_r  gamma_first_name  surname_l  ...       dob_l       dob_r  \
0        Grace                 4       None  ...  1997-04-26  1997-04-26   
1       Oliver                 4  Griffiths  ...  1991

In [52]:
df_high_prob_matches

Unnamed: 0,match_weight,match_probability,source_dataset_l,source_dataset_r,unique_id_l,unique_id_r,first_name_l,first_name_r,gamma_first_name,surname_l,...,dob_l,dob_r,gamma_dob,city_l,city_r,gamma_city,email_l,email_r,gamma_email,match_key
0,23.454689,1.000000,__splink__input_table_0,__splink__input_table_1,4,4,Grace,Grace,4,,...,1997-04-26,1997-04-26,5,Hull,Hull,1,grace.kelly52@jones.com,grace.kelly52@jones.com,4,0
1,28.818160,1.000000,__splink__input_table_0,__splink__input_table_1,14,14,Oliver,Oliver,4,Griffiths,...,1991-10-26,1991-10-26,5,Lunton,Lunton,1,o.griffiths90@reyes-coleman.com,o.griffiths90@reyes-coleman.com,4,0
2,32.625514,1.000000,__splink__input_table_0,__splink__input_table_1,19,19,Rowe,Rowe,4,Caleb,...,1992-12-20,1992-12-20,5,Lvpreool,Lvpreool,1,calebr@thompson.org,calebr@thompson.org,4,0
3,25.190886,1.000000,__splink__input_table_0,__splink__input_table_1,25,25,Gabriel,Gabriel,4,Thomas,...,1977-09-13,1977-09-13,5,London,London,1,gabriel.t54@nichols.info,gabriel.t54@nichols.info,4,0
4,16.668011,0.999990,__splink__input_table_0,__splink__input_table_1,26,30,Thomas,Thomas,4,Gabriel,...,1976-09-15,1976-09-15,5,Loodon,London,0,gabriel.t54@nnichls.info,gabriel.t54@nlchois.info,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2576,3.454238,0.916391,__splink__input_table_0,__splink__input_table_1,659,661,Charlie,Cahlrae,1,Hall,...,2018-06-26,2019-07-26,1,Birmingham,Birminghram,0,charlieh@sandoval-sanders.info,charlieh@sandoval-sanders.info,4,1
2577,9.751534,0.998841,__splink__input_table_0,__splink__input_table_1,950,952,George,George,4,Davies,...,2005-07-23,2004-06-23,1,,Middlesbrough,-1,gdavies72@conner-rose.com,gdavies72@conner-rose.com,4,1
2578,10.181235,0.999139,__splink__input_table_0,__splink__input_table_1,140,139,James,James,4,Campbell,...,2012-11-06,2011-10-10,1,London,London,1,jc@herring.info,joc@hrning.irfo,2,1
2579,14.017473,0.999940,__splink__input_table_0,__splink__input_table_1,138,139,,James,-1,Campbell,...,2011-10-10,2011-10-10,5,London,London,1,jc@herring.info,joc@hrning.irfo,2,1


In [55]:
import pseudopeople

TypeError: unsupported operand type(s) for |: 'type' and 'NoneType'

In [5]:
import pseudopeople as psp

TypeError: unsupported operand type(s) for |: 'type' and 'NoneType'