<a href="https://colab.research.google.com/github/jacobmorrier/fast-er/blob/main/example/Example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install fast-er-link pyreadr

Collecting fast-er-link
  Downloading fast_er_link-0.2.0-py3-none-any.whl.metadata (1.4 kB)
Collecting pyreadr
  Downloading pyreadr-0.5.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading fast_er_link-0.2.0-py3-none-any.whl (20 kB)
Downloading pyreadr-0.5.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (411 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m411.7/411.7 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyreadr, fast-er-link
Successfully installed fast-er-link-0.2.0 pyreadr-0.5.3


In [2]:
import faster
import numpy as np
import pyreadr

In [4]:
# Upload example datasets
url_A = "https://github.com/jacobmorrier/fast-er/blob/main/example/df_A_100000.Rdata?raw=true"
path_A = "/content/df_A.Rdata"
df_A = pyreadr.read_r(pyreadr.download_file(url_A, path_A), path_A)['df_A']

url_B = "https://github.com/jacobmorrier/fast-er/blob/main/example/df_B_100000.Rdata?raw=true"
path_B = "/content/df_B.Rdata"
df_B = pyreadr.read_r(pyreadr.download_file(url_B, path_B), path_B)['df_B']

df_A['house_number'] = df_A['house_number'].astype(int).astype(str)
df_B['house_number'] = df_B['house_number'].astype(int).astype(str)

In [12]:
print('Number of common observations: ' + '{:,}'.format(len(np.intersect1d(df_A['ncid'], df_B['ncid']))))

Number of common observations: 50,000


In [13]:
%%time
# Calculation of String Similarity Metrics
comp = faster.Comparison(df_A, df_B,
                         Vars_Fuzzy_A=['last_name', 'first_name', 'house_number', 'street_name'],
                         Vars_Fuzzy_B=['last_name', 'first_name', 'house_number', 'street_name'],
                         Vars_Exact_A=['birth_year'], Vars_Exact_B=['birth_year'])

comp.fit()

# Estimation of the Fellegi-Sunter Model
est = faster.Estimation(4, 1, comp.Counts)

est.fit()

# Linkage
link = faster.Linkage(df_A, df_B, comp.Indices, est.Ksi)

linked_df = link.transform()

Convergence successfully achieved.
CPU times: user 22.8 s, sys: 1.61 s, total: 24.4 s
Wall time: 28.4 s


In [14]:
%%time
merged_df = df_A.merge(df_B, on = ['last_name', 'first_name', 'house_number', 'street_name', 'birth_year'])

CPU times: user 158 ms, sys: 2.91 ms, total: 161 ms
Wall time: 163 ms


In [15]:
precision = (linked_df['ncid_A'] == linked_df['ncid_B']).mean()
recall = (linked_df['ncid_A'] == linked_df['ncid_B']).sum() / len(np.intersect1d(df_A['ncid'], df_B['ncid']))

print('Probabilistic Record Linkage:')
print('-----------------------------')
print('Precision:', '{:.4f}'.format(precision))
print('Recall:', '{:.4f}'.format(recall))

Probabilistic Record Linkage:
-----------------------------
Precision: 0.9802
Recall: 0.9983


In [19]:
precision = (merged_df['ncid_x'] == merged_df['ncid_y']).mean()
recall = (merged_df['ncid_x'] == merged_df['ncid_y']).sum() / len(np.intersect1d(df_A['ncid'], df_B['ncid']))

print('\'Hard\' Merge:')
print('-------------')
print('Precision:', '{:.4f}'.format(precision))
print('Recall:', '{:.4f}'.format(recall))

'Hard' Merge:
-------------
Precision: 1.0000
Recall: 0.3645
