In [417]:
import itertools
import time

import recordlinker
from recordlinker.blocking import BinaryEncoder, Blocker, Comparer
from recordlinker.metrics import normalized_l1

import numpy as np
import pandas as pd

import sklearn
from sklearn import metrics

from pyjarowinkler import distance

import multiprocessing as mp

import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [412]:
%reload_ext autoreload
%autoreload 2

In [5]:
iowa_matches = pd.read_csv(
    '/Users/kailinlu/Desktop/QMSSWork/RecordLinking/recordlinker/recordlinker/data/iowa_matches.csv')
iowa_matches.drop_duplicates(subset=['uid1915', 'hhid'], inplace=True)
iowa_nonmatches = pd.read_csv(
    '/Users/kailinlu/Desktop/QMSSWork/RecordLinking/recordlinker/recordlinker/data/iowa_nonmatches.csv')
iowa_nonmatches.drop_duplicates(subset=['uid1915', 'hhid'], inplace=True)

iowa_matches['match'] = 1
iowa_nonmatches['match'] = 0

iowa = pd.concat([iowa_matches, iowa_nonmatches])
total_matches = len(iowa_matches['uid-hhid'])
exact_matches = np.sum(iowa_matches['lname1915'] == iowa_matches['lname1940'])
print('Number of total matches: {}'.format(total_matches))
print('Number of exact matches: {}'.format(exact_matches))

names_1915 = iowa[['lname1915', 'uid1915', 'yob1915', 'fname1915']]
names_1915.drop_duplicates(subset=['uid1915'], inplace=True)
names_1940 = iowa[['lname1940', 'hhid', 'yob1940', 'fname1940']]
names_1940.drop_duplicates(subset=['hhid'], inplace=True)
names_1915.reset_index(inplace=True)
names_1940.reset_index(inplace=True)

Number of total matches: 4320
Number of exact matches: 3240


In [97]:
# Get indices of matches 
iowa['indexA'] = iowa['uid1915'].apply(lambda x: names_1915[names_1915['uid1915']==x].index[0]) 
iowa['indexB'] = iowa['hhid'].apply(lambda x: names_1940[names_1940['hhid']==x].index[0]) 

In [98]:
indexA = iowa[iowa['match']==1]['indexA']
indexB = iowa[iowa['match']==1]['indexB']

## Create comparison matrix

In [142]:
model_path = '/Users/kailinlu/Desktop/QMSSWork/RecordLinking/models/lstm_letter_8_iowa_last/encoder.h5'

blocker = Blocker(dfA=names_1915, dfB=names_1940)
blocks = blocker.block(autoencoder_col='lname1915',
                       autoencoder_colB='lname1940', 
                       autoencoder_model_path=model_path)

Loaded Model with input shape (None, 12, 28)
Median mu has been set with size (8,)
Finished blocking with autoencoder in 11.7590 s


In [143]:
blocker.compute_block_metrics(match_indexA=indexA, match_indexB=indexB)

Num Blocks: 203
Original Comparisons Needed: 449,404,991
Total Comparisons 4,391,383 : 0.98% of original
Avg Block Size: 21,632.43
Max Block Size: 330,624
Min Block Size: 1
Balance Score (1=even sizes): 0.000003
Num Matches Found 3672 Out Of 4320 (85.00%)
Num blocks containing matches 187, (92.12%)


In [441]:
compare_model_path = '/Users/kailinlu/Desktop/QMSSWork/RecordLinking/models/lstm_letter_192_iowa_last/encoder.h5'

comparer = Comparer(blocker)
comparer.compare_autoencoder(colA='lname1915', colB='lname1940', model_path=compare_model_path)
comparer.compare_jarowinkler(colA='lname1915', colB='lname1940')
comparer.compare_product(a='autoencoder', b='jarowinkler')
comparer.compare_jarowinkler(colA='fname1915', colB='fname1940', colname='jarowinkler-first')

Finished computing autoencoder feature in 125.471208 s


In [442]:
comparer.features.head()

Unnamed: 0,indexA,indexB,autoencoder,jarowinkler,product-autoencoder-jarowinkler,jarowinkler-first
0,1194,1182,1.0,1.0,1.0,1.0
1,1194,4470,0.5,0.52,0.26,0.46
2,1194,7406,0.546875,0.55,0.300781,0.61
3,1194,17520,1.0,1.0,1.0,0.56
4,1194,22496,0.526042,0.55,0.289323,0.53


In [443]:
features = comparer.discretize({'autoencoder': 0.88, 
                                'jarowinkler': 0.88,
                                'jarowinkler-first': 0.88,
                                'product-autoencoder-jarowinkler': .8}, 
                              binary=True)
features.head()

Unnamed: 0,indexA,indexB,autoencoder,jarowinkler,product-autoencoder-jarowinkler,jarowinkler-first
0,1194,1182,1,1,1,1
1,1194,4470,0,0,0,0
2,1194,7406,0,0,0,0
3,1194,17520,1,1,1,0
4,1194,22496,0,0,0,0


####  Preparing CSVs for linkage using R fastLink

In [None]:
# # Denote 2 for exact match, 1 for close match, 0 for no match
fastlink_features = comparer.discretize({'autoencoder': 0.8, 
                                         'jarowinkler': 0.88, 
                                         'product-autoencoder-jarowinkler': 0.8}, 
                                       binary=False)
fastlink_features.head()

In [439]:
# # Jaro Winkler only 
# jw_table = pd.pivot_table(fastlink_features, index='jarowinkler', values='indexA', aggfunc='count')
# jw_table.reset_index(inplace=True)
# jw_table.columns = ['gamma.1', 'counts']
# jw_table

# # Autoencoder only 
# auto_table = pd.pivot_table(fastlink_features, index='autoencoder', values='indexA', aggfunc='count')
# auto_table.reset_index(inplace=True)
# auto_table.columns = ['gamma.1', 'counts']
# auto_table

# # Product only 
# product_table = pd.pivot_table(fastlink_features, index='product-autoencoder-jarowinkler', values='indexA', aggfunc='count')
# product_table.reset_index(inplace=True)
# product_table.columns = ['gamma.1', 'counts']
# product_table

# # JW and Autoencoder 
# jw_auto_table = pd.pivot_table(fastlink_features, 
#                                index=['jarowinkler', 'autoencoder'], 
#                                values='indexA', 
#                                aggfunc='count')

# jw_auto_table.reset_index(inplace=True)
# jw_auto_table.columns = ['gamma.1', 'gamma.2', 'counts']
# jw_auto_table

## Metrics for linking 

1. False Negative 
2. False Positive 
3. Accuracy 
4. F1 
5. Confusion matrix 
6. Linkage time

In [394]:
import recordlinkage
import time 

In [445]:
linkage_features = features.set_index(['indexB', 'indexA'])
true_matches = pd.MultiIndex.from_tuples(list(zip(indexB, indexA)))

In [462]:
# Jaro Winkler first and last
start_time = time.time() 
jw_ecm = recordlinkage.KMeansClassifier()
jw_prediction = jw_ecm.fit_predict(linkage_features[['jarowinkler', 'jarowinkler-first']])
print('Prediction Time {:4f} s'.format(time.time() - start_time))

Prediction Time 1.727759 s


In [465]:
recordlinkage.confusion_matrix(true_matches,jw_prediction, len(linkage_features))

array([[   3283,    1037],
       [  16588, 4370475]])

In [474]:
print('F: ', recordlinkage.fscore(true_matches,jw_prediction))
print('Precision: ', recordlinkage.precision(true_matches,jw_prediction))
print('Recall: ', recordlinkage.recall(true_matches,jw_prediction))

F:  0.27142325658302674
Precision:  0.16521564088369986
Recall:  0.7599537037037037


In [482]:
# Autoencoder 
start_time = time.time() 
auto_ecm = recordlinkage.KMeansClassifier()
auto_prediction = auto_ecm.fit_predict(linkage_features[['autoencoder']])
print('Prediction Time {:4f} s'.format(time.time() - start_time))

Prediction Time 1.292093 s


In [483]:
recordlinkage.confusion_matrix(true_matches,auto_prediction, len(linkage_features))

array([[   3395,     925],
       [ 554387, 3832676]])

In [478]:
print('F: ', recordlinkage.fscore(true_matches,auto_prediction))
print('Precision: ', recordlinkage.precision(true_matches,auto_prediction))
print('Recall: ', recordlinkage.recall(true_matches,auto_prediction))

F:  0.012079658140337518
Precision:  0.0060866073125342875
Recall:  0.7858796296296297


In [484]:
start_time = time.time() 
jw_auto_ecm = recordlinkage.KMeansClassifier()
jw_auto_prediction = jw_auto_ecm.fit_predict(linkage_features[['jarowinkler', 'jarowinkler-first', 'autoencoder']])
print('Prediction Time {:4f} s'.format(time.time() - start_time))

Prediction Time 2.396527 s


In [485]:
recordlinkage.confusion_matrix(true_matches, jw_auto_prediction, len(linkage_features))

array([[   3395,     925],
       [ 554387, 3832676]])

In [486]:
print('F: ', recordlinkage.fscore(true_matches,jw_auto_prediction))
print('Precision: ', recordlinkage.precision(true_matches,jw_auto_prediction))
print('Recall: ', recordlinkage.recall(true_matches,jw_auto_prediction))

F:  0.012079658140337518
Precision:  0.0060866073125342875
Recall:  0.7858796296296297


In [487]:
start_time = time.time() 
all_ecm = recordlinkage.KMeansClassifier()
all_prediction = all_ecm.fit_predict(linkage_features[['autoencoder', 'jarowinkler', 'jarowinkler-first', 'product-autoencoder-jarowinkler']])
print('Prediction Time {:4f} s'.format(time.time() - start_time))

Prediction Time 2.726355 s


In [489]:
recordlinkage.confusion_matrix(true_matches, all_prediction, len(linkage_features))

array([[   3483,     837],
       [ 594370, 3792693]])

In [491]:
print('F: ', recordlinkage.fscore(true_matches,all_prediction))
print('Precision: ', recordlinkage.precision(true_matches,all_prediction))
print('Recall: ', recordlinkage.recall(true_matches,all_prediction))

F:  0.011568104182685043
Precision:  0.005825846821877619
Recall:  0.80625
