In [205]:
import itertools
import time

import recordlinker
from recordlinker.blocking import BinaryEncoder, Blocker, Comparer
from recordlinker.metrics import normalized_l1

import numpy as np
import pandas as pd

import sklearn
from sklearn import metrics

from pyjarowinkler import distance

import multiprocessing as mp

import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [4]:
%reload_ext autoreload
%autoreload 2

In [5]:
iowa_matches = pd.read_csv(
    '/Users/kailinlu/Desktop/QMSSWork/RecordLinking/recordlinker/recordlinker/data/iowa_matches.csv')
iowa_matches.drop_duplicates(subset=['uid1915', 'hhid'], inplace=True)
iowa_nonmatches = pd.read_csv(
    '/Users/kailinlu/Desktop/QMSSWork/RecordLinking/recordlinker/recordlinker/data/iowa_nonmatches.csv')
iowa_nonmatches.drop_duplicates(subset=['uid1915', 'hhid'], inplace=True)

iowa_matches['match'] = 1
iowa_nonmatches['match'] = 0

iowa = pd.concat([iowa_matches, iowa_nonmatches])
total_matches = len(iowa_matches['uid-hhid'])
exact_matches = np.sum(iowa_matches['lname1915'] == iowa_matches['lname1940'])
print('Number of total matches: {}'.format(total_matches))
print('Number of exact matches: {}'.format(exact_matches))

names_1915 = iowa[['lname1915', 'uid1915', 'yob1915', 'fname1915']]
names_1915.drop_duplicates(subset=['uid1915'], inplace=True)
names_1940 = iowa[['lname1940', 'hhid', 'yob1940', 'fname1940']]
names_1940.drop_duplicates(subset=['hhid'], inplace=True)
names_1915.reset_index(inplace=True)
names_1940.reset_index(inplace=True)

Number of total matches: 4320
Number of exact matches: 3240


In [97]:
# Get indices of matches 
iowa['indexA'] = iowa['uid1915'].apply(lambda x: names_1915[names_1915['uid1915']==x].index[0]) 
iowa['indexB'] = iowa['hhid'].apply(lambda x: names_1940[names_1940['hhid']==x].index[0]) 

In [98]:
indexA = iowa[iowa['match']==1]['indexA']
indexB = iowa[iowa['match']==1]['indexB']

## Create comparison matrix

In [142]:
model_path = '/Users/kailinlu/Desktop/QMSSWork/RecordLinking/models/lstm_letter_8_iowa_last/encoder.h5'

blocker = Blocker(dfA=names_1915, dfB=names_1940)
blocks = blocker.block(autoencoder_col='lname1915',
                       autoencoder_colB='lname1940', 
                       autoencoder_model_path=model_path)

Loaded Model with input shape (None, 12, 28)
Median mu has been set with size (8,)
Finished blocking with autoencoder in 11.7590 s


In [143]:
blocker.compute_block_metrics(match_indexA=indexA, match_indexB=indexB)

Num Blocks: 203
Original Comparisons Needed: 449,404,991
Total Comparisons 4,391,383 : 0.98% of original
Avg Block Size: 21,632.43
Max Block Size: 330,624
Min Block Size: 1
Balance Score (1=even sizes): 0.000003
Num Matches Found 3672 Out Of 4320 (85.00%)
Num blocks containing matches 187, (92.12%)


In [206]:
compare_model_path = '/Users/kailinlu/Desktop/QMSSWork/RecordLinking/models/lstm_letter_192_iowa_last/encoder.h5'

comparer = Comparer(blocker)
comparer.compare_autoencoder(colA='lname1915', colB='lname1940', model_path=compare_model_path)
comparer.compare_jarowinkler(colA='lname1915', colB='lname1940')

comparer.features.head()

Loaded Model with input shape (None, 12, 28)
Median mu has been set with size (192,)
Finished computing autoencoder feature in 65.483876 s
Finished computing autoencoder feature in 132.162353 s


Unnamed: 0,indexA,indexB,autoencoder,jarowinkler
0,1194,1182,1.0,1.0
1,1194,4470,0.5,0.52
2,1194,7406,0.546875,0.55
3,1194,17520,1.0,1.0
4,1194,22496,0.526042,0.55


In [211]:
binarized_features = comparer.binarize({'autoencoder': 0.84, 'jarowinkler': 0.84})

In [212]:
binarized_features.head()

Unnamed: 0,indexA,indexB,autoencoder,jarowinkler
0,1194,1182,1,1
1,1194,4470,0,0
2,1194,7406,0,0
3,1194,17520,1,1
4,1194,22496,0,0


## Metrics for linking 

1. False Negative 
2. False Positive 
3. Accuracy 
4. F1 
5. Confusion matrix 
6. Linkage time