In [226]:
import itertools
import time

import recordlinker
from recordlinker.blocking import BinaryEncoder, Blocker, Linker
from recordlinker.metrics import normalized_l1

import numpy as np
import pandas as pd

import sklearn
from sklearn import metrics

from pyjarowinkler import distance

import warnings
warnings.filterwarnings('ignore')

In [200]:
%reload_ext autoreload
%autoreload 2

In [201]:
iowa_matches = pd.read_csv(
    '/Users/kailinlu/Desktop/QMSSWork/RecordLinking/recordlinker/recordlinker/data/iowa_matches.csv')
iowa_matches.drop_duplicates(subset=['uid1915', 'hhid'], inplace=True)
iowa_nonmatches = pd.read_csv(
    '/Users/kailinlu/Desktop/QMSSWork/RecordLinking/recordlinker/recordlinker/data/iowa_nonmatches.csv')
iowa_nonmatches.drop_duplicates(subset=['uid1915', 'hhid'], inplace=True)

iowa_matches['match'] = 1
iowa_nonmatches['match'] = 0

iowa = pd.concat([iowa_matches, iowa_nonmatches])
total_matches = len(iowa_matches['uid-hhid'])
exact_matches = np.sum(iowa_matches['lname1915'] == iowa_matches['lname1940'])
print('Number of total matches: {}'.format(total_matches))
print('Number of exact matches: {}'.format(exact_matches))

names_1915 = iowa[['lname1915', 'uid1915', 'yob1915', 'fname1915']]
names_1915.drop_duplicates(subset=['uid1915'], inplace=True)
names_1940 = iowa[['lname1940', 'hhid', 'yob1940', 'fname1940']]
names_1940.drop_duplicates(subset=['hhid'], inplace=True)
names_1915.reset_index(inplace=True)
names_1940.reset_index(inplace=True)


Number of total matches: 4320
Number of exact matches: 3240


## Create comparison matrix

In [202]:
model_path = '/Users/kailinlu/Desktop/QMSSWork/RecordLinking/models/lstm_letter_8_iowa_last/encoder.h5'

blocker = Blocker(dfA=names_1915, dfB=names_1940)
blocks = blocker.block(autoencoder_col='lname1915',
                       autoencoder_colB='lname1940', 
                       autoencoder_model_path=model_path)

Loaded Model with input shape (None, 12, 28)
Median mu has been set with size (8,)


In [203]:
blocker.compute_block_metrics()

Num Blocks: 203
Original Comparisons Needed: 449,404,991
Total Comparisons 4,391,383 : 0.98% of original
Avg Block Size: 21,632.43
Max Block Size: 330,624
Min Block Size: 1
Balance Score (1=even sizes): 0.000003


In [227]:
linker = Linker(blocker)
comparisons = linker.compare(cols=['lname1915', 'lname1940'], 
                            jaro=True, 
                            autoencoder=False)

Checkpoint 1.327187
Checkpoint 1.576893
Checkpoint 42.995814


In [278]:
A = names_1915['lname1915'].iloc[comparisons['indexA']]
B = names_1940['lname1940'].iloc[comparisons['indexB']]

In [307]:
def enc_dist(args):
    a,b = args
    vecA = blocker.encoder.encode(blocker.preprocess(names_1915.iloc[a], 'lname1915'))
    vecB = blocker.encoder.encode(blocker.preprocess(names_1940.iloc[b], 'lname1940'))
    return normalized_l1(vecA, vecB)

def autoencoder_dist(args):
    return np.reshape([enc_dist(arg) for arg in args],-1)

In [None]:
# start_time = time.time()
# p = Pool(processes=8)
# items = list(zip(comparisons['indexA'],comparisons['indexB']))[:10]
# # print(time.time() - start_time)
# batch_size = 2
# # #batch_size = len(items) // 8
# items_split = [items[i:i+batch_size] for i in range(0, len(items), batch_size)]
# # print(time.time() - start_time)
# results = p.map(autoencoder_dist, items)
# print(time.time()-start_time)

Process ForkPoolWorker-1092:
Process ForkPoolWorker-1091:
Process ForkPoolWorker-1090:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/local/Cellar/python/3.6.5/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/usr/local/Cellar/python/3.6.5/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/local/Cellar/python/3.6.5/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/local/Cellar/python/3.6.5/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/local/Cellar/python/3.6.5/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/process.py", line 93, in ru

KeyboardInterrupt: 

KeyboardInterrupt
KeyboardInterrupt
  File "/usr/local/Cellar/python/3.6.5/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/connection.py", line 407, in _recv_bytes
    buf = self._recv(4)
  File "/usr/local/Cellar/python/3.6.5/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/connection.py", line 379, in _recv
    chunk = read(handle, remaining)
KeyboardInterrupt


In [322]:
start_time = time.time()
items = list(zip(comparisons['indexA'],comparisons['indexB']))[:1000]
print(time.time()-start_time)
autoencoder_dist(items)
print(time.time()-start_time)

1.4534540176391602
8.600399017333984


## Metrics for linking 

1. False Negative 
2. False Positive 
3. Accuracy 
4. F1 
5. Confusion matrix 
6. Linkage time

In [143]:
comparisons['jaro-winkler'] = pd.Series(results)