In [9]:
import recordlinker

from recordlinker.preprocess import create_training_set
from recordlinker.blocking import BinaryEncoder

import numpy as np
import pandas as pd

import sklearn
from sklearn import metrics

import warnings
warnings.filterwarnings('ignore')

In [4]:
%reload_ext autoreload
%autoreload 2

Load the data and mark matches and nonmatches

In [14]:
iowa_matches = pd.read_csv('/Users/kailinlu/Desktop/QMSSWork/RecordLinking/recordlinker/recordlinker/data/iowa_matches.csv')
iowa_nonmatches = pd.read_csv('/Users/kailinlu/Desktop/QMSSWork/RecordLinking/recordlinker/recordlinker/data/iowa_nonmatches.csv')

iowa_matches['match'] = 1
iowa_nonmatches['match'] = 0

iowa = pd.concat([iowa_matches, iowa_nonmatches])
total_matches = len(iowa_matches['uid-hhid'])
print('Number of total matches: {}'.format(total_matches))

Number of total matches: 4320


Extract unique IDs for 1915 names and unique IDs for 1940 names. 

There are 6881 unique 1915 people and 65939 unique 1940 people. We want to match the correct uid-hhid pairs as denoted in the iowa dataframe. 

In [10]:
names_1915 = iowa[['lname1915', 'uid1915']]
names_1915.drop_duplicates(inplace=True)
names_1940 = iowa[['lname1940', 'hhid']]
names_1940.drop_duplicates(inplace=True)

In [11]:
names_1915.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6881 entries, 0 to 74895
Data columns (total 2 columns):
lname1915    6881 non-null object
uid1915      6881 non-null object
dtypes: object(2)
memory usage: 161.3+ KB


In [12]:
names_1940.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 65939 entries, 0 to 75022
Data columns (total 2 columns):
lname1940    65939 non-null object
hhid         65939 non-null int64
dtypes: int64(1), object(1)
memory usage: 1.5+ MB


### Blocking

We will test the quality of the blocks from the LSTM model with 2, 4, 8, and 16 latent variables.

Metrics: 

1. Number of blocks 
2. Average size, max size, and min size of each block 
3. % of all matches found within the same block 

In [187]:
def _block_autoencoder(dfA, dfB, 
                       autoencoder_model_path,
                       autoencoder_col,
                       autoencoder_colB=None,
                       embed_type='letters'):
    
    if autoencoder_colB is None: 
        autoencoder_colB = autoencoder_col
    assert all(isinstance(name, str) for name in dfA[autoencoder_col])
    assert all(isinstance(name, str) for name in dfB[autoencoder_colB])

    encoder = BinaryEncoder(autoencoder_model_path)
    input_dim = encoder.input_dim
    if len(input_dim) == 3:
        train_data = create_training_set(dfA,
                                               autoencoder_col,
                                               max_length=input_dim[1],
                                               embed_type='letters',
                                               normalize=False,
                                               categorical=True)
        match_data = create_training_set(dfB,
                                               autoencoder_colB,
                                               max_length=input_dim[1],
                                               embed_type='letters',
                                               normalize=False,
                                               categorical=True)
    else:
        train_data = create_training_set(dfA,
                                               autoencoder_col,
                                               max_length=input_dim[1],
                                               embed_type=embed_type,
                                               normalize=True,
                                               categorical=False)
        match_data = create_training_set(dfB,
                                               autoencoder_colB,
                                               max_length=input_dim[1],
                                               embed_type=embed_type,
                                               normalize=True,
                                               categorical=False)

    train_encoded = encoder.calculate_and_encode(train_data, split=True)
    match_encoded = encoder.binary_encode(match_data, split=True)

    unique_blocks =  [self.stringify(vec) for vec in np.unique(train_encoded, axis=0)]
    blocks = dict.fromkeys(unique_blocks)
    for k, v in blocks.items():
        blocks[k] = {'A':[], 'B':[]}

    for i, vec in enumerate(train_encoded):
        key = self.stringify(vec)
        blocks[key]['A'].append(i)

    for i, vec in enumerate(match_encoded):
        key = self.stringify(vec)
        if key in blocks.keys():
            blocks[key]['B'].append(i)
    return blocks

In [188]:
model_path = '/Users/kailinlu/Desktop/QMSSWork/RecordLinking/models/lstm_letter_2_iowa_last/encoder.h5'

blocks = _block_autoencoder(dfA=names_1915, dfB=names_1940,
                            autoencoder_model_path=model_path,
                            autoencoder_col='lname1915',
                            autoencoder_colB='lname1940',
                            embed_type='letters')

AttributeError: 'BinaryEncoder' object has no attribute 'calculate_and_encode'

In [189]:
dir(BinaryEncoder)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'binary_encode']