In [31]:
import pandas as pd 
import numpy as np 
 
from recordlinker import preprocess 

import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore', 'info')

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
iowa_matches = pd.read_csv('/Users/kailinlu/Desktop/QMSSWork/RecordLinking/recordlinker/recordlinker/data/iowa_matches.csv')
iowa_nonmatches = pd.read_csv('/Users/kailinlu/Desktop/QMSSWork/RecordLinking/recordlinker/recordlinker/data/iowa_nonmatches.csv')

In [5]:
iowa_matches.head()

Unnamed: 0,uid1915,fname1915,lname1915,fullname1915,yob1915,hhid,fname1940,lname1940,fullname1940,yob1940,uid-hhid
0,uid0910071227,donald d,cutler,donald d cutler,1911,19067,donald dean,cutler,donald dean cutler,1911,uid0910071227-19067
1,uid0063131339,homer,taylor,homer taylor,1912,71505,homer ellis,taylor,homer ellis taylor,1912,uid0063131339-71505
2,uid0044088276,earl,stearnes,earl stearnes,1899,109708,earl,stearns,earl stearns,1900,uid0044088276-109708
3,uid0067053130,theodore,hornaday,theodore hornaday,1904,108304,theodore i,harnaday,theodore i harnaday,1904,uid0067053130-108304
4,uid0066046148,jack r,turner,jack r turner,1907,105092,jack r,turner,jack r turner,1907,uid0066046148-105092


In [6]:
union_matches = pd.read_csv('/Users/kailinlu/Desktop/QMSSWork/RecordLinking/recordlinker/recordlinker/data/unionarmy_matches.csv')

In [7]:
union_matches.head()

Unnamed: 0,recidnum,recname1,recname2,last1,first1,last2,first2
0,100501001,anson charles h,anson charles h,anson,charles h,anson,charles h
1,100501002,allsheskey theodore f,allsheskey theodore f,allsheskey,theodore f,allsheskey,theodore f
2,100501003,bill charles w,bill c w,bill,charles w,bill,c w
3,100501004,bradley george a,bradley george a,bradley,george a,bradley,george a
4,100501005,bunitt william n,burritt william n,bunitt,william n,burritt,william n


### String Embedding Example

In [42]:
from recordlinker.preprocess import embed_letters, embed_shingles, disembed_letters, disembed_shingles

name = 'kailin lu'
max_length = 12 

print('Embed Letters: \n', 
      embed_letters(name, max_length), 
      disembed_letters(embed_letters(name, max_length))) 

print('Embed Letters Normalized: \n', 
      embed_letters(name, max_length, normalize=True), 
      disembed_letters(embed_letters(name, max_length, normalize=True)))

print('Embed 2-Shingles: \n', 
      embed_shingles(name, max_length), 
      disembed_shingles(embed_shingles(name, max_length))) 

print('Embed 2-Shingles Normalized: \n', 
      embed_shingles(name, max_length, normalize=True), 
      disembed_shingles(embed_shingles(name, max_length, normalize=True)))

Embed Letters: 
 [11  1  9 12  9 14 27 12 21  0  0  0] kailin lu
Embed Letters Normalized: 
 [0.40740741 0.03703704 0.33333333 0.44444444 0.33333333 0.51851852
 1.         0.44444444 0.77777778 0.         0.         0.        ] kailin lu
Embed 2-Shingles: 
 [261   8 219 295 221 364 688 306   0   0   0   0] kailin lu
Embed 2-Shingles Normalized: 
 [0.35753425 0.0109589  0.3        0.40410959 0.30273973 0.49863014
 0.94246575 0.41917808 0.         0.         0.         0.        ] kailin lu


## Train and save autoencoders

### Dense

In [71]:
ORIG_LENGTH = 12
BATCH_SIZE = 32
ENCODE_DIM = [256, 128] 
DECODE_DIM = [128, 256]
LR = 1e-4
EPOCHS=301
LATENT_DIM = [16,24]

# Embed letters 
namesA = preprocess.embed(iowa_matches['lname1915'],
                         max_length=ORIG_LENGTH, 
                         embed_type='letters', 
                         normalize=True)
namesB = preprocess.embed(iowa_matches['lname1940'],
                         max_length=ORIG_LENGTH, 
                         embed_type='letters', 
                         normalize=True)

for latent_dim in LATENT_DIM: 
    save_path = '/Users/kailinlu/Desktop/QMSSWork/RecordLinking/models/dense_letter_{}/'.format(latent_dim)
    run_id = 'dense_{}'.format(latent_dim)
    vae = recordlinker.model.VAE(batch_size=BATCH_SIZE,
                                 orig_dim=ORIG_LENGTH, 
                                 latent_dim=latent_dim,
                                 encode_dim=ENCODE_DIM,
                                 decode_dim=DECODE_DIM,
                                 lr=LR)

    model, encoder, decoder = vae.train(namesA, namesB, 
                                        epochs=EPOCHS, 
                                        run_id=run_id,
                                        save_path=save_path,
                                        optimizer='adam', 
                                        tensorboard=True, 
                                        earlystop=True,
                                        earlystop_patience=10,
                                        reconstruct=True, 
                                        reconstruct_display=10)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 12)           0                                            
__________________________________________________________________________________________________
enc_0 (Dense)                   (None, 256)          3328        input_1[0][0]                    
__________________________________________________________________________________________________
enc_1 (Dense)                   (None, 128)          32896       enc_0[0][0]                      
__________________________________________________________________________________________________
mu (Dense)                      (None, 16)           2064        enc_1[0][0]                      
__________________________________________________________________________________________________
log_sigma 

Epoch 37/301
Epoch 38/301
Epoch 39/301
Epoch 40/301
Epoch 41/301
Sample Reconstructions:
{'Orig': 'patrick', 'Pred:': 'hcsnkgb     '}
{'Orig': 'larson', 'Pred:': 'kcupmoa     '}
{'Orig': 'dolan', 'Pred:': 'hnpciaba    '}
{'Orig': 'dannenfeldt', 'Pred:': 'dgrqemkmgcaa'}
{'Orig': 'pingree', 'Pred:': 'nioeqec     '}
Epoch 42/301
Epoch 43/301
Epoch 44/301
Epoch 45/301
Epoch 46/301
Epoch 47/301
Epoch 48/301
Epoch 49/301
Epoch 50/301
Epoch 51/301
Sample Reconstructions:
{'Orig': 'swanson', 'Pred:': 'rudmqoncbaa '}
{'Orig': 'nelson', 'Pred:': 'mfnrmmb     '}
{'Orig': 'coughlin', 'Pred:': 'fluhejflca  '}
{'Orig': 'jansen', 'Pred:': 'icprela     '}
{'Orig': 'baker', 'Pred:': 'ecpfmad     '}
Epoch 52/301
Epoch 53/301
Epoch 54/301
Epoch 55/301
Epoch 56/301
Epoch 57/301
Epoch 58/301
Epoch 59/301
Epoch 60/301
Epoch 61/301
Sample Reconstructions:
{'Orig': 'schlueter', 'Pred:': 'lgkjugufgaa '}
{'Orig': 'bloodgood', 'Pred:': 'dhonegkiebaa'}
{'Orig': 'solnar', 'Pred:': 'snmndn      '}
{'Orig': 'dorey',

Train on 3456 samples, validate on 864 samples
Epoch 1/301
Sample Reconstructions:
{'Orig': 'jacobs', 'Pred:': 'ifmmeneba   '}
{'Orig': 'casey', 'Pred:': 'jjrgngca    '}
{'Orig': 'haben', 'Pred:': 'kmljjfdba   '}
{'Orig': 'mehaffey', 'Pred:': 'kkllkkjgeccc'}
{'Orig': 'lacy', 'Pred:': 'jiincfca    '}
Epoch 2/301
Epoch 3/301
Epoch 4/301
Epoch 5/301
Epoch 6/301
Epoch 7/301
Epoch 8/301
Epoch 9/301
Epoch 10/301
Epoch 11/301
Sample Reconstructions:
{'Orig': 'mueller', 'Pred:': 'jtellekbba  '}
{'Orig': 'kopel', 'Pred:': 'koqejbaa    '}
{'Orig': 'eisland', 'Pred:': 'ejsldkba    '}
{'Orig': 'turner', 'Pred:': 'suqngoabaa  '}
{'Orig': 'riesberg', 'Pred:': 'oifqeekfeca '}
Epoch 12/301
Epoch 13/301
Epoch 14/301
Epoch 15/301
Epoch 16/301
Epoch 17/301
Epoch 18/301
Epoch 19/301
Epoch 20/301
Epoch 21/301
Sample Reconstructions:
{'Orig': 'luce', 'Pred:': 'ltgecba     '}
{'Orig': 'cooper', 'Pred:': 'dppmgqb     '}
{'Orig': 'fitzgerald', 'Pred:': 'fjrvjgoedca '}
{'Orig': 'tenny', 'Pred:': 'sgomxba     '}

In [30]:
ORIG_LENGTH = 12
BATCH_SIZE = 32
ENCODE_DIM = [128, 128] 
DECODE_DIM = [128, 128]
LR = 5e-4
EPOCHS=301
EMBED_TYPE = 'shingles'
# Embed letters 
namesA = preprocess.embed(iowa_matches['lname1915'],
                         max_length=ORIG_LENGTH, 
                         embed_type=EMBED_TYPE, 
                         normalize=True)
namesB = preprocess.embed(iowa_matches['lname1940'],
                         max_length=ORIG_LENGTH, 
                         embed_type=EMBED_TYPE, 
                         normalize=True)

LATENT_DIM = [2,4,8,16,24]
for latent_dim in LATENT_DIM: 
    save_path = '/Users/kailinlu/Desktop/QMSSWork/RecordLinking/models/dense_shingle_{}_union_first/'.format(latent_dim)
    run_id = 'dense_{}'.format(latent_dim)
    vae = recordlinker.model.VAE(batch_size=BATCH_SIZE,
                                 orig_dim=ORIG_LENGTH, 
                                 latent_dim=latent_dim,
                                 encode_dim=ENCODE_DIM,
                                 decode_dim=DECODE_DIM,
                                 lr=LR)

    model, encoder, decoder = vae.train(namesA, namesB, 
                                        epochs=EPOCHS, 
                                        run_id=run_id,
                                        save_path=save_path,
                                        optimizer='adam', 
                                        tensorboard=True, 
                                        earlystop=True,
                                        earlystop_patience=15,
                                        reconstruct=True, 
                                        reconstruct_type='s',
                                        reconstruct_display=10)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 12)           0                                            
__________________________________________________________________________________________________
enc_0 (Dense)                   (None, 128)          1664        input_1[0][0]                    
__________________________________________________________________________________________________
enc_1 (Dense)                   (None, 128)          16512       enc_0[0][0]                      
__________________________________________________________________________________________________
mu (Dense)                      (None, 2)            258         enc_1[0][0]                      
__________________________________________________________________________________________________
log_sigma 

Epoch 36/301
Epoch 37/301
Epoch 38/301
Epoch 39/301
Epoch 40/301
Epoch 41/301
Sample Reconstructions:
{'Orig': 'ingebritson', 'Pred:': 'lqgmnsqnfcbaj'}
{'Orig': 'platt', 'Pred:': 'nrgoicaaaaaab'}
{'Orig': 'sproston', 'Pred:': 'kmkmmplgdbaag'}
{'Orig': 'platt', 'Pred:': 'nrgoicaaaaaab'}
{'Orig': 'hopkins', 'Pred:': 'kknmkidbaaaad'}
Epoch 42/301
Epoch 43/301
Epoch 44/301
Epoch 45/301
Saved encoder in: /Users/kailinlu/Desktop/QMSSWork/RecordLinking/models/dense_shingle_2_union_first/encoder.h5
Saved decoder in: /Users/kailinlu/Desktop/QMSSWork/RecordLinking/models/dense_shingle_2_union_first/decoder.h5
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 12)           0                                            
________________________________________________________________________________________________

Epoch 3/301
Epoch 4/301
Epoch 5/301
Epoch 6/301
Epoch 7/301
Epoch 8/301
Epoch 9/301
Epoch 10/301
Epoch 11/301
Sample Reconstructions:
{'Orig': 'kothenbentel', 'Pred:': 'jntgjkfcbaaaj'}
{'Orig': 'stoffers', 'Pred:': 'nsqwmedbbaaam'}
{'Orig': 'mairet', 'Pred:': 'kdkqebaaaaaad'}
{'Orig': 'glackemeyer', 'Pred:': 'kkffkngebaaak'}
{'Orig': 'morr', 'Pred:': 'ioydcbaaaaaad'}
Epoch 12/301
Epoch 13/301
Epoch 14/301
Epoch 15/301
Epoch 16/301
Epoch 17/301
Epoch 18/301
Epoch 19/301
Epoch 20/301
Epoch 21/301
Sample Reconstructions:
{'Orig': 'mulford', 'Pred:': 'lslgllgebaaah'}
{'Orig': 'daniels', 'Pred:': 'hcqfgibaaaaad'}
{'Orig': 'mayer', 'Pred:': 'hdxecbaaaaaab'}
{'Orig': 'hughes', 'Pred:': 'ltifebaaaaaac'}
{'Orig': 'inman', 'Pred:': 'jmnccbaaaaaab'}
Epoch 22/301
Epoch 23/301
Epoch 24/301
Epoch 25/301
Epoch 26/301
Epoch 27/301
Epoch 28/301
Epoch 29/301
Epoch 30/301
Epoch 31/301
Sample Reconstructions:
{'Orig': 'moeller', 'Pred:': 'nohwmdcaaaaae'}
{'Orig': 'lafler', 'Pred:': 'jdgjebaaaaaab'}
{'Orig

Epoch 3/301
Epoch 4/301
Epoch 5/301
Epoch 6/301
Epoch 7/301
Epoch 8/301
Epoch 9/301
Epoch 10/301
Epoch 11/301
Sample Reconstructions:
{'Orig': 'mcclain', 'Pred:': 'iwfhddbbaaaae'}
{'Orig': 'menkel', 'Pred:': 'leohdbaaaaaab'}
{'Orig': 'messink', 'Pred:': 'iezpkhcaaaaac'}
{'Orig': 'muto', 'Pred:': 'lttcbbaaaaaab'}
{'Orig': 'lund', 'Pred:': 'ktncbbaaaaaab'}
Epoch 12/301
Epoch 13/301
Epoch 14/301
Epoch 15/301
Epoch 16/301
Epoch 17/301
Epoch 18/301
Epoch 19/301
Epoch 20/301
Epoch 21/301
Sample Reconstructions:
{'Orig': 'carter', 'Pred:': 'edrsecaaaaaab'}
{'Orig': 'lynch', 'Pred:': 'lxlebaaaaaaac'}
{'Orig': 'stutsman', 'Pred:': 'qtstthdbaaaaf'}
{'Orig': 'lunde', 'Pred:': 'lulecaaaaaaac'}
{'Orig': 'emery', 'Pred:': 'flfqcbaaaaaab'}
Epoch 22/301
Epoch 23/301
Epoch 24/301
Epoch 25/301
Epoch 26/301
Epoch 27/301
Epoch 28/301
Epoch 29/301
Epoch 30/301
Epoch 31/301
Sample Reconstructions:
{'Orig': 'caviness', 'Pred:': 'hdvlgnldcaaae'}
{'Orig': 'belding', 'Pred:': 'cekekmaaaaaac'}
{'Orig': 'stepanek

Epoch 53/301
Epoch 54/301
Epoch 55/301
Epoch 56/301
Epoch 57/301
Epoch 58/301
Epoch 59/301
Epoch 60/301
Epoch 61/301
Sample Reconstructions:
{'Orig': 'murphy', 'Pred:': 'lrtngaaaaab'}
{'Orig': 'menkel', 'Pred:': 'ldnidaaaa'}
{'Orig': 'stubbs', 'Pred:': 'rruxdaaaaaab'}
{'Orig': 'dodd', 'Pred:': 'djybbaaaa'}
{'Orig': 'cunningham', 'Pred:': 'erzlhodbaaaac'}
Epoch 62/301
Saved encoder in: /Users/kailinlu/Desktop/QMSSWork/RecordLinking/models/dense_shingle_24_union_first/encoder.h5
Saved decoder in: /Users/kailinlu/Desktop/QMSSWork/RecordLinking/models/dense_shingle_24_union_first/decoder.h5


### LSTM

In [22]:
# Train 
# One hot encoding of names
ORIG_LENGTH = 12
classes = 28
LR = 5e-4
BATCH_SIZE = 32
EPOCHS = 350
namesA = preprocess.embed(iowa_matches['lname1915'],
                         max_length=ORIG_LENGTH, 
                         embed_type='letters', 
                         normalize=False, 
                         categorical=True)

namesB = preprocess.embed(iowa_matches['lname1940'],
                         max_length=ORIG_LENGTH, 
                         embed_type='letters',
                         normalize=False, 
                         categorical=True)


LATENT_DIM = [4]
for latent_dim in LATENT_DIM: 
    save_path = '/Users/kailinlu/Desktop/QMSSWork/RecordLinking/models/lstm_letter_{}_iowa_last_smaller_decoder/'.format(latent_dim)
    run_id = 'lstm_{}'.format(latent_dim)
    lstm_vae = recordlinker.model.LSTMVAE(batch_size=BATCH_SIZE, 
                                          timesteps=ORIG_LENGTH, 
                                          orig_dim=classes,
                                          latent_dim=latent_dim,
                                          encode_dim=[128,64], 
                                          decode_dim=[64],
                                          lr=LR) 
    model_lstm, model_encoder, model_decoder = lstm_vae.train(namesA, namesB, 
                                                              epochs=EPOCHS, 
                                                              run_id=run_id, 
                                                              save_path=save_path, 
                                                              earlystop=True,
                                                              earlystop_patience=6,
                                                              tensorboard=True, 
                                                              reconstruct=True, 
                                                              reconstruct_display=10)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 12, 28)       0                                            
__________________________________________________________________________________________________
enc_0 (LSTM)                    (None, 12, 128)      80384       input_1[0][0]                    
__________________________________________________________________________________________________
enc_1 (LSTM)                    (None, 12, 64)       49408       enc_0[0][0]                      
__________________________________________________________________________________________________
flatten_1 (Flatten)             (None, 768)          0           enc_1[0][0]                      
__________________________________________________________________________________________________
mu (Dense)

Epoch 83/350
Epoch 84/350
Epoch 85/350
Epoch 86/350
Epoch 87/350
Epoch 88/350
Epoch 89/350
Epoch 90/350
Epoch 91/350
Sample Reconstructions:
{'Orig': 'dougherty', 'Pred:': 'mereearey'}
{'Orig': 'schaeffer', 'Pred:': 'maneehee'}
{'Orig': 'strueber', 'Pred:': 'seeersey'}
{'Orig': 'swartzendrub', 'Pred:': 'sstttennnbub'}
{'Orig': 'stampher', 'Pred:': 'meeenler'}
Epoch 92/350
Epoch 93/350
Epoch 94/350
Epoch 95/350
Epoch 96/350
Epoch 97/350
Epoch 98/350
Epoch 99/350
Epoch 100/350
Epoch 101/350
Sample Reconstructions:
{'Orig': 'housley', 'Pred:': 'mornle'}
{'Orig': 'hines', 'Pred:': 'manes'}
{'Orig': 'eckardt', 'Pred:': 'mallart'}
{'Orig': 'rettenmaier', 'Pred:': 'stetenaaeer'}
{'Orig': 'kracht', 'Pred:': 'merrtt'}
Epoch 102/350
Epoch 103/350
Epoch 104/350
Epoch 105/350
Epoch 106/350
Epoch 107/350
Epoch 108/350
Epoch 109/350
Saved encoder in: /Users/kailinlu/Desktop/QMSSWork/RecordLinking/models/lstm_letter_4_iowa_last_smaller_decoder/encoder.h5
Saved decoder in: /Users/kailinlu/Desktop/QMSS