In [1]:
import recordlinker
from recordlinker.blocking import BinaryEncoder, Blocker

import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

Using TensorFlow backend.


In [26]:
# Autoreload modules
%reload_ext autoreload
%autoreload 2

# Memory profiler
%load_ext memory_profiler
%load_ext line_profiler

### Load the data and mark matches and nonmatches

#### Iowa 

In [3]:
iowa_matches = pd.read_csv('/Users/kailinlu/Desktop/QMSSWork/RecordLinking/recordlinker/recordlinker/data/iowa_matches.csv')
iowa_matches.drop_duplicates(subset=['uid1915', 'hhid'], inplace=True)
iowa_nonmatches = pd.read_csv('/Users/kailinlu/Desktop/QMSSWork/RecordLinking/recordlinker/recordlinker/data/iowa_nonmatches.csv')
iowa_nonmatches.drop_duplicates(subset=['uid1915', 'hhid'], inplace=True)

iowa_matches['match'] = 1
iowa_nonmatches['match'] = 0

iowa = pd.concat([iowa_matches, iowa_nonmatches])
total_matches = len(iowa_matches['uid-hhid'])
exact_matches = np.sum(iowa_matches['lname1915'] == iowa_matches['lname1940'])
print('Number of total matches: {}'.format(total_matches))
print('Number of exact matches: {}'.format(exact_matches))

Number of total matches: 4320
Number of exact matches: 3240


Extract unique IDs for 1915 names and unique IDs for 1940 names. 

There are 6881 unique 1915 people and 65939 unique 1940 people. We want to match the correct uid-hhid pairs as denoted in the iowa dataframe. 

In [4]:
names_1915 = iowa[['lname1915', 'uid1915', 'yob1915', 'fname1915']]
names_1915.drop_duplicates(subset=['uid1915'], inplace=True)
names_1940 = iowa[['lname1940', 'hhid', 'yob1940', 'fname1940']]
names_1940.drop_duplicates(subset=['hhid'], inplace=True)
names_1915.reset_index(inplace=True)
names_1940.reset_index(inplace=True)

In [5]:
names_1915.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6881 entries, 0 to 6880
Data columns (total 5 columns):
index        6881 non-null int64
lname1915    6881 non-null object
uid1915      6881 non-null object
yob1915      6881 non-null int64
fname1915    6881 non-null object
dtypes: int64(2), object(3)
memory usage: 268.9+ KB


In [6]:
names_1940.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65311 entries, 0 to 65310
Data columns (total 5 columns):
index        65311 non-null int64
lname1940    65311 non-null object
hhid         65311 non-null int64
yob1940      65311 non-null int64
fname1940    65311 non-null object
dtypes: int64(3), object(2)
memory usage: 2.5+ MB


In [10]:
# Get indices of matches 
iowa['indexA'] = iowa['uid1915'].apply(lambda x: names_1915[names_1915['uid1915']==x].index[0]) 
iowa['indexB'] = iowa['hhid'].apply(lambda x: names_1940[names_1940['hhid']==x].index[0]) 

In [11]:
indexA = iowa[iowa['match']==1]['indexA']
indexB = iowa[iowa['match']==1]['indexB']

#### Union army

In [11]:
union_matches = pd.read_csv('/Users/kailinlu/Desktop/QMSSWork/RecordLinking/recordlinker/recordlinker/data/unionarmy_matches.csv')
union_nonmatches = pd.read_csv('/Users/kailinlu/Desktop/QMSSWork/RecordLinking/recordlinker/recordlinker/data/unionarmy_nonmatches.csv')

In [12]:
union_matches['match'] = 1
union_nonmatches['match'] = 0

union = pd.concat([union_matches, union_nonmatches])
union['recidnum_x'] = union['recidnum_x'].fillna(union['recidnum'])
union['recidnum_y'] = union['recidnum_y'].fillna(union['recidnum'])
total_union_matches = len(union_matches['recidnum'])
exact_union_matches = np.sum(union_matches['first1'] == union_matches['first2'])
print('Number of total matches: {}'.format(total_union_matches))
print('Number of exact matches: {}'.format(exact_union_matches))

Number of total matches: 44098
Number of exact matches: 33045


In [13]:
union.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 110242 entries, 0 to 66143
Data columns (total 10 columns):
first1        110242 non-null object
first2        110242 non-null object
last1         110242 non-null object
last2         110242 non-null object
match         110242 non-null int64
recidnum      44098 non-null float64
recidnum_x    110242 non-null float64
recidnum_y    110242 non-null float64
recname1      110242 non-null object
recname2      110242 non-null object
dtypes: float64(3), int64(1), object(6)
memory usage: 9.3+ MB


In [14]:
# Get unique namesA and namesB from the union dataset 
union_namesA = union[['first1', 'last1', 'recname1', 'recidnum_x']]
union_namesA.drop_duplicates(subset=['recidnum_x'], inplace=True)
union_namesB = union[['first2', 'last2', 'recname2', 'recidnum_y']]
union_namesB.drop_duplicates(subset=['recidnum_y'], inplace=True)

In [15]:
# Get indices of matches 
union['indexA'] = union['recidnum_x'].apply(lambda x: union_namesA[union_namesA['recidnum_x']==x].index[0]) 
union['indexB'] = union['recidnum_y'].apply(lambda x: union_namesB[union_namesB['recidnum_y']==x].index[0])

union_indexA = union[union['match']==1]['indexA']
union_indexB = union[union['match']==1]['indexB']

# Blocking

We will test the quality of the blocks from the Dense and LSTM models with 2, 4, 8, and 16 latent variables.

Metrics: 

1. Number of blocks 
2. Average size, max size, and min size of each block 
3. Balance of block sizes: % pairs in smallest block / %pairs in largest block
4. % of all matches contained in at least one block
5. Number and % of blocks which contain at least 1 match

### Records to check without blocking

#### Iowa dataset - Autoencoder

In [16]:
# Example with no blocking done - metrics will take a long time to compute
blocker = Blocker(dfA=names_1915, dfB=names_1940)
blocks = blocker.block() 
blocker.compute_block_metrics(match_indexA=indexA, match_indexB=indexB)

Num Blocks: 1
Original Comparisons Needed: 449,404,991
Total Comparisons 449,404,991 : 100.00% of original
Avg Block Size: 449,404,991.00
Max Block Size: 449,404,991
Min Block Size: 449,404,991
Balance Score (1=even sizes): 1.000000
Num Matches Found 4320 Out Of 4320 (100.00%)
Num blocks containing matches 1, (100.00%)


#### Union dataset - Autoencoder

In [17]:
blocker = Blocker(dfA=union_namesA, dfB=union_namesB)
blocks = blocker.block() 
blocker.compute_block_metrics(match_indexA=union_indexA, match_indexB=union_indexB)

Num Blocks: 1
Original Comparisons Needed: 1,944,633,604
Total Comparisons 1,944,633,604 : 100.00% of original
Avg Block Size: 1,944,633,604.00
Max Block Size: 1,944,633,604
Min Block Size: 1,944,633,604
Balance Score (1=even sizes): 1.000000
Num Matches Found 44098 Out Of 44098 (100.00%)
Num blocks containing matches 1, (100.00%)


### Blocking With Binary Encoding

#### Iowa

In [18]:
# LSTM Encoding
latent_dims = [2,4,8,16,24]

for dim in latent_dims:
    print('Block with {} Encoding Units: \n'.format(dim))
    model_path = '/Users/kailinlu/Desktop/QMSSWork/RecordLinking/models/lstm_letter_{}_iowa_last/encoder.h5'.format(dim)
    blocker = Blocker(dfA=names_1915,
                      dfB=names_1940)

    blocks = blocker.block(autoencoder_col='lname1915',
                           autoencoder_colB='lname1940', 
                           autoencoder_model_path=model_path)
    blocker.compute_block_metrics(match_indexA=indexA, match_indexB=indexB)
    print('\n')

Block with 2 Encoding Units: 

Loaded Model with input shape (None, 12, 28)
Median mu has been set with size (2,)
Num Blocks: 4
Original Comparisons Needed: 449,404,991
Total Comparisons 221,733,332 : 49.34% of original
Avg Block Size: 55,433,333.00
Max Block Size: 112,619,884
Min Block Size: 4,470
Balance Score (1=even sizes): 0.000040
Num Matches Found 4163 Out Of 4320 (96.37%)
Num blocks containing matches 4, (100.00%)


Block with 4 Encoding Units: 

Loaded Model with input shape (None, 12, 28)
Median mu has been set with size (4,)
Num Blocks: 15
Original Comparisons Needed: 449,404,991
Total Comparisons 113,379,503 : 25.23% of original
Avg Block Size: 7,558,633.53
Max Block Size: 44,538,417
Min Block Size: 1
Balance Score (1=even sizes): 0.000000
Num Matches Found 4015 Out Of 4320 (92.94%)
Num blocks containing matches 14, (93.33%)


Block with 8 Encoding Units: 

Loaded Model with input shape (None, 12, 28)
Median mu has been set with size (8,)
Num Blocks: 203
Original Comparison

In [19]:
# Does a more complex encoder relative to the decoder help balance matches? - Does not seem so 
# LSTM Encoder-Heavy models 
latent_dims = [2,4]

for dim in latent_dims:
    print('Block with {} Encoding Units: \n'.format(dim))
    model_path = '/Users/kailinlu/Desktop/QMSSWork/RecordLinking/models/lstm_letter_{}_iowa_last_smaller_decoder/encoder.h5'.format(dim)
    blocker = Blocker(dfA=names_1915,
                      dfB=names_1940)

    blocks = blocker.block(autoencoder_col='lname1915',
                                     autoencoder_colB='lname1940', 
                                     autoencoder_model_path=model_path)
    blocker.compute_block_metrics(match_indexA=indexA, match_indexB=indexB)
    print('\n')

Block with 2 Encoding Units: 

Loaded Model with input shape (None, 12, 28)
Median mu has been set with size (2,)
Num Blocks: 4
Original Comparisons Needed: 449404991
Total Comparisons 114,288,704, 25.43% of original
Avg Block Size: 28,572,176.00
Max Block Size: 36,422,676
Min Block Size: 19,912,400
Balance Score (1=even sizes): 0.546703
Num Matches Found 4083 Out Of 4320 (94.51%)
Num blocks containing matches 4, (100.00%)


Block with 4 Encoding Units: 

Loaded Model with input shape (None, 12, 28)
Median mu has been set with size (4,)
Num Blocks: 16
Original Comparisons Needed: 449404991
Total Comparisons 52,556,221, 11.69% of original
Avg Block Size: 3,284,763.81
Max Block Size: 11,758,920
Min Block Size: 2,140
Balance Score (1=even sizes): 0.000182
Num Matches Found 3993 Out Of 4320 (92.43%)
Num blocks containing matches 16, (100.00%)




In [20]:
# Dense Letter Encoding
latent_dims = [2,4,8,16,24]
for dim in latent_dims:
    print('Block with {} Encoding Units: \n'.format(dim))
    model_path = '/Users/kailinlu/Desktop/QMSSWork/RecordLinking/models/dense_letter_{}_iowa_last/encoder.h5'.format(dim)
    blocker = Blocker(dfA=names_1915,
                      dfB=names_1940)

    blocks = blocker.block(autoencoder_col='lname1915',
                                     autoencoder_colB='lname1940', 
                                     autoencoder_model_path=model_path)
    blocker.compute_block_metrics(match_indexA=indexA, match_indexB=indexB)
    print('\n')

Block with 2 Encoding Units: 

Loaded Model with input shape (None, 12)
Median mu has been set with size (2,)
Num Blocks: 4
Original Comparisons Needed: 449,404,991
Total Comparisons 120,696,353 : 26.86% of original
Avg Block Size: 30,174,088.25
Max Block Size: 48,571,974
Min Block Size: 13,723,448
Balance Score (1=even sizes): 0.282538
Num Matches Found 3987 Out Of 4320 (92.29%)
Num blocks containing matches 4, (100.00%)


Block with 4 Encoding Units: 

Loaded Model with input shape (None, 12)
Median mu has been set with size (4,)
Num Blocks: 15
Original Comparisons Needed: 449,404,991
Total Comparisons 38,404,323 : 8.55% of original
Avg Block Size: 2,560,288.20
Max Block Size: 7,044,708
Min Block Size: 15
Balance Score (1=even sizes): 0.000002
Num Matches Found 3707 Out Of 4320 (85.81%)
Num blocks containing matches 15, (100.00%)


Block with 8 Encoding Units: 

Loaded Model with input shape (None, 12)
Median mu has been set with size (8,)
Num Blocks: 151
Original Comparisons Needed:

In [21]:
# Dense Letter Encoding with more complex encoder
latent_dims = [2,4,8]
for dim in latent_dims:
    print('Block with {} Encoding Units: \n'.format(dim))
    model_path = '/Users/kailinlu/Desktop/QMSSWork/RecordLinking/models/dense_letter_{}_iowa_last_smaller_decoder/encoder.h5'.format(dim)
    blocker = Blocker(dfA=names_1915,
                      dfB=names_1940)

    blocks = blocker.block(autoencoder_col='lname1915',
                           autoencoder_colB='lname1940', 
                           autoencoder_model_path=model_path)
    blocker.compute_block_metrics(match_indexA=indexA, match_indexB=indexB)
    print('\n')

Block with 2 Encoding Units: 

Loaded Model with input shape (None, 12)
Median mu has been set with size (2,)
Num Blocks: 4
Original Comparisons Needed: 449404991
Total Comparisons 123,255,073, 27.43% of original
Avg Block Size: 30,813,768.25
Max Block Size: 51,533,307
Min Block Size: 11,298,976
Balance Score (1=even sizes): 0.219256
Num Matches Found 3989 Out Of 4320 (92.34%)
Num blocks containing matches 4, (100.00%)


Block with 4 Encoding Units: 

Loaded Model with input shape (None, 12)
Median mu has been set with size (4,)
Num Blocks: 16
Original Comparisons Needed: 449404991
Total Comparisons 34,618,546, 7.70% of original
Avg Block Size: 2,163,659.12
Max Block Size: 9,823,753
Min Block Size: 294,084
Balance Score (1=even sizes): 0.029936
Num Matches Found 3707 Out Of 4320 (85.81%)
Num blocks containing matches 16, (100.00%)


Block with 8 Encoding Units: 

Loaded Model with input shape (None, 12)
Median mu has been set with size (8,)
Num Blocks: 220
Original Comparisons Needed: 

In [22]:
# Dense Shingle Encoding
latent_dims = [2,4,8,16,24]

for dim in latent_dims:
    print('Block with {} Encoding Units: \n'.format(dim))
    model_path = '/Users/kailinlu/Desktop/QMSSWork/RecordLinking/models/dense_shingle_{}_iowa_last/encoder.h5'.format(dim)
    blocker = Blocker(dfA=names_1915,
                      dfB=names_1940)

    blocks = blocker.block(autoencoder_col='lname1915',
                                     autoencoder_colB='lname1940', 
                                     autoencoder_model_path=model_path, 
                                     autoencoder_embedtype='shingles')
    blocker.compute_block_metrics(match_indexA=indexA, match_indexB=indexB)
    print('\n')

Block with 2 Encoding Units: 

Loaded Model with input shape (None, 12)
Median mu has been set with size (2,)
Num Blocks: 4
Original Comparisons Needed: 449404991
Total Comparisons 113,496,917, 25.25% of original
Avg Block Size: 28,374,229.25
Max Block Size: 35,521,395
Min Block Size: 21,001,661
Balance Score (1=even sizes): 0.591240
Num Matches Found 3926 Out Of 4320 (90.88%)
Num blocks containing matches 4, (100.00%)


Block with 4 Encoding Units: 

Loaded Model with input shape (None, 12)
Median mu has been set with size (4,)
Num Blocks: 14
Original Comparisons Needed: 449404991
Total Comparisons 42,665,714, 9.49% of original
Avg Block Size: 3,047,551.00
Max Block Size: 10,514,088
Min Block Size: 4,128
Balance Score (1=even sizes): 0.000393
Num Matches Found 3749 Out Of 4320 (86.78%)
Num blocks containing matches 14, (100.00%)


Block with 8 Encoding Units: 

Loaded Model with input shape (None, 12)
Median mu has been set with size (8,)
Num Blocks: 103
Original Comparisons Needed: 4

#### Union Army

In [31]:
# Dense letter encoding 
# latent_dims = [2,4,8,16,24]
# for dim in latent_dims:
#     print('Block with {} Encoding Units: \n'.format(dim))
#     model_path = '/Users/kailinlu/Desktop/QMSSWork/RecordLinking/models/dense_letter_{}_union_first/encoder.h5'.format(dim)
    
#     blocker = Blocker(dfA=union_namesA,
#                       dfB=union_namesB)

#     blocks = blocker.block(autoencoder_col='first1',
#                                      autoencoder_colB='first2', 
#                                      autoencoder_model_path=model_path)
#     blocker.compute_block_metrics(match_indexA=union_indexA, match_indexB=union_indexB)
#     print('\n')

In [24]:
# Dense shingle encoding 
latent_dims = [2,4,8,16,24]

for dim in latent_dims:
    print('Block with {} Encoding Units: \n'.format(dim))
    model_path = '/Users/kailinlu/Desktop/QMSSWork/RecordLinking/models/dense_shingle_{}_union_first/encoder.h5'.format(dim)
    
    blocker = Blocker(dfA=union_namesA,
                      dfB=union_namesB)

    blocks = blocker.block(autoencoder_col='first1',
                                     autoencoder_colB='first2', 
                                     autoencoder_model_path=model_path, 
                                     autoencoder_embedtype='shingles')
    blocker.compute_block_metrics(match_indexA=union_indexA, match_indexB=union_indexB)
    print('\n')

Block with 2 Encoding Units: 

Loaded Model with input shape (None, 12)
Median mu has been set with size (2,)
Num Blocks: 4
Original Comparisons Needed: 1944633604
Total Comparisons 579,922,683, 29.82% of original
Avg Block Size: 144,980,670.75
Max Block Size: 255,782,922
Min Block Size: 33,867,996
Balance Score (1=even sizes): 0.132409
Num Matches Found 38283 Out Of 44098 (86.81%)
Num blocks containing matches 4, (100.00%)


Block with 4 Encoding Units: 

Loaded Model with input shape (None, 12)
Median mu has been set with size (4,)
Num Blocks: 16
Original Comparisons Needed: 1944633604
Total Comparisons 149,086,137, 7.67% of original
Avg Block Size: 9,317,883.56
Max Block Size: 29,089,970
Min Block Size: 932,715
Balance Score (1=even sizes): 0.032063
Num Matches Found 36552 Out Of 44098 (82.89%)
Num blocks containing matches 16, (100.00%)


Block with 8 Encoding Units: 

Loaded Model with input shape (None, 12)
Median mu has been set with size (8,)
Num Blocks: 103
Original Comparison

In [25]:
# LSTM encoding 
latent_dims = [4,8,16,24]

for dim in latent_dims:
    print('Block with {} Encoding Units: \n'.format(dim))
    model_path = '/Users/kailinlu/Desktop/QMSSWork/RecordLinking/models/lstm_letter_{}_union_first/encoder.h5'.format(dim)
    
    blocker = Blocker(dfA=union_namesA,
                      dfB=union_namesB)

    blocks = blocker.block(autoencoder_col='first1',
                                     autoencoder_colB='first2', 
                                     autoencoder_model_path=model_path)
    blocker.compute_block_metrics(match_indexA=union_indexA, match_indexB=union_indexB)
    print('\n')

Block with 4 Encoding Units: 

Loaded Model with input shape (None, 12, 28)
Median mu has been set with size (4,)
Num Blocks: 16
Original Comparisons Needed: 1944633604
Total Comparisons 198,615,459, 10.21% of original
Avg Block Size: 12,413,466.19
Max Block Size: 55,790,238
Min Block Size: 1,584
Balance Score (1=even sizes): 0.000028
Num Matches Found 38496 Out Of 44098 (87.30%)
Num blocks containing matches 16, (100.00%)


Block with 8 Encoding Units: 

Loaded Model with input shape (None, 12, 28)
Median mu has been set with size (8,)
Num Blocks: 227
Original Comparisons Needed: 1944633604
Total Comparisons 36,401,880, 1.87% of original
Avg Block Size: 160,360.70
Max Block Size: 12,624,675
Min Block Size: 1
Balance Score (1=even sizes): 0.000000
Num Matches Found 35756 Out Of 44098 (81.08%)
Num blocks containing matches 221, (97.36%)


Block with 16 Encoding Units: 

Loaded Model with input shape (None, 12, 28)
Median mu has been set with size (16,)
Num Blocks: 3003
Original Comparis

### Blocking with Timeframe

In [25]:
blocker = Blocker(dfA=names_1915,
                  dfB=names_1940)

blocks = blocker.block(timeframe_col='yob1915',
                       timeframe_colB='yob1940',
                       timeframe_range=2)
blocker.compute_block_metrics(match_indexA=indexA, match_indexB=indexB)

Num Blocks: 15
Original Comparisons Needed: 449,404,991
Total Comparisons 127,937,823 : 28.47% of original
Avg Block Size: 8,529,188.20
Max Block Size: 10,058,491
Min Block Size: 5,730,654
Balance Score (1=even sizes): 0.569733
Num Matches Found 4207 Out Of 4320 (97.38%)
Num blocks containing matches 15, (100.00%)


In [29]:
blocker = Blocker(dfA=names_1915,
                  dfB=names_1940)

blocks = blocker.block(timeframe_col='yob1915',
                     timeframe_colB='yob1940',
                     timeframe_range=1)
blocker.compute_block_metrics(match_indexA=indexA, match_indexB=indexB)

Num Blocks: 1
Original Comparisons Needed: 449,404,991
Total Comparisons 449,404,991 : 100.00% of original
Avg Block Size: 449,404,991.00
Max Block Size: 449,404,991
Min Block Size: 449,404,991
Balance Score (1=even sizes): 1.000000
Num Matches Found 4320 Out Of 4320 (100.00%)
Num blocks containing matches 1, (100.00%)


In [78]:
blocker = Blocker(dfA=names_1915,
                  dfB=names_1940)

blocks = blocker.block(timeframe_col='yob1915',
                       timeframe_colB='yob1940',
                       timeframe_range=0)
blocker.compute_block_metrics(match_indexA=indexA, match_indexB=indexB)

Num Blocks: 15
Original Comparisons Needed: 449404991
Total Comparisons 26,261,907, 5.84% of original
Avg Block Size: 1,750,793.80
Max Block Size: 2,058,255
Min Block Size: 1,233,408
Balance Score (1=even sizes): 0.599249
Num Matches Found 2631 Out Of 4320 (60.90%)
Num blocks containing matches 15, (100.00%)


### Blocking with Autoencoder and Timeframe

In [99]:
blocker = Blocker(dfA=names_1915,
                  dfB=names_1940)

latent_dim = [2,4,8,16,24]
for dim in latent_dim:
    print('LSTM encoder with {} latent dim within two years \n'.format(dim))
    model_path = '/Users/kailinlu/Desktop/QMSSWork/RecordLinking/models/lstm_letter_{}_iowa_last/encoder.h5'.format(dim)
    blocks = blocker.block(autoencoder_col='lname1915', 
                           autoencoder_colB='lname1940', 
                           autoencoder_model_path=model_path, 
                           timeframe_col='yob1915', 
                           timeframe_colB='yob1940',
                           timeframe_range=2)
    blocker.compute_block_metrics(match_indexA=indexA, match_indexB=indexB)
    print('\n')

LSTM encoder with 2 latent dim within two years 

Loaded Model with input shape (None, 12, 28)
Median mu has been set with size (2,)
Finished blocking with autoencoder in 14.5841 s
Finished blocking on timerange in 22.131389 s
Num Blocks: 56
Original Comparisons Needed: 449,404,991
Total Comparisons 63,128,942 : 14.05% of original
Avg Block Size: 1,127,302.54
Max Block Size: 2,649,052
Min Block Size: 25
Balance Score (1=even sizes): 0.000009
Num Matches Found 4064 Out Of 4320 (94.07%)
Num blocks containing matches 51, (91.07%)


LSTM encoder with 4 latent dim within two years 

Loaded Model with input shape (None, 12, 28)
Median mu has been set with size (4,)
Finished blocking with autoencoder in 14.1891 s
Finished blocking on timerange in 21.263545 s
Num Blocks: 148
Original Comparisons Needed: 449,404,991
Total Comparisons 95,383,679 : 21.22% of original
Avg Block Size: 467,567.05
Max Block Size: 2,649,052
Min Block Size: 1
Balance Score (1=even sizes): 0.000000
Num Matches Found 391

In [100]:
blocker = Blocker(dfA=names_1915,
                  dfB=names_1940)

latent_dim = [2,4,8,16,24]
for dim in latent_dim:
    print('LSTM encoder with {} latent dim within one year \n'.format(dim))
    model_path = '/Users/kailinlu/Desktop/QMSSWork/RecordLinking/models/lstm_letter_{}_iowa_last/encoder.h5'.format(dim)
    blocks = blocker.block(autoencoder_col='lname1915', 
                           autoencoder_colB='lname1940', 
                           autoencoder_model_path=model_path, 
                           timeframe_col='yob1915', 
                           timeframe_colB='yob1940',
                           timeframe_range=1)
    blocker.compute_block_metrics(match_indexA=indexA, match_indexB=indexB)
    print('\n')

LSTM encoder with 2 latent dim within one year 

Loaded Model with input shape (None, 12, 28)
Median mu has been set with size (2,)
Finished blocking with autoencoder in 14.7475 s
Finished blocking on timerange in 20.471843 s
Num Blocks: 56
Original Comparisons Needed: 449,404,991
Total Comparisons 38,481,420 : 8.56% of original
Avg Block Size: 687,168.21
Max Block Size: 1,653,456
Min Block Size: 12
Balance Score (1=even sizes): 0.000007
Num Matches Found 3829 Out Of 4320 (88.63%)
Num blocks containing matches 51, (91.07%)


LSTM encoder with 4 latent dim within one year 

Loaded Model with input shape (None, 12, 28)
Median mu has been set with size (4,)
Finished blocking with autoencoder in 16.0071 s
Finished blocking on timerange in 21.107264 s
Num Blocks: 147
Original Comparisons Needed: 449,404,991
Total Comparisons 58,136,403 : 12.94% of original
Avg Block Size: 286,386.22
Max Block Size: 1,653,456
Min Block Size: 2
Balance Score (1=even sizes): 0.000001
Num Matches Found 3690 Out

In [101]:
blocker = Blocker(dfA=names_1915,
                  dfB=names_1940)

latent_dim = [2,4,8,16,24]
for dim in latent_dim:
    print('LSTM encoder with {} latent dim within same year \n'.format(dim))
    model_path = '/Users/kailinlu/Desktop/QMSSWork/RecordLinking/models/lstm_letter_{}_iowa_last/encoder.h5'.format(dim)
    blocks = blocker.block(autoencoder_col='lname1915', 
                           autoencoder_colB='lname1940', 
                           autoencoder_model_path=model_path, 
                           timeframe_col='yob1915', 
                           timeframe_colB='yob1940',
                           timeframe_range=0)
    blocker.compute_block_metrics(match_indexA=indexA, match_indexB=indexB)
    print('\n')

LSTM encoder with 2 latent dim within same year 

Loaded Model with input shape (None, 12, 28)
Median mu has been set with size (2,)
Finished blocking with autoencoder in 15.6038 s
Finished blocking on timerange in 21.431666 s
Num Blocks: 56
Original Comparisons Needed: 449,404,991
Total Comparisons 12,961,343 : 2.88% of original
Avg Block Size: 231,452.55
Max Block Size: 546,749
Min Block Size: 3
Balance Score (1=even sizes): 0.000005
Num Matches Found 2541 Out Of 4320 (58.82%)
Num blocks containing matches 48, (85.71%)


LSTM encoder with 4 latent dim within same year 

Loaded Model with input shape (None, 12, 28)
Median mu has been set with size (4,)
Finished blocking with autoencoder in 16.2535 s
Finished blocking on timerange in 20.701992 s
Num Blocks: 146
Original Comparisons Needed: 449,404,991
Total Comparisons 19,582,864 : 4.36% of original
Avg Block Size: 96,944.87
Max Block Size: 546,749
Min Block Size: 1
Balance Score (1=even sizes): 0.000002
Num Matches Found 2446 Out Of 4