# **PVS-like case study: sample data**

In [1]:
import re
import pandas as pd, numpy as np

# Load data

See code in `generate_simulated_data` directory for how we generated the files to link

In [2]:
reference_file = pd.read_parquet('reference_file_sample.parquet')
census_2030 = pd.read_parquet('census_2030_sample.parquet')

# Pre-process data

Not much needed here because the datasets are already so tidy and similar to each other.

In [3]:
# Use NaN for all forms of missingness, including empty string
reference_file = reference_file.fillna(np.nan).replace('', np.nan)
census_2030 = census_2030.fillna(np.nan).replace('', np.nan)

In [4]:
# We want to compare mailing address with physical address
reference_file = reference_file.rename(columns=lambda c: c.replace('mailing_address_', ''))

In [5]:
# My working theory: the purpose of the "geokey" is because address parts violate conditional independence
get_geokey = lambda x: (x.street_number + ' ' + x.street_name + ' ' + x.unit_number.fillna('') + ' ' + x.city + ' ' + x.state.astype(str) + ' ' + x.zipcode).str.strip().str.split().str.join(' ')
reference_file = reference_file.assign(geokey=get_geokey)
census_2030 = census_2030.assign(geokey=get_geokey)

In [6]:
# Add columns used to "cut the database": ZIP3 and a grouping of first and last initial
reference_file = reference_file.assign(zip3=lambda x: x.zipcode.str[:3])
census_2030 = census_2030.assign(zip3=lambda x: x.zipcode.str[:3])

# Page 20 of the NORC report: "Name-cuts are defined by combinations of the first characters of the first and last names. The twenty letter groupings
# for the first character are: A-or-blank, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, and U-Z."
initial_cut = lambda x: x.fillna('A').str[0].replace('A', 'A-or-blank').replace(['U', 'V', 'W', 'X', 'Y', 'Z'], 'U-Z')
reference_file = reference_file.assign(first_initial_cut=lambda x: initial_cut(x.first_name), last_initial_cut=lambda x: initial_cut(x.last_name))
census_2030 = census_2030.assign(first_initial_cut=lambda x: initial_cut(x.first_name), last_initial_cut=lambda x: initial_cut(x.last_name))

# Data, ready to link

Note: I have not yet introduced alternate names and dates of birth here.

In [7]:
reference_file

Unnamed: 0,record_id,pik,ssn,date_of_birth,first_name,middle_initial,last_name,date_of_death,street_number,street_name,unit_number,po_box,city,state,zipcode,geokey,zip3,first_initial_cut,last_initial_cut
0,0,0,001-02-4588,08/08/2008,Isabella,G,Windom,,685,emerson st,,,Anytown,US,00000,685 emerson st Anytown US 00000,000,I,U-Z
1,1,1,001-15-8330,05/04/1976,Gerald,J,Beckham,,5010,south doctor martin luther king jr dr,,,Anytown,US,00000,5010 south doctor martin luther king jr dr Any...,000,G,B
2,2,2,001-16-0077,02/07/1970,Jerald,J,Alvarez,,,,,,,,,,,J,A-or-blank
3,3,3,001-17-9511,11/20/1966,Teresa,A,Togni,,150,s sheldon rd,,,Anytown,US,00000,150 s sheldon rd Anytown US 00000,000,T,T
4,4,4,001-25-8258,06/29/2026,Bethany,G,Tenorio,,,,,,,,,,,B,T
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18521,18521,18521,976-30-9537,06/12/1976,Aron,C,Frausto Ferretiz,,,,,,,,,,,A-or-blank,F
18522,18522,18522,978-78-6109,05/22/1963,Claude,M,Page,,,,,,,,,,,C,P
18523,18523,18523,979-44-7835,08/01/1979,Thomas,A,Martinez-Puentes,,,,,,,,,,,T,M
18524,18524,18524,998-22-9577,04/17/2002,Jeffery,P,Shaw,,,,,,,,,,,J,S


In [8]:
census_2030

Unnamed: 0,record_id,first_name,middle_initial,last_name,age,date_of_birth,street_number,street_name,unit_number,city,state,zipcode,relation_to_reference_person,sex,race_ethnicity,geokey,zip3,first_initial_cut,last_initial_cut
0,0,John,E,Mcueever,86,06/29/1942,147-153,browning ave,,Anytown,US,00000,Reference person,Male,Black,147-153 browning ave Anytown US 00000,000,J,M
1,1,Sharon,T,Schmidt,69,10/50/1960,109,stqllion sr,,Anytown,US,00000,Reference person,Female,White,109 stqllion sr Anytown US 00000,000,S,S
2,2,Gail,K,Durand,77,01/03/1953,2115,cannon dr,,Anytown,US,00000,Reference person,Female,Multiracial or Other,2115 cannon dr Anytown US 00000,000,G,D
3,3,John,J,Williams,81,11/24/1948,146,delaware av,,Anytown,US,00000,Reference person,Male,White,146 delaware av Anytown US 00000,000,J,U-Z
4,4,Child,L,Wukliamz,81,09/27/1948,146,delaware av,,Anytown,US,00000,Opp-sex spouse,Female,White,146 delaware av Anytown US 00000,000,C,U-Z
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11048,11048,Chloe,A,Maryknez-Alvarez,21,07/12/2008,207,harrison st,,Anytown,US,00000,Biological child,Female,Latino,207 harrison st Anytown US 00000,000,C,M
11049,11049,Zachary,E,Martinez-Alvarez,18,06/29/2011,207,harrison st,,Anytown,US,00000,Biological child,Male,,207 harrison st Anytown US 00000,000,U-Z,M
11050,11050,Madeline,A,Martinez-Alvarez,16,08/12/2013,207,harrison st,,Anytown,US,00000,Biological child,Female,Latino,207 harrison st Anytown US 00000,000,M,M
11051,11051,Naomi,A,Martinez-Aldarez,1,11/01/2028,207,harrison st,,Anytown,US,00000,Grandchild,Female,Latino,207 harrison st Anytown US 00000,000,N,M


# Implement PVS-like matching with `fastLink`

## Estimate parameters (lambda, m, u) once for both modules

In reality these parameters are not estimated from the data.
It is unclear to me whether they are actually the same for both modules or even for different passes of the same module.

In [9]:
common_cols = [c for c in reference_file.columns if c in census_2030.columns]
common_cols

['record_id',
 'date_of_birth',
 'first_name',
 'middle_initial',
 'last_name',
 'street_number',
 'street_name',
 'unit_number',
 'city',
 'state',
 'zipcode',
 'geokey',
 'zip3',
 'first_initial_cut',
 'last_initial_cut']

In [10]:
import sys, pathlib
import os
os.environ["R_HOME"] = str(pathlib.Path(sys.executable).parent.parent / 'lib/R')

In [11]:
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri

pandas2ri.activate()

fastLink = importr('fastLink')

In [12]:
%%time

# From the fastLink README:
# ## Run the algorithm on the random samples
# rs.out <- fastLink(
#   dfA = dfA.s, dfB = dfB.s, 
#   varnames = c("firstname", "middlename", "lastname", "housenum", "streetname", "city", "birthyear"),
#   stringdist.match = c("firstname", "middlename", "lastname", "streetname", "city"),
#   partial.match = c("firstname", "lastname", "streetname"),
#   estimate.only = TRUE
# )

from rpy2 import robjects as ro

COMPARISON_COLUMNS = ["first_name", "middle_initial", "last_name", "date_of_birth", "geokey"]

prep_for_fastLink = lambda df: df[COMPARISON_COLUMNS].astype(str).fillna(ro.NA_Character).reset_index().rename(columns={'index': 'python_index'})

em_object = fastLink.fastLink(
    dfA = prep_for_fastLink(reference_file),
    dfB = prep_for_fastLink(census_2030),
    varnames = ro.StrVector(COMPARISON_COLUMNS),
    stringdist_match = ro.StrVector(["first_name", "last_name", "geokey"]),
    partial_match = ro.StrVector(["first_name", "last_name", "geokey"]),
    # Just run EM, don't link
    estimate_only = True,
)


fastLink(): Fast Probabilistic Record Linkage

If you set return.all to FALSE, you will not be able to calculate a confusion table as a summary statistic.
Calculating matches for each variable.
Getting counts for parameter estimation.
    Parallelizing calculation using OpenMP. 2 threads out of 10 are used.
Running the EM algorithm.
CPU times: user 1min 18s, sys: 6.22 s, total: 1min 24s
Wall time: 1min 20s


In [13]:
PROBABILITY_THRESHOLD = 0.85

## Implement matching

In [14]:
base = importr('base')

# Calculate this once to save time -- mapping from record_id to index of each dataframe
reference_file_index_of_ids = reference_file.reset_index().set_index('record_id')['index']
census_index_of_ids = census_2030.reset_index().set_index('record_id')['index']

# TODO: Have this function output more charts and diagnostics
def matching_pass_no_blocking():
    # fastLink really doesn't work well with blocking -- it requires you to call it separately
    # for each block, and with restrictive blocks, this is much, much slower than not using blocking
    # at all.
    census_to_match = census_2030[census_2030.pik.isnull()]
    
    # If we had wanted to do blocking, we would have done it something like this:
    # # fastLink's blocking (blockData) doesn't support blocking on multiple columns at once(!),
    # # so we implement our own blocking
    # # Technically we could have done it with some hacky approach involving appending the columns,
    # # but fastLink still requires you to make a separate linking call for each block anyway
    # census_2030_groups = census_2030[census_2030.pik.isnull()].groupby(blocking_cols, as_index=False)
    # reference_file_groups = reference_file.groupby(blocking_cols, as_index=False)
    
    # print(f'{census_2030_groups.ngroups} blocks')
    
    # varnames = ro.StrVector(COMPARISON_COLUMNS)
    # stringdist_match = ro.StrVector(["first_name", "last_name", "geokey"])
    # partial_match = ro.StrVector(["first_name", "last_name", "geokey"])

    # potential_links = []
    # for index, (key, census_2030_block) in enumerate(census_2030_groups):
    
    # try:
    #     reference_file_block = reference_file_groups.get_group(key)
    # except KeyError:
    #     # Nothing in the reference file for this block; so that implies there are no
    #     # matches to find
    #     continue

    # if len(reference_file_block) == 1:
    #     # HACK -- fastLink seems to not work at all if dfB is only one row
    #     reference_file_block = pd.concat([reference_file_block, pd.DataFrame(np.nan, index=[-1], columns=reference_file_block.columns)])

    # Then the rest of this logic until we print the number of potential links would be inside the loop

    with (ro.default_converter + pandas2ri.converter).context():
        conversion = ro.conversion.get_conversion()
        census_2030_r = conversion.py2rpy(prep_for_fastLink(census_to_match))
        reference_file_r = conversion.py2rpy(prep_for_fastLink(reference_file))

    fastLink_result = fastLink.fastLink(
        dfA=census_2030_r,
        dfB=reference_file_r,
        varnames=ro.StrVector(COMPARISON_COLUMNS),
        stringdist_match=ro.StrVector(["first_name", "last_name", "geokey"]),
        partial_match=ro.StrVector(["first_name", "last_name", "geokey"]),
        em_obj=em_object,
        threshold_match=PROBABILITY_THRESHOLD,
    )

    census_2030_matches_r_indices = fastLink_result.rx2('matches').rx2('inds.a')
    reference_file_matches_r_indices = fastLink_result.rx2('matches').rx2('inds.b')

    census_2030_matches = pd.Index(census_2030_r.rx(census_2030_matches_r_indices, 'python_index'))
    reference_file_matches = pd.Index(reference_file_r.rx(reference_file_matches_r_indices, 'python_index'))

    potential_links = (
        census_to_match.loc[census_2030_matches].reset_index(drop=True).add_suffix('_census_2030')
        .join(
            reference_file.loc[reference_file_matches].reset_index(drop=True).add_suffix('_reference_file')
        )
    )

    print(f'{len(potential_links)} links above threshold')

    # Post-processing: deal with multiple matches
    # According to the report, a record is considered not linkable if it has multiple matches above the threshold
    # I represent "not linkable" here with a PIK of -1 (different from NaN, which means yet-to-be-linked)
    potential_links = potential_links.merge(reference_file[['record_id', 'pik']], left_on='record_id_reference_file', right_on='record_id', how='left').drop(columns=['record_id'])
    print(f'{potential_links.record_id_census_2030.nunique()} input records have a match')
    census_records_with_multiple_potential_piks = potential_links.groupby('record_id_census_2030').pik.nunique().pipe(lambda c: c[c > 1]).index
    if len(census_records_with_multiple_potential_piks) > 0:
        print(f'{len(census_records_with_multiple_potential_piks)} input records matched to multiple PIKs, marking as unlinkable')

    potential_links.loc[potential_links.record_id_census_2030.isin(census_records_with_multiple_potential_piks), 'pik'] = -1

    assert (potential_links.groupby('record_id_census_2030').pik.nunique() == 1).all()
    links = potential_links.groupby('record_id_census_2030').pik.first().reset_index()
    census_2030.loc[census_index_of_ids.loc[links.record_id_census_2030], 'pik'] = links.pik.values

    print(f'Matched {len(links)} records; {census_2030.pik.isnull().mean():.2%} still eligible to match')
    
    # Diagnostic showing the predicted values for each combination of column similarity values
    # Not trivial to do this with fastLink when there is blocking, so we skip it
    
    return None, links

# Just do all the matching (single pass)

As noted above, fastLink doesn't work well with blocking.
I wasn't able to get reasonable performance from doing realistic NameSearch and GeoSearch passes.

In [15]:
census_2030['pik'] = np.nan

In [16]:
%%time

all_combos, pik_pairs = matching_pass_no_blocking()


fastLink(): Fast Probabilistic Record Linkage

If you set return.all to FALSE, you will not be able to calculate a confusion table as a summary statistic.
Calculating matches for each variable.
Getting counts for parameter estimation.
    Parallelizing calculation using OpenMP. 2 threads out of 10 are used.
Imputing matching probabilities using provided EM object.
Getting the indices of estimated matches.
    Parallelizing calculation using OpenMP. 2 threads out of 10 are used.
Deduping the estimated matches.
Getting the match patterns for each estimated match.
9719 links above threshold
9719 input records have a match
Matched 9719 records; 12.07% still eligible to match
CPU times: user 1min 20s, sys: 10.1 s, total: 1min 30s
Wall time: 1min 23s


In [17]:
pik_pairs

Unnamed: 0,record_id_census_2030,pik
0,0,6829
1,2,6400
2,3,18337
3,5,15686
4,6,2021
...,...,...
9714,11048,17071
9715,11049,12689
9716,11050,10874
9717,11051,10825


# Post-processing multiple matches

In [18]:
# Sentinel value represents matching to more than one PIK
census_2030[census_2030.pik == -1]

Unnamed: 0,record_id,first_name,middle_initial,last_name,age,date_of_birth,street_number,street_name,unit_number,city,state,zipcode,relation_to_reference_person,sex,race_ethnicity,geokey,zip3,first_initial_cut,last_initial_cut,pik


In [19]:
census_2030.loc[census_2030.pik == -1, 'pik'] = np.nan

# Resulting PIKs

In [20]:
census_2030

Unnamed: 0,record_id,first_name,middle_initial,last_name,age,date_of_birth,street_number,street_name,unit_number,city,state,zipcode,relation_to_reference_person,sex,race_ethnicity,geokey,zip3,first_initial_cut,last_initial_cut,pik
0,0,John,E,Mcueever,86,06/29/1942,147-153,browning ave,,Anytown,US,00000,Reference person,Male,Black,147-153 browning ave Anytown US 00000,000,J,M,6829.0
1,1,Sharon,T,Schmidt,69,10/50/1960,109,stqllion sr,,Anytown,US,00000,Reference person,Female,White,109 stqllion sr Anytown US 00000,000,S,S,
2,2,Gail,K,Durand,77,01/03/1953,2115,cannon dr,,Anytown,US,00000,Reference person,Female,Multiracial or Other,2115 cannon dr Anytown US 00000,000,G,D,6400.0
3,3,John,J,Williams,81,11/24/1948,146,delaware av,,Anytown,US,00000,Reference person,Male,White,146 delaware av Anytown US 00000,000,J,U-Z,18337.0
4,4,Child,L,Wukliamz,81,09/27/1948,146,delaware av,,Anytown,US,00000,Opp-sex spouse,Female,White,146 delaware av Anytown US 00000,000,C,U-Z,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11048,11048,Chloe,A,Maryknez-Alvarez,21,07/12/2008,207,harrison st,,Anytown,US,00000,Biological child,Female,Latino,207 harrison st Anytown US 00000,000,C,M,17071.0
11049,11049,Zachary,E,Martinez-Alvarez,18,06/29/2011,207,harrison st,,Anytown,US,00000,Biological child,Male,,207 harrison st Anytown US 00000,000,U-Z,M,12689.0
11050,11050,Madeline,A,Martinez-Alvarez,16,08/12/2013,207,harrison st,,Anytown,US,00000,Biological child,Female,Latino,207 harrison st Anytown US 00000,000,M,M,10874.0
11051,11051,Naomi,A,Martinez-Aldarez,1,11/01/2028,207,harrison st,,Anytown,US,00000,Grandchild,Female,Latino,207 harrison st Anytown US 00000,000,N,M,10825.0


In [21]:
census_2030.pik.notnull().mean()

0.8793087849452638

In [22]:
census_2030_ground_truth = pd.read_parquet('census_2030_ground_truth_sample.parquet').set_index('record_id').simulant_id
reference_file_ground_truth = pd.read_parquet('reference_file_ground_truth_sample.parquet').set_index('record_id').simulant_id

In [23]:
# Not possible to be PIKed, since they are truly not in the reference file
(~census_2030_ground_truth.isin(reference_file_ground_truth)).mean()

0.04641273862299828

In [24]:
census_2030.pik.notnull().mean() / census_2030_ground_truth.isin(reference_file_ground_truth).mean()

0.9221062618595827

In [25]:
# Multiple Census rows assigned the same PIK, indicating the model thinks they are duplicates in Census
census_2030.pik.value_counts().value_counts()

count
1    9717
2       1
Name: count, dtype: int64

In [26]:
# However, in this version of pseudopeople, there are no actual duplicates in Census
assert not census_2030_ground_truth.duplicated().any()

In [27]:
# Interesting: in pseudopeople, sometimes siblings are assigned the same (common) first name, making them almost identical.
# The only giveaway is their age and DOB.
# Presumably, this tends not to happen in real life.
duplicate_piks = census_2030.pik.value_counts()[census_2030.pik.value_counts() > 1].index
census_2030[census_2030.pik.isin(duplicate_piks)].sort_values('pik')

Unnamed: 0,record_id,first_name,middle_initial,last_name,age,date_of_birth,street_number,street_name,unit_number,city,state,zipcode,relation_to_reference_person,sex,race_ethnicity,geokey,zip3,first_initial_cut,last_initial_cut,pik
5070,5070,Sean,C,Renobato Torres,54,01/20/1976,19,deland ct,,Anytown,US,0,Reference person,Male,Latino,19 deland ct Anytown US 00000,0,S,R,18280.0
5072,5072,Jkdeljn,C,Renobato Torres,25,06/28/2004,19,deland ct,,Anytown,US,0,Biological child,Female,Latino,19 deland ct Anytown US 00000,0,J,R,18280.0


## PIK accuracy

In [28]:
pik_simulant_id = census_2030.pik.map(reference_file_ground_truth)
pik_simulant_id

0          0_923
1            NaN
2         0_6176
3        0_13972
4            NaN
          ...   
11048    0_22741
11049    0_22742
11050    0_22743
11051    0_23271
11052    0_16724
Name: pik, Length: 11053, dtype: object

In [29]:
(pik_simulant_id[pik_simulant_id.notnull()] == census_2030_ground_truth[pik_simulant_id.notnull()]).mean()

0.9880646157012039

In [30]:
errors = census_2030[census_2030.pik.notnull() & (pik_simulant_id != census_2030_ground_truth)]
confused_for = reference_file.set_index('record_id').loc[errors.pik].reset_index().set_index(errors.index)
errors[common_cols].compare(confused_for[common_cols], keep_shape=True, keep_equal=True)

Unnamed: 0_level_0,record_id,record_id,date_of_birth,date_of_birth,first_name,first_name,middle_initial,middle_initial,last_name,last_name,...,zipcode,zipcode,geokey,geokey,zip3,zip3,first_initial_cut,first_initial_cut,last_initial_cut,last_initial_cut
Unnamed: 0_level_1,self,other,self,other,self,other,self,other,self,other,...,self,other,self,other,self,other,self,other,self,other
150,150,11736,08/71/1966,,Ray,Rose,M,M,Thomas,Thomason,...,,,,,,,R,R,T,T
224,224,6959,07/30/2010,09/20/2007,Gabriela,Rabriella,L,L,Thomas,Hart,...,00000,00000,8370 chervil ct Anytown US 00000,8370 chervil ct Anytown US 00000,000,000,G,R,T,H
279,279,4339,11/07/1975,04/26/2024,Calvin,Jayleen,J,J,Lady Of Houde,Lady Of House,...,00000,,,,000,,C,J,L,L
407,407,10007,12/02/1962,,Morrid,Pedro,C,C,Hill,Hill,...,,,,,,,M,P,H,H
659,659,5737,03/08/9954,11/09/1953,Anthony,Anthony,M,M,Fuomer,Carty,...,00000,,,,000,,A-or-blank,A-or-blank,F,C
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10491,10491,8811,06/13/1967,10/29/1961,Gary,Gary,B,B,Flannigan,Willis,...,00000,,,,000,,G,G,F,U-Z
10643,10643,10370,05/26/1989,11/05/1991,Ashley,Ashley,L,L,Torres-Lazo,Elwell,...,00000,,,,000,,A-or-blank,A-or-blank,T,E
10731,10731,10440,12/24/1389,03/03/2021,Tanya,F,A,A,Diep,Diem,...,00000,,,,000,,T,F,D,D
10797,10797,2759,03/24/1979,09/16/1965,Kevin,Kevin,T,R,Herrera,Herrera,...,00000,00000,5096 e 22nd st Anytown US 00000,5096 e 22nd st Anytown US 00000,000,000,K,K,H,H


In [31]:
census_2030.to_parquet('census_2030_with_piks_sample.parquet')

In [32]:
# Convert this notebook to a Python script
! ./convert_notebook.sh pvs_like_case_study_sample_data_r

[NbConvertApp] Converting notebook pvs_like_case_study_sample_data_r.ipynb to python
