# **PVS-like case study: sample data**

In [1]:
import pseudopeople
from pathlib import Path
import pandas as pd, numpy as np

# Load simulated data to link

Generated with latest version of `pseudopeople` package.

## Load simulated data

Imagined scenario: PIKing the 2030 census.

One way to do this:
* Use (cumulative) SSA Numident up to that time.
* Link it (deterministically, using SSN) to taxes to get the most recent address for each person.
  * Likely would use 1040 here, but I'll use W2 for now.
* Link probabilistically to the census data.

In [2]:
!pip freeze | grep pseudopeople

pseudopeople==0.3.0


In [3]:
# pseudopeople_input_path = Path('/mnt/team/simulation_science/priv/engineering/vivarium_census_prl_synth_pop/results/v1.3_compressed_hdf/united_states_of_america/2023_04_04_09_18_48/final_results/2023_04_04_16_31_24')
# seeds_to_use = []
pseudopeople_input_path = Path('/ihme/scratch/users/zmbc/vivarium_results/united_states_of_america/2023_04_04_16_18_30/final_results/2023_04_04_17_30_18/')

In [4]:
! ls -lh $pseudopeople_input_path

total 32K
drwxrwxr-x 2 zmbc Domain Users 512 Apr  4 17:33 decennial_census_observer
drwxrwxr-x 2 zmbc Domain Users 512 Apr  4 17:33 household_survey_observer_acs
drwxrwxr-x 2 zmbc Domain Users 512 Apr  4 17:34 household_survey_observer_cps
drwxrwxr-x 2 zmbc Domain Users 512 Apr  4 17:34 social_security_observer
drwxrwxr-x 2 zmbc Domain Users 512 Apr  4 17:34 tax_1040_observer
drwxrwxr-x 2 zmbc Domain Users 512 Apr  4 17:34 tax_dependents_observer
drwxrwxr-x 2 zmbc Domain Users 512 Apr  4 17:34 tax_w2_observer
drwxrwxr-x 2 zmbc Domain Users 512 Apr  4 17:34 wic_observer


In [5]:
! ls -lh $pseudopeople_input_path/social_security_observer

total 448K
-rwxr-xr-x 1 zmbc Domain Users 443K Apr  4 17:34 social_security_observer.hdf


In [6]:
%%time

ssa = pseudopeople.generate_social_security(pseudopeople_input_path / 'social_security_observer/social_security_observer.hdf', configuration='case_study_config.yaml')
# We could set a date cutoff here, but since we are linking the 2030 census, it would be right around the time our sim ends anyway.
# Also, setting a date cutoff would drop those with missing event_date, which is probably correct (but we should revisit levels of noise
# in this data).
ssa

CPU times: user 595 ms, sys: 33.9 ms, total: 629 ms
Wall time: 646 ms


Unnamed: 0,date_of_birth,middle_initial,first_name,last_name,simulant_id,ssn,event_date,event_type
0,1922-02-23 00:00:00,D,Margaret,Obar,0_16801,786-77-6454,1922-02-23 00:00:00,creation
1,1922-07-17 00:00:00,A,Edna,Austin,0_18069,688-88-6377,1922-07-17 00:00:00,creation
3,1922-10-15 00:00:00,D,Josephine,Keath,0_7245,665-25-7858,1922-10-15 00:00:00,creation
7,1923-08-11 00:00:00,D,Joqn,Menchaca-Silva,0_6068,102-60-0838,1923-08-11 00:00:00,creation
8,1923-08-21 00:00:00,M,Mary,Moore,0_4393,494-11-1947,1923-08-21 00:00:00,creation
...,...,...,...,...,...,...,...,...
24633,1955-08-07 00:00:00,E,Thomas,Green,0_9164,104-19-7348,2030-05-15 00:00:00,death
24634,1956-02-12 00:00:00,D,Patricia,Theiss,0_19164,874-32-8890,2030-05-15 00:00:00,death
24635,1959-05-27 00:00:00,K,Rhonda,Mcgraw,0_16742,864-04-0094,2030-05-15 00:00:00,death
24636,,J,Gerald,Disalvk,0_18457,564-95-5073,2030-05-15 00:00:00,death


In [7]:
%%time

w2_1099 = pseudopeople.generate_taxes_w2_and_1099(pseudopeople_input_path / 'tax_w2_observer/tax_w2_observer.hdf', configuration='case_study_config.yaml')
w2_1099

CPU times: user 5.72 s, sys: 218 ms, total: 5.94 s
Wall time: 5.97 s


Unnamed: 0,mailing_address_unit_number,employer_street_number,middle_initial,income,last_name,tax_form,mailing_address_city,mailing_address_state,mailing_address_zipcode,employer_zipcode,...,simulant_id,employer_unit_number,employer_state,ssn,tax_year,employer_id,employer_name,employer_city,mailing_address_street_number,employer_street_name
0,,e,M,5889.171749,Shipp,W2,Anytown,US,,99999,...,0_4,,US,828-99-4653,2020,87,Transformation Center 3 Emergency Veterinary H...,Anytown,1312,ince dr
1,,,N,46085.84201,Shupp,W2,Anytown,US,99999,99999,...,0_5,,US,885-38-0858,2020,12,Jj Rubys Salon Studios,Anytown,1349,stoney crk cir
2,,65,C,26513.522956,Holt,W2,Anytown,US,99999,99999,...,0_2464,,US,584-19-7087,2020,57,ReStore and Yogurt Parlour Salon,Anytown,46,s west ave
3,,2329,S,20326.460457,Holt,W2,Anytown,US,99469,99999,...,0_3558,,US,813-32-2963,2020,36,Council of Taos Suby Repair,Anytown,9112,greenwood wy
4,,309,S,4719.617448,Holt,W2,Anytown,,99999,99999,...,0_3558,,US,813-32-2963,2020,90,Northwell Health Center Inc,Anytown,9112,n 52nd st
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93300,,9641,C,5324.699057,Rubio,W2,Anytown,US,99999,99999,...,0_12671,,US,796-80-2013,2029,50,The Church of America,Anytown,2569,s cr 15 rd
93301,,4135,C,16390.413647,Rubio,W2,Anytown,US,99999,99999,...,0_12671,,US,796-80-2013,2029,100,Twin Boro Auto Repair Service (USPS),Anytown,2569,yarmont way
93302,,4946,K,13313.581513,Hughes,W2,Anytown,US,99999,99999,...,0_18073,,US,105-49-3492,2029,30,Warrensburg,Anytown,21,w bluebird pl
93303,,9641,K,18809.325345,Hughes,W2,Anytown,US,99899,99999,...,0_18073,,US,105-49-3492,2029,50,The Church of America,Anytown,21,


In [8]:
%%time

census = pseudopeople.generate_decennial_census(pseudopeople_input_path / 'decennial_census_observer/decennial_census_observer.hdf', configuration='case_study_config.yaml')
census

CPU times: user 870 ms, sys: 16.7 ms, total: 887 ms
Wall time: 910 ms


Unnamed: 0,street_number,middle_initial,last_name,sex,guardian_1,year,housing_type,city,race_ethnicity,relation_to_household_head,age,unit_number,date_of_birth,first_name,street_name,simulant_id,guardian_2,zipcode,state
0,1312,M,Shipp,Female,0_-1,2020,Standard,Anytown,Black,Reference person,76,,1944-03-23 00:00:00,Maureen,cohmonweaigh avnje,0_4,0_-1,99699,US
1,1312,N,Shupp,Male,0_-1,2020,Standard,Anytown,Black,Opp-sex spouse,69,,1951-03-05 00:00:00,Richard,commonwealth avnue,0_5,0_-1,99999,US
2,46,C,Holt,Female,0_-1,2020,Standard,Anytown,Black,Reference person,66,,1953-08-03 00:00:00,Darlene,bancroft st,0_2464,0_-1,99999,US
3,9112,S,Holt,Female,0_-1,2020,Standard,Anytown,Black,Reference person,42,,1977-05-22 00:00:00,Tabatha,janis ne,0_3558,0_-1,99999,US
4,9112,M,Holt,Female,0_-1,2020,Standard,Anytown,Black,Biological child,25,,1994-12-16 00:00:00,Alice,janis ne,0_3559,0_-1,99999,US
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19949,5700,S,Wasser,Male,0_-1,2030,Standard,Anytown,White,Reference person,66,,1963-09-21 00:00:00,John,n cr 400 e,0_17995,0_-1,99999,US
19950,21,K,Hughes,Male,0_-1,2030,Standard,Anytown,Black,Reference person,77,,1952-10-15 00:00:00,Clyde,mammoth springs dr,0_18073,0_-1,92999,US
19951,8728,DC,Walker,Female,0_14366,2030,Standard,Anytown,Black,Reference person,2,,2027-11-11 00:00:00,Stephanie,,0_22588,0_-1,99999,US
19952,1900,D,Foster,Male,0_-1,2030,Standard,Anytown,White,Reference person,23,,2006-10-10 00:00:00,Austin,ranch loop,0_22759,0_-1,99999,US


In [9]:
census.year.value_counts(dropna=False)

year
2020    9979
2030    9975
Name: count, dtype: int64

In [10]:
census_2030 = census[census.year == 2030]
census_2030

Unnamed: 0,street_number,middle_initial,last_name,sex,guardian_1,year,housing_type,city,race_ethnicity,relation_to_household_head,age,unit_number,date_of_birth,first_name,street_name,simulant_id,guardian_2,zipcode,state
19,46,C,Holt,Female,0_-1,2030,Standard,Anytown,Black,Reference person,76,,1953-08-03 00:00:00,Darlene,bancroft st,0_2464,0_-1,99999,US
20,4732,C,Hoyt,Male,0_6065,2030,Standard,,Latino,Other nonrelative,12,,2018-03-13 00:00:00,Matteo,klauber ave,0_6066,0_-1,99999,US
21,5413,J,Ash,Female,0_-1,2030,Standard,Anytown,Black,Reference person,80,,1949-05-08 00:00:00,Sherry,kiely blvd,0_16850,0_-1,99998,US
22,4738,C,Langenfeld,Female,0_-1,2030,Standard,Anytown,White,Reference person,69,,1960-12-06 00:00:00,Denise,klauber ave,0_17160,0_-1,99999,US
23,12031,C,Shepherd,Female,0_21129,2030,Standard,Anytown,Black,Reference person,4,,2045-06-05 01:20:00,Sophia,east oakton drive,0_21752,0_-1,99999,US
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19949,5700,S,Wasser,Male,0_-1,2030,Standard,Anytown,White,Reference person,66,,1963-09-21 00:00:00,John,n cr 400 e,0_17995,0_-1,99999,US
19950,21,K,Hughes,Male,0_-1,2030,Standard,Anytown,Black,Reference person,77,,1952-10-15 00:00:00,Clyde,mammoth springs dr,0_18073,0_-1,92999,US
19951,8728,DC,Walker,Female,0_14366,2030,Standard,Anytown,Black,Reference person,2,,2027-11-11 00:00:00,Stephanie,,0_22588,0_-1,99999,US
19952,1900,D,Foster,Male,0_-1,2030,Standard,Anytown,White,Reference person,23,,2006-10-10 00:00:00,Austin,ranch loop,0_22759,0_-1,99999,US


## Deterministically get a recent address

In [11]:
# Within each year, we do not have a date field for W2. So we don't know which order jobs happened in.
# We take the address associated with the most income in the most recent year. Note that part or all
# of this address may be missing.
w2_1099.groupby(['tax_year', 'ssn']).mailing_address_street_name.nunique(dropna=False).sort_values()

tax_year  ssn        
2020      001-16-0077    1
2026      603-44-0253    1
          603-65-5538    1
          603-76-1835    1
          603-81-9368    1
                        ..
2021      857-06-7979    3
2029      703-41-9284    4
2023      755-15-6662    4
2020      590-53-1851    4
2028      452-98-1553    4
Name: mailing_address_street_name, Length: 67275, dtype: int64

In [12]:
w2_1099.ssn.nunique()

13649

In [13]:
# We could consider adding a year cutoff here (e.g. to be recent it needs to be at least in 2025).
# But I think for linkage, the more information, the better.
recent_addresses = (
    w2_1099.assign(income=lambda x: x.income.fillna(0).astype(float)).sort_values(['tax_year', 'income'], ascending=False).groupby('ssn').first().filter(like='mailing_address')
)
recent_addresses

Unnamed: 0_level_0,mailing_address_unit_number,mailing_address_city,mailing_address_state,mailing_address_zipcode,mailing_address_street_name,mailing_address_po_box,mailing_address_street_number
ssn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
001-02-4588,,Anytown,US,99999,sunburst terrace,0,106
001-16-0077,,Anytown,US,99999,narst mill rd,0,
001-17-9511,,Anytown,US,99999,capt t ln,0,14107
001-30-0491,,Anytown,US,99999,wst euclid avenue,0,233
001-33-5249,no 50,Anytown,US,99999,devonshire wy,0,606
...,...,...,...,...,...,...,...
968-92-4966,,Anytown,US,99999,37th xtteeh ter,0,3919
973-37-8383,,Anytown,US,09999,w mary ave,0,23134
975-45-6490,,Anytown,US,99989,padato csoutjh circle,0,2358
975-68-7318,,Anytown,US,99999,ebony street,0,3475


In [14]:
# Everyone with any taxes has a most recent address.
assert set(recent_addresses.index) == set(w2_1099.ssn.dropna().unique())

In [15]:
# Some people won't have a recent address, if they don't have any taxes.
ssa[~ssa.ssn.isin(recent_addresses.index)]

Unnamed: 0,date_of_birth,middle_initial,first_name,last_name,simulant_id,ssn,event_date,event_type
9,1923-09-05 00:00:00,A,Edith,Campbell,0_19817,103-21-8846,1923-09-05 00:00:00,creation
59,1925-11-12 00:00:00,D,Margaret,Sumner,0_5785,407-81-9766,1925-11-12 00:00:00,creation
103,1927-10-30 00:00:00,A,Dorothy,Kjlles,0_7824,723-87-9412,1927-10-30 00:00:00,death
113,1928-05-30 00:00:00,H,Peggy,Biel,0_15226,107-61-9780,1928-05-30 00:00:00,creation
124,1928-11-30 00:00:00,R,Beverly,Harding,0_15069,462-33-7739,1928-11-30 00:00:00,creation
...,...,...,...,...,...,...,...,...
24575,2030-03-31 00:00:00,M,Alani,Andrade,0_23415,193-80-5960,2030-03-20 00:00:00,creation
24576,2030-04-01 00:00:00,A,Olivia,Moles,0_23419,742-47-6307,2030-03-20 00:00:00,creation
24577,2030-04-02 00:00:00,M,Ava,Lint,0_23426,424-75-4102,2030-03-20 00:00:00,creation
24596,1979-08-17 00:00:00,C,William,Becker,0_13937,218-25-7331,2030-04-17 00:00:00,death


## Create a fake Numident file

In [16]:
# The probable real-life approach would be: take date of birth from the (first) creation event, date of death (if any)
# from the (last) death event, name from the most recent event of any kind.
# We don't want to throw out events with a missing/invalid date, so we'll fill them with the value *least* likely to be chosen
# (early if taking the latest, late if taking the earliest).
fill_dates = lambda df, fill_type: pd.to_datetime(df.event_date, errors='coerce').fillna(pd.Timestamp('2100-01-01' if fill_type == 'latest' else '1900-01-01'))

date_of_birth = (
    ssa[ssa.event_type == 'creation']
        .assign(event_date_for_sort=lambda df: fill_dates(df, 'latest'))
        .sort_values('event_date_for_sort')
        .groupby('ssn')
        .date_of_birth.first()
)
date_of_death = (
    ssa[ssa.event_type == 'death']
        .assign(event_date_for_sort=lambda df: fill_dates(df, 'earliest'))
        .sort_values('event_date_for_sort')
        .groupby('ssn')
        .event_date.last()
        .rename('date_of_death')
)
name = (
    ssa
        .assign(event_date_for_sort=lambda df: fill_dates(df, 'earliest'))
        .sort_values('event_date_for_sort')
        .groupby('ssn')
        .last()[['first_name', 'middle_initial', 'last_name']]
)

# What to do about ground truth here? This simple linkage could already be introducing errors, since SSN is not without noise!
# For now, I'll take the most common ground truth.
simulant_id = ssa.groupby('ssn').simulant_id.agg(lambda x: pd.Series.mode(x)[0])

fake_numident = pd.DataFrame(simulant_id).join(date_of_birth, how='left').join(name, how='left').join(date_of_death, how='left').reset_index()
fake_numident

Unnamed: 0,ssn,simulant_id,date_of_birth,first_name,middle_initial,last_name,date_of_death
0,000-73-0165,0_20663,2022-03-15 00:00:00,Alexander,M,Wood,
1,001-02-4588,0_2861,2009-12-16 00:00:00,Jamaya,S,Pingol,
2,001-16-0077,0_18263,1970-10-21 00:00:00,Brenda,L,Kiefer,
3,001-17-9511,0_7457,1967-04-22 00:00:00,Robert,G,Ortiz,
4,001-30-0491,0_4183,1947-11-23 00:00:00,Ruth,H,Blair,
...,...,...,...,...,...,...,...
15281,899-97-5729,0_2944,2020-03-04 00:00:00,Atlas,A,Palafox-Gutierrez,
15282,934-29-6471,0_3037,1981-12-31 00:00:00,Charlene,X,Griffith,
15283,938-11-1538,0_11244,1980-01-19 00:00:00,Tracy,M,Schweich,
15284,955-16-6917,0_16732,1972-01-16 00:00:00,Elizabeth,M,Xie,


In [17]:
# Most people have not died
fake_numident.date_of_death.isnull().mean()

0.9162632474159361

## Create a composite reference file for linking

In [18]:
reference_file = (
    # Exclude those who have died before the census
    fake_numident[~(pd.to_datetime(fake_numident.date_of_death, errors='coerce') <= pd.Timestamp('2020-04-01'))]
        .merge(recent_addresses, on='ssn', how='left')
)
reference_file

Unnamed: 0,ssn,simulant_id,date_of_birth,first_name,middle_initial,last_name,date_of_death,mailing_address_unit_number,mailing_address_city,mailing_address_state,mailing_address_zipcode,mailing_address_street_name,mailing_address_po_box,mailing_address_street_number
0,000-73-0165,0_20663,2022-03-15 00:00:00,Alexander,M,Wood,,,,,,,,
1,001-02-4588,0_2861,2009-12-16 00:00:00,Jamaya,S,Pingol,,,Anytown,US,99999,sunburst terrace,0,106
2,001-16-0077,0_18263,1970-10-21 00:00:00,Brenda,L,Kiefer,,,Anytown,US,99999,narst mill rd,0,
3,001-17-9511,0_7457,1967-04-22 00:00:00,Robert,G,Ortiz,,,Anytown,US,99999,capt t ln,0,14107
4,001-30-0491,0_4183,1947-11-23 00:00:00,Ruth,H,Blair,,,Anytown,US,99999,wst euclid avenue,0,233
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15217,899-97-5729,0_2944,2020-03-04 00:00:00,Atlas,A,Palafox-Gutierrez,,,,,,,,
15218,934-29-6471,0_3037,1981-12-31 00:00:00,Charlene,X,Griffith,,,,,,,,
15219,938-11-1538,0_11244,1980-01-19 00:00:00,Tracy,M,Schweich,,,,,,,,
15220,955-16-6917,0_16732,1972-01-16 00:00:00,Elizabeth,M,Xie,,,,,,,,


# Pre-process the data

Not much needed here because the datasets are already so tidy and similar to each other.

In [19]:
# Add a unique record ID
reference_file = reference_file.reset_index(drop=True).reset_index().rename(columns={'index': 'record_id'})
census_2030 = census_2030.reset_index(drop=True).reset_index().rename(columns={'index': 'record_id'})

# Remove ground truth
reference_file_ground_truth = reference_file.pop('simulant_id')
census_2030_ground_truth = census_2030.pop('simulant_id')

In [20]:
# Use true missingness instead of empty string
reference_file = reference_file.replace('', np.nan)
census_2030 = census_2030.replace('', np.nan)

In [21]:
# We want to compare mailing address with physical address
reference_file = reference_file.rename(columns=lambda c: c.replace('mailing_address_', ''))

In [22]:
# Purely for ease of use, order the columns nicely
reference_file_columns_order = [
    'record_id',
    'ssn',
    'first_name', 'middle_initial', 'last_name',
    'date_of_birth',
    'date_of_death',
    'street_number', 'street_name', 'unit_number', 'city', 'state', 'zipcode', 'po_box',
]
assert set(reference_file_columns_order) == set(reference_file.columns)
reference_file = reference_file[reference_file_columns_order]

In [23]:
census_columns_order = [
    'record_id',
    'first_name', 'middle_initial', 'last_name',
    'sex', 'race_ethnicity', 'age', 'date_of_birth',
    'housing_type', 'relation_to_household_head',
    'street_number', 'street_name', 'unit_number', 'city', 'state', 'zipcode',
]
assert (set(census_columns_order) | {'guardian_1', 'guardian_2', 'year'}) == set(census_2030.columns)
census_2030 = census_2030[census_columns_order]

In [24]:
# My working theory: the purpose of the "geokey" is because address parts violate conditional independence
get_geokey = lambda x: (x.street_number + ' ' + x.street_name + ' ' + x.unit_number.fillna('') + ' ' + x.city + ' ' + x.state + ' ' + x.zipcode).str.strip().str.split().str.join(' ')
reference_file = reference_file.assign(geokey=get_geokey)
census_2030 = census_2030.assign(geokey=get_geokey)

In [25]:
# Add columns used to "cut the database": ZIP3 and a grouping of first and last initial
reference_file = reference_file.assign(zip3=lambda x: x.zipcode.str[:3])
census_2030 = census_2030.assign(zip3=lambda x: x.zipcode.str[:3])

# Page 20 of the NORC report: "Name-cuts are defined by combinations of the first characters of the first and last names. The twenty letter groupings
# for the first character are: A-or-blank, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, and U-Z."
initial_cut = lambda x: x.fillna('A').str[0].replace('A', 'A-or-blank').replace(['U', 'V', 'W', 'X', 'Y', 'Z'], 'U-Z')
reference_file = reference_file.assign(first_initial_cut=lambda x: initial_cut(x.first_name), last_initial_cut=lambda x: initial_cut(x.last_name))
census_2030 = census_2030.assign(first_initial_cut=lambda x: initial_cut(x.first_name), last_initial_cut=lambda x: initial_cut(x.last_name))

# Data to link

Note: I am unclear on how this works with alternate names and addresses. Should there be duplicate rows in the reference file?

In [26]:
reference_file

Unnamed: 0,record_id,ssn,first_name,middle_initial,last_name,date_of_birth,date_of_death,street_number,street_name,unit_number,city,state,zipcode,po_box,geokey,zip3,first_initial_cut,last_initial_cut
0,0,000-73-0165,Alexander,M,Wood,2022-03-15 00:00:00,,,,,,,,,,,A-or-blank,U-Z
1,1,001-02-4588,Jamaya,S,Pingol,2009-12-16 00:00:00,,106,sunburst terrace,,Anytown,US,99999,0,106 sunburst terrace Anytown US 99999,999,J,P
2,2,001-16-0077,Brenda,L,Kiefer,1970-10-21 00:00:00,,,narst mill rd,,Anytown,US,99999,0,,999,B,K
3,3,001-17-9511,Robert,G,Ortiz,1967-04-22 00:00:00,,14107,capt t ln,,Anytown,US,99999,0,14107 capt t ln Anytown US 99999,999,R,O
4,4,001-30-0491,Ruth,H,Blair,1947-11-23 00:00:00,,233,wst euclid avenue,,Anytown,US,99999,0,233 wst euclid avenue Anytown US 99999,999,R,B
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15217,15217,899-97-5729,Atlas,A,Palafox-Gutierrez,2020-03-04 00:00:00,,,,,,,,,,,A-or-blank,P
15218,15218,934-29-6471,Charlene,X,Griffith,1981-12-31 00:00:00,,,,,,,,,,,C,G
15219,15219,938-11-1538,Tracy,M,Schweich,1980-01-19 00:00:00,,,,,,,,,,,T,S
15220,15220,955-16-6917,Elizabeth,M,Xie,1972-01-16 00:00:00,,,,,,,,,,,E,U-Z


In [27]:
census_2030

Unnamed: 0,record_id,first_name,middle_initial,last_name,sex,race_ethnicity,age,date_of_birth,housing_type,relation_to_household_head,street_number,street_name,unit_number,city,state,zipcode,geokey,zip3,first_initial_cut,last_initial_cut
0,0,Darlene,C,Holt,Female,Black,76,1953-08-03 00:00:00,Standard,Reference person,46,bancroft st,,Anytown,US,99999,46 bancroft st Anytown US 99999,999,D,H
1,1,Matteo,C,Hoyt,Male,Latino,12,2018-03-13 00:00:00,Standard,Other nonrelative,4732,klauber ave,,,US,99999,,999,M,H
2,2,Sherry,J,Ash,Female,Black,80,1949-05-08 00:00:00,Standard,Reference person,5413,kiely blvd,,Anytown,US,99998,5413 kiely blvd Anytown US 99998,999,S,A-or-blank
3,3,Denise,C,Langenfeld,Female,White,69,1960-12-06 00:00:00,Standard,Reference person,4738,klauber ave,,Anytown,US,99999,4738 klauber ave Anytown US 99999,999,D,L
4,4,Sophia,C,Shepherd,Female,Black,4,2045-06-05 01:20:00,Standard,Reference person,12031,east oakton drive,,Anytown,US,99999,12031 east oakton drive Anytown US 99999,999,S,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9970,9970,John,S,Wasser,Male,White,66,1963-09-21 00:00:00,Standard,Reference person,5700,n cr 400 e,,Anytown,US,99999,5700 n cr 400 e Anytown US 99999,999,J,U-Z
9971,9971,Clyde,K,Hughes,Male,Black,77,1952-10-15 00:00:00,Standard,Reference person,21,mammoth springs dr,,Anytown,US,92999,21 mammoth springs dr Anytown US 92999,929,C,H
9972,9972,Stephanie,DC,Walker,Female,Black,2,2027-11-11 00:00:00,Standard,Reference person,8728,,,Anytown,US,99999,,999,S,U-Z
9973,9973,Austin,D,Foster,Male,White,23,2006-10-10 00:00:00,Standard,Reference person,1900,ranch loop,,Anytown,US,99999,1900 ranch loop Anytown US 99999,999,A-or-blank,F


In [28]:
%store reference_file census_2030 reference_file census_2030 reference_file_ground_truth census_2030_ground_truth

Stored 'reference_file' (DataFrame)
Stored 'census_2030' (DataFrame)
Stored 'reference_file' (DataFrame)
Stored 'census_2030' (DataFrame)
Stored 'reference_file_ground_truth' (Series)
Stored 'census_2030_ground_truth' (Series)


# Implement PVS-like matching with `splink`

## Estimate parameters (lambda, m, u) once for both modules

In reality these parameters are not estimated from the data.
It is unclear to me whether they are actually the same for both modules or even for different passes of the same module.

In [29]:
# temp: reload
%store -r reference_file census_2030

import pandas as pd, numpy as np

In [30]:
common_cols = [c for c in reference_file.columns if c in census_2030.columns]
common_cols

['record_id',
 'first_name',
 'middle_initial',
 'last_name',
 'date_of_birth',
 'street_number',
 'street_name',
 'unit_number',
 'city',
 'state',
 'zipcode',
 'geokey',
 'zip3',
 'first_initial_cut',
 'last_initial_cut']

In [31]:
def prep_table_for_splink(df):
    return (
        df[common_cols]
            .assign(date_of_birth=lambda x: x.date_of_birth.astype(str))
            .rename(columns={'record_id': 'unique_id'})
    )

tables_for_splink = [prep_table_for_splink(reference_file), prep_table_for_splink(census_2030)]

In [32]:
[len(t) for t in tables_for_splink]

[15222, 9975]

In [33]:
# estimate_probability_two_random_records_match did not seem to give me a reasonable estimate
# we estimate that around 90% of the census are present in the reference file
probability_two_random_records_match = (0.90 * len(census_2030)) / (len(reference_file) * len(census_2030))
probability_two_random_records_match

5.912495072920773e-05

In [34]:
from splink.duckdb.duckdb_linker import DuckDBLinker
from splink.duckdb.duckdb_comparison_library import (
    exact_match,
    levenshtein_at_thresholds,
)

settings = {
    "link_type": "link_only",
    "comparisons": [
        levenshtein_at_thresholds("first_name", 2, term_frequency_adjustments=True),
        exact_match("middle_initial"),
        levenshtein_at_thresholds("last_name", 2, term_frequency_adjustments=True),
        # For some reason, this makes everything crash!?
        # levenshtein_at_thresholds("date_of_birth", 1),
        exact_match("date_of_birth"),
        levenshtein_at_thresholds("geokey", 5),
    ],
    "probability_two_random_records_match": probability_two_random_records_match
}

linker = DuckDBLinker(
    tables_for_splink,
    settings,
    input_table_aliases=["reference_file", "census_2030"]
)

# NOTE: This is not reproducible!
linker.estimate_u_using_random_sampling(max_pairs=1e5)

blocking_rule_for_training = "l.first_name = r.first_name and l.last_name = r.last_name"
linker.estimate_parameters_using_expectation_maximisation(blocking_rule_for_training)

blocking_rule_for_training = "l.geokey = r.geokey"
linker.estimate_parameters_using_expectation_maximisation(blocking_rule_for_training)

----- Estimating u probabilities using random sampling -----

Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - first_name (no m values are trained).
    - middle_initial (no m values are trained).
    - last_name (no m values are trained).
    - date_of_birth (no m values are trained).
    - geokey (no m values are trained).

----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
l.first_name = r.first_name and l.last_name = r.last_name

Parameter estimates will be made for the following comparison(s):
    - middle_initial
    - date_of_birth
    - geokey

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
    - first_name
    - last_name

Iteration 1: Largest change in params was 0.438 in the m_probability of geokey, level `Levenshtein <= 5`
Iteration 2: Largest change in params was 0.00283 in probability_two_rando

<EMTrainingSession, blocking on l.geokey = r.geokey, deactivating comparisons geokey>

In [35]:
linker.match_weights_chart()

In [36]:
# NOTE: First name looks very wrong. I have not yet figured out why this is.
linker.m_u_parameters_chart()

In [37]:
splink_settings = linker._settings_obj.as_dict()

In [38]:
PROBABILITY_THRESHOLD = 0.85

In [39]:
%store splink_settings PROBABILITY_THRESHOLD

Stored 'splink_settings' (dict)
Stored 'PROBABILITY_THRESHOLD' (float)


## Implement matching passes

In [40]:
# Calculate this once to save time -- mapping from record_id to index of Census dataframe
census_index_of_ids = census_2030.reset_index().set_index('record_id')['index']

# TODO: Have this function output more charts and diagnostics
def pvs_matching_pass(blocking_cols):
    tables_for_splink = [prep_table_for_splink(reference_file), prep_table_for_splink(census_2030[census_2030.pik.isnull()])]

    blocking_rule_parts = [f"l.{col} = r.{col}" for col in blocking_cols]
    blocking_rule = " and ".join(blocking_rule_parts)
    linker = DuckDBLinker(
        tables_for_splink,
        {**splink_settings, **{
            "blocking_rules_to_generate_predictions": [blocking_rule],
        }},
        input_table_aliases=["reference_file", "census_2030"]
    )

    all_predictions = linker.predict().as_pandas_dataframe()
    all_combos = all_predictions.groupby(list(all_predictions.filter(like='gamma_').columns)).match_probability.agg(['mean', 'count']).sort_values('mean')

    potential_links = linker.predict(threshold_match_probability=PROBABILITY_THRESHOLD).as_pandas_dataframe()
    print(f'{len(potential_links)} links above threshold')

    # Post-processing: deal with multiple matches
    # According to the report, it is frequently the case that the post-processing rule doesn't assign *any* matches when there are multiple
    # So I'm replicating that feature with a very simple algorithm
    pik_pairs = potential_links.sort_values('match_weight', ascending=False).groupby(['unique_id_l']).first()
    runner_up_pairs = potential_links.sort_values('match_weight', ascending=False).groupby(['unique_id_l']).nth(2)
    pik_pairs = pik_pairs.join(runner_up_pairs.match_weight.rename('runner_up_match_weight'), how='left')
    pairs_to_keep = ~(pik_pairs.runner_up_match_weight > pik_pairs.match_weight + 0.5)
    print(f'{len(pairs_to_keep)} matches remain after dealing with multiple matches')
    pik_pairs = pik_pairs[pairs_to_keep]
    # Make pik_pairs index into the census_2030 dataframe
    pik_pairs = pik_pairs.set_index(pik_pairs.index.map(census_index_of_ids))
    
    census_2030.loc[pik_pairs.index, 'pik'] = pik_pairs.unique_id_r
    print(f'Matched {len(pik_pairs)} records; {census_2030.pik.isnull().mean():.2%} still unmatched')
    
    return all_combos, pik_pairs

# GeoSearch

> There are six passes through GeoSearch defined currently for an ACS PVS run. These passes use the first
  three digits of an address ZIP code (ZIP3) as a database “cutting” strategy...
>
> The GeoSearch matching
  variables include name and DOB, but also several variables derived from the Geokey (street name, house
  number, etc).

[(source)](https://www.norc.org/PDFs/May%202011%20Personal%20Validation%20and%20Entity%20Resolution%20Conference/PVS%20Assessment%20Report%20FINAL%20JULY%202011.pdf)

In [41]:
# temp: reload everything
%store -r

import pandas as pd, numpy as np
from splink.duckdb.duckdb_linker import DuckDBLinker
from splink.duckdb.duckdb_comparison_library import (
    exact_match,
    levenshtein_at_thresholds,
)

In [42]:
def geosearch_pass(blocking_cols):
    return pvs_matching_pass(["zip3"] + blocking_cols)

## Pass 1: block on full name and entire address

In [43]:
census_2030['pik'] = np.nan

In [44]:
all_combos, pik_pairs = geosearch_pass(["first_name", "middle_initial", "last_name", "geokey"])

2348 links above threshold
2348 matches remain after dealing with multiple matches
Matched 2348 records; 76.46% still unmatched


### Look at diagnostics

In [45]:
all_combos

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,mean,count
gamma_first_name,gamma_middle_initial,gamma_last_name,gamma_date_of_birth,gamma_geokey,Unnamed: 5_level_1,Unnamed: 6_level_1
2,1,2,0,2,0.999959,144
2,1,2,1,2,1.0,2204


In [46]:
pik_pairs

Unnamed: 0_level_0,match_weight,match_probability,source_dataset_l,source_dataset_r,unique_id_r,first_name_l,first_name_r,gamma_first_name,middle_initial_l,middle_initial_r,...,gamma_last_name,date_of_birth_l,date_of_birth_r,gamma_date_of_birth,geokey_l,geokey_r,gamma_geokey,zip3_l,zip3_r,runner_up_match_weight
unique_id_l,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5,37.044501,1.0,census_2030,reference_file,10774,Patrica,Patrica,2,M,M,...,2,1964-11-28 00:00:00,1964-11-28 00:00:00,1,427 e 72 st Anytown US 99999,427 e 72 st Anytown US 99999,2,999,999,
8,30.497606,1.0,census_2030,reference_file,12112,Bruce,Bruce,2,D,D,...,2,1950-07-18 00:00:00,1950-07-18 00:00:00,1,8126 lerer lane Anytown US 99999,8126 lerer lane Anytown US 99999,2,999,999,
9,36.366429,1.0,census_2030,reference_file,5363,Ronda,Ronda,2,L,L,...,2,1968-10-20 00:00:00,1968-10-20 00:00:00,1,1707 tyler street Anytown US 99999,1707 tyler street Anytown US 99999,2,999,999,
12,29.353106,1.0,census_2030,reference_file,5544,Nathaniel,Nathaniel,2,D,D,...,2,2009-03-11 00:00:00,2009-03-11 00:00:00,1,627 south champlain avenue Anytown US 99999,627 south champlain avenue Anytown US 99999,2,999,999,
13,31.450549,1.0,census_2030,reference_file,2443,Candace,Candace,2,K,K,...,2,1983-05-30 00:00:00,1983-05-30 00:00:00,1,189-36 eagles nest tr Anytown US 99999,189-36 eagles nest tr Anytown US 99999,2,999,999,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9951,33.906997,1.0,census_2030,reference_file,15090,Austin,Austin,2,J,J,...,2,1997-07-17 00:00:00,1997-07-17 00:00:00,1,2780 cr 64 Anytown US 99999,2780 cr 64 Anytown US 99999,2,999,999,
9952,31.196504,1.0,census_2030,reference_file,4138,Lori,Lori,2,H,H,...,2,1983-12-31 00:00:00,1983-12-31 00:00:00,1,2569 ocean avenue Anytown US 99999,2569 ocean avenue Anytown US 99999,2,999,999,
9954,32.196504,1.0,census_2030,reference_file,471,Alana,Alana,2,K,K,...,2,2002-09-20 00:00:00,2002-09-20 00:00:00,1,2569 ocean avenue Anytown US 99999,2569 ocean avenue Anytown US 99999,2,999,999,
9955,29.892723,1.0,census_2030,reference_file,13481,Emily,Emily,2,C,C,...,2,2004-01-12 00:00:00,2004-01-12 00:00:00,1,2569 ocean avenue Anytown US 99999,2569 ocean avenue Anytown US 99999,2,999,999,


## Pass 2: Block on first name and entire address

In [47]:
all_combos, pik_pairs = geosearch_pass(["first_name", "geokey"])

649 links above threshold
649 matches remain after dealing with multiple matches
Matched 649 records; 69.95% still unmatched


### Look at diagnostics

In [48]:
all_combos

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,mean,count
gamma_first_name,gamma_middle_initial,gamma_last_name,gamma_date_of_birth,gamma_geokey,Unnamed: 5_level_1,Unnamed: 6_level_1
2,0,0,0,2,0.004315,42
2,-1,0,0,2,0.03247,1
2,1,0,0,2,0.17168,12
2,0,-1,0,2,0.212447,2
2,0,1,0,2,0.909688,1
2,0,0,1,2,0.977448,1
2,1,-1,0,2,0.994209,2
2,1,1,0,2,0.999247,10
2,0,2,0,2,0.999327,18
2,1,0,1,2,0.999909,15


In [49]:
pik_pairs

Unnamed: 0_level_0,match_weight,match_probability,source_dataset_l,source_dataset_r,unique_id_r,first_name_l,first_name_r,gamma_first_name,middle_initial_l,middle_initial_r,...,gamma_last_name,date_of_birth_l,date_of_birth_r,gamma_date_of_birth,geokey_l,geokey_r,gamma_geokey,zip3_l,zip3_r,runner_up_match_weight
unique_id_l,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
32,21.205690,1.000000,census_2030,reference_file,7056,Jessica,Jessica,2,H,J,...,2,2003-12-02 00:00:00,2003-12-02 00:00:00,1,5701 princeton avenue Anytown US 99999,5701 princeton avenue Anytown US 99999,2,999,999,
50,12.480045,0.999825,census_2030,reference_file,6872,Caroline,Caroline,2,S,S,...,1,1573-01-12 00:00:00,1973-01-12 00:00:00,0,5321 harlow court Anytown US 99999,5321 harlow court Anytown US 99999,2,999,999,
77,24.623168,1.000000,census_2030,reference_file,1511,Ashley,Ashley,2,Y,J,...,2,1991-04-01 00:00:00,1991-04-01 00:00:00,1,2 w altadena ave Anytown US 99999,2 w altadena ave Anytown US 99999,2,999,999,
124,22.939986,1.000000,census_2030,reference_file,561,Misty,Misty,2,M,M,...,-1,1977-08-26 00:00:00,1977-08-26 00:00:00,1,670 blooming vlly ct Anytown US 99999,670 blooming vlly ct Anytown US 99999,2,999,999,
147,25.205678,1.000000,census_2030,reference_file,11280,Amanda,Amanda,2,J,J,...,1,1982-08-03 00:00:00,1982-08-03 00:00:00,1,9660 yellow lantana ln Anytown US 99999,9660 yellow lantana ln Anytown US 99999,2,999,999,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9925,26.444465,1.000000,census_2030,reference_file,13589,Alyssa,Alyssa,2,C,C,...,1,1997-11-17 00:00:00,1997-11-17 00:00:00,1,9236 ederle st e Anytown US 99999,9236 ederle st e Anytown US 99999,2,999,999,
9946,5.437698,0.977448,census_2030,reference_file,12959,John,John,2,S,W,...,0,1961-12-20 00:00:00,1961-12-20 00:00:00,1,19 weiser court Anytown US 99999,19 weiser court Anytown US 99999,2,999,999,
9961,28.208130,1.000000,census_2030,reference_file,5569,Austin,Austin,2,K,M,...,2,1994-12-15 00:00:00,1994-12-15 00:00:00,1,3527 fairview dr Anytown US 99999,3527 fairview dr Anytown US 99999,2,999,999,
9968,23.759670,1.000000,census_2030,reference_file,1665,William,William,2,M,H,...,2,1994-11-01 00:00:00,1994-11-01 00:00:00,1,102 stonehenge cv Anytown US 99999,102 stonehenge cv Anytown US 99999,2,999,999,


## Pass 3: Block on full name and street address

In [50]:
all_combos, pik_pairs = geosearch_pass(["first_name", "middle_initial", "last_name", "street_number", "street_name"])

240 links above threshold
240 matches remain after dealing with multiple matches
Matched 240 records; 67.55% still unmatched


### Look at diagnostics

In [51]:
all_combos

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,mean,count
gamma_first_name,gamma_middle_initial,gamma_last_name,gamma_date_of_birth,gamma_geokey,Unnamed: 5_level_1,Unnamed: 6_level_1
2,1,2,0,1,0.999979,19
2,1,2,1,-1,0.999998,37
2,1,2,1,1,1.0,184


In [52]:
pik_pairs

Unnamed: 0_level_0,match_weight,match_probability,source_dataset_l,source_dataset_r,unique_id_r,first_name_l,first_name_r,gamma_first_name,middle_initial_l,middle_initial_r,...,geokey_l,geokey_r,gamma_geokey,zip3_l,zip3_r,street_number_l,street_number_r,street_name_l,street_name_r,runner_up_match_weight
unique_id_l,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,18.218441,0.999997,census_2030,reference_file,12363,Patrick,Patrick,2,M,M,...,427 e 72 st Anytown US 99998,427 e 72 st Anytown US 99999,1,999,999,427,427,e 72 st,e 72 st,
28,32.220996,1.000000,census_2030,reference_file,9915,Bailey,Bailey,2,K,K,...,20704 12th st ne Anytown US 99919,20704 12th st ne Anytown US 99999,1,999,999,20704,20704,12th st ne,12th st ne,
44,32.636033,1.000000,census_2030,reference_file,5032,Arianna,Arianna,2,K,K,...,515 w market st Anytown US 99969,515 w market st Anytown US 99999,1,999,999,515,515,w market st,w market st,
85,32.899068,1.000000,census_2030,reference_file,5204,Samantha,Samantha,2,T,T,...,15195 n franklin st Anytown US 99999,15195 n franklin st Anytown NV 99999,1,999,999,15195,15195,n franklin st,n franklin st,
109,24.561008,1.000000,census_2030,reference_file,5459,Bernice,Bernice,2,A,A,...,,415 wst wind circel Anytown US 99999,-1,999,999,415,415,wst wind circel,wst wind circel,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9677,31.150607,1.000000,census_2030,reference_file,13443,Ronald,Ronald,2,M,M,...,1 w 93rd ave Anytown PA 99999,1 w 93rd ave Anytown US 99999,1,999,999,1,1,w 93rd ave,w 93rd ave,
9699,16.654540,0.999990,census_2030,reference_file,8134,Ann,Ann,2,B,B,...,35908 maple st Anytown US 99992,35908 maple st Anytown US 99999,1,999,999,35908,35908,maple st,maple st,
9729,32.988335,1.000000,census_2030,reference_file,10565,Julie,Julie,2,J,J,...,5055 dalebrook d Anytown US 99999,5055 dalebrook d Anytown US 99994,1,999,999,5055,5055,dalebrook d,dalebrook d,
9947,32.988335,1.000000,census_2030,reference_file,10194,Anna,Anna,2,K,K,...,252 audubon cir ap 525 Anytown US 99999,252 audubon cir ap 525 Anytown US 99993,1,999,999,252,252,audubon cir,audubon cir,


## Pass 4: Block on first name and street address

In [53]:
all_combos, pik_pairs = geosearch_pass(["first_name", "street_number", "street_name"])

82 links above threshold
82 matches remain after dealing with multiple matches
Matched 82 records; 66.73% still unmatched


### Look at diagnostics

In [54]:
all_combos

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,mean,count
gamma_first_name,gamma_middle_initial,gamma_last_name,gamma_date_of_birth,gamma_geokey,Unnamed: 5_level_1,Unnamed: 6_level_1
2,0,0,0,1,0.003771,1
2,0,0,0,2,0.006695,10
2,1,-1,0,-1,0.021405,1
2,1,0,0,2,0.092779,3
2,1,0,0,1,0.153355,1
2,1,1,0,-1,0.607342,1
2,1,-1,0,1,0.99496,1
2,1,1,0,1,0.999429,1
2,0,2,0,1,0.999459,1
2,-1,2,0,1,0.999864,1


In [55]:
pik_pairs

Unnamed: 0_level_0,match_weight,match_probability,source_dataset_l,source_dataset_r,unique_id_r,first_name_l,first_name_r,gamma_first_name,middle_initial_l,middle_initial_r,...,geokey_l,geokey_r,gamma_geokey,zip3_l,zip3_r,street_number_l,street_number_r,street_name_l,street_name_r,runner_up_match_weight
unique_id_l,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
78,24.345525,1.000000,census_2030,reference_file,4797,Nicole,Nicole,2,J,J,...,304 church lane Anytown US 99919,304 church lane Anytown US 99999,1,999,999,304,304,church lane,church lane,
243,27.404418,1.000000,census_2030,reference_file,280,Albert,Albert,2,K,K,...,213 harlan dr Anytown US 99999,213 harlan dr Anytown US 99993,1,999,999,213,213,harlan dr,harlan dr,
351,30.115345,1.000000,census_2030,reference_file,6695,Brandy,Brandy,2,J,,...,11801 e st Anytown US 99999,11801 e st Anytown US 99993,1,999,999,11801,11801,e st,e st,
487,27.404851,1.000000,census_2030,reference_file,10375,Jeremy,Jeremy,2,S,,...,1522 nw pumpkin rdge rd Anytown US 99989,1522 nw pumpkin rdge rd Anytown US 99999,1,999,999,1522,1522,nw pumpkin rdge rd,nw pumpkin rdge rd,
708,23.464088,1.000000,census_2030,reference_file,11138,Jennifer,Jennifer,2,L,P,...,624 waco ln unit no 82 Anytown US 99999,624 waco ln unit no 02 Anytown US 99999,1,999,999,624,624,waco ln,waco ln,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9698,25.218476,1.000000,census_2030,reference_file,9047,Michael,Michael,2,F,R,...,35908 maple st Anytown US 99949,35908 maple st Anytown US 99999,1,999,999,35908,35908,maple st,maple st,
9799,20.110708,0.999999,census_2030,reference_file,6573,Casey,Casey,2,,R,...,,2515 soo marie ave Anytown US 99999,-1,999,999,2515,2515,soo marie ave,soo marie ave,
9810,25.716362,1.000000,census_2030,reference_file,13507,Alexandra,Alexandra,2,K,K,...,301 west dr Anytown US 99994,301 west dr Anytown US 99999,1,999,999,301,301,west dr,west dr,
9824,29.959943,1.000000,census_2030,reference_file,3501,Spencer,Spencer,2,A,X,...,1161 northeast fremont stree Anytkwn US 99999,1161 northeast fremont stree Anytown US 99999,1,999,999,1161,1161,northeast fremont stree,northeast fremont stree,


## Pass 5: Block on first and last name

In [56]:
all_combos, pik_pairs = geosearch_pass(["first_name", "last_name"])

1873 links above threshold
1871 matches remain after dealing with multiple matches
Matched 1871 records; 47.97% still unmatched


### Look at diagnostics

In [57]:
all_combos

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,mean,count
gamma_first_name,gamma_middle_initial,gamma_last_name,gamma_date_of_birth,gamma_geokey,Unnamed: 5_level_1,Unnamed: 6_level_1
2,0,2,0,0,0.00581,44
2,-1,2,0,0,0.017611,1
2,1,2,0,0,0.192619,20
2,0,2,0,-1,0.235477,12
2,1,2,0,-1,0.953909,30
2,0,2,0,1,0.997632,10
2,0,2,1,0,0.999036,4
2,-1,2,0,1,0.999584,1
2,1,2,0,1,0.999967,86
2,1,2,1,0,0.999968,59


In [58]:
pik_pairs

Unnamed: 0_level_0,match_weight,match_probability,source_dataset_l,source_dataset_r,unique_id_r,first_name_l,first_name_r,gamma_first_name,middle_initial_l,middle_initial_r,...,gamma_last_name,date_of_birth_l,date_of_birth_r,gamma_date_of_birth,geokey_l,geokey_r,gamma_geokey,zip3_l,zip3_r,runner_up_match_weight
unique_id_l,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,32.134075,1.000000,census_2030,reference_file,9891,Darlene,Darlene,2,C,C,...,2,1953-08-03 00:00:00,1953-08-03 00:00:00,1,46 bancroft st Anytown US 99999,46 bancroft st Antgown US 99929,1,999,999,
2,33.057454,1.000000,census_2030,reference_file,2995,Sherry,Sherry,2,J,J,...,2,1949-05-08 00:00:00,1949-05-08 00:00:00,1,5413 kiely blvd Anytown US 99998,5413 kiely blvd Anytown US 99999,1,999,999,
11,31.456003,1.000000,census_2030,reference_file,3474,Priscilla,Priscilla,2,E,E,...,2,1981-03-11 00:00:00,1981-03-11 00:00:00,1,627 south champlain avenue Anytown US 99999,647 south champlain avenue Anytown US 99999,1,999,999,
18,29.170601,1.000000,census_2030,reference_file,2455,Allison,Allison,2,K,K,...,2,1987-01-03 00:00:00,1987-01-03 00:00:00,1,6270 strand cir Anytown US 99999,6250 strand cir Anytown US 99999,1,999,999,
19,17.073137,0.999993,census_2030,reference_file,13645,Jose,Jose,2,J,J,...,2,2016-12-16 00:00:00,2016-12-16 00:00:00,1,6250 strand cir Anytown US 99999,29396 mason rd Anytown US 99999,0,999,999,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9944,28.822677,1.000000,census_2030,reference_file,12427,Jason,Jason,2,J,J,...,2,1974-02-11 00:00:00,1974-02-11 00:00:00,1,8925 st ignatius ln Anytown US 99999,8925 sft ignatius ln Anytown US 99999,1,999,999,
9945,24.103443,1.000000,census_2030,reference_file,12040,Terri,Terri,2,A,A,...,2,1961-10-21 00:00:00,1961-10-21 00:00:00,1,7846 hancock street Anytown US 99999,,-1,999,999,
9948,33.178469,1.000000,census_2030,reference_file,12562,Kathy,Kathy,2,T,T,...,2,1969-06-12 00:00:00,1969-06-12 00:00:00,1,568 lower wetumpka rd Anytown US 99999,562 lower wetumpka rd Anytown US 99999,1,999,999,
9950,33.745510,1.000000,census_2030,reference_file,685,Sheila,Sheila,2,S,S,...,2,1967-06-23 00:00:00,1967-06-23 00:00:00,1,2002 n sunshine cir Anytown US 99996,2302 n sunshine cir Anytown US 99999,1,999,999,


In [59]:
all_combos

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,mean,count
gamma_first_name,gamma_middle_initial,gamma_last_name,gamma_date_of_birth,gamma_geokey,Unnamed: 5_level_1,Unnamed: 6_level_1
2,0,2,0,0,0.00581,44
2,-1,2,0,0,0.017611,1
2,1,2,0,0,0.192619,20
2,0,2,0,-1,0.235477,12
2,1,2,0,-1,0.953909,30
2,0,2,0,1,0.997632,10
2,0,2,1,0,0.999036,4
2,-1,2,0,1,0.999584,1
2,1,2,0,1,0.999967,86
2,1,2,1,0,0.999968,59


In [60]:
pik_pairs

Unnamed: 0_level_0,match_weight,match_probability,source_dataset_l,source_dataset_r,unique_id_r,first_name_l,first_name_r,gamma_first_name,middle_initial_l,middle_initial_r,...,gamma_last_name,date_of_birth_l,date_of_birth_r,gamma_date_of_birth,geokey_l,geokey_r,gamma_geokey,zip3_l,zip3_r,runner_up_match_weight
unique_id_l,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,32.134075,1.000000,census_2030,reference_file,9891,Darlene,Darlene,2,C,C,...,2,1953-08-03 00:00:00,1953-08-03 00:00:00,1,46 bancroft st Anytown US 99999,46 bancroft st Antgown US 99929,1,999,999,
2,33.057454,1.000000,census_2030,reference_file,2995,Sherry,Sherry,2,J,J,...,2,1949-05-08 00:00:00,1949-05-08 00:00:00,1,5413 kiely blvd Anytown US 99998,5413 kiely blvd Anytown US 99999,1,999,999,
11,31.456003,1.000000,census_2030,reference_file,3474,Priscilla,Priscilla,2,E,E,...,2,1981-03-11 00:00:00,1981-03-11 00:00:00,1,627 south champlain avenue Anytown US 99999,647 south champlain avenue Anytown US 99999,1,999,999,
18,29.170601,1.000000,census_2030,reference_file,2455,Allison,Allison,2,K,K,...,2,1987-01-03 00:00:00,1987-01-03 00:00:00,1,6270 strand cir Anytown US 99999,6250 strand cir Anytown US 99999,1,999,999,
19,17.073137,0.999993,census_2030,reference_file,13645,Jose,Jose,2,J,J,...,2,2016-12-16 00:00:00,2016-12-16 00:00:00,1,6250 strand cir Anytown US 99999,29396 mason rd Anytown US 99999,0,999,999,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9944,28.822677,1.000000,census_2030,reference_file,12427,Jason,Jason,2,J,J,...,2,1974-02-11 00:00:00,1974-02-11 00:00:00,1,8925 st ignatius ln Anytown US 99999,8925 sft ignatius ln Anytown US 99999,1,999,999,
9945,24.103443,1.000000,census_2030,reference_file,12040,Terri,Terri,2,A,A,...,2,1961-10-21 00:00:00,1961-10-21 00:00:00,1,7846 hancock street Anytown US 99999,,-1,999,999,
9948,33.178469,1.000000,census_2030,reference_file,12562,Kathy,Kathy,2,T,T,...,2,1969-06-12 00:00:00,1969-06-12 00:00:00,1,568 lower wetumpka rd Anytown US 99999,562 lower wetumpka rd Anytown US 99999,1,999,999,
9950,33.745510,1.000000,census_2030,reference_file,685,Sheila,Sheila,2,S,S,...,2,1967-06-23 00:00:00,1967-06-23 00:00:00,1,2002 n sunshine cir Anytown US 99996,2302 n sunshine cir Anytown US 99999,1,999,999,


# NameSearch

>    The NameSearch module, by contrast, does not use any geographic variables for matching. Only the
>    Name and DOB are used to match. There are four NameSearch passes defined for the ACS. All passes
>    use the first characters of the First and Last names to define cuts...

In [61]:
# temp: reload everything
%store -r

import pandas as pd, numpy as np
from splink.duckdb.duckdb_linker import DuckDBLinker
from splink.duckdb.duckdb_comparison_library import (
    exact_match,
    levenshtein_at_thresholds,
)

In [62]:
def namesearch_pass(blocking_cols):
    return pvs_matching_pass(["first_initial_cut", "last_initial_cut"] + blocking_cols)

## Pass 1: Block on full name and DOB

In [63]:
all_combos, pik_pairs = namesearch_pass(["first_name", "middle_initial", "last_name", "date_of_birth"])

2218 links above threshold
2218 matches remain after dealing with multiple matches
Matched 2218 records; 25.73% still unmatched


### Look at diagnostics

In [64]:
all_combos

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,mean,count
gamma_first_name,gamma_middle_initial,gamma_last_name,gamma_date_of_birth,gamma_geokey,Unnamed: 5_level_1,Unnamed: 6_level_1
2,1,2,1,0,0.999982,17
2,1,2,1,-1,0.999999,1552
2,1,2,1,1,1.0,649


In [65]:
pik_pairs

Unnamed: 0_level_0,match_weight,match_probability,source_dataset_l,source_dataset_r,unique_id_r,first_name_l,first_name_r,gamma_first_name,middle_initial_l,middle_initial_r,...,date_of_birth_r,gamma_date_of_birth,geokey_l,geokey_r,gamma_geokey,last_initial_cut_l,last_initial_cut_r,first_initial_cut_l,first_initial_cut_r,runner_up_match_weight
unique_id_l,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,23.520758,1.000000,census_2030,reference_file,11083,Denise,Denise,2,C,C,...,1960-12-06 00:00:00,1,4738 klauber ave Anytown US 99999,,-1,L,L,D,D,
20,24.562578,1.000000,census_2030,reference_file,11384,Carmen,Carmen,2,J,J,...,1990-10-26 00:00:00,1,817 wtrs st Anytown US 99999,,-1,B,B,C,C,
25,17.005959,0.999992,census_2030,reference_file,6996,Sophia,Sophia,2,E,E,...,2020-05-05 00:00:00,1,6250 strand cir Anytown US 99999,,-1,A-or-blank,A-or-blank,S,S,
27,19.956554,0.999999,census_2030,reference_file,1490,David,David,2,O,O,...,1999-02-26 00:00:00,1,20704 12th st ne Anytown US 99999,,-1,K,K,D,D,
37,18.810050,0.999998,census_2030,reference_file,4135,Odin,Odin,2,K,K,...,2025-11-06 00:00:00,1,5701 princeton avenue Anytown US 99999,,-1,J,J,O,O,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9960,21.525920,1.000000,census_2030,reference_file,779,Isabella,Isabella,2,B,B,...,2022-01-08 00:00:00,1,,,-1,R,R,I,I,
9963,30.982252,1.000000,census_2030,reference_file,5755,Stephen,Stephen,2,J,J,...,1982-03-01 00:00:00,1,17448 ledwich ave Anytown US 99999,17048 ledwich ave Anytown US 99499,1,L,L,S,S,
9965,24.895153,1.000000,census_2030,reference_file,12446,Grady,Grady,2,K,K,...,2016-03-14 00:00:00,1,17048 ledwich ave Anytown US 99999,,-1,L,L,G,G,
9971,32.110357,1.000000,census_2030,reference_file,1704,Clyde,Clyde,2,K,K,...,1952-10-15 00:00:00,1,21 mammoth springs dr Anytown US 92999,21 mammoth springs dr Anytown US 99999,1,H,H,C,C,


## Pass 2: Block on first name and DOB

In [66]:
all_combos, pik_pairs = namesearch_pass(["first_name", "date_of_birth"])

624 links above threshold
624 matches remain after dealing with multiple matches
Matched 624 records; 19.48% still unmatched


### Look at diagnostics

In [67]:
all_combos

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,mean,count
gamma_first_name,gamma_middle_initial,gamma_last_name,gamma_date_of_birth,gamma_geokey,Unnamed: 5_level_1,Unnamed: 6_level_1
2,0,0,1,0,0.00132,1
2,0,0,1,-1,0.398,2
2,1,0,1,-1,0.952217,4
2,0,1,1,0,0.980982,2
2,0,2,1,0,0.98919,1
2,0,0,1,1,0.997564,3
2,1,-1,1,-1,0.99907,2
2,1,1,1,0,0.99937,9
2,0,1,1,-1,0.999545,16
2,-1,0,1,1,0.999601,1


In [68]:
pik_pairs

Unnamed: 0_level_0,match_weight,match_probability,source_dataset_l,source_dataset_r,unique_id_r,first_name_l,first_name_r,gamma_first_name,middle_initial_l,middle_initial_r,...,date_of_birth_r,gamma_date_of_birth,geokey_l,geokey_r,gamma_geokey,last_initial_cut_l,last_initial_cut_r,first_initial_cut_l,first_initial_cut_r,runner_up_match_weight
unique_id_l,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10,12.161360,0.999782,census_2030,reference_file,9340,Kerry,Kerry,2,K,K,...,1974-01-12 00:00:00,1,627 south champlain avenue Anytown US 99999,4231 jeffrey ln unit № 1 Anytown US 99999,0,U-Z,U-Z,K,K,
21,14.058134,0.999941,census_2030,reference_file,8597,Tina,Tina,2,A,A,...,1967-01-19 00:00:00,1,1710 maple hl rd Anytown US 99999,1730 maple hl rd Anytown CO 99999,1,R,R,T,T,
36,17.355165,0.999994,census_2030,reference_file,12516,Elias,Elias,2,A,A,...,2023-02-23 00:00:00,1,27 champions way Anytown US 99999,,-1,B,B,E,E,
97,17.217662,0.999993,census_2030,reference_file,6342,Ellie,Ellie,2,C,C,...,2020-08-18 00:00:00,1,1643 pruneridge ave Anytown US 39999,,-1,P,P,E,E,
100,15.237288,0.999974,census_2030,reference_file,9089,Stephanie,Stephanie,2,O,P,...,1985-03-06 00:00:00,1,12100 south stree Anytown US 99999,,-1,K,K,S,S,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9933,14.337761,0.999952,census_2030,reference_file,6309,Ethan,Ethan,2,W,D,...,2020-06-14 00:00:00,1,8234 freiermuth dr Anytown US 99999,,-1,A-or-blank,A-or-blank,E,E,
9966,24.244324,1.000000,census_2030,reference_file,659,Ryan,Ryan,2,J,J,...,1993-09-13 00:00:00,1,2014 routt Anytown US 99999,2064 routt Anytown US 93999,1,M,M,R,R,
9967,17.055174,0.999993,census_2030,reference_file,13569,Sandra,Sandra,2,B,V,...,1971-07-28 00:00:00,1,,,-1,N,N,S,S,
9972,13.099784,0.999886,census_2030,reference_file,12084,Stephanie,Stephanie,2,DC,C,...,2027-11-11 00:00:00,1,,,-1,U-Z,U-Z,S,S,


## Pass 3: Block on last name and DOB

In [69]:
all_combos, pik_pairs = namesearch_pass(["last_name", "date_of_birth"])

607 links above threshold
606 matches remain after dealing with multiple matches
Matched 606 records; 13.40% still unmatched


### Look at diagnostics

In [70]:
all_combos

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,mean,count
gamma_first_name,gamma_middle_initial,gamma_last_name,gamma_date_of_birth,gamma_geokey,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1,2,1,0,0.989086,1
0,0,2,1,-1,0.995621,1
-1,0,2,1,-1,0.997355,1
-1,1,2,1,0,0.997471,1
1,1,2,1,0,0.998892,7
1,0,2,1,-1,0.999454,21
1,-1,2,1,-1,0.999645,3
0,1,2,1,-1,0.999883,2
-1,1,2,1,-1,0.999925,6
1,1,2,1,-1,0.999979,177


In [71]:
pik_pairs

Unnamed: 0_level_0,match_weight,match_probability,source_dataset_l,source_dataset_r,unique_id_r,first_name_l,first_name_r,gamma_first_name,middle_initial_l,middle_initial_r,...,date_of_birth_r,gamma_date_of_birth,geokey_l,geokey_r,gamma_geokey,last_initial_cut_l,last_initial_cut_r,first_initial_cut_l,first_initial_cut_r,runner_up_match_weight
unique_id_l,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7,25.385250,1.000000,census_2030,reference_file,4534,Roxwnnf,Roxanne,1,S,S,...,1952-11-26 00:00:00,1,1641 colfax avenue Anytown US 99999,1641 colfax avenue Anytown US 99999,2,U-Z,U-Z,R,R,
26,14.509474,0.999957,census_2030,reference_file,12764,Bethany,Bethsny,1,G,G,...,2026-06-13 00:00:00,1,6250 strand cir Anytown US 99999,,-1,A-or-blank,A-or-blank,B,B,
43,26.514110,1.000000,census_2030,reference_file,550,Craig,Cgaig,1,J,J,...,1985-02-15 00:00:00,1,515 w market st Anytown US 99999,815 w market st Anytown US 99999,1,L,L,C,C,
51,26.385250,1.000000,census_2030,reference_file,9278,Hamnxah,Hannah,1,A,A,...,1987-08-28 00:00:00,1,4125 greaves avenue Anytown US 99999,4125 greaves avenue Anytown US 99999,2,P,P,H,H,
62,26.777144,1.000000,census_2030,reference_file,13642,Clzkre,Claire,1,M,M,...,2009-11-26 00:00:00,1,4125 greaves avenue Anytown US 99999,4125 greaves avenue Anytown US 99939,1,N,N,C,C,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9815,17.187545,0.999993,census_2030,reference_file,4314,Ejioy,Emily,1,J,J,...,2026-05-05 00:00:00,1,301 west dr Anytown US 59999,,-1,P,P,E,E,
9828,26.970212,1.000000,census_2030,reference_file,6180,Bemiya,Benita,1,A,A,...,1970-09-11 00:00:00,1,1308 sw 8th ave Anytown US 99999,1308 sw 8th ave Anytown US 99999,2,D,D,B,B,
9847,21.725347,1.000000,census_2030,reference_file,2481,Phulip,Puiljp,0,W,W,...,1985-07-24 00:00:00,1,22 e perry pky Anytown US 99999,22 e perry pky Anytown US 99999,2,C,C,P,P,
9902,28.099072,1.000000,census_2030,reference_file,6171,Patricia,Patrlcia,1,C,C,...,1946-08-15 00:00:00,1,11938 olive st Anytown MD 99999,11938 olive st Anytown US 99999,1,U-Z,U-Z,P,P,


## Pass 4: Block on DOB

In [72]:
all_combos, pik_pairs = namesearch_pass(["date_of_birth"])

46 links above threshold
46 matches remain after dealing with multiple matches
Matched 46 records; 12.94% still unmatched


### Look at diagnostics

In [73]:
all_combos

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,mean,count
gamma_first_name,gamma_middle_initial,gamma_last_name,gamma_date_of_birth,gamma_geokey,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0,0,1,0,1.1e-05,1
-1,0,0,1,0,2.1e-05,1
0,1,0,1,0,0.000338,1
0,0,0,1,-1,0.000565,2
-1,1,0,1,0,0.000654,1
0,0,-1,1,-1,0.08094,1
2,0,0,1,-1,0.3886,2
1,0,1,1,-1,0.984245,1
1,1,1,1,-1,0.999488,11
0,1,-1,1,1,0.999529,1


In [74]:
pik_pairs

Unnamed: 0_level_0,match_weight,match_probability,source_dataset_l,source_dataset_r,unique_id_r,first_name_l,first_name_r,gamma_first_name,middle_initial_l,middle_initial_r,...,date_of_birth_r,gamma_date_of_birth,geokey_l,geokey_r,gamma_geokey,last_initial_cut_l,last_initial_cut_r,first_initial_cut_l,first_initial_cut_r,runner_up_match_weight
unique_id_l,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
166,20.521058,0.999999,census_2030,reference_file,6078,Betjany,Bethany,1,M,M,...,1985-11-13 00:00:00,1,4737 se thornton dr Anytown US 99999,4736 se thornton dr Anytown US 99099,1,P,P,B,B,
674,10.931459,0.999488,census_2030,reference_file,12728,Wendy,Wemndy,1,C,C,...,1981-03-18 00:00:00,1,,762 webster st Anytown US 99999,-1,L,L,U-Z,U-Z,
1091,10.931459,0.999488,census_2030,reference_file,4877,Cye,Cyd,1,D,D,...,1960-01-31 00:00:00,1,35 trammell dr Anytown US 99999,,-1,A-or-blank,A-or-blank,C,C,
1123,20.521058,0.999999,census_2030,reference_file,1877,Jeremy,Jrremy,1,T,T,...,1968-07-07 00:00:00,1,25392 greenview Anytown US 99299,25392 greenview Anytown US 99999,1,B,B,J,J,
1419,20.521058,0.999999,census_2030,reference_file,8060,Brottanu,Brittany,1,E,E,...,1980-12-23 00:00:00,1,1070 23rd streer Anytown US 99999,1070 23rd street Anytown US 99999,1,H,H,B,B,
1820,20.521058,0.999999,census_2030,reference_file,2899,Jeef,Jeff,1,S,S,...,1993-12-03 00:00:00,1,166-8 n woodlawn age Anytown US 99999,166-8 n woodlawn ave Anytown US 99999,1,P,P,J,J,
1843,20.97716,1.0,census_2030,reference_file,12717,Christian,Chriatian,1,C,C,...,1993-07-04 00:00:00,1,9803 s county rd 850 e rd e Anytown US 99999,9803 s county rd 850 e rd e Anytown US 99999,2,G,G,C,C,
2001,20.521058,0.999999,census_2030,reference_file,12470,Raiden,Raisen,1,J,J,...,2025-10-19 00:00:00,1,66 larimar ave Anytown US 99979,64 larimar ave Anytown US 99999,1,U-Z,U-Z,R,R,
2302,20.521058,0.999999,census_2030,reference_file,11755,Ezekiel,Ezekjel,1,L,L,...,2006-05-12 00:00:00,1,2548 village sth dr Anytown US 39999,2548 village sth dr Anytown US 99999,1,J,J,E,E,
2513,16.010829,0.999985,census_2030,reference_file,7053,Marfya,Martha,1,X,C,...,1962-06-20 00:00:00,1,120 7th street rd Anytown US 99999,120 7th street rd Anytown US 99999,2,S,S,M,M,


# Resulting PIKs

In [75]:
census_2030

Unnamed: 0,record_id,first_name,middle_initial,last_name,sex,race_ethnicity,age,date_of_birth,housing_type,relation_to_household_head,...,street_name,unit_number,city,state,zipcode,geokey,zip3,first_initial_cut,last_initial_cut,pik
0,0,Darlene,C,Holt,Female,Black,76,1953-08-03 00:00:00,Standard,Reference person,...,bancroft st,,Anytown,US,99999,46 bancroft st Anytown US 99999,999,D,H,9891.0
1,1,Matteo,C,Hoyt,Male,Latino,12,2018-03-13 00:00:00,Standard,Other nonrelative,...,klauber ave,,,US,99999,,999,M,H,
2,2,Sherry,J,Ash,Female,Black,80,1949-05-08 00:00:00,Standard,Reference person,...,kiely blvd,,Anytown,US,99998,5413 kiely blvd Anytown US 99998,999,S,A-or-blank,2995.0
3,3,Denise,C,Langenfeld,Female,White,69,1960-12-06 00:00:00,Standard,Reference person,...,klauber ave,,Anytown,US,99999,4738 klauber ave Anytown US 99999,999,D,L,11083.0
4,4,Sophia,C,Shepherd,Female,Black,4,2045-06-05 01:20:00,Standard,Reference person,...,east oakton drive,,Anytown,US,99999,12031 east oakton drive Anytown US 99999,999,S,S,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9970,9970,John,S,Wasser,Male,White,66,1963-09-21 00:00:00,Standard,Reference person,...,n cr 400 e,,Anytown,US,99999,5700 n cr 400 e Anytown US 99999,999,J,U-Z,384.0
9971,9971,Clyde,K,Hughes,Male,Black,77,1952-10-15 00:00:00,Standard,Reference person,...,mammoth springs dr,,Anytown,US,92999,21 mammoth springs dr Anytown US 92999,929,C,H,1704.0
9972,9972,Stephanie,DC,Walker,Female,Black,2,2027-11-11 00:00:00,Standard,Reference person,...,,,Anytown,US,99999,,999,S,U-Z,12084.0
9973,9973,Austin,D,Foster,Male,White,23,2006-10-10 00:00:00,Standard,Reference person,...,ranch loop,,Anytown,US,99999,1900 ranch loop Anytown US 99999,999,A-or-blank,F,4234.0


In [76]:
census_2030.pik.notnull().mean()

0.8705764411027569

In [77]:
# Multiple Census rows assigned the same PIK, indicating duplicates in Census
census_2030.pik.value_counts().value_counts()

count
1    8670
2       7
Name: count, dtype: int64

In [78]:
duplicate_piks = census_2030.pik.value_counts()[census_2030.pik.value_counts() > 1].index

In [79]:
census_2030[census_2030.pik.isin(duplicate_piks)].sort_values('pik')

Unnamed: 0,record_id,first_name,middle_initial,last_name,sex,race_ethnicity,age,date_of_birth,housing_type,relation_to_household_head,...,street_name,unit_number,city,state,zipcode,geokey,zip3,first_initial_cut,last_initial_cut,pik
7834,7834,Jason,J,Rojas,Male,Latino,54,1975-07-11 00:00:00,Standard,Reference person,...,tibbles street,,Anytown,US,99999,755 tibbles street Anytown US 99999,999,J,R,2470.0
9453,9453,Joseph,J,Rojas,Male,Latino,54,1975-07-11 00:00:00,Standard,Reference person,...,cottonwood drive,1 f unit 101,Anytown,US,69999,25260 cottonwood drive 1 f unit 101 Anytown US...,699,J,R,2470.0
793,793,Liam,C,Maldonado,Male,Latino,12,2017-07-09 00:00:00,Standard,Other nonrelative,...,n palm ave,,Anytown,US,99999,458 n palm ave Anytown US 99999,999,L,M,6586.0
794,794,Liam,C,Maldonado,Male,Latino,11,2018-07-26 00:00:00,Standard,Other nonrelative,...,n palm ave,,Anytown,US,99999,458 n palm ave Anytown US 99999,999,L,M,6586.0
6957,6957,Joseph,C,Hoffman,Male,White,52,1977-12-02 00:00:00,Standard,Reference person,...,charles street,,Anytown,US,99999,1303 charles street Anytown US 99999,999,J,H,8166.0
6959,6959,Joseph,C,Hoffman,Male,Latino,12,2017-07-10 00:00:00,Standard,Adopted child,...,charles street,,Anytown,US,99999,1303 charles street Anytown US 99999,999,J,H,8166.0
899,899,Noah,J,Slade,Male,White,7,2022-05-14 00:00:00,Standard,Other nonrelative,...,perry ridge ct,,Anytown,US,99999,,999,N,S,11112.0
8943,8943,Noah,J,Slade,Male,Black,27,2002-09-10 00:00:00,Standard,Reference person,...,tarheel ln,,Anytown,US,99999,342 tarheel ln Anytown US 99999,999,N,S,11112.0
1974,1974,Thomas,G,George,Male,Asian,42,1987-12-03 00:00:00,Standard,Reference person,...,palarm creek,,Anytown,US,99999,150 palarm creek Anytown US 99999,999,T,G,11778.0
1976,1976,Thomas,I,George,Male,Asian,57,1973-03-14 00:00:00,Standard,Other relative,...,palarm frwek,,Anytown,US,99999,150 palarm frwek Anytown US 99999,999,T,G,11778.0


## PIK accuracy

In [80]:
pik_simulant_id = census_2030.pik.map(reference_file_ground_truth)
pik_simulant_id

0        0_2464
1           NaN
2       0_16850
3       0_17160
4           NaN
         ...   
9970    0_17995
9971    0_18073
9972    0_22588
9973    0_22759
9974    0_22760
Name: pik, Length: 9975, dtype: object

In [81]:
(pik_simulant_id[pik_simulant_id.notnull()] == census_2030_ground_truth[pik_simulant_id.notnull()]).mean()

0.9987333026255182

In [82]:
errors = census_2030[census_2030.pik.notnull() & (pik_simulant_id != census_2030_ground_truth)]
confused_for = reference_file.set_index('record_id').loc[errors.pik].reset_index().set_index(errors.index)
errors[common_cols].compare(confused_for[common_cols], keep_shape=True, keep_equal=True)

Unnamed: 0_level_0,record_id,record_id,first_name,first_name,middle_initial,middle_initial,last_name,last_name,date_of_birth,date_of_birth,...,zipcode,zipcode,geokey,geokey,zip3,zip3,first_initial_cut,first_initial_cut,last_initial_cut,last_initial_cut
Unnamed: 0_level_1,self,other,self,other,self,other,self,other,self,other,...,self,other,self,other,self,other,self,other,self,other
793,793,6586,Liam,Liam,C,C,Maldonado,Maldonado,2017-07-09 00:00:00,2018-07-26 00:00:00,...,99999,99999,458 n palm ave Anytown US 99999,458 n palm ave Anytown US 99999,999,999,L,L,M,M
899,899,11112,Noah,Noah,J,J,Slade,Slade,2022-05-14 00:00:00,2002-09-10 00:00:00,...,99999,99999,,342 tarheel ln Anytown US 99999,999,999,N,N,S,S
1974,1974,11778,Thomas,Thomas,G,I,George,George,1987-12-03 00:00:00,1973-03-14 00:00:00,...,99999,99999,150 palarm creek Anytown US 99999,180 palarm creek Anytown US 99999,999,999,T,T,G,G
2582,2582,12398,Thomas,Thomas,E,D,Epperson,Epperson,1973-06-06 00:00:00,2000-09-06 00:00:00,...,99999,99999,44-02 so monroe stre Anytown US 99999,44-02 so monroe stre Anytown US 99999,999,999,T,T,E,E
3042,3042,10349,Michael,Michael,D,D,Willis,Williams,1982-08-20 00:00:00,1964-06-04 00:00:00,...,99999,99999,501 lake stream dr Anytown US 99999,501 lake stream dr Anytown US 99999,999,999,M,M,U-Z,U-Z
3773,3773,14133,Steven,Steven,A,HB,Gamez,Gamez,2013-02-22 00:00:00,1979-06-05 00:00:00,...,99999,99999,5881 elliot Anytown US 99999,5880 elliot Anytown US 99999,999,999,S,S,G,G
4564,4564,15069,Diane,Diane,M,M,Matthews,Matthews,1953-08-31 00:00:00,1978-09-23 00:00:00,...,99999,99999,,4482 graceland ave Anytown US 99999,999,999,D,D,M,M
6201,6201,11061,Thomas,Thomas,F,C,Smith,Sjmith,1984-06-21 00:00:00,1985-09-08 00:00:00,...,99999,99999,1106 rockwood ln Anytown US 99999,1106 rockwood ln Anytown US 99999,999,999,T,T,S,S
6959,6959,8166,Joseph,Joseph,C,C,Hoffman,Hoffman,2017-07-10 00:00:00,1977-12-02 00:00:00,...,99999,99999,1303 charles street Anytown US 99999,1303 charles street Anytown US 99999,999,999,J,J,H,H
8704,8704,14624,Michael,Michael,M,M,Herrod,Herrod,1985-10-10 00:00:00,1990-06-17 00:00:00,...,99999,99909,56775 french rd Anytown US 99999,56775 french rd Anytown US 99909,999,999,M,M,H,H
