# **PVS-like case study: sample data**

In [1]:
import pseudopeople
import pandas as pd, numpy as np

# Generate simulated data to link

Using v0.6.4 of the `pseudopeople` package.

In [2]:
!pip freeze | grep pseudopeople

pseudopeople==0.6.4


## Load simulated data

Imagined scenario: PIKing the 2030 census.

One way to do this:
* Use (cumulative) SSA Numident up to that time.
* Link it (deterministically, using SSN) to taxes to get the most recent address for each person.
  * Likely would use 1040 here, but I'll use W2 for now.
* Link probabilistically to the census data.

In [3]:
default_configuration = pseudopeople.get_config()

In [4]:
def column_noise_value(dataset, column, noise_type, default_value):
    if dataset in ('decennial_census', 'taxes_w2_and_1099', 'social_security'):
        if noise_type == "make_typos":
            if column == "middle_initial":
                return {"cell_probability": 0.05, "token_probability": 1}
            elif column in ("first_name", "last_name", "street_name"):
                return {"cell_probability": 0.1, "token_probability": 0.1}
        elif noise_type == "write_wrong_digits":
            return {"cell_probability": 0.1, "token_probability": 0.1}

    return default_value

def row_noise_value(dataset, noise_type, default_value):
    return default_value

In [5]:
custom_configuration = {
    dataset: {
        noise_category: (
            ({
                column: {
                    noise_type: column_noise_value(dataset, column, noise_type, noise_type_config)
                    for noise_type, noise_type_config in column_config.items()
                }
                for column, column_config in noise_category_config.items()
            }
            if noise_category == "column_noise" else
            {
                noise_type: row_noise_value(dataset, noise_type, noise_type_config)
                for noise_type, noise_type_config in noise_category_config.items()
            })
        )
        for noise_category, noise_category_config in dataset_config.items()
    }
    for dataset, dataset_config in default_configuration.items()
}

In [6]:
%%time

# Here I've figured that there would be some delay in getting the Numident -- so by Census processing time
# for the 2030 Census, only the SSA by the end of 2029 would be available.
# Note that with pseudopeople's current design it is only possible to set a cutoff at the end of a calendar year.
ssa = pseudopeople.generate_social_security(year=2029, config=custom_configuration)
ssa

                                                               

CPU times: user 1.01 s, sys: 94.2 ms, total: 1.1 s
Wall time: 1.13 s




Unnamed: 0,simulant_id,first_name,middle_initial,last_name,date_of_birth,ssn,event_type,event_date
0,0_19979,Mary,M,Pierce,12/04/1919,786-77-6454,creation,19191204
1,0_6846,Peter,M,Mundell,06/07/1921,688-88-6377,creation,19210607
2,0_19941,Anna,H,Causey,03/07/1922,665-25-7858,creation,12220307
3,0_19825,Gertrude,M,Osornia,05/11/1922,875-10-2359,creation,19220511
4,0_19806,Edna,A,Hunter,05/25/1922,420-19-3737,creation,19220525
...,...,...,...,...,...,...,...,...
20027,0_23620,Mila,M,Saldana,01/09/2030,133-85-8593,creation,20291218
20028,0_23629,Luna,N,Bonnell,01/09/2030,422-69-9071,creation,20291218
20029,0_23630,Charlotte,A,May,01/10/2030,826-03-0946,creation,20291218
20030,0_23624,Liam,C,Vanover,01/12/2030,778-37-9317,creation,20291218


In [7]:
%%time

# Consider the last few years of taxes -- 2029 taxes would be filed a couple months before Census day 2030
w2_1099 = pd.concat([
    pseudopeople.generate_taxes_w2_and_1099(year=year, config=custom_configuration).assign(tax_year=year) for year in (2025, 2026, 2027, 2028, 2029)
], ignore_index=True)
w2_1099

                                                               

CPU times: user 6.95 s, sys: 446 ms, total: 7.4 s
Wall time: 6.68 s




Unnamed: 0,simulant_id,first_name,middle_initial,last_name,age,date_of_birth,mailing_address_street_number,mailing_address_street_name,mailing_address_unit_number,mailing_address_po_box,...,employer_id,employer_name,employer_street_number,employer_street_name,employer_unit_number,employer_city,employer_state,employer_zipcode,tax_form,tax_year
0,0_5,Michelle,M,Ticas,44,08/10/1981,1312,commonwealth avnue,,,...,12,Jj Rubys Salon Studios,1300,windsor lane,,Anytown,US,00000,W2,2025
1,0_5,Michelle,M,Ticas,44,08/10/1981,1312,commonwealth avnue,,,...,60,Freeway Insurance Agency,1105,largess ln,,Anytown,US,00000,W2,2025
2,0_5623,Gloria,A,Quintana,52,07/23/1973,,,,14011.0,...,46,Nashville City Properties,411,sthe 20th avenue,,Anytown,US,00000,W2,2025
3,0_7252,Tamara,D,Sosa,48,05/15/1977,15,julian dr,,,...,69,Rancho Vistoso Trails Mental Health,4056,goliad st,,Anytown,US,00000,W2,2025
4,0_7252,Tamara,D,Sosa,48,05/15/1977,15,julian dr,,,...,84,New Era Home,222,w hemlock st,,Anytown,US,00000,W2,2025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54141,0_3456,Amanda,M,Mitchell,49,02/15/1980,3,goodland avnue,,,...,84,New Era Home,222,w hemlock st,,Anytown,US,00000,W2,2029
54142,0_3457,Steven,R,Mitchell,49,03/13/1980,3,goodland avnue,,,...,75,France,2506,mccullough lane,,Anytown,US,00000,W2,2029
54143,0_19046,Delbert,D,Hawkins,89,03/15/1940,3,goodland avnue,,,...,43,Ram Fashion Nail,20308,hancock str,,Anytown,US,00000,W2,2029
54144,0_19046,Delbert,D,Hawkins,89,03/15/1940,3,goodland avnue,,,...,53,A Car Title Loans,6100,e ball rd,,Anytown,US,00000,W2,2029


In [8]:
%%time

census_2030 = pseudopeople.generate_decennial_census(year=2030, config=custom_configuration)
census_2030

                                                               

CPU times: user 881 ms, sys: 46.8 ms, total: 928 ms
Wall time: 907 ms




Unnamed: 0,simulant_id,first_name,middle_initial,last_name,age,date_of_birth,street_number,street_name,unit_number,city,state,zipcode,relation_to_reference_person,sex,race_ethnicity
0,0_923,John,E,Mcueever,86,06/29/1942,147-153,browning ave,,Anytown,US,00000,Reference person,Male,Black
1,0_2641,Sharon,T,Schmidt,69,10/50/1960,109,stqllion sr,,Anytown,US,00000,Reference person,Female,White
2,0_6176,Gail,K,Durand,77,01/03/1953,2115,cannon dr,,Anytown,US,00000,Reference person,Female,Multiracial or Other
3,0_13972,John,J,Williams,81,11/24/1948,146,delaware av,,Anytown,US,00000,Reference person,Male,White
4,0_13973,Child,L,Wukliamz,81,09/27/1948,146,delaware av,,Anytown,US,00000,Opp-sex spouse,Female,White
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11048,0_22741,Chloe,A,Maryknez-Alvarez,21,07/12/2008,207,harrison st,,Anytown,US,00000,Biological child,Female,Latino
11049,0_22742,Zachary,E,Martinez-Alvarez,18,06/29/2011,207,harrison st,,Anytown,US,00000,Biological child,Male,
11050,0_22743,Madeline,A,Martinez-Alvarez,16,08/12/2013,207,harrison st,,Anytown,US,00000,Biological child,Female,Latino
11051,0_23271,Naomi,A,Martinez-Aldarez,1,11/01/2028,207,harrison st,,Anytown,US,00000,Grandchild,Female,Latino


## Deterministically get a recent address

In [9]:
# Within each year, we do not have a date field for W2. So we don't know which order jobs happened in.
# We take the address associated with the most income in the most recent year. Note that part or all
# of this address may be missing.
w2_1099.groupby(['tax_year', 'ssn']).mailing_address_street_name.nunique(dropna=False).sort_values()

tax_year  ssn        
2025      000-74-9102    1
2028      308-42-7924    1
          308-48-4345    1
          308-78-1837    1
          309-02-7977    1
                        ..
2025      681-62-4798    2
          325-59-2336    2
2029      298-41-5379    2
2027      413-65-8440    2
2029      385-02-7530    3
Name: mailing_address_street_name, Length: 40697, dtype: int64

In [10]:
w2_1099.ssn.nunique()

14796

In [11]:
recent_addresses = (
    w2_1099.assign(income=lambda x: x.income.fillna(0).astype(float)).sort_values(['tax_year', 'income'], ascending=False).groupby('ssn').first().filter(like='mailing_address')
)
recent_addresses

Unnamed: 0_level_0,mailing_address_street_number,mailing_address_street_name,mailing_address_unit_number,mailing_address_po_box,mailing_address_city,mailing_address_state,mailing_address_zipcode
ssn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
000-74-9102,222,white road,,,Anytown,US,00000
000-87-0907,2732,bindl dr,,,Anytown,US,00000
001-02-4588,685,emerson st,,,Anytown,US,00000
001-15-8330,5010,south doctor martin luther king jr dr,,,Anytown,US,00000
001-17-9511,150,s sheldon rd,,,Anytown,US,00000
...,...,...,...,...,...,...,...
994-37-3653,16695,14th ave nw,,,Anytown,US,00000
994-55-0008,2014,routt,,,Anytown,US,00000
995-60-2964,328,mobeetie st,,,Anytown,US,00000
997-63-6760,10900,s k st,,,Anytown,US,00000


In [12]:
# Everyone with any taxes has a most recent address.
assert set(recent_addresses.index) == set(w2_1099.ssn.dropna().unique())

In [13]:
# Some people won't have a recent address, if they don't have any taxes.
ssa[~ssa.ssn.isin(recent_addresses.index)]

Unnamed: 0,simulant_id,first_name,middle_initial,last_name,date_of_birth,ssn,event_type,event_date
0,0_19979,Mary,M,Pierce,12/04/1919,786-77-6454,creation,19191204
1,0_6846,Peter,M,Mundell,06/07/1921,688-88-6377,creation,19210607
2,0_19941,Anna,H,Causey,03/07/1922,665-25-7858,creation,12220307
3,0_19825,Gertrude,M,Osornia,05/11/1922,875-10-2359,creation,19220511
4,0_19806,Edna,A,Hunter,05/25/1922,420-19-3737,creation,19220525
...,...,...,...,...,...,...,...,...
20027,0_23620,Mila,M,Saldana,01/09/2030,133-85-8593,creation,20291218
20028,0_23629,Luna,N,Bonnell,01/09/2030,422-69-9071,creation,20291218
20029,0_23630,Charlotte,A,May,01/10/2030,826-03-0946,creation,20291218
20030,0_23624,Liam,C,Vanover,01/12/2030,778-37-9317,creation,20291218


## Create a fake Numident file

In [14]:
# The probable real-life approach would be: take date of birth from the (first) creation event, date of death (if any)
# from the (last) death event, name from the most recent event of any kind.
# We don't want to throw out events with a missing/invalid date, so we'll fill them with the value *least* likely to be chosen
# (early if taking the latest, late if taking the earliest).
fill_dates = lambda df, fill_type: pd.to_datetime(df.event_date, errors='coerce').fillna(pd.Timestamp('2100-01-01' if fill_type == 'latest' else '1900-01-01'))

date_of_birth = (
    ssa[ssa.event_type == 'creation']
        .assign(event_date_for_sort=lambda df: fill_dates(df, 'latest'))
        .sort_values('event_date_for_sort')
        .groupby('ssn')
        .date_of_birth.first()
)
date_of_death = (
    ssa[ssa.event_type == 'death']
        .assign(event_date_for_sort=lambda df: fill_dates(df, 'earliest'))
        .sort_values('event_date_for_sort')
        .groupby('ssn')
        .event_date.last()
        .rename('date_of_death')
)
name = (
    ssa
        .assign(event_date_for_sort=lambda df: fill_dates(df, 'earliest'))
        .sort_values('event_date_for_sort')
        .groupby('ssn')
        .last()[['first_name', 'middle_initial', 'last_name']]
)

# What to do about ground truth here? This simple linkage could already be introducing errors, since SSN is not without noise!
# For now, I'll take the most common ground truth.
simulant_id = ssa.groupby('ssn').simulant_id.agg(lambda x: pd.Series.mode(x)[0])

fake_numident = pd.DataFrame(simulant_id).join(date_of_birth, how='left').join(name, how='left').join(date_of_death, how='left').reset_index()
fake_numident

Unnamed: 0,ssn,simulant_id,date_of_birth,first_name,middle_initial,last_name,date_of_death
0,001-02-4588,0_13602,08/08/2008,Isabella,G,Windom,
1,001-15-8330,0_16514,05/04/1976,Gerald,J,Beckham,
2,001-16-0077,0_13906,02/07/1970,Jerald,J,Alvarez,
3,001-17-9511,0_13442,11/20/1966,Teresa,A,Togni,
4,001-25-8258,0_22495,06/29/2026,Bethany,G,Tenorio,
...,...,...,...,...,...,...,...
18769,976-30-9537,0_4258,06/12/1976,Aron,C,Frausto Ferretiz,
18770,978-78-6109,0_19947,05/22/1963,Claude,M,Page,
18771,979-44-7835,0_20792,08/01/1979,Thomas,A,Martinez-Puentes,
18772,998-22-9577,0_9017,04/17/2002,Jeffery,P,Shaw,


In [15]:
# Most people have not died
fake_numident.date_of_death.isnull().mean()

0.9172792159369341

## Create a composite reference file for linking

In [16]:
reference_file = (
    # Exclude those who have died before the census
    fake_numident[~(pd.to_datetime(fake_numident.date_of_death, errors='coerce') <= pd.Timestamp('2020-04-01'))]
        .merge(recent_addresses, on='ssn', how='left')
)
reference_file

Unnamed: 0,ssn,simulant_id,date_of_birth,first_name,middle_initial,last_name,date_of_death,mailing_address_street_number,mailing_address_street_name,mailing_address_unit_number,mailing_address_po_box,mailing_address_city,mailing_address_state,mailing_address_zipcode
0,001-02-4588,0_13602,08/08/2008,Isabella,G,Windom,,685,emerson st,,,Anytown,US,00000
1,001-15-8330,0_16514,05/04/1976,Gerald,J,Beckham,,5010,south doctor martin luther king jr dr,,,Anytown,US,00000
2,001-16-0077,0_13906,02/07/1970,Jerald,J,Alvarez,,,,,,,,
3,001-17-9511,0_13442,11/20/1966,Teresa,A,Togni,,150,s sheldon rd,,,Anytown,US,00000
4,001-25-8258,0_22495,06/29/2026,Bethany,G,Tenorio,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18521,976-30-9537,0_4258,06/12/1976,Aron,C,Frausto Ferretiz,,,,,,,,
18522,978-78-6109,0_19947,05/22/1963,Claude,M,Page,,,,,,,,
18523,979-44-7835,0_20792,08/01/1979,Thomas,A,Martinez-Puentes,,,,,,,,
18524,998-22-9577,0_9017,04/17/2002,Jeffery,P,Shaw,,,,,,,,


# Pre-process the data

Not much needed here because the datasets are already so tidy and similar to each other.

In [17]:
# Add a unique record ID
reference_file = reference_file.reset_index(drop=True).reset_index().rename(columns={'index': 'record_id'})
census_2030 = census_2030.reset_index(drop=True).reset_index().rename(columns={'index': 'record_id'})

# Remove ground truth
reference_file_ground_truth = reference_file.pop('simulant_id')
census_2030_ground_truth = census_2030.pop('simulant_id')

In [18]:
# Use true missingness instead of empty string
reference_file = reference_file.replace('', np.nan)
census_2030 = census_2030.replace('', np.nan)

In [19]:
# We want to compare mailing address with physical address
reference_file = reference_file.rename(columns=lambda c: c.replace('mailing_address_', ''))

In [20]:
# Purely for ease of use, order the columns nicely
reference_file_columns_order = [
    'record_id',
    'ssn',
    'first_name', 'middle_initial', 'last_name',
    'date_of_birth',
    'date_of_death',
    'street_number', 'street_name', 'unit_number', 'city', 'state', 'zipcode', 'po_box',
]
assert set(reference_file_columns_order) == set(reference_file.columns)
reference_file = reference_file[reference_file_columns_order]

In [21]:
census_columns_order = [
    'record_id',
    'first_name', 'middle_initial', 'last_name',
    'sex', 'race_ethnicity', 'age', 'date_of_birth',
    'relation_to_reference_person',
    'street_number', 'street_name', 'unit_number', 'city', 'state', 'zipcode',
]
assert set(census_columns_order) == set(census_2030.columns)
census_2030 = census_2030[census_columns_order]

In [22]:
# My working theory: the purpose of the "geokey" is because address parts violate conditional independence
get_geokey = lambda x: (x.street_number + ' ' + x.street_name + ' ' + x.unit_number.fillna('') + ' ' + x.city + ' ' + x.state.astype(str) + ' ' + x.zipcode).str.strip().str.split().str.join(' ')
reference_file = reference_file.assign(geokey=get_geokey)
census_2030 = census_2030.assign(geokey=get_geokey)

In [23]:
# Add columns used to "cut the database": ZIP3 and a grouping of first and last initial
reference_file = reference_file.assign(zip3=lambda x: x.zipcode.str[:3])
census_2030 = census_2030.assign(zip3=lambda x: x.zipcode.str[:3])

# Page 20 of the NORC report: "Name-cuts are defined by combinations of the first characters of the first and last names. The twenty letter groupings
# for the first character are: A-or-blank, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, and U-Z."
initial_cut = lambda x: x.fillna('A').str[0].replace('A', 'A-or-blank').replace(['U', 'V', 'W', 'X', 'Y', 'Z'], 'U-Z')
reference_file = reference_file.assign(first_initial_cut=lambda x: initial_cut(x.first_name), last_initial_cut=lambda x: initial_cut(x.last_name))
census_2030 = census_2030.assign(first_initial_cut=lambda x: initial_cut(x.first_name), last_initial_cut=lambda x: initial_cut(x.last_name))

# Data to link

Note: I have not yet introduced alternate names and dates of birth here.

In [24]:
reference_file

Unnamed: 0,record_id,ssn,first_name,middle_initial,last_name,date_of_birth,date_of_death,street_number,street_name,unit_number,city,state,zipcode,po_box,geokey,zip3,first_initial_cut,last_initial_cut
0,0,001-02-4588,Isabella,G,Windom,08/08/2008,,685,emerson st,,Anytown,US,00000,,685 emerson st Anytown US 00000,000,I,U-Z
1,1,001-15-8330,Gerald,J,Beckham,05/04/1976,,5010,south doctor martin luther king jr dr,,Anytown,US,00000,,5010 south doctor martin luther king jr dr Any...,000,G,B
2,2,001-16-0077,Jerald,J,Alvarez,02/07/1970,,,,,,,,,,,J,A-or-blank
3,3,001-17-9511,Teresa,A,Togni,11/20/1966,,150,s sheldon rd,,Anytown,US,00000,,150 s sheldon rd Anytown US 00000,000,T,T
4,4,001-25-8258,Bethany,G,Tenorio,06/29/2026,,,,,,,,,,,B,T
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18521,18521,976-30-9537,Aron,C,Frausto Ferretiz,06/12/1976,,,,,,,,,,,A-or-blank,F
18522,18522,978-78-6109,Claude,M,Page,05/22/1963,,,,,,,,,,,C,P
18523,18523,979-44-7835,Thomas,A,Martinez-Puentes,08/01/1979,,,,,,,,,,,T,M
18524,18524,998-22-9577,Jeffery,P,Shaw,04/17/2002,,,,,,,,,,,J,S


In [25]:
census_2030

Unnamed: 0,record_id,first_name,middle_initial,last_name,sex,race_ethnicity,age,date_of_birth,relation_to_reference_person,street_number,street_name,unit_number,city,state,zipcode,geokey,zip3,first_initial_cut,last_initial_cut
0,0,John,E,Mcueever,Male,Black,86,06/29/1942,Reference person,147-153,browning ave,,Anytown,US,00000,147-153 browning ave Anytown US 00000,000,J,M
1,1,Sharon,T,Schmidt,Female,White,69,10/50/1960,Reference person,109,stqllion sr,,Anytown,US,00000,109 stqllion sr Anytown US 00000,000,S,S
2,2,Gail,K,Durand,Female,Multiracial or Other,77,01/03/1953,Reference person,2115,cannon dr,,Anytown,US,00000,2115 cannon dr Anytown US 00000,000,G,D
3,3,John,J,Williams,Male,White,81,11/24/1948,Reference person,146,delaware av,,Anytown,US,00000,146 delaware av Anytown US 00000,000,J,U-Z
4,4,Child,L,Wukliamz,Female,White,81,09/27/1948,Opp-sex spouse,146,delaware av,,Anytown,US,00000,146 delaware av Anytown US 00000,000,C,U-Z
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11048,11048,Chloe,A,Maryknez-Alvarez,Female,Latino,21,07/12/2008,Biological child,207,harrison st,,Anytown,US,00000,207 harrison st Anytown US 00000,000,C,M
11049,11049,Zachary,E,Martinez-Alvarez,Male,,18,06/29/2011,Biological child,207,harrison st,,Anytown,US,00000,207 harrison st Anytown US 00000,000,U-Z,M
11050,11050,Madeline,A,Martinez-Alvarez,Female,Latino,16,08/12/2013,Biological child,207,harrison st,,Anytown,US,00000,207 harrison st Anytown US 00000,000,M,M
11051,11051,Naomi,A,Martinez-Aldarez,Female,Latino,1,11/01/2028,Grandchild,207,harrison st,,Anytown,US,00000,207 harrison st Anytown US 00000,000,N,M


In [26]:
# Save these variables; this means that if you restart the kernel, you don't need to run this first part of the notebook again.
%store reference_file census_2030 reference_file census_2030 reference_file_ground_truth census_2030_ground_truth

Stored 'reference_file' (DataFrame)
Stored 'census_2030' (DataFrame)
Stored 'reference_file' (DataFrame)
Stored 'census_2030' (DataFrame)
Stored 'reference_file_ground_truth' (Series)
Stored 'census_2030_ground_truth' (Series)


# Implement PVS-like matching with `splink`

## Estimate parameters (lambda, m, u) once for both modules

In reality these parameters are not estimated from the data.
It is unclear to me whether they are actually the same for both modules or even for different passes of the same module.

In [27]:
# Reload saved variables; you can start the notebook from here if you have *ever* run the part above.
%store -r reference_file census_2030

import pandas as pd, numpy as np

In [28]:
common_cols = [c for c in reference_file.columns if c in census_2030.columns]
common_cols

['record_id',
 'first_name',
 'middle_initial',
 'last_name',
 'date_of_birth',
 'street_number',
 'street_name',
 'unit_number',
 'city',
 'state',
 'zipcode',
 'geokey',
 'zip3',
 'first_initial_cut',
 'last_initial_cut']

In [29]:
def prep_table_for_splink(df):
    return (
        df[common_cols]
            .assign(date_of_birth=lambda x: x.date_of_birth.astype(str))
            .rename(columns={'record_id': 'unique_id'})
    )

tables_for_splink = [prep_table_for_splink(reference_file), prep_table_for_splink(census_2030)]

In [30]:
[len(t) for t in tables_for_splink]

[18526, 11053]

In [31]:
# estimate_probability_two_random_records_match did not seem to give me a reasonable estimate
# we estimate that around 90% of the census are present in the reference file
probability_two_random_records_match = (0.90 * len(census_2030)) / (len(reference_file) * len(census_2030))
probability_two_random_records_match

4.858037352909425e-05

In [32]:
from splink.duckdb.duckdb_linker import DuckDBLinker
from splink.duckdb.duckdb_comparison_library import (
    exact_match,
    levenshtein_at_thresholds,
)

settings = {
    "link_type": "link_only",
    "comparisons": [
        levenshtein_at_thresholds("first_name", 2, term_frequency_adjustments=True),
        exact_match("middle_initial"),
        levenshtein_at_thresholds("last_name", 2, term_frequency_adjustments=True),
        # For some reason, this makes everything crash!?
        # levenshtein_at_thresholds("date_of_birth", 1),
        exact_match("date_of_birth"),
        levenshtein_at_thresholds("geokey", 5),
    ],
    "probability_two_random_records_match": probability_two_random_records_match
}

linker = DuckDBLinker(
    tables_for_splink,
    settings,
    input_table_aliases=["reference_file", "census_2030"]
)

# NOTE: This is not reproducible!
linker.estimate_u_using_random_sampling(max_pairs=1e5)

blocking_rule_for_training = "l.first_name = r.first_name and l.last_name = r.last_name"
linker.estimate_parameters_using_expectation_maximisation(blocking_rule_for_training)

blocking_rule_for_training = "l.geokey = r.geokey"
linker.estimate_parameters_using_expectation_maximisation(blocking_rule_for_training)

----- Estimating u probabilities using random sampling -----

Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - first_name (no m values are trained).
    - middle_initial (no m values are trained).
    - last_name (no m values are trained).
    - date_of_birth (no m values are trained).
    - geokey (no m values are trained).

----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
l.first_name = r.first_name and l.last_name = r.last_name

Parameter estimates will be made for the following comparison(s):
    - middle_initial
    - date_of_birth
    - geokey

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
    - first_name
    - last_name

Iteration 1: Largest change in params was -0.159 in the m_probability of geokey, level `Exact match`
Iteration 2: Largest change in params was 0.0072 in probability_two_random_rec

<EMTrainingSession, blocking on l.geokey = r.geokey, deactivating comparisons geokey>

In [33]:
linker.match_weights_chart()

In [34]:
# NOTE: EM appears to be finding people in the same family instead of the same person!
# See first_name m probabilities.
# For now, I address this by almost always blocking on first name.
# More experimentation needed to get reasonable values here.
linker.m_u_parameters_chart()

In [35]:
splink_settings = linker._settings_obj.as_dict()

In [36]:
PROBABILITY_THRESHOLD = 0.85

In [37]:
# Save these variables; this means that if you restart the kernel, you don't need to run this first part of the notebook again.
%store splink_settings PROBABILITY_THRESHOLD

Stored 'splink_settings' (dict)
Stored 'PROBABILITY_THRESHOLD' (float)


## Implement matching passes

In [38]:
# Calculate this once to save time -- mapping from record_id to index of Census dataframe
census_index_of_ids = census_2030.reset_index().set_index('record_id')['index']

# TODO: Have this function output more charts and diagnostics
def pvs_matching_pass(blocking_cols):
    tables_for_splink = [prep_table_for_splink(reference_file), prep_table_for_splink(census_2030[census_2030.pik.isnull()])]

    blocking_rule_parts = [f"l.{col} = r.{col}" for col in blocking_cols]
    blocking_rule = " and ".join(blocking_rule_parts)
    linker = DuckDBLinker(
        tables_for_splink,
        {**splink_settings, **{
            "blocking_rules_to_generate_predictions": [blocking_rule],
        }},
        input_table_aliases=["reference_file", "census_2030"]
    )

    all_predictions = linker.predict().as_pandas_dataframe()
    all_combos = all_predictions.groupby(list(all_predictions.filter(like='gamma_').columns)).match_probability.agg(['mean', 'count']).sort_values('mean')

    potential_links = linker.predict(threshold_match_probability=PROBABILITY_THRESHOLD).as_pandas_dataframe()
    print(f'{len(potential_links)} links above threshold')

    # Post-processing: deal with multiple matches
    # According to the report, it is frequently the case that the post-processing rule doesn't assign *any* matches when there are multiple
    # So I'm replicating that feature with a very simple algorithm
    pik_pairs = potential_links.sort_values('match_weight', ascending=False).groupby(['unique_id_l']).first()
    runner_up_pairs = potential_links.sort_values('match_weight', ascending=False).groupby(['unique_id_l']).nth(2)
    pik_pairs = pik_pairs.join(runner_up_pairs.match_weight.rename('runner_up_match_weight'), how='left')
    pairs_to_keep = ~(pik_pairs.runner_up_match_weight > pik_pairs.match_weight + 0.5)
    print(f'{len(pairs_to_keep)} matches remain after dealing with multiple matches')
    pik_pairs = pik_pairs[pairs_to_keep]
    # Make pik_pairs index into the census_2030 dataframe
    pik_pairs = pik_pairs.set_index(pik_pairs.index.map(census_index_of_ids))
    
    census_2030.loc[pik_pairs.index, 'pik'] = pik_pairs.unique_id_r
    print(f'Matched {len(pik_pairs)} records; {census_2030.pik.isnull().mean():.2%} still unmatched')
    
    return all_combos, pik_pairs

# GeoSearch

> There are six passes through GeoSearch defined currently for an ACS PVS run. These passes use the first
  three digits of an address ZIP code (ZIP3) as a database “cutting” strategy...
>
> The GeoSearch matching
  variables include name and DOB, but also several variables derived from the Geokey (street name, house
  number, etc).

[(source)](https://www.norc.org/PDFs/May%202011%20Personal%20Validation%20and%20Entity%20Resolution%20Conference/PVS%20Assessment%20Report%20FINAL%20JULY%202011.pdf)

In [39]:
# Reload saved variables; you can start the notebook from here if you have *ever* run the part above.
%store -r

import pandas as pd, numpy as np
from splink.duckdb.duckdb_linker import DuckDBLinker
from splink.duckdb.duckdb_comparison_library import (
    exact_match,
    levenshtein_at_thresholds,
)

Unable to restore variable 'dr_total_pop', ignoring (use %store -d to forget!)
The error was: <class 'KeyError'>
Unable to restore variable 'dr_predictions', ignoring (use %store -d to forget!)
The error was: <class 'KeyError'>
Unable to restore variable 'diabetes_count', ignoring (use %store -d to forget!)
The error was: <class 'KeyError'>


In [40]:
def geosearch_pass(blocking_cols):
    return pvs_matching_pass(["zip3"] + blocking_cols)

## Pass 1: block on full name and entire address

In [41]:
census_2030['pik'] = np.nan

In [42]:
all_combos, pik_pairs = geosearch_pass(["first_name", "middle_initial", "last_name", "geokey"])

3737 links above threshold
3737 matches remain after dealing with multiple matches
Matched 3737 records; 66.19% still unmatched


### Look at diagnostics

In [43]:
all_combos

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,mean,count
gamma_first_name,gamma_middle_initial,gamma_last_name,gamma_date_of_birth,gamma_geokey,Unnamed: 5_level_1,Unnamed: 6_level_1
2,1,2,0,2,0.999878,582
2,1,2,1,2,1.0,3155


In [44]:
pik_pairs

Unnamed: 0_level_0,match_weight,match_probability,source_dataset_l,source_dataset_r,unique_id_r,first_name_l,first_name_r,gamma_first_name,middle_initial_l,middle_initial_r,...,gamma_last_name,date_of_birth_l,date_of_birth_r,gamma_date_of_birth,geokey_l,geokey_r,gamma_geokey,zip3_l,zip3_r,runner_up_match_weight
unique_id_l,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,32.508685,1.000000,census_2030,reference_file,6400,Gail,Gail,2,K,K,...,2,01/03/1953,01/03/1953,1,2115 cannon dr Anytown US 00000,2115 cannon dr Anytown US 00000,2,000,000,
3,21.255344,1.000000,census_2030,reference_file,18337,John,John,2,J,J,...,2,11/24/1948,11/24/1948,1,146 delaware av Anytown US 00000,146 delaware av Anytown US 00000,2,000,000,
6,10.341160,0.999230,census_2030,reference_file,2021,Sarah,Sarah,2,L,L,...,2,06/63/1975,01/03/1975,0,146 delaware av Anytown US 00000,146 delaware av Anytown US 00000,2,000,000,
12,27.776882,1.000000,census_2030,reference_file,10198,Emily,Emily,2,M,M,...,2,08/22/2000,08/22/2000,1,5046 church street Anytown US 00000,5046 church street Anytown US 00000,2,000,000,
18,34.146115,1.000000,census_2030,reference_file,2575,Cathie,Cathie,2,K,K,...,2,04/27/1958,04/27/1958,1,1451 atlanta st Anytown US 00000,1451 atlanta st Anytown US 00000,2,000,000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11035,27.664989,1.000000,census_2030,reference_file,5058,Stephanie,Stephanie,2,V,V,...,2,10/29/1976,10/29/1976,1,344 nrtwest lilywood drive Anytown US 00000,344 nrtwest lilywood drive Anytown US 00000,2,000,000,
11036,32.068113,1.000000,census_2030,reference_file,9325,Savon,Savon,2,T,T,...,2,12/08/1998,12/08/1998,1,344 nrtwest lilywood drive Anytown US 00000,344 nrtwest lilywood drive Anytown US 00000,2,000,000,
11037,15.104121,0.999972,census_2030,reference_file,18078,Jennifer,Jennifer,2,J,J,...,2,11/22/2985,11/22/1985,0,1702 s rainbow dr Anytown US 00000,1702 s rainbow dr Anytown US 00000,2,000,000,
11047,31.316040,1.000000,census_2030,reference_file,17273,Liliana,Liliana,2,G,G,...,2,05/27/1986,05/27/1986,1,207 harrison st Anytown US 00000,207 harrison st Anytown US 00000,2,000,000,


## Pass 2: Block on first name and entire address

In [45]:
all_combos, pik_pairs = geosearch_pass(["first_name", "geokey"])

1112 links above threshold
1110 matches remain after dealing with multiple matches
Matched 1110 records; 56.15% still unmatched


### Look at diagnostics

In [46]:
all_combos

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,mean,count
gamma_first_name,gamma_middle_initial,gamma_last_name,gamma_date_of_birth,gamma_geokey,Unnamed: 5_level_1,Unnamed: 6_level_1
2,0,0,0,2,0.00645,332
2,-1,0,0,2,0.015471,9
2,0,-1,0,2,0.15742,11
2,1,0,0,2,0.257259,71
2,-1,-1,0,2,0.547433,2
2,0,1,0,2,0.874208,7
2,1,-1,0,2,0.893138,21
2,0,0,1,2,0.990617,12
2,0,2,0,2,0.993956,76
2,-1,0,1,2,0.995175,1


In [47]:
pik_pairs

Unnamed: 0_level_0,match_weight,match_probability,source_dataset_l,source_dataset_r,unique_id_r,first_name_l,first_name_r,gamma_first_name,middle_initial_l,middle_initial_r,...,gamma_last_name,date_of_birth_l,date_of_birth_r,gamma_date_of_birth,geokey_l,geokey_r,gamma_geokey,zip3_l,zip3_r,runner_up_match_weight
unique_id_l,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,19.098410,0.999998,census_2030,reference_file,6829,John,John,2,E,E,...,1,06/29/1942,06/29/1942,1,147-153 browning ave Anytown US 00000,147-153 browning ave Anytown US 00000,2,000,000,
14,11.299966,0.999604,census_2030,reference_file,907,Cassandra,Cassandra,2,DR,R,...,2,35/12/1980,05/12/1980,0,14165 glendale st Anytown US 00000,14165 glendale st Anytown US 00000,2,000,000,
24,24.993561,1.000000,census_2030,reference_file,1469,Cody,Cody,2,H,J,...,2,06/29/2005,06/29/2005,1,77722 picciola rd Anytown US 00000,77722 picciola rd Anytown US 00000,2,000,000,
26,22.519459,1.000000,census_2030,reference_file,9816,John,John,2,,J,...,2,06/16/1954,06/16/1954,1,6691 matthew dr Anytown US 00000,6691 matthew dr Anytown US 00000,2,000,000,
35,19.181997,0.999998,census_2030,reference_file,12330,Thomas,Thomas,2,J,J,...,1,08/15/1978,08/15/1978,1,8693 gashey dr Anytown US 00000,8693 gashey dr Anytown US 00000,2,000,000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11017,22.382618,1.000000,census_2030,reference_file,5591,Vanessa,Vanessa,2,D,D,...,1,08/27/1998,08/27/1998,1,508 fairway vsta dr Anytown US 00000,508 fairway vsta dr Anytown US 00000,2,000,000,
11019,20.928333,0.999999,census_2030,reference_file,2624,Jordan,Jordan,2,I,O,...,2,11/17/2003,11/17/2003,1,508 fairway vsta dr Anytown US 00000,508 fairway vsta dr Anytown US 00000,2,000,000,
11020,18.255937,0.999997,census_2030,reference_file,10308,Ronnie,Ronnie,2,G,G,...,-1,09/30/1977,09/30/1977,1,100 halesite dv Anytown US 00000,100 halesite dv Anytown US 00000,2,000,000,
11042,14.688792,0.999962,census_2030,reference_file,13168,Elisabeth,Elisabeth,2,C,C,...,0,07/03/1994,07/03/1994,1,2424 willow green ct Anytown US 00000,2424 willow green ct Anytown US 00000,2,000,000,


## Pass 3: Block on full name and street address

In [48]:
all_combos, pik_pairs = geosearch_pass(["first_name", "middle_initial", "last_name", "street_number", "street_name"])

229 links above threshold
229 matches remain after dealing with multiple matches
Matched 229 records; 54.08% still unmatched


### Look at diagnostics

In [49]:
all_combos

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,mean,count
gamma_first_name,gamma_middle_initial,gamma_last_name,gamma_date_of_birth,gamma_geokey,Unnamed: 5_level_1,Unnamed: 6_level_1
2,1,2,0,0,0.770911,8
2,1,2,0,-1,0.993968,5
2,1,2,0,1,0.999915,27
2,1,2,1,0,0.999958,22
2,1,2,1,-1,0.999998,37
2,1,2,1,1,1.0,134


In [50]:
pik_pairs

Unnamed: 0_level_0,match_weight,match_probability,source_dataset_l,source_dataset_r,unique_id_r,first_name_l,first_name_r,gamma_first_name,middle_initial_l,middle_initial_r,...,geokey_l,geokey_r,gamma_geokey,zip3_l,zip3_r,street_name_l,street_name_r,street_number_l,street_number_r,runner_up_match_weight
unique_id_l,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
60,28.226009,1.000000,census_2030,reference_file,13291,Taylor,Taylor,2,A,A,...,1703 cottonwood dr Anttown US 00000,1703 cottonwood dr Anytown US 00000,1,000,000,cottonwood dr,cottonwood dr,1703,1703,
74,28.682523,1.000000,census_2030,reference_file,8705,Andrea,Andrea,2,J,J,...,838 lock dr Anytown nan 00000,838 lock dr Anytown US 00000,1,000,000,lock dr,lock dr,838,838,
106,11.930748,0.999744,census_2030,reference_file,5087,Jeremy,Jeremy,2,C,C,...,37615 elysian ave Antgown US 00000,37615 elysian ave Anytown US 00000,1,000,000,elysian ave,elysian ave,37615,37615,
132,28.834527,1.000000,census_2030,reference_file,16060,Louise,Louise,2,T,T,...,355 lowry dr no 32 Anytown US 00000,355 lowry dr ni 32 Anytown US 00000,1,000,000,lowry dr,lowry dr,355,355,
169,19.296130,0.999998,census_2030,reference_file,1213,Nicole,Nicole,2,A,A,...,,710 lugano wy Anytown US 00000,-1,000,000,lugano wy,lugano wy,710,710,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10775,29.816825,1.000000,census_2030,reference_file,15909,Joe,Joe,2,J,J,...,61 eustice ro Anytown nan 00000,61 eustice ro Anytown US 00000,1,000,000,eustice ro,eustice ro,61,61,
10890,24.838428,1.000000,census_2030,reference_file,5212,Aaron,Aaron,2,S,S,...,8434 southeast 25th avenue apt 510 Anytown US ...,8434 southeast 25th avenue apt 957 Anytown US ...,1,000,000,southeast 25th avenue,southeast 25th avenue,8434,8434,
10902,21.899015,1.000000,census_2030,reference_file,1881,Trent,Trent,2,C,C,...,2620 sw cochran st Anytown US 00000,,-1,000,000,sw cochran st,sw cochran st,2620,2620,
10923,18.006851,0.999996,census_2030,reference_file,16689,Stephani,Stephani,2,A,A,...,614 liberty st Anytown nan 00000,614 liberty st Anytown US 00000,1,000,000,liberty st,liberty st,614,614,


## Pass 4: Block on first name and street address

In [51]:
all_combos, pik_pairs = geosearch_pass(["first_name", "street_number", "street_name"])

48 links above threshold
48 matches remain after dealing with multiple matches
Matched 48 records; 53.64% still unmatched


### Look at diagnostics

In [52]:
all_combos

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,mean,count
gamma_first_name,gamma_middle_initial,gamma_last_name,gamma_date_of_birth,gamma_geokey,Unnamed: 5_level_1,Unnamed: 6_level_1
2,0,0,0,0,1e-06,1
2,0,0,0,-1,0.00011,3
2,1,0,0,-1,0.0025,1
2,0,0,0,1,0.003946,7
2,-1,0,0,2,0.005915,2
2,0,0,0,2,0.006152,210
2,1,0,0,1,0.071606,3
2,-1,-1,0,2,0.164863,1
2,0,-1,0,2,0.2142,2
2,1,0,0,2,0.265239,47


In [53]:
pik_pairs

Unnamed: 0_level_0,match_weight,match_probability,source_dataset_l,source_dataset_r,unique_id_r,first_name_l,first_name_r,gamma_first_name,middle_initial_l,middle_initial_r,...,geokey_l,geokey_r,gamma_geokey,zip3_l,zip3_r,street_name_l,street_name_r,street_number_l,street_number_r,runner_up_match_weight
unique_id_l,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
229,15.316513,0.999975,census_2030,reference_file,7798,Elizabeth,Elizabeth,2,J,J,...,8370 chervil ct Anytown US 00000,8370 chervil ct Anytown US 00002,1,0,0,chervil ct,chervil ct,8370,8370,
688,8.983905,0.998029,census_2030,reference_file,5915,Trevor,Trevor,2,C,C,...,1702 meisner rd Anytown US 00073,1702 meisner rd Anytown US 00000,1,0,0,meisner rd,meisner rd,1702,1702,
745,16.946458,0.999992,census_2030,reference_file,7211,Kent,Kent,2,C,C,...,,1702 meisner rd Anytown US 00000,-1,0,0,meisner rd,meisner rd,1702,1702,
927,20.333077,0.999999,census_2030,reference_file,7243,Nicole,Nicole,2,J,J,...,4805 bowers av nw Anytown nan 00000,4805 bowers av nw Anytown US 00000,1,0,0,bowers av nw,bowers av nw,4805,4805,
969,26.540722,1.0,census_2030,reference_file,14009,Joselyn,Joselyn,2,P,O,...,4258 main st Anytown nan 00000,4258 main st Anytown US 00000,1,0,0,main st,main st,4258,4258,
1155,24.95576,1.0,census_2030,reference_file,18025,Micheal,Micheal,2,D,R,...,2623 n holliston ave Anytown nan 00000,2623 n holliston ave Anytown US 00000,1,0,0,n holliston ave,n holliston ave,2623,2623,
1444,4.890165,0.967378,census_2030,reference_file,7807,Kelly,Kelly,2,,J,...,250 perdido drive Anytown US 00000,250 perdido drive Anytown US 00008,1,0,0,perdido drive,perdido drive,250,250,
2149,17.466581,0.999994,census_2030,reference_file,3656,Benjamin,Benjamin,2,I,J,...,3232 maple grove ln Anytown nan 00000,3232 maple grove ln Anytown US 00000,1,0,0,maple grove ln,maple grove ln,3232,3232,
2207,23.292795,1.0,census_2030,reference_file,14900,Destiny,Destiny,2,N,H,...,3232 maple grove ln Anytown nan 00000,3232 maple grove ln Anytown US 00000,1,0,0,maple grove ln,maple grove ln,3232,3232,
2400,9.456281,0.998578,census_2030,reference_file,16269,Whitney,Whitney,2,D,D,...,5410 arbella lp Anytown US 00000,5410 arbella lp unit 577 Anytown US 00000,0,0,0,arbella lp,arbella lp,5410,5410,


## Pass 5: Block on first and last name

In [54]:
all_combos, pik_pairs = geosearch_pass(["first_name", "last_name"])

1367 links above threshold
1365 matches remain after dealing with multiple matches
Matched 1365 records; 41.29% still unmatched


### Look at diagnostics

In [55]:
all_combos

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,mean,count
gamma_first_name,gamma_middle_initial,gamma_last_name,gamma_date_of_birth,gamma_geokey,Unnamed: 5_level_1,Unnamed: 6_level_1
2,0,2,0,0,0.027092,36
2,0,2,0,-1,0.201983,17
2,1,2,0,0,0.566159,23
2,-1,2,0,-1,0.898382,4
2,1,2,0,-1,0.982222,76
2,0,2,0,1,0.993606,17
2,0,2,1,0,0.997436,7
2,-1,2,1,0,0.999541,8
2,-1,2,0,1,0.999824,2
2,1,2,0,1,0.999863,102


In [56]:
pik_pairs

Unnamed: 0_level_0,match_weight,match_probability,source_dataset_l,source_dataset_r,unique_id_r,first_name_l,first_name_r,gamma_first_name,middle_initial_l,middle_initial_r,...,gamma_last_name,date_of_birth_l,date_of_birth_r,gamma_date_of_birth,geokey_l,geokey_r,gamma_geokey,zip3_l,zip3_r,runner_up_match_weight
unique_id_l,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9,9.797449,0.998878,census_2030,reference_file,13649,Christian,Christian,2,J,J,...,2,01/30/1986,,0,146 delaware av Anytown US 00000,436 delaware av Anytown US 00030,1,000,000,
11,23.982638,1.000000,census_2030,reference_file,18255,Robert,Robert,2,J,J,...,2,10/25/1981,10/25/1981,1,1137 lackef dr apt # 351 Anytown US 00000,1235 packer dr apt # 351 Anytown US 00000,1,000,000,
23,5.712181,0.981282,census_2030,reference_file,18202,Jacob,Jacob,2,E,W,...,2,04/29/1936,04/29/1996,0,11053 dry creek rd Anytown US 00000,11093 dry creek rd Anytown US 00000,1,000,000,
28,12.002111,0.999756,census_2030,reference_file,13138,Robert,Robert,2,R,R,...,2,08/23/1978,08/23/1977,0,717 pinhooo rd Anytown US 00000,715 pinhook rd Anytown US 00000,1,000,000,
30,26.094439,1.000000,census_2030,reference_file,15889,Aaron,Aaron,2,S,S,...,2,07/27/1982,07/27/1982,1,610 f parid at Anytown US 00000,610 e paris st Anytown US 00000,1,000,000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10977,24.603167,1.000000,census_2030,reference_file,9844,Antionette,Antionette,2,K,K,...,2,12/28/1988,12/28/1988,1,,640 liberty court Anytown US 00000,-1,000,000,
10989,20.959311,1.000000,census_2030,reference_file,596,Whitney,Whitney,2,C,C,...,2,12/26/1992,12/26/1992,1,,3225 chantilly dr Anytown US 00000,-1,000,000,
11003,11.072712,0.999536,census_2030,reference_file,13987,Jacob,Jacob,2,J,J,...,2,03/21/1996,03/21/1996,1,1457 mystic ave Anytown US 00000,13354 caswell st Anytown US 00000,0,000,000,
11015,27.801713,1.000000,census_2030,reference_file,10808,Justin,Justin,2,A,A,...,2,07/28/1991,07/28/1991,1,16623 freewom ln Anytown US 00000,16623 freedom ln Anytown US 00000,1,000,000,


In [57]:
all_combos

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,mean,count
gamma_first_name,gamma_middle_initial,gamma_last_name,gamma_date_of_birth,gamma_geokey,Unnamed: 5_level_1,Unnamed: 6_level_1
2,0,2,0,0,0.027092,36
2,0,2,0,-1,0.201983,17
2,1,2,0,0,0.566159,23
2,-1,2,0,-1,0.898382,4
2,1,2,0,-1,0.982222,76
2,0,2,0,1,0.993606,17
2,0,2,1,0,0.997436,7
2,-1,2,1,0,0.999541,8
2,-1,2,0,1,0.999824,2
2,1,2,0,1,0.999863,102


In [58]:
pik_pairs

Unnamed: 0_level_0,match_weight,match_probability,source_dataset_l,source_dataset_r,unique_id_r,first_name_l,first_name_r,gamma_first_name,middle_initial_l,middle_initial_r,...,gamma_last_name,date_of_birth_l,date_of_birth_r,gamma_date_of_birth,geokey_l,geokey_r,gamma_geokey,zip3_l,zip3_r,runner_up_match_weight
unique_id_l,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9,9.797449,0.998878,census_2030,reference_file,13649,Christian,Christian,2,J,J,...,2,01/30/1986,,0,146 delaware av Anytown US 00000,436 delaware av Anytown US 00030,1,000,000,
11,23.982638,1.000000,census_2030,reference_file,18255,Robert,Robert,2,J,J,...,2,10/25/1981,10/25/1981,1,1137 lackef dr apt # 351 Anytown US 00000,1235 packer dr apt # 351 Anytown US 00000,1,000,000,
23,5.712181,0.981282,census_2030,reference_file,18202,Jacob,Jacob,2,E,W,...,2,04/29/1936,04/29/1996,0,11053 dry creek rd Anytown US 00000,11093 dry creek rd Anytown US 00000,1,000,000,
28,12.002111,0.999756,census_2030,reference_file,13138,Robert,Robert,2,R,R,...,2,08/23/1978,08/23/1977,0,717 pinhooo rd Anytown US 00000,715 pinhook rd Anytown US 00000,1,000,000,
30,26.094439,1.000000,census_2030,reference_file,15889,Aaron,Aaron,2,S,S,...,2,07/27/1982,07/27/1982,1,610 f parid at Anytown US 00000,610 e paris st Anytown US 00000,1,000,000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10977,24.603167,1.000000,census_2030,reference_file,9844,Antionette,Antionette,2,K,K,...,2,12/28/1988,12/28/1988,1,,640 liberty court Anytown US 00000,-1,000,000,
10989,20.959311,1.000000,census_2030,reference_file,596,Whitney,Whitney,2,C,C,...,2,12/26/1992,12/26/1992,1,,3225 chantilly dr Anytown US 00000,-1,000,000,
11003,11.072712,0.999536,census_2030,reference_file,13987,Jacob,Jacob,2,J,J,...,2,03/21/1996,03/21/1996,1,1457 mystic ave Anytown US 00000,13354 caswell st Anytown US 00000,0,000,000,
11015,27.801713,1.000000,census_2030,reference_file,10808,Justin,Justin,2,A,A,...,2,07/28/1991,07/28/1991,1,16623 freewom ln Anytown US 00000,16623 freedom ln Anytown US 00000,1,000,000,


# NameSearch

>    The NameSearch module, by contrast, does not use any geographic variables for matching. Only the
>    Name and DOB are used to match. There are four NameSearch passes defined for the ACS. All passes
>    use the first characters of the First and Last names to define cuts...

In [59]:
def namesearch_pass(blocking_cols):
    return pvs_matching_pass(["first_initial_cut", "last_initial_cut"] + blocking_cols)

## Pass 1: Block on full name and DOB

In [60]:
all_combos, pik_pairs = namesearch_pass(["first_name", "middle_initial", "last_name", "date_of_birth"])

1577 links above threshold
1577 matches remain after dealing with multiple matches
Matched 1577 records; 27.02% still unmatched


### Look at diagnostics

In [61]:
all_combos

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,mean,count
gamma_first_name,gamma_middle_initial,gamma_last_name,gamma_date_of_birth,gamma_geokey,Unnamed: 5_level_1,Unnamed: 6_level_1
2,1,2,1,0,0.999987,2
2,1,2,1,-1,0.999998,1534
2,1,2,1,1,1.0,41


In [62]:
pik_pairs

Unnamed: 0_level_0,match_weight,match_probability,source_dataset_l,source_dataset_r,unique_id_r,first_name_l,first_name_r,gamma_first_name,middle_initial_l,middle_initial_r,...,date_of_birth_r,gamma_date_of_birth,geokey_l,geokey_r,gamma_geokey,first_initial_cut_l,first_initial_cut_r,last_initial_cut_l,last_initial_cut_r,runner_up_match_weight
unique_id_l,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,17.531255,0.999995,census_2030,reference_file,16268,Natalie,Natalie,2,V,V,...,12/17/2008,1,12828 gwendolyn dr Anytown US 00000,,-1,N,N,C,C,
16,21.747415,1.000000,census_2030,reference_file,6232,Julian,Julian,2,J,J,...,07/26/2024,1,15 julian dr Anytown US 00000,,-1,J,J,U-Z,U-Z,
17,21.561002,1.000000,census_2030,reference_file,2140,Bella,Bella,2,P,P,...,11/22/2025,1,5046 church street Anytown US 00000,,-1,B,B,M,M,
20,20.698506,0.999999,census_2030,reference_file,3060,Dominic,Dominic,2,B,B,...,08/14/2014,1,9635 lambert st Anytown US 00000,,-1,D,D,M,M,
21,19.435471,0.999999,census_2030,reference_file,15724,Dylan,Dylan,2,A,A,...,09/22/2016,1,9635 lambert st Anytown US 00000,,-1,D,D,M,M,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11040,18.759906,0.999998,census_2030,reference_file,13175,Stephen,Stephen,2,B,B,...,12/25/2022,1,315 mcdaniel street sw Anytown US 00000,,-1,S,S,U-Z,U-Z,
11043,19.247844,0.999998,census_2030,reference_file,14961,Alexander,Alexander,2,S,S,...,01/27/2028,1,2424 willow green ct Anytown US 00000,,-1,A-or-blank,A-or-blank,M,M,
11044,23.020434,1.000000,census_2030,reference_file,14948,Molly,Molly,2,A,A,...,10/27/2012,1,228 schuster rd Anytown US 00000,,-1,M,M,M,M,
11045,25.020434,1.000000,census_2030,reference_file,18263,Otto,Otto,2,K,K,...,02/06/2014,1,228 schuster rd Anytown US 00000,,-1,O,O,M,M,


## Pass 2: Block on first name and DOB

In [63]:
all_combos, pik_pairs = namesearch_pass(["first_name", "date_of_birth"])

458 links above threshold
458 matches remain after dealing with multiple matches
Matched 458 records; 22.88% still unmatched


### Look at diagnostics

In [64]:
all_combos

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,mean,count
gamma_first_name,gamma_middle_initial,gamma_last_name,gamma_date_of_birth,gamma_geokey,Unnamed: 5_level_1,Unnamed: 6_level_1
2,0,0,1,0,0.009906,3
2,0,0,1,-1,0.068044,3
2,1,0,1,-1,0.930566,11
2,1,1,1,0,0.994724,6
2,0,1,1,-1,0.996587,18
2,1,-1,1,-1,0.997776,2
2,-1,1,1,-1,0.998418,7
2,1,0,1,1,0.998681,2
2,0,2,1,-1,0.999902,165
2,1,1,1,-1,0.999921,158


In [65]:
pik_pairs

Unnamed: 0_level_0,match_weight,match_probability,source_dataset_l,source_dataset_r,unique_id_r,first_name_l,first_name_r,gamma_first_name,middle_initial_l,middle_initial_r,...,date_of_birth_r,gamma_date_of_birth,geokey_l,geokey_r,gamma_geokey,first_initial_cut_l,first_initial_cut_r,last_initial_cut_l,last_initial_cut_r,runner_up_match_weight
unique_id_l,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
37,17.493069,0.999995,census_2030,reference_file,8650,Nova,Nova,2,E,E,...,03/16/2014,1,850 ashmount avenu Anytown US 00000,,-1,N,N,M,M,
46,21.539612,1.000000,census_2030,reference_file,17694,Randy,Randy,2,M,M,...,04/13/1961,1,19461 fire twe rd Anytown US 00000,19461 fire twr rd Anytown US 00000,1,R,R,H,H,
58,3.622654,0.924912,census_2030,reference_file,12477,Carol,Carol,2,R,R,...,01/26/1946,1,3193 shelton hall rd Anytown US 00000,,-1,C,C,H,H,
101,18.078031,0.999996,census_2030,reference_file,8612,Londyn,Londyn,2,E,E,...,03/12/2012,1,,,-1,L,L,A-or-blank,A-or-blank,
103,21.234554,1.000000,census_2030,reference_file,17414,Cash,Cash,2,,O,...,12/04/2018,1,,,-1,C,C,D,D,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10945,15.119754,0.999972,census_2030,reference_file,3169,Wesley,Wesley,2,JM,M,...,11/03/2022,1,16 interlocken boulwvatd Anytown US 00000,,-1,U-Z,U-Z,S,S,
10959,16.885289,0.999992,census_2030,reference_file,70,Dallas,Dallas,2,QA,A,...,10/04/2022,1,,,-1,D,D,S,S,
10991,15.270677,0.999975,census_2030,reference_file,15699,Paula,Paula,2,J,J,...,09/01/1954,1,120 greenwood ave Anytown US 00000,,-1,P,P,C,C,
10998,17.078031,0.999993,census_2030,reference_file,8331,Angelique,Angelique,2,K,K,...,04/28/1979,1,1035 fairview ave Anytown US 00000,,-1,A-or-blank,A-or-blank,M,M,


## Pass 3: Block on last name and DOB

In [66]:
all_combos, pik_pairs = namesearch_pass(["last_name", "date_of_birth"])

590 links above threshold
590 matches remain after dealing with multiple matches
Matched 590 records; 17.54% still unmatched


### Look at diagnostics

In [67]:
all_combos

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,mean,count
gamma_first_name,gamma_middle_initial,gamma_last_name,gamma_date_of_birth,gamma_geokey,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0,2,1,-1,0.885131,3
-1,0,2,1,-1,0.914052,1
1,0,2,1,-1,0.998195,17
1,1,2,1,0,0.999297,8
0,1,2,1,-1,0.999364,3
-1,1,2,1,-1,0.999813,4
-1,0,2,1,2,0.999917,1
1,1,2,1,-1,0.999929,172
0,0,2,1,2,0.999934,2
-1,0,2,1,1,0.999968,1


In [68]:
pik_pairs

Unnamed: 0_level_0,match_weight,match_probability,source_dataset_l,source_dataset_r,unique_id_r,first_name_l,first_name_r,gamma_first_name,middle_initial_l,middle_initial_r,...,date_of_birth_r,gamma_date_of_birth,geokey_l,geokey_r,gamma_geokey,first_initial_cut_l,first_initial_cut_r,last_initial_cut_l,last_initial_cut_r,runner_up_match_weight
unique_id_l,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5,19.353549,0.999999,census_2030,reference_file,15686,Shwlley,Shelley,1,J,J,...,03/02/1973,1,146 delaware av Anytown US 00000,146 delaware av Anytown US 00000,2,S,S,U-Z,U-Z,
7,21.016514,1.000000,census_2030,reference_file,2207,Mirjel,Miguel,1,C,C,...,10/23/2006,1,12828 gwendolyn dr Anytown US 00000,12828 gwendolyn dr Anytown US 00000,2,M,M,C,C,
10,15.248207,0.999974,census_2030,reference_file,8512,Glkria,Gloria,1,A,A,...,07/23/1973,1,2265 erik paul dr Anytown US 00000,,-1,G,G,Q,Q,
42,11.341316,0.999615,census_2030,reference_file,5328,Kailani,Kailano,1,I,I,...,10/23/2023,1,8925 st ignatius ln Anytown US 00000,,-1,K,K,G,G,
115,17.483437,0.999995,census_2030,reference_file,3358,Curtis,C,0,B,B,...,08/27/1984,1,1544 e us 224 Anytown US 00000,1544 e us 224 Anytown US 00000,2,C,C,U-Z,U-Z,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10892,14.248207,0.999949,census_2030,reference_file,2038,Nxncy,Nancy,1,M,M,...,08/06/1951,1,,,-1,N,N,A-or-blank,A-or-blank,
10909,18.213539,0.999997,census_2030,reference_file,6972,,Annie,-1,R,R,...,05/28/1947,1,1715 galloway st Anytown US 00000,1715 galloway st Anytown US 00030,1,A-or-blank,A-or-blank,T,T,
10924,21.835767,1.000000,census_2030,reference_file,8436,,Adria,-1,S,S,...,12/21/1982,1,5105 southwest birch st Anytown US 00000,5105 southwest birch st Anytown US 00000,2,A-or-blank,A-or-blank,G,G,
10988,23.786603,1.000000,census_2030,reference_file,1263,Dougkas,Douglas,1,A,A,...,08/13/1990,1,1632 peterson oxgh Anytown US 00000,1632 peterson path Anytown US 00000,1,D,D,O,O,


## Pass 4: Block on DOB

In [69]:
all_combos, pik_pairs = namesearch_pass(["date_of_birth"])

43 links above threshold
43 matches remain after dealing with multiple matches
Matched 43 records; 17.15% still unmatched


### Look at diagnostics

In [70]:
all_combos

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,mean,count
gamma_first_name,gamma_middle_initial,gamma_last_name,gamma_date_of_birth,gamma_geokey,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0,0,1,0,1.3e-05,23
-1,0,0,1,0,3.3e-05,3
0,0,0,1,-1,0.000442,34
0,1,0,1,0,0.000498,4
-1,0,0,1,-1,0.001127,1
2,0,0,1,0,0.009645,3
1,0,0,1,-1,0.013248,4
-1,0,1,1,0,0.033726,1
0,-1,-1,1,-1,0.056404,3
2,0,0,1,-1,0.067235,3


In [71]:
pik_pairs

Unnamed: 0_level_0,match_weight,match_probability,source_dataset_l,source_dataset_r,unique_id_r,first_name_l,first_name_r,gamma_first_name,middle_initial_l,middle_initial_r,...,date_of_birth_r,gamma_date_of_birth,geokey_l,geokey_r,gamma_geokey,first_initial_cut_l,first_initial_cut_r,last_initial_cut_l,last_initial_cut_r,runner_up_match_weight
unique_id_l,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
47,17.119517,0.999993,census_2030,reference_file,15004,Richard,Richsrd,1,O,O,...,12/17/1947,1,215 bauman ave Anytown US 00000,215 bauman ave Anytown US 00000,2,R,R,R,R,
406,17.119517,0.999993,census_2030,reference_file,1575,Christopher,Christophwer,1,J,J,...,08/25/1995,1,8370 chervil ct Anytown US 00000,8370 chervil ct Anytown US 00000,2,C,C,H,H,
419,9.128818,0.998217,census_2030,reference_file,8487,Mobty,Monty,1,C,C,...,12/21/1968,1,,8370 chervil ct Anytown US 00000,-1,M,M,R,R,
1313,4.017152,0.941831,census_2030,reference_file,2448,Bebjamln,Benjamin,1,S,S,...,03/31/1996,1,2213 w prodo vista dr Anytown US 00000,2595 mdw gardens cir Anytown US 00000,0,B,B,U-Z,U-Z,
1489,4.203424,0.948514,census_2030,reference_file,15469,Pattrocu,Patrick,0,R,R,...,05/12/2008,1,250 perdido drive Anytown US 00000,,-1,P,P,B,B,
1545,17.119517,0.999993,census_2030,reference_file,12814,Francisco,Francisdo,1,D,D,...,05/29/1999,1,250 perdido drive Anytown US 00000,250 perdido drive Anytown US 00000,2,F,F,R,R,
1692,7.056457,0.992543,census_2030,reference_file,10169,Ryan,Ryzn,1,J,J,...,12/18/1994,1,11 senatorial dr Anytown US 00000,11 senatorial dr Anytown US 00000,2,R,R,G,G,
1698,12.194123,0.999787,census_2030,reference_file,18269,Kauhkin,Kqitlin,0,J,J,...,03/24/2005,1,24200 savin ave Anytown US 00000,24200 savin ave Anytown US 00000,2,K,K,A-or-blank,A-or-blank,
1737,9.128818,0.998217,census_2030,reference_file,7394,Eeved,Eevee,1,A,A,...,03/22/2025,1,11213 virginia ln Anytown US 00000,,-1,E,E,P,P,
2000,17.119517,0.999993,census_2030,reference_file,15301,Timothy,Tinothy,1,E,E,...,09/30/2006,1,3232 maple grove ln Anytown US 00000,3232 maple grove ln Anytown US 00000,2,T,T,S,S,


# Resulting PIKs

In [72]:
census_2030

Unnamed: 0,record_id,first_name,middle_initial,last_name,sex,race_ethnicity,age,date_of_birth,relation_to_reference_person,street_number,street_name,unit_number,city,state,zipcode,geokey,zip3,first_initial_cut,last_initial_cut,pik
0,0,John,E,Mcueever,Male,Black,86,06/29/1942,Reference person,147-153,browning ave,,Anytown,US,00000,147-153 browning ave Anytown US 00000,000,J,M,6829.0
1,1,Sharon,T,Schmidt,Female,White,69,10/50/1960,Reference person,109,stqllion sr,,Anytown,US,00000,109 stqllion sr Anytown US 00000,000,S,S,
2,2,Gail,K,Durand,Female,Multiracial or Other,77,01/03/1953,Reference person,2115,cannon dr,,Anytown,US,00000,2115 cannon dr Anytown US 00000,000,G,D,6400.0
3,3,John,J,Williams,Male,White,81,11/24/1948,Reference person,146,delaware av,,Anytown,US,00000,146 delaware av Anytown US 00000,000,J,U-Z,18337.0
4,4,Child,L,Wukliamz,Female,White,81,09/27/1948,Opp-sex spouse,146,delaware av,,Anytown,US,00000,146 delaware av Anytown US 00000,000,C,U-Z,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11048,11048,Chloe,A,Maryknez-Alvarez,Female,Latino,21,07/12/2008,Biological child,207,harrison st,,Anytown,US,00000,207 harrison st Anytown US 00000,000,C,M,17071.0
11049,11049,Zachary,E,Martinez-Alvarez,Male,,18,06/29/2011,Biological child,207,harrison st,,Anytown,US,00000,207 harrison st Anytown US 00000,000,U-Z,M,12689.0
11050,11050,Madeline,A,Martinez-Alvarez,Female,Latino,16,08/12/2013,Biological child,207,harrison st,,Anytown,US,00000,207 harrison st Anytown US 00000,000,M,M,10874.0
11051,11051,Naomi,A,Martinez-Aldarez,Female,Latino,1,11/01/2028,Grandchild,207,harrison st,,Anytown,US,00000,207 harrison st Anytown US 00000,000,N,M,10825.0


In [73]:
census_2030.pik.notnull().mean()

0.8284628607617841

In [74]:
# Not possible to be PIKed
(~census_2030_ground_truth.isin(reference_file_ground_truth)).mean()

0.04641273862299828

In [75]:
census_2030.pik.notnull().mean() / census_2030_ground_truth.isin(reference_file_ground_truth).mean()

0.8687855787476281

In [76]:
# Multiple Census rows assigned the same PIK, indicating duplicates in Census
census_2030.pik.value_counts().value_counts()

count
1    9151
2       3
Name: count, dtype: int64

In [77]:
duplicate_piks = census_2030.pik.value_counts()[census_2030.pik.value_counts() > 1].index

In [78]:
census_2030[census_2030.pik.isin(duplicate_piks)].sort_values('pik')

Unnamed: 0,record_id,first_name,middle_initial,last_name,sex,race_ethnicity,age,date_of_birth,relation_to_reference_person,street_number,street_name,unit_number,city,state,zipcode,geokey,zip3,first_initial_cut,last_initial_cut,pik
10701,10701,John,A,Bean,Male,Multiracial or Other,19,05/30/2010,Biological child,4837,knappton r,,Anytown,US,0,4837 knappton r Anytown US 00000,0,J,B,4514.0
10702,10702,John,A,Bean,Male,Multiracial or Other,17,12/31/2012,Biological child,4837,knappton r,,Anytown,US,0,4837 knappton r Anytown US 00000,0,J,B,4514.0
8378,8378,Emily,K,Allen,Female,White,18,04/14/2011,Biological child,3076,hanna ave n,,Anytown,US,0,3076 hanna ave n Anytown US 00000,0,E,A-or-blank,4981.0
8379,8379,Emily,K,Allen,Female,White,16,07/30/2013,,3076,hanna ave n,,Anytown,US,0,3076 hanna ave n Anytown US 00000,0,E,A-or-blank,4981.0
2075,2075,Gary,C,Reed,Male,White,89,06/17/1940,Noninstitutionalized GQ pop,3232,maple grove ln,,Anytown,US,0,3232 maple grove ln Anytown US 00000,0,G,R,16419.0
2095,2095,Gwry,C,Lee,Male,Black,85,10/08/1944,Noninstitutionalized GQ pop,3232,maple grove ln,,Anytown,US,0,3232 maple grove ln Anytown US 00000,0,G,L,16419.0


## PIK accuracy

In [79]:
pik_simulant_id = census_2030.pik.map(reference_file_ground_truth)
pik_simulant_id

0          0_923
1            NaN
2         0_6176
3        0_13972
4            NaN
          ...   
11048    0_22741
11049    0_22742
11050    0_22743
11051    0_23271
11052    0_16724
Name: pik, Length: 11053, dtype: object

In [80]:
(pik_simulant_id[pik_simulant_id.notnull()] == census_2030_ground_truth[pik_simulant_id.notnull()]).mean()

0.9992355574969969

In [81]:
errors = census_2030[census_2030.pik.notnull() & (pik_simulant_id != census_2030_ground_truth)]
confused_for = reference_file.set_index('record_id').loc[errors.pik].reset_index().set_index(errors.index)
errors[common_cols].compare(confused_for[common_cols], keep_shape=True, keep_equal=True)

Unnamed: 0_level_0,record_id,record_id,first_name,first_name,middle_initial,middle_initial,last_name,last_name,date_of_birth,date_of_birth,...,zipcode,zipcode,geokey,geokey,zip3,zip3,first_initial_cut,first_initial_cut,last_initial_cut,last_initial_cut
Unnamed: 0_level_1,self,other,self,other,self,other,self,other,self,other,...,self,other,self,other,self,other,self,other,self,other
2075,2075,16419,Gary,Gary,C,C,Reed,Lee,06/17/1940,10/08/1944,...,0,0,3232 maple grove ln Anytown US 00000,3232 maple grove ln Anytown US 00000,0,0,G,G,R,L
2290,2290,9957,Bennett,Bennett,J,J,Smith,Smith,05/05/2026,09/27/2000,...,0,0,3232 maple grove ln Anyyown US 00000,,0,0,B,B,S,S
8379,8379,4981,Emily,Emily,K,K,Allen,Allen,07/30/2013,04/14/2011,...,0,0,3076 hanna ave n Anytown US 00000,3076 hanna ave n Anytown US 00000,0,0,E,E,A-or-blank,A-or-blank
9404,9404,4251,Margaret,Margaret,C,C,Moe,Miranda,09/21/1932,09/21/1932,...,0,0,4410 705 707 ivan ave s Anytown US 00000,,0,0,M,M,M,M
10038,10038,8876,John,John,D,D,Dietrick,Dietrick,04/18/1972,06/14/1978,...,0,0,519 s ocean blvd Anytown US 00000,519 s ocean blvd Anytown US 00000,0,0,J,J,D,D
10702,10702,4514,John,John,A,A,Bean,Beaj,12/31/2012,05/30/2010,...,0,0,4837 knappton r Anytown US 00000,4837 knappton r Anytown US 00000,0,0,J,J,B,B
10797,10797,2759,Kevin,Kevin,T,R,Herrera,Herrera,03/24/1979,09/16/1965,...,0,0,5096 e 22nd st Anytown US 00000,5096 e 22nd st Anytown US 00000,0,0,K,K,H,H
