# **PVS-like case study: sample data**

In [1]:
import pseudopeople
import pandas as pd, numpy as np

# Generate simulated data to link

Using the latest version of the `pseudopeople` package.

## Load simulated data

Imagined scenario: PIKing the 2030 census.

One way to do this:
* Use (cumulative) SSA Numident up to that time.
* Link it (deterministically, using SSN) to taxes to get the most recent address for each person.
  * Likely would use 1040 here, but I'll use W2 for now.
* Link probabilistically to the census data.

In [2]:
!pip freeze | grep pseudopeople

pseudopeople==0.4.0


In [3]:
noise_configuration = {
    "decennial_census": {
        "first_name": {
            "typographic": {
                "row_noise_level": 0.1,
                "token_noise_level": 0.1
            }
        },
        "last_name": {
            "typographic": {
                "row_noise_level": 0.1,
                "token_noise_level": 0.1
            }
        },
        "middle_initial": {
            "typographic": {
                "row_noise_level": 0.1,
                "token_noise_level": 1
            }
        },
        "street_name": {
            "typographic": {
                "row_noise_level": 0.1,
                "token_noise_level": 0.1
            }
        },
        "street_number": {
            "numeric_miswriting": {
                "row_noise_level": 0.1,
                "token_noise_level": 0.1
            }
        },
        "unit_number": {
            "numeric_miswriting": {
                "row_noise_level": 0.1,
                "token_noise_level": 0.1
            }
        },
        "zipcode": {
            "numeric_miswriting": {
                "row_noise_level": 0.1,
                "token_noise_level": 0.1
            }
        }
    },
    "taxes_w2_and_1099": {
        "first_name": {
            "typographic": {
                "row_noise_level": 0.1,
                "token_noise_level": 0.1
            }
        },
        "last_name": {
            "typographic": {
                "row_noise_level": 0.1,
                "token_noise_level": 0.1
            }
        },
        "middle_initial": {
            "typographic": {
                "row_noise_level": 0.05,
                "token_noise_level": 1
            }
        },
        "mailing_address_street_name": {
            "typographic": {
                "row_noise_level": 0.1,
                "token_noise_level": 0.1
            }
        },
        "mailing_address_street_number": {
            "numeric_miswriting": {
                "row_noise_level": 0.3,
                "token_noise_level": 0.1
            }
        },
        "mailing_address_unit_number": {
            "numeric_miswriting": {
                "row_noise_level": 0.3,
                "token_noise_level": 0.1
            }
        },
        "mailing_address_zipcode": {
            "numeric_miswriting": {
                "row_noise_level": 0.3,
                "token_noise_level": 0.1
            }
        }
    },
    "social_security": {
        "first_name": {
            "typographic": {
                "row_noise_level": 0.1,
                "token_noise_level": 0.1
            }
        },
        "last_name": {
            "typographic": {
                "row_noise_level": 0.1,
                "token_noise_level": 0.1
            }
        },
        "middle_initial": {
            "typographic": {
                "row_noise_level": 0.05,
                "token_noise_level": 1
            }
        },
        "ssn": {
            "missing_data": {
                "row_noise_level": 0,
                "token_noise_level": 0.1
            },
            "typographic": {
                "row_noise_level": 0.01,
                "token_noise_level": 0.1,
                "include_original_token_level": 0
            },
            "numeric_miswriting": {
                "row_noise_level": 0.02,
                "token_noise_level": 0.1
            }
        }
    }
}

In [4]:
%%time

ssa = pseudopeople.generate_social_security(configuration=noise_configuration)
# We could set a date cutoff here, but since we are linking the 2030 census, it would be right around the time our sim ends anyway.
# Also, setting a date cutoff would drop those with missing event_date, which is probably correct (but we should revisit levels of noise
# in this data).
ssa

CPU times: user 939 ms, sys: 49.7 ms, total: 988 ms
Wall time: 965 ms


Unnamed: 0,simulant_id,first_name,date_of_birth,middle_initial,ssn,event_date,last_name,event_type
1,0_6846,Peter,1921-06-07 00:00:00,M,688-88-6377,1921-06-07 00:00:00,Savino,creation
4,0_19825,Gertrude,1922-05-11 00:00:00,M,875-10-2359,1922-05-11 00:00:00,Delgado,creation
7,0_18581,Margaret,1922-09-15 00:00:00,D,102-60-0838,1922-09-15 00:00:00,Blanchard,creation
9,0_2471,Mary,1922-11-02 00:00:00,M,103-21-8846,1922-11-02 00:00:00,Garman,creation
10,0_3267,Mary,1923-01-23 00:00:00,M,087-88-5614,1923-01-23 00:00:00,Damato,creation
...,...,...,...,...,...,...,...,...
31318,0_7196,Randy,1965-02-27 00:00:00,D,251-26-8762,2041-05-21 00:00:00,Do,death
31319,0_16187,Robert,1967-07-30 00:00:00,D,587-82-9505,2041-05-21 00:00:00,Moore,death
31321,0_14782,Margaret,1982-04-20 00:00:00,M,060-42-4747,2041-05-21 00:00:00,Younes,
31322,0_8538,Jacob,1999-10-09 00:00:00,D,268-41-3957,2041-05-21 00:00:00,Long,death


In [5]:
%%time

w2_1099 = pseudopeople.generate_taxes_w2_and_1099(configuration=noise_configuration)
w2_1099

CPU times: user 6.51 s, sys: 514 ms, total: 7.03 s
Wall time: 6.83 s


Unnamed: 0,simulant_id,mailing_address_street_name,first_name,date_of_birth,employer_city,middle_initial,employer_state,ssn,employer_street_name,mailing_address_street_number,...,age,mailing_address_state,employer_id,employer_name,mailing_address_zipcode,tax_form,employer_zipcode,last_name,mailing_address_po_box,employer_unit_number
0,0_4,commonwealth avnue,Michael,1983-03-13 00:00:00,Anytown,M,US,205-18-7302,ince dr,1312,...,36,US,95,Pikes Creek Campground,,W2,00000,Ticas,0,
1,0_5,commonwealth avnue,Michelle,1981-08-10 00:00:00,Anytown,M,US,722-73-2456,hallmont dr,1312,...,38,US,29,Red's Dairy Queen,00000,W2,00000,Ticas,0,
2,0_5621,,Jeffrey,1970-07-26 00:00:00,Anytown,S,US,871-62-4023,mckenzie hwy,,...,49,US,75,France,00000,W2,00000,Contreras,14011,
3,0_5623,,Gloria,1973-07-23 00:00:00,Anytown,A,US,413-69-0826,west union street,,...,46,US,46,Nashville City Properties,00000,W2,00000,Contreras,14011,
4,0_7251,stateline rd,Joe,1965-03-15 00:00:00,Anytown,S,US,893-51-3798,regatta dr,8776,...,54,,32,Tony's Family Practice Inc,00000,W2,00000,Almedina,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
208573,0_9012,,Bria,2001-11-28 00:00:00,Anytown,B,US,311-55-6145,lemoyne ave,,...,39,US,39,Gabriel's Liquor & Acting,00000,W2,00000,Moore,2271,
208574,0_15812,katahdin ave,Nicole,,Anytown,K,US,553-97-1584,s l st,14456,...,39,US,18,H&R Block,00000,W2,00000,Dunton,0,
208575,0_15812,katahdin ave,Nicole,2001-05-20 00:00:00,Anytown,K,US,553-97-1584,sherwood frm road,14456,...,39,US,78,Autism and Bob's Big Louie's,00000,W2,00000,Dunton,0,
208576,0_21712,kingspark dr,Sofia,2006-03-24 00:00:00,Anytown,L,US,885-25-8879,n broadway,8034,...,34,US,6,Dollar Stores,00000,W2,00000,Manning,,


In [6]:
%%time

census = pseudopeople.generate_decennial_census(configuration=noise_configuration)
census

CPU times: user 901 ms, sys: 72 ms, total: 973 ms
Wall time: 938 ms


Unnamed: 0,simulant_id,first_name,date_of_birth,housing_type,city,middle_initial,guardian_2,zipcode,age,year,relation_to_household_head,state,street_number,sex,guardian_1,last_name,street_name,race_ethnicity,unit_number
0,0_2,Melanie,1993-08-05 00:00:00,Standard,Anytown,L,0_-1,00000,26,2020,Reference person,US,10233,Female,0_-1,Herrod,north burgher avenue,White,
1,0_3,Jordan,1993-12-29 00:00:00,Standard,Anytown,C,0_-1,00000,26,2020,Other relative,US,10233,Female,0_-1,Herrod,north burgher avenue,White,
2,0_923,John,1942-06-29 00:00:00,Standard,Anytown,E,0_-1,00000,77,2020,Reference person,US,147-153,Male,0_-1,Davis,browning ave,Black,
3,0_2641,Sharon,1960-10-10 00:00:00,Standard,Anytown,T,0_-1,00000,59,2020,Reference person,US,107,Female,0_-1,Plummer,stallion st,White,
4,0_2801,Ronnie,1946-12-05 00:00:00,Standard,Anytown,A,0_-1,00000,73,2020,Reference person,US,214,Male,0_-1,Yoakum,s vine lane,White,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29570,0_7522,Halle,2014-04-21 00:00:00,Standard,Anytown,R,0_7521,00000,25,2040,Reference person,US,135,Female,0_7520,Carriker,cobblewood drive,White,
29571,0_14524,Keith,1967-05-11 00:00:00,Standard,Anytown,D,0_-1,00000,72,2040,Reference person,US,728,Male,0_-1,Znhtalek,w winchester st,White,
29572,0_14563,Presley,2008-04-02 00:00:00,Standard,Anytown,I,0_14561,00000,31,2040,Other nonrelative,US,728,Female,0_14560,Hill,w winchester st,White,
29573,0_18084,Carol,1971-11-09 00:00:00,Standard,Anytown,M,0_-1,00000,68,2040,Reference person,US,129,,0_-1,Wardell,custer street,White,


In [7]:
census.year.value_counts(dropna=False)

year
2040    9956
2030    9833
2020    9786
Name: count, dtype: int64

In [8]:
census_2030 = census[census.year == 2030]
census_2030

Unnamed: 0,simulant_id,first_name,date_of_birth,housing_type,city,middle_initial,guardian_2,zipcode,age,year,relation_to_household_head,state,street_number,sex,guardian_1,last_name,street_name,race_ethnicity,unit_number
10,0_923,John,1942-06-29 00:00:00,Standard,Anytown,E,0_-1,00000,87,2030,Reference person,US,147-153,Male,0_-1,Davis,browning ave,Black,
11,0_2641,Sharon,1960-10-10 00:00:00,Standard,Anytown,T,0_-1,00000,69,2030,Reference person,US,107,Female,0_-1,Plummer,stallion st,White,
12,0_6176,Gail,1953-01-03 00:00:00,Standard,Anytown,K,0_-1,00000,77,2030,Reference person,US,2115,Female,0_-1,Durand,cannon dr,Multiracial or Other,
13,0_13972,John,1948-11-24 00:00:00,Standard,Anytown,J,0_-1,00000,81,2030,Reference person,US,19802,Male,0_-1,Bartlett,westminster dr,White,
14,0_13973,Linda,1948-09-27 00:00:00,Standard,Anytown,L,0_-1,00000,81,2030,Opp-sex spouse,US,19802,Female,0_-1,Bartlett,westminster dr,White,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29495,0_21929,Noah,2024-09-09 00:00:00,Standard,Anytown,W,0_-1,00000,5,2030,Biological child,US,17902,Male,0_19440,Randall,w pacific ave,White,
29496,0_22511,Xavifer,2026-07-01 00:00:00,Standard,Anytown,E,0_21606,00000,3,2030,Biological child,US,352,Male,0_21607,Hurlbut,wesley chapel rd,White,
29497,0_23096,Teddy,2028-03-27 00:00:00,Standard,Anytown,R,0_-1,00000,2,2030,Biological child,US,17902,Male,0_19440,Randall,w pacific ave,White,
29509,0_21280,Henry,2022-11-18 00:00:00,Standard,Anytown,T,0_-1,00000,7,2030,Reference person,US,2349,Male,0_12801,Sawin,frost view dr,White,apartment 1


## Deterministically get a recent address

In [9]:
# Within each year, we do not have a date field for W2. So we don't know which order jobs happened in.
# We take the address associated with the most income in the most recent year. Note that part or all
# of this address may be missing.
w2_1099.groupby(['tax_year', 'ssn']).mailing_address_street_name.nunique(dropna=False).sort_values()

tax_year  ssn        
2019      001-15-8330    1
2033      704-70-9887    1
          704-78-6904    1
          704-84-8665    1
          705-28-4023    1
                        ..
2028      758-91-8322    3
2037      035-58-6802    3
2019      503-12-2161    3
2023      624-10-8571    3
2031      005-37-3399    3
Name: mailing_address_street_name, Length: 150403, dtype: int64

In [10]:
w2_1099.ssn.nunique()

20861

In [11]:
# We could consider adding a year cutoff here (e.g. to be recent it needs to be at least in 2025).
# But I think for linkage, the more information, the better.
recent_addresses = (
    w2_1099.assign(income=lambda x: x.income.fillna(0).astype(float)).sort_values(['tax_year', 'income'], ascending=False).groupby('ssn').first().filter(like='mailing_address')
)
recent_addresses

Unnamed: 0_level_0,mailing_address_street_name,mailing_address_street_number,mailing_address_unit_number,mailing_address_city,mailing_address_state,mailing_address_zipcode,mailing_address_po_box
ssn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
000-67-1218,e parker st,72687,,Anytown,US,00000,0
000-72-73517,east 21st stree,54,,Anytpwn,US,00000,0
001-02-4588,norh e 156th str,19784,,Anytown,US,00000,0
001-04-4288,drew road,213,# 401,Anytown,US,00000,0
001-11-9734,westminster dr,19802,,Anytown,US,00000,0
...,...,...,...,...,...,...,...
981-87-9324,4th st nw,2309,,Anytown,US,00000,0
9841-60-7488,houston st,35701,,Anytown,US,00000,0
988-39-5898,eastbridge dr,17347,,Anytown,US,00000,
988-69-1254,,,,Anytown,US,00000,2938


In [12]:
# Everyone with any taxes has a most recent address.
assert set(recent_addresses.index) == set(w2_1099.ssn.dropna().unique())

In [13]:
# Some people won't have a recent address, if they don't have any taxes.
ssa[~ssa.ssn.isin(recent_addresses.index)]

Unnamed: 0,simulant_id,first_name,date_of_birth,middle_initial,ssn,event_date,last_name,event_type
37,0_5031,Charles,1924-03-01 00:00:00,S,345-07-0188,1924-03-01 00:00:00,Kramer,creation
53,0_13555,Frankie,1924-06-26 00:00:00,M,458-98-9990,1924-06-26 00:00:00,Webb,creation
59,0_10428,Stella,1924-09-08 00:00:00,G,407-81-9766,1924-09-08 00:00:00,Griggs,creation
64,0_9757,Carl,1924-11-29 00:00:00,A,665-31-5960,1924-11-29 00:00:00,Donnelly,creation
65,0_1731,Felix,1924-12-02 00:00:00,I,535-95-8703,1924-12-02 00:00:00,Lawrence,creation
...,...,...,...,...,...,...,...,...
31255,0_27467,Elias,2041-04-11 00:00:00,A,685-89-3952,2041-03-26 00:00:00,Funderburk,creation
31259,0_27455,Eevee,2041-04-16 00:00:00,A,580-16-0393,2041-03-26 00:00:00,Pronti,creation
31261,0_27460,Caden,2041-04-18 00:00:00,M,119-41-3986,2041-03-26 00:00:00,Gogineni,creation
31310,0_11462,Judith,1952-09-01 00:00:00,S,006-77-5279,2041-05-21 00:00:00,Hess,death


## Create a fake Numident file

In [14]:
# The probable real-life approach would be: take date of birth from the (first) creation event, date of death (if any)
# from the (last) death event, name from the most recent event of any kind.
# We don't want to throw out events with a missing/invalid date, so we'll fill them with the value *least* likely to be chosen
# (early if taking the latest, late if taking the earliest).
fill_dates = lambda df, fill_type: pd.to_datetime(df.event_date, errors='coerce').fillna(pd.Timestamp('2100-01-01' if fill_type == 'latest' else '1900-01-01'))

date_of_birth = (
    ssa[ssa.event_type == 'creation']
        .assign(event_date_for_sort=lambda df: fill_dates(df, 'latest'))
        .sort_values('event_date_for_sort')
        .groupby('ssn')
        .date_of_birth.first()
)
date_of_death = (
    ssa[ssa.event_type == 'death']
        .assign(event_date_for_sort=lambda df: fill_dates(df, 'earliest'))
        .sort_values('event_date_for_sort')
        .groupby('ssn')
        .event_date.last()
        .rename('date_of_death')
)
name = (
    ssa
        .assign(event_date_for_sort=lambda df: fill_dates(df, 'earliest'))
        .sort_values('event_date_for_sort')
        .groupby('ssn')
        .last()[['first_name', 'middle_initial', 'last_name']]
)

# What to do about ground truth here? This simple linkage could already be introducing errors, since SSN is not without noise!
# For now, I'll take the most common ground truth.
simulant_id = ssa.groupby('ssn').simulant_id.agg(lambda x: pd.Series.mode(x)[0])

fake_numident = pd.DataFrame(simulant_id).join(date_of_birth, how='left').join(name, how='left').join(date_of_death, how='left').reset_index()
fake_numident

Unnamed: 0,ssn,simulant_id,date_of_birth,first_name,middle_initial,last_name,date_of_death
0,000-65-3626,0_6059,1976-07-22 00:00:00,Raymundo,J,Posey,
1,001-02-4588,0_13602,2008-08-08 00:00:00,Isabella,G,Bryant,
2,001-11-0864,0_26121,2037-02-24 00:00:00,Zamir,C,Brown,
3,001-15-8330,0_16514,1976-05-04 00:00:00,Gerald,J,Underwood,
4,001-16-0077,0_13906,1970-02-07 00:00:00,Jerald,J,Barajas,
...,...,...,...,...,...,...,...
19437,899-97-5729,0_18151,2018-10-26 00:00:00,Logan,M,Dye,
19438,935-09-9271,0_19864,,Billy,D,Cox,2022-07-12 00:00:00
19439,943-64-3550,0_1728,2002-09-06 00:00:00,Yasmin,B,Gutierrez,
19440,975-48-2496,0_1782,1931-01-23 00:00:00,Ralph,M,Hughes,


In [15]:
# Most people have not died
fake_numident.date_of_death.isnull().mean()

0.8367451908239893

## Create a composite reference file for linking

In [16]:
reference_file = (
    # Exclude those who have died before the census
    fake_numident[~(pd.to_datetime(fake_numident.date_of_death, errors='coerce') <= pd.Timestamp('2020-04-01'))]
        .merge(recent_addresses, on='ssn', how='left')
)
reference_file

Unnamed: 0,ssn,simulant_id,date_of_birth,first_name,middle_initial,last_name,date_of_death,mailing_address_street_name,mailing_address_street_number,mailing_address_unit_number,mailing_address_city,mailing_address_state,mailing_address_zipcode,mailing_address_po_box
0,000-65-3626,0_6059,1976-07-22 00:00:00,Raymundo,J,Posey,,,,,,,,
1,001-02-4588,0_13602,2008-08-08 00:00:00,Isabella,G,Bryant,,norh e 156th str,19784,,Anytown,US,00000,0
2,001-11-0864,0_26121,2037-02-24 00:00:00,Zamir,C,Brown,,,,,,,,
3,001-15-8330,0_16514,1976-05-04 00:00:00,Gerald,J,Underwood,,lacombe avenue,323r,,Anytown,US,00000,0
4,001-16-0077,0_13906,1970-02-07 00:00:00,Jerald,J,Barajas,,n league rd,4232,aptmnt 1,Anytown,US,00000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19269,899-97-5729,0_18151,2018-10-26 00:00:00,Logan,M,Dye,,cesar e chavez ave,8301,,Anytown,US,00000,0
19270,935-09-9271,0_19864,,Billy,D,Cox,2022-07-12 00:00:00,,,,,,,
19271,943-64-3550,0_1728,2002-09-06 00:00:00,Yasmin,B,Gutierrez,,,,,,,,
19272,975-48-2496,0_1782,1931-01-23 00:00:00,Ralph,M,Hughes,,,,,,,,


# Pre-process the data

Not much needed here because the datasets are already so tidy and similar to each other.

In [17]:
# Add a unique record ID
reference_file = reference_file.reset_index(drop=True).reset_index().rename(columns={'index': 'record_id'})
census_2030 = census_2030.reset_index(drop=True).reset_index().rename(columns={'index': 'record_id'})

# Remove ground truth
reference_file_ground_truth = reference_file.pop('simulant_id')
census_2030_ground_truth = census_2030.pop('simulant_id')

In [18]:
# Use true missingness instead of empty string
reference_file = reference_file.replace('', np.nan)
census_2030 = census_2030.replace('', np.nan)

In [19]:
# We want to compare mailing address with physical address
reference_file = reference_file.rename(columns=lambda c: c.replace('mailing_address_', ''))

In [20]:
# Purely for ease of use, order the columns nicely
reference_file_columns_order = [
    'record_id',
    'ssn',
    'first_name', 'middle_initial', 'last_name',
    'date_of_birth',
    'date_of_death',
    'street_number', 'street_name', 'unit_number', 'city', 'state', 'zipcode', 'po_box',
]
assert set(reference_file_columns_order) == set(reference_file.columns)
reference_file = reference_file[reference_file_columns_order]

In [21]:
census_columns_order = [
    'record_id',
    'first_name', 'middle_initial', 'last_name',
    'sex', 'race_ethnicity', 'age', 'date_of_birth',
    'housing_type', 'relation_to_household_head',
    'street_number', 'street_name', 'unit_number', 'city', 'state', 'zipcode',
]
assert (set(census_columns_order) | {'guardian_1', 'guardian_2', 'year'}) == set(census_2030.columns)
census_2030 = census_2030[census_columns_order]

In [22]:
# My working theory: the purpose of the "geokey" is because address parts violate conditional independence
get_geokey = lambda x: (x.street_number + ' ' + x.street_name + ' ' + x.unit_number.fillna('') + ' ' + x.city + ' ' + x.state + ' ' + x.zipcode).str.strip().str.split().str.join(' ')
reference_file = reference_file.assign(geokey=get_geokey)
census_2030 = census_2030.assign(geokey=get_geokey)

In [23]:
# Add columns used to "cut the database": ZIP3 and a grouping of first and last initial
reference_file = reference_file.assign(zip3=lambda x: x.zipcode.str[:3])
census_2030 = census_2030.assign(zip3=lambda x: x.zipcode.str[:3])

# Page 20 of the NORC report: "Name-cuts are defined by combinations of the first characters of the first and last names. The twenty letter groupings
# for the first character are: A-or-blank, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, and U-Z."
initial_cut = lambda x: x.fillna('A').str[0].replace('A', 'A-or-blank').replace(['U', 'V', 'W', 'X', 'Y', 'Z'], 'U-Z')
reference_file = reference_file.assign(first_initial_cut=lambda x: initial_cut(x.first_name), last_initial_cut=lambda x: initial_cut(x.last_name))
census_2030 = census_2030.assign(first_initial_cut=lambda x: initial_cut(x.first_name), last_initial_cut=lambda x: initial_cut(x.last_name))

# Data to link

Note: I have not yet introduced alternate names and dates of birth here.

In [24]:
reference_file

Unnamed: 0,record_id,ssn,first_name,middle_initial,last_name,date_of_birth,date_of_death,street_number,street_name,unit_number,city,state,zipcode,po_box,geokey,zip3,first_initial_cut,last_initial_cut
0,0,000-65-3626,Raymundo,J,Posey,1976-07-22 00:00:00,,,,,,,,,,,R,P
1,1,001-02-4588,Isabella,G,Bryant,2008-08-08 00:00:00,,19784,norh e 156th str,,Anytown,US,00000,0,19784 norh e 156th str Anytown US 00000,000,I,B
2,2,001-11-0864,Zamir,C,Brown,2037-02-24 00:00:00,,,,,,,,,,,U-Z,B
3,3,001-15-8330,Gerald,J,Underwood,1976-05-04 00:00:00,,323r,lacombe avenue,,Anytown,US,00000,0,323r lacombe avenue Anytown US 00000,000,G,U-Z
4,4,001-16-0077,Jerald,J,Barajas,1970-02-07 00:00:00,,4232,n league rd,aptmnt 1,Anytown,US,00000,0,4232 n league rd aptmnt 1 Anytown US 00000,000,J,B
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19269,19269,899-97-5729,Logan,M,Dye,2018-10-26 00:00:00,,8301,cesar e chavez ave,,Anytown,US,00000,0,8301 cesar e chavez ave Anytown US 00000,000,L,D
19270,19270,935-09-9271,Billy,D,Cox,,2022-07-12 00:00:00,,,,,,,,,,B,C
19271,19271,943-64-3550,Yasmin,B,Gutierrez,2002-09-06 00:00:00,,,,,,,,,,,U-Z,G
19272,19272,975-48-2496,Ralph,M,Hughes,1931-01-23 00:00:00,,,,,,,,,,,R,H


In [25]:
census_2030

Unnamed: 0,record_id,first_name,middle_initial,last_name,sex,race_ethnicity,age,date_of_birth,housing_type,relation_to_household_head,street_number,street_name,unit_number,city,state,zipcode,geokey,zip3,first_initial_cut,last_initial_cut
0,0,John,E,Davis,Male,Black,87,1942-06-29 00:00:00,Standard,Reference person,147-153,browning ave,,Anytown,US,00000,147-153 browning ave Anytown US 00000,000,J,D
1,1,Sharon,T,Plummer,Female,White,69,1960-10-10 00:00:00,Standard,Reference person,107,stallion st,,Anytown,US,00000,107 stallion st Anytown US 00000,000,S,P
2,2,Gail,K,Durand,Female,Multiracial or Other,77,1953-01-03 00:00:00,Standard,Reference person,2115,cannon dr,,Anytown,US,00000,2115 cannon dr Anytown US 00000,000,G,D
3,3,John,J,Bartlett,Male,White,81,1948-11-24 00:00:00,Standard,Reference person,19802,westminster dr,,Anytown,US,00000,19802 westminster dr Anytown US 00000,000,J,B
4,4,Linda,L,Bartlett,Female,White,81,1948-09-27 00:00:00,Standard,Opp-sex spouse,19802,westminster dr,,Anytown,US,00000,19802 westminster dr Anytown US 00000,000,L,B
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9828,9828,Noah,W,Randall,Male,White,5,2024-09-09 00:00:00,Standard,Biological child,17902,w pacific ave,,Anytown,US,00000,17902 w pacific ave Anytown US 00000,000,N,R
9829,9829,Xavifer,E,Hurlbut,Male,White,3,2026-07-01 00:00:00,Standard,Biological child,352,wesley chapel rd,,Anytown,US,00000,352 wesley chapel rd Anytown US 00000,000,U-Z,H
9830,9830,Teddy,R,Randall,Male,White,2,2028-03-27 00:00:00,Standard,Biological child,17902,w pacific ave,,Anytown,US,00000,17902 w pacific ave Anytown US 00000,000,T,R
9831,9831,Henry,T,Sawin,Male,White,7,2022-11-18 00:00:00,Standard,Reference person,2349,frost view dr,apartment 1,Anytown,US,00000,2349 frost view dr apartment 1 Anytown US 00000,000,H,S


In [26]:
# Save these variables; this means that if you restart the kernel, you don't need to run this first part of the notebook again.
%store reference_file census_2030 reference_file census_2030 reference_file_ground_truth census_2030_ground_truth

Stored 'reference_file' (DataFrame)
Stored 'census_2030' (DataFrame)
Stored 'reference_file' (DataFrame)
Stored 'census_2030' (DataFrame)
Stored 'reference_file_ground_truth' (Series)
Stored 'census_2030_ground_truth' (Series)


# Implement PVS-like matching with `splink`

## Estimate parameters (lambda, m, u) once for both modules

In reality these parameters are not estimated from the data.
It is unclear to me whether they are actually the same for both modules or even for different passes of the same module.

In [27]:
# Reload saved variables; you can start the notebook from here if you have *ever* run the part above.
%store -r reference_file census_2030

import pandas as pd, numpy as np

In [28]:
common_cols = [c for c in reference_file.columns if c in census_2030.columns]
common_cols

['record_id',
 'first_name',
 'middle_initial',
 'last_name',
 'date_of_birth',
 'street_number',
 'street_name',
 'unit_number',
 'city',
 'state',
 'zipcode',
 'geokey',
 'zip3',
 'first_initial_cut',
 'last_initial_cut']

In [29]:
def prep_table_for_splink(df):
    return (
        df[common_cols]
            .assign(date_of_birth=lambda x: x.date_of_birth.astype(str))
            .rename(columns={'record_id': 'unique_id'})
    )

tables_for_splink = [prep_table_for_splink(reference_file), prep_table_for_splink(census_2030)]

In [30]:
[len(t) for t in tables_for_splink]

[19274, 9833]

In [31]:
# estimate_probability_two_random_records_match did not seem to give me a reasonable estimate
# we estimate that around 90% of the census are present in the reference file
probability_two_random_records_match = (0.90 * len(census_2030)) / (len(reference_file) * len(census_2030))
probability_two_random_records_match

4.6695029573518736e-05

In [32]:
from splink.duckdb.duckdb_linker import DuckDBLinker
from splink.duckdb.duckdb_comparison_library import (
    exact_match,
    levenshtein_at_thresholds,
)

settings = {
    "link_type": "link_only",
    "comparisons": [
        levenshtein_at_thresholds("first_name", 2, term_frequency_adjustments=True),
        exact_match("middle_initial"),
        levenshtein_at_thresholds("last_name", 2, term_frequency_adjustments=True),
        # For some reason, this makes everything crash!?
        # levenshtein_at_thresholds("date_of_birth", 1),
        exact_match("date_of_birth"),
        levenshtein_at_thresholds("geokey", 5),
    ],
    "probability_two_random_records_match": probability_two_random_records_match
}

linker = DuckDBLinker(
    tables_for_splink,
    settings,
    input_table_aliases=["reference_file", "census_2030"]
)

# NOTE: This is not reproducible!
linker.estimate_u_using_random_sampling(max_pairs=1e5)

blocking_rule_for_training = "l.first_name = r.first_name and l.last_name = r.last_name"
linker.estimate_parameters_using_expectation_maximisation(blocking_rule_for_training)

blocking_rule_for_training = "l.geokey = r.geokey"
linker.estimate_parameters_using_expectation_maximisation(blocking_rule_for_training)

----- Estimating u probabilities using random sampling -----

Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - first_name (no m values are trained).
    - middle_initial (no m values are trained).
    - last_name (no m values are trained).
    - date_of_birth (no m values are trained).
    - geokey (no m values are trained).

----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
l.first_name = r.first_name and l.last_name = r.last_name

Parameter estimates will be made for the following comparison(s):
    - middle_initial
    - date_of_birth
    - geokey

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
    - first_name
    - last_name

Iteration 1: Largest change in params was -0.341 in the m_probability of geokey, level `Exact match`
Iteration 2: Largest change in params was 0.0116 in probability_two_random_rec

<EMTrainingSession, blocking on l.geokey = r.geokey, deactivating comparisons geokey>

In [33]:
linker.match_weights_chart()

In [34]:
# NOTE: EM appears to be finding people in the same family instead of the same person!
# See first_name m probabilities.
# For now, I address this by almost always blocking on first name.
# More experimentation needed to get reasonable values here.
linker.m_u_parameters_chart()

In [35]:
splink_settings = linker._settings_obj.as_dict()

In [36]:
PROBABILITY_THRESHOLD = 0.85

In [37]:
# Save these variables; this means that if you restart the kernel, you don't need to run this first part of the notebook again.
%store splink_settings PROBABILITY_THRESHOLD

Stored 'splink_settings' (dict)
Stored 'PROBABILITY_THRESHOLD' (float)


## Implement matching passes

In [38]:
# Calculate this once to save time -- mapping from record_id to index of Census dataframe
census_index_of_ids = census_2030.reset_index().set_index('record_id')['index']

# TODO: Have this function output more charts and diagnostics
def pvs_matching_pass(blocking_cols):
    tables_for_splink = [prep_table_for_splink(reference_file), prep_table_for_splink(census_2030[census_2030.pik.isnull()])]

    blocking_rule_parts = [f"l.{col} = r.{col}" for col in blocking_cols]
    blocking_rule = " and ".join(blocking_rule_parts)
    linker = DuckDBLinker(
        tables_for_splink,
        {**splink_settings, **{
            "blocking_rules_to_generate_predictions": [blocking_rule],
        }},
        input_table_aliases=["reference_file", "census_2030"]
    )

    all_predictions = linker.predict().as_pandas_dataframe()
    all_combos = all_predictions.groupby(list(all_predictions.filter(like='gamma_').columns)).match_probability.agg(['mean', 'count']).sort_values('mean')

    potential_links = linker.predict(threshold_match_probability=PROBABILITY_THRESHOLD).as_pandas_dataframe()
    print(f'{len(potential_links)} links above threshold')

    # Post-processing: deal with multiple matches
    # According to the report, it is frequently the case that the post-processing rule doesn't assign *any* matches when there are multiple
    # So I'm replicating that feature with a very simple algorithm
    pik_pairs = potential_links.sort_values('match_weight', ascending=False).groupby(['unique_id_l']).first()
    runner_up_pairs = potential_links.sort_values('match_weight', ascending=False).groupby(['unique_id_l']).nth(2)
    pik_pairs = pik_pairs.join(runner_up_pairs.match_weight.rename('runner_up_match_weight'), how='left')
    pairs_to_keep = ~(pik_pairs.runner_up_match_weight > pik_pairs.match_weight + 0.5)
    print(f'{len(pairs_to_keep)} matches remain after dealing with multiple matches')
    pik_pairs = pik_pairs[pairs_to_keep]
    # Make pik_pairs index into the census_2030 dataframe
    pik_pairs = pik_pairs.set_index(pik_pairs.index.map(census_index_of_ids))
    
    census_2030.loc[pik_pairs.index, 'pik'] = pik_pairs.unique_id_r
    print(f'Matched {len(pik_pairs)} records; {census_2030.pik.isnull().mean():.2%} still unmatched')
    
    return all_combos, pik_pairs

# GeoSearch

> There are six passes through GeoSearch defined currently for an ACS PVS run. These passes use the first
  three digits of an address ZIP code (ZIP3) as a database “cutting” strategy...
>
> The GeoSearch matching
  variables include name and DOB, but also several variables derived from the Geokey (street name, house
  number, etc).

[(source)](https://www.norc.org/PDFs/May%202011%20Personal%20Validation%20and%20Entity%20Resolution%20Conference/PVS%20Assessment%20Report%20FINAL%20JULY%202011.pdf)

In [39]:
# Reload saved variables; you can start the notebook from here if you have *ever* run the part above.
%store -r

import pandas as pd, numpy as np
from splink.duckdb.duckdb_linker import DuckDBLinker
from splink.duckdb.duckdb_comparison_library import (
    exact_match,
    levenshtein_at_thresholds,
)

In [40]:
def geosearch_pass(blocking_cols):
    return pvs_matching_pass(["zip3"] + blocking_cols)

## Pass 1: block on full name and entire address

In [41]:
census_2030['pik'] = np.nan

In [42]:
all_combos, pik_pairs = geosearch_pass(["first_name", "middle_initial", "last_name", "geokey"])

3855 links above threshold
3854 matches remain after dealing with multiple matches
Matched 3854 records; 60.81% still unmatched


### Look at diagnostics

In [43]:
all_combos

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,mean,count
gamma_first_name,gamma_middle_initial,gamma_last_name,gamma_date_of_birth,gamma_geokey,Unnamed: 5_level_1,Unnamed: 6_level_1
2,1,2,0,2,0.999927,279
2,1,2,1,2,1.0,3576


In [44]:
pik_pairs

Unnamed: 0_level_0,match_weight,match_probability,source_dataset_l,source_dataset_r,unique_id_r,first_name_l,first_name_r,gamma_first_name,middle_initial_l,middle_initial_r,...,gamma_last_name,date_of_birth_l,date_of_birth_r,gamma_date_of_birth,geokey_l,geokey_r,gamma_geokey,zip3_l,zip3_r,runner_up_match_weight
unique_id_l,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,23.570036,1.0,census_2030,reference_file,7113,John,John,2,E,E,...,2,1942-06-29 00:00:00,1942-06-29 00:00:00,1,147-153 browning ave Anytown US 00000,147-153 browning ave Anytown US 00000,2,000,000,
1,32.206939,1.0,census_2030,reference_file,2320,Sharon,Sharon,2,T,T,...,2,1960-10-10 00:00:00,1960-10-10 00:00:00,1,107 stallion st Anytown US 00000,107 stallion st Anytown US 00000,2,000,000,
2,34.447947,1.0,census_2030,reference_file,6680,Gail,Gail,2,K,K,...,2,1953-01-03 00:00:00,1953-01-03 00:00:00,1,2115 cannon dr Anytown US 00000,2115 cannon dr Anytown US 00000,2,000,000,
3,27.017494,1.0,census_2030,reference_file,19095,John,John,2,J,J,...,2,1948-11-24 00:00:00,1948-11-24 00:00:00,1,19802 westminster dr Anytown US 00000,19802 westminster dr Anytown US 00000,2,000,000,
5,32.126019,1.0,census_2030,reference_file,16307,Shelley,Shelley,2,J,J,...,2,1973-03-02 00:00:00,1973-03-02 00:00:00,1,19802 westminster dr Anytown US 00000,19802 westminster dr Anytown US 00000,2,000,000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9815,29.846683,1.0,census_2030,reference_file,5638,Lisa,Lisa,2,A,A,...,2,1964-05-09 00:00:00,1964-05-09 00:00:00,1,1790 dogwood way apt 435 Anytown US 00000,1790 dogwood way apt 435 Anytown US 00000,2,000,000,
9816,29.541056,1.0,census_2030,reference_file,8838,Anthony,Anthony,2,R,R,...,2,1964-02-17 00:00:00,1964-02-17 00:00:00,1,1790 dogwood way apt 435 Anytown US 00000,1790 dogwood way apt 435 Anytown US 00000,2,000,000,
9822,32.024736,1.0,census_2030,reference_file,7818,Grace,Grace,2,R,R,...,2,1962-06-24 00:00:00,1962-06-24 00:00:00,1,2405 s 12th st Anytown US 00000,2405 s 12th st Anytown US 00000,2,000,000,
9823,31.737454,1.0,census_2030,reference_file,18624,Carmen,Carmen,2,J,J,...,2,1997-07-29 00:00:00,1997-07-29 00:00:00,1,17902 w pacific ave Anytown US 00000,17902 w pacific ave Anytown US 00000,2,000,000,


## Pass 2: Block on first name and entire address

In [45]:
all_combos, pik_pairs = geosearch_pass(["first_name", "geokey"])

273 links above threshold
273 matches remain after dealing with multiple matches
Matched 273 records; 58.03% still unmatched


### Look at diagnostics

In [46]:
all_combos

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,mean,count
gamma_first_name,gamma_middle_initial,gamma_last_name,gamma_date_of_birth,gamma_geokey,Unnamed: 5_level_1,Unnamed: 6_level_1
2,0,0,0,2,0.005915,117
2,-1,0,0,2,0.010268,2
2,0,-1,0,2,0.077781,1
2,0,1,0,2,0.227558,1
2,1,0,0,2,0.285795,30
2,1,-1,0,2,0.957336,8
2,0,2,0,2,0.991077,3
2,1,1,0,2,0.992999,3
2,-1,2,0,2,0.999109,3
2,-1,0,1,2,0.999574,2


In [47]:
pik_pairs

Unnamed: 0_level_0,match_weight,match_probability,source_dataset_l,source_dataset_r,unique_id_r,first_name_l,first_name_r,gamma_first_name,middle_initial_l,middle_initial_r,...,gamma_last_name,date_of_birth_l,date_of_birth_r,gamma_date_of_birth,geokey_l,geokey_r,gamma_geokey,zip3_l,zip3_r,runner_up_match_weight
unique_id_l,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
16,16.734658,0.999991,census_2030,reference_file,2732,Cathie,Cathie,2,K,K,...,0,1958-04-27 00:00:00,1958-04-27 00:00:00,1,1451 atlanta st Anytown US 00000,1451 atlanta st Anytown US 00000,2,000,000,
52,20.197514,0.999999,census_2030,reference_file,17136,Laura,Laura,2,D,D,...,1,1968-09-19 00:00:00,1968-09-19 00:00:00,1,10834 glenorchy place Anytown US 00000,10834 glenorchy place Anytown US 00000,2,000,000,
77,23.708476,1.000000,census_2030,reference_file,11580,Vernon,Vernon,2,B,B,...,1,1963-05-15 00:00:00,1963-05-15 00:00:00,1,18884 american pr pl Anytown US 00000,18884 american pr pl Anytown US 00000,2,000,000,
89,21.275516,1.000000,census_2030,reference_file,395,Randy,Randy,2,J,J,...,1,1978-06-09 00:00:00,1978-06-09 00:00:00,1,10133 ridgeline cir Anytown US 00000,10133 ridgeline cir Anytown US 00000,2,000,000,
101,7.578586,0.994796,census_2030,reference_file,2013,Ralph,Ralph,2,M,M,...,1,1984-62-01 08:00:00,,0,4925 citation avenue Anytown US 00000,4925 citation avenue Anytown US 00000,2,000,000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9773,14.319620,0.999951,census_2030,reference_file,1332,Douglas,Douglas,2,A,A,...,0,1990-08-13 00:00:00,1990-08-13 00:00:00,1,1416 nrte brazee street Anytown US 00000,1416 nrte brazee street Anytown US 00000,2,000,000,
9774,21.340218,1.000000,census_2030,reference_file,17588,Jaxson,Jaxson,2,O,O,...,-1,2012-03-23 00:00:00,2012-03-23 00:00:00,1,1416 nrte brazee street Anytown US 00000,1416 nrte brazee street Anytown US 00000,2,000,000,
9804,22.925180,1.000000,census_2030,reference_file,9360,Isiah,Isiah,2,A,A,...,-1,2003-09-05 00:00:00,2003-09-05 00:00:00,1,9 151st ln n Anytown US 00000,9 151st ln n Anytown US 00000,2,000,000,
9806,14.734658,0.999963,census_2030,reference_file,12025,Tiffany,Tiffany,2,A,A,...,0,1986-05-27 00:00:00,1986-05-27 00:00:00,1,3785 sw 151st st Anytown US 00000,3785 sw 151st st Anytown US 00000,2,000,000,


## Pass 3: Block on full name and street address

In [48]:
all_combos, pik_pairs = geosearch_pass(["first_name", "middle_initial", "last_name", "street_number", "street_name"])

266 links above threshold
266 matches remain after dealing with multiple matches
Matched 266 records; 55.32% still unmatched


### Look at diagnostics

In [49]:
all_combos

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,mean,count
gamma_first_name,gamma_middle_initial,gamma_last_name,gamma_date_of_birth,gamma_geokey,Unnamed: 5_level_1,Unnamed: 6_level_1
2,1,2,0,-1,0.984081,6
2,1,2,0,1,0.999778,9
2,1,2,1,0,0.999982,3
2,1,2,1,-1,0.999999,80
2,1,2,1,1,1.0,168


In [50]:
pik_pairs

Unnamed: 0_level_0,match_weight,match_probability,source_dataset_l,source_dataset_r,unique_id_r,first_name_l,first_name_r,gamma_first_name,middle_initial_l,middle_initial_r,...,geokey_l,geokey_r,gamma_geokey,zip3_l,zip3_r,street_name_l,street_name_r,street_number_l,street_number_r,runner_up_match_weight
unique_id_l,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10,22.470143,1.000000,census_2030,reference_file,11339,Terrance,Terrance,2,R,R,...,,19802 westminster dr Anytown US 00000,-1,000,000,westminster dr,westminster dr,19802,19802,
31,30.444141,1.000000,census_2030,reference_file,18429,Randy,Randy,2,M,M,...,19461 fire twr rd Anytown US 00000,19461 fire twr rd Anytown AK 00000,1,000,000,fire twr rd,fire twr rd,19461,19461,
37,30.074192,1.000000,census_2030,reference_file,1659,Gerard,Gerard,2,M,M,...,9 tuttletown r Antgown US 00000,9 tuttletown r Anytown US 00000,1,000,000,tuttletown r,tuttletown r,9,9,
56,27.716640,1.000000,census_2030,reference_file,5045,William,William,2,C,C,...,940 sw 45th ave Anytown US 00000,940 sw 45th ave Anytown TN 00000,1,000,000,sw 45th ave,sw 45th ave,940,940,
93,24.901645,1.000000,census_2030,reference_file,14939,Mary,Mary,2,D,D,...,4260 townhall st apt number 348 Anytown US 00000,4260 townhall st apt number 348 Znytown US 00000,1,000,000,townhall st,townhall st,4260,4260,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9644,29.363221,1.000000,census_2030,reference_file,5173,Rachel,Rachel,2,J,J,...,7927 tilden street Anytown US 00000,7927 tilden street Anytown US 00010,1,000,000,tilden street,tilden street,7927,7927,
9679,14.796711,0.999965,census_2030,reference_file,15587,Rachel,Rachel,2,I,I,...,837 oak lawn dr Anytown US 00000,837 oak lawn dr unit e 8th floor Anytown US 00000,0,000,000,oak lawn dr,oak lawn dr,837,837,
9694,23.588787,1.000000,census_2030,reference_file,11266,Brent,Brent,2,R,R,...,,8210 carolwood dr Anytown US 00000,-1,000,000,carolwood dr,carolwood dr,8210,8210,
9715,3.554156,0.921547,census_2030,reference_file,8602,Rachel,Rachel,2,I,I,...,,7503 mourning dove cir Anytown US 00000,-1,000,000,mourning dove cir,mourning dove cir,7503,7503,


## Pass 4: Block on first name and street address

In [51]:
all_combos, pik_pairs = geosearch_pass(["first_name", "street_number", "street_name"])

23 links above threshold
23 matches remain after dealing with multiple matches
Matched 23 records; 55.09% still unmatched


### Look at diagnostics

In [52]:
all_combos

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,mean,count
gamma_first_name,gamma_middle_initial,gamma_last_name,gamma_date_of_birth,gamma_geokey,Unnamed: 5_level_1,Unnamed: 6_level_1
2,0,0,0,-1,6e-06,6
2,0,0,0,1,0.001412,4
2,0,0,0,2,0.006019,100
2,-1,0,0,2,0.011481,1
2,0,1,0,2,0.228992,1
2,1,0,0,2,0.247281,25
2,1,0,1,-1,0.956383,1
2,1,0,1,1,0.99929,4
2,-1,-1,1,1,0.999793,1
2,1,-1,1,-1,0.999807,1


In [53]:
pik_pairs

Unnamed: 0_level_0,match_weight,match_probability,source_dataset_l,source_dataset_r,unique_id_r,first_name_l,first_name_r,gamma_first_name,middle_initial_l,middle_initial_r,...,geokey_l,geokey_r,gamma_geokey,zip3_l,zip3_r,street_name_l,street_name_r,street_number_l,street_number_r,runner_up_match_weight
unique_id_l,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
260,24.853747,1.0,census_2030,reference_file,6521,Baylee,Baylee,2,X,C,...,1702 meisner rd Anytown US 00000,1702 meisner rd Anyrown US 00000,1,0,0,meisner rd,meisner rd,1702,1702,
1011,18.311111,0.999997,census_2030,reference_file,10165,Kathy,Kathy,2,D,D,...,19521 w main st Anytlwn US 00000,19521 w main st Anytown US 00000,1,0,0,w main st,w main st,19521,19521,
1028,4.454611,0.956383,census_2030,reference_file,9142,Blake,Blake,2,A,A,...,,1874 fiesta ct Anytown US 00000,-1,0,0,fiesta ct,fiesta ct,1874,1874,
2273,15.512744,0.999979,census_2030,reference_file,371,Isabella,Isabella,2,S,,...,,2405 cth gg Anytown US 00000,-1,0,0,cth gg,cth gg,2405,2405,
2424,9.318969,0.998437,census_2030,reference_file,674,Michael,Michael,2,J,J,...,14410 avon st Anytown IL 00000,14410 avon st Anytown US 00000,1,0,0,avon st,avon st,14410,14410,
2455,27.300712,1.0,census_2030,reference_file,5524,Deborah,Deborah,2,,K,...,41255 holland road Anytlwn US 00000,41255 holland road Anytown US 00000,1,0,0,holland road,holland road,41255,41255,
3379,17.029753,0.999993,census_2030,reference_file,10063,Michael,Michael,2,D,D,...,155 phillips raod Anytown VT 00000,155 phillips raod Anytown US 00000,1,0,0,phillips raod,phillips raod,155,155,
3474,13.590073,0.999919,census_2030,reference_file,11347,Rita,Rita,2,T,T,...,814 van buren dr Anytown ID 00000,814 van buren dr Anytown US 00000,1,0,0,van buren dr,van buren dr,814,814,
4247,12.384233,0.999813,census_2030,reference_file,19102,Anthony,Anthony,2,C,,...,,791 northwest saltzman road Anytown US 00000,-1,0,0,northwest saltzman road,northwest saltzman road,791,791,
4325,29.363721,1.0,census_2030,reference_file,14432,Rachael,Rachael,2,,C,...,44 rosedale dr Anytown MI 00000,44 rosedale dr Anytown US 00000,1,0,0,rosedale dr,rosedale dr,44,44,


## Pass 5: Block on first and last name

In [54]:
all_combos, pik_pairs = geosearch_pass(["first_name", "last_name"])

3208 links above threshold
3197 matches remain after dealing with multiple matches
Matched 3197 records; 22.58% still unmatched


### Look at diagnostics

In [55]:
all_combos

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,mean,count
gamma_first_name,gamma_middle_initial,gamma_last_name,gamma_date_of_birth,gamma_geokey,Unnamed: 5_level_1,Unnamed: 6_level_1
2,0,2,0,0,0.081995,78
2,0,2,0,-1,0.255074,24
2,-1,2,0,0,0.475732,7
2,-1,2,0,-1,0.538467,5
2,1,2,0,0,0.899852,146
2,1,2,0,-1,0.969289,64
2,1,2,0,1,0.999853,7
2,-1,2,1,-1,0.99994,16
2,-1,2,0,1,0.999949,1
2,-1,2,1,0,0.99995,49


In [56]:
pik_pairs

Unnamed: 0_level_0,match_weight,match_probability,source_dataset_l,source_dataset_r,unique_id_r,first_name_l,first_name_r,gamma_first_name,middle_initial_l,middle_initial_r,...,gamma_last_name,date_of_birth_l,date_of_birth_r,gamma_date_of_birth,geokey_l,geokey_r,gamma_geokey,zip3_l,zip3_r,runner_up_match_weight
unique_id_l,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,19.751623,0.999999,census_2030,reference_file,17582,Linda,Linda,2,L,L,...,2,1948-09-27 00:00:00,1948-09-27 00:00:00,1,19802 westminster dr Anytown US 00000,406 nrth pittsburgh avenue Anytown US 00000,0,000,000,
11,21.813967,1.000000,census_2030,reference_file,8840,Gloria,Gloria,2,A,A,...,2,1973-07-23 00:00:00,1973-07-23 00:00:00,1,2265 erik paul dr Anytown US 00000,,-1,000,000,
12,17.588725,0.999995,census_2030,reference_file,19015,Robert,Robert,2,J,J,...,2,1981-10-25 00:00:00,1981-10-25 00:00:00,1,610 105th ave se Anytown US 00000,1702 meisner rd Anytown US 00000,0,000,000,
19,19.566358,0.999999,census_2030,reference_file,16351,Dylan,Dylan,2,A,A,...,2,2016-09-22 00:00:00,2016-09-22 00:00:00,1,9635 lambert st Anytown US 00000,5344 aberfoyle place nw Anytown US 00000,0,000,000,
24,22.781370,1.000000,census_2030,reference_file,7654,Seth,Seth,2,V,V,...,2,2004-09-28 00:00:00,2004-09-28 00:00:00,1,1728 burnt oak ln Anytown US 00000,5014 andover ct Anytown US 00000,0,000,000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9799,22.266797,1.000000,census_2030,reference_file,6768,Ivan,Ivan,2,W,W,...,2,2000-05-05 00:00:00,2000-05-05 00:00:00,1,4304 high range road Anytown US 00000,9 wing st Anytown US 00000,0,000,000,
9801,19.833838,0.999999,census_2030,reference_file,4565,Sofia,Sofia,2,A,A,...,2,2014-06-03 00:00:00,2014-06-03 00:00:00,1,427 forest hill dr Anytown US 00000,30088 e guadalupe rd Anytown US 00000,0,000,000,
9802,26.099369,1.000000,census_2030,reference_file,15278,Zaira,Zaira,2,B,B,...,2,2016-01-30 00:00:00,2016-01-30 00:00:00,1,427 forest hill dr Anytown US 00000,,-1,000,000,
9810,15.460946,0.999978,census_2030,reference_file,1581,Stephen,Stephen,2,J,J,...,2,1983-12-29 00:00:00,1983-12-29 00:00:00,1,16506 poplar view court Anytown US 00000,703 martin rd nw Anytown US 00000,0,000,000,


In [57]:
all_combos

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,mean,count
gamma_first_name,gamma_middle_initial,gamma_last_name,gamma_date_of_birth,gamma_geokey,Unnamed: 5_level_1,Unnamed: 6_level_1
2,0,2,0,0,0.081995,78
2,0,2,0,-1,0.255074,24
2,-1,2,0,0,0.475732,7
2,-1,2,0,-1,0.538467,5
2,1,2,0,0,0.899852,146
2,1,2,0,-1,0.969289,64
2,1,2,0,1,0.999853,7
2,-1,2,1,-1,0.99994,16
2,-1,2,0,1,0.999949,1
2,-1,2,1,0,0.99995,49


In [58]:
pik_pairs

Unnamed: 0_level_0,match_weight,match_probability,source_dataset_l,source_dataset_r,unique_id_r,first_name_l,first_name_r,gamma_first_name,middle_initial_l,middle_initial_r,...,gamma_last_name,date_of_birth_l,date_of_birth_r,gamma_date_of_birth,geokey_l,geokey_r,gamma_geokey,zip3_l,zip3_r,runner_up_match_weight
unique_id_l,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,19.751623,0.999999,census_2030,reference_file,17582,Linda,Linda,2,L,L,...,2,1948-09-27 00:00:00,1948-09-27 00:00:00,1,19802 westminster dr Anytown US 00000,406 nrth pittsburgh avenue Anytown US 00000,0,000,000,
11,21.813967,1.000000,census_2030,reference_file,8840,Gloria,Gloria,2,A,A,...,2,1973-07-23 00:00:00,1973-07-23 00:00:00,1,2265 erik paul dr Anytown US 00000,,-1,000,000,
12,17.588725,0.999995,census_2030,reference_file,19015,Robert,Robert,2,J,J,...,2,1981-10-25 00:00:00,1981-10-25 00:00:00,1,610 105th ave se Anytown US 00000,1702 meisner rd Anytown US 00000,0,000,000,
19,19.566358,0.999999,census_2030,reference_file,16351,Dylan,Dylan,2,A,A,...,2,2016-09-22 00:00:00,2016-09-22 00:00:00,1,9635 lambert st Anytown US 00000,5344 aberfoyle place nw Anytown US 00000,0,000,000,
24,22.781370,1.000000,census_2030,reference_file,7654,Seth,Seth,2,V,V,...,2,2004-09-28 00:00:00,2004-09-28 00:00:00,1,1728 burnt oak ln Anytown US 00000,5014 andover ct Anytown US 00000,0,000,000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9799,22.266797,1.000000,census_2030,reference_file,6768,Ivan,Ivan,2,W,W,...,2,2000-05-05 00:00:00,2000-05-05 00:00:00,1,4304 high range road Anytown US 00000,9 wing st Anytown US 00000,0,000,000,
9801,19.833838,0.999999,census_2030,reference_file,4565,Sofia,Sofia,2,A,A,...,2,2014-06-03 00:00:00,2014-06-03 00:00:00,1,427 forest hill dr Anytown US 00000,30088 e guadalupe rd Anytown US 00000,0,000,000,
9802,26.099369,1.000000,census_2030,reference_file,15278,Zaira,Zaira,2,B,B,...,2,2016-01-30 00:00:00,2016-01-30 00:00:00,1,427 forest hill dr Anytown US 00000,,-1,000,000,
9810,15.460946,0.999978,census_2030,reference_file,1581,Stephen,Stephen,2,J,J,...,2,1983-12-29 00:00:00,1983-12-29 00:00:00,1,16506 poplar view court Anytown US 00000,703 martin rd nw Anytown US 00000,0,000,000,


# NameSearch

>    The NameSearch module, by contrast, does not use any geographic variables for matching. Only the
>    Name and DOB are used to match. There are four NameSearch passes defined for the ACS. All passes
>    use the first characters of the First and Last names to define cuts...

In [59]:
def namesearch_pass(blocking_cols):
    return pvs_matching_pass(["first_initial_cut", "last_initial_cut"] + blocking_cols)

## Pass 1: Block on full name and DOB

In [60]:
all_combos, pik_pairs = namesearch_pass(["first_name", "middle_initial", "last_name", "date_of_birth"])

956 links above threshold
956 matches remain after dealing with multiple matches
Matched 956 records; 12.85% still unmatched


### Look at diagnostics

In [61]:
all_combos

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,mean,count
gamma_first_name,gamma_middle_initial,gamma_last_name,gamma_date_of_birth,gamma_geokey,Unnamed: 5_level_1,Unnamed: 6_level_1
2,1,2,1,0,0.999989,26
2,1,2,1,-1,0.999998,886
2,1,2,1,1,1.0,44


In [62]:
pik_pairs

Unnamed: 0_level_0,match_weight,match_probability,source_dataset_l,source_dataset_r,unique_id_r,first_name_l,first_name_r,gamma_first_name,middle_initial_l,middle_initial_r,...,date_of_birth_r,gamma_date_of_birth,geokey_l,geokey_r,gamma_geokey,last_initial_cut_l,last_initial_cut_r,first_initial_cut_l,first_initial_cut_r,runner_up_match_weight
unique_id_l,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
23,18.582920,0.999997,census_2030,reference_file,6824,Delilah,Delilah,2,A,A,...,2022-07-07 00:00:00,1,5 n goldwater dr Anytown US 00000,,-1,T,T,D,D,
27,24.820456,1.000000,census_2030,reference_file,11028,Irie,Irie,2,S,S,...,2018-06-13 00:00:00,1,4100 corano ln Anytown US 00000,,-1,L,L,I,I,
29,18.701515,0.999998,census_2030,reference_file,956,Noah,Noah,2,J,J,...,2023-03-23 00:00:00,1,4100 corano ln Anytown US 00000,,-1,L,L,N,N,
48,22.302608,1.000000,census_2030,reference_file,19226,Yaretzi,Yaretzi,2,E,E,...,2022-07-26 00:00:00,1,,,-1,U-Z,U-Z,U-Z,U-Z,
49,17.694925,0.999995,census_2030,reference_file,12327,Ava,Ava,2,M,M,...,2028-03-15 00:00:00,1,21980 meandering wy Anytown US 00000,,-1,N,N,A-or-blank,A-or-blank,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9821,20.155767,0.999999,census_2030,reference_file,7724,Zoe,Zoe,2,B,B,...,2028-01-27 00:00:00,1,1303 charles street Anytown US 00000,,-1,M,M,U-Z,U-Z,
9826,21.717645,1.000000,census_2030,reference_file,5580,Phillip,Phillip,2,R,R,...,2022-04-02 00:00:00,1,352 wesley chapel rd Anytown US 00000,,-1,H,H,P,P,
9828,19.353592,0.999999,census_2030,reference_file,16542,Noah,Noah,2,W,W,...,2024-09-09 00:00:00,1,17902 w pacific ave Anytown US 00000,,-1,R,R,N,N,
9830,22.224605,1.000000,census_2030,reference_file,15080,Teddy,Teddy,2,R,R,...,2028-03-27 00:00:00,1,17902 w pacific ave Anytown US 00000,,-1,R,R,T,T,


## Pass 2: Block on first name and DOB

In [63]:
all_combos, pik_pairs = namesearch_pass(["first_name", "date_of_birth"])

63 links above threshold
63 matches remain after dealing with multiple matches
Matched 63 records; 12.21% still unmatched


### Look at diagnostics

In [64]:
all_combos

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,mean,count
gamma_first_name,gamma_middle_initial,gamma_last_name,gamma_date_of_birth,gamma_geokey,Unnamed: 5_level_1,Unnamed: 6_level_1
2,1,0,1,-1,0.848455,2
2,1,0,1,0,0.874624,2
2,0,1,1,0,0.939044,1
2,1,-1,1,0,0.980869,1
2,1,1,1,0,0.998728,20
2,1,-1,1,-1,0.999037,1
2,1,1,1,-1,0.999706,12
2,0,2,1,-1,0.999971,1
2,-1,2,1,-1,0.999987,24
2,1,1,1,1,0.999998,1


In [65]:
pik_pairs

Unnamed: 0_level_0,match_weight,match_probability,source_dataset_l,source_dataset_r,unique_id_r,first_name_l,first_name_r,gamma_first_name,middle_initial_l,middle_initial_r,...,date_of_birth_r,gamma_date_of_birth,geokey_l,geokey_r,gamma_geokey,last_initial_cut_l,last_initial_cut_r,first_initial_cut_l,first_initial_cut_r,runner_up_match_weight
unique_id_l,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
975,12.471779,0.999824,census_2030,reference_file,18421,Sue,Sue,2,R,R,...,1952-10-20 00:00:00,1,1043 trotter ln Anytown US 00000,1702 meisner rd Anytown US 00000,0,U-Z,U-Z,S,S,
1002,11.886817,0.999736,census_2030,reference_file,8288,Hayden,Hayden,2,D,D,...,2019-06-15 00:00:00,1,4371 southeast 120th avnu Anytown US 00000,1549 broadway st Anytown US 00000,0,G,G,H,H,
1132,13.812498,0.999930,census_2030,reference_file,8290,Lindsay,Lindsay,2,C,C,...,1984-10-16 00:00:00,1,5300 tea rose ct Anytown US 00000,,-1,F,F,L,L,
1158,7.613798,0.994921,census_2030,reference_file,17330,John,John,2,C,C,...,1989-01-29 00:00:00,1,2020 w arthur av Anytown US 00000,862 tully ave nw Anytown US 00000,0,B,B,J,J,
1232,16.390965,0.999988,census_2030,reference_file,18471,Jaylin,Jaylin,2,,C,...,2027-01-27 00:00:00,1,1748 braeburn ndr Anytown US 00000,,-1,N,N,J,J,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9335,9.449412,0.998572,census_2030,reference_file,13884,Matthew,Matthew,2,B,B,...,2010-09-03 00:00:00,1,5540 indian vw tr Anytown US 00000,75 wadsworth byp Anytown US 00000,0,O,O,M,M,
9680,9.114227,0.998199,census_2030,reference_file,3842,Andrew,Andrew,2,C,C,...,1990-08-26 00:00:00,1,2379 schenck d Anytown US 00000,1125 36th avenue southwest Anytown US 00000,0,R,R,A-or-blank,A-or-blank,
9689,17.189331,0.999993,census_2030,reference_file,10790,Nicole,Nicole,2,S,,...,1986-08-26 00:00:00,1,13175 s 16th st Anytown US 00000,,-1,D,D,N,N,
9765,9.743859,0.998835,census_2030,reference_file,8129,Zoe,Zoe,2,C,C,...,2021-08-29 00:00:00,1,2613 granite stre Anytown US 00000,7514 tern rd apt 4029 4th floor Anytown US 00000,0,G,G,U-Z,U-Z,


## Pass 3: Block on last name and DOB

In [66]:
all_combos, pik_pairs = namesearch_pass(["last_name", "date_of_birth"])

68 links above threshold
68 matches remain after dealing with multiple matches
Matched 68 records; 11.52% still unmatched


### Look at diagnostics

In [67]:
all_combos

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,mean,count
gamma_first_name,gamma_middle_initial,gamma_last_name,gamma_date_of_birth,gamma_geokey,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0,2,1,-1,0.989489,1
0,1,2,1,0,0.992421,1
-1,0,2,1,-1,0.995077,1
-1,1,2,1,0,0.998452,5
0,-1,2,1,-1,0.998817,1
1,-1,2,1,-1,0.999062,1
0,1,2,1,-1,0.999108,3
-1,1,2,1,-1,0.999794,4
1,1,2,1,0,0.999808,10
1,1,2,1,-1,0.999946,4


In [68]:
pik_pairs

Unnamed: 0_level_0,match_weight,match_probability,source_dataset_l,source_dataset_r,unique_id_r,first_name_l,first_name_r,gamma_first_name,middle_initial_l,middle_initial_r,...,date_of_birth_r,gamma_date_of_birth,geokey_l,geokey_r,gamma_geokey,last_initial_cut_l,last_initial_cut_r,first_initial_cut_l,first_initial_cut_r,runner_up_match_weight
unique_id_l,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
273,14.057366,0.999941,census_2030,reference_file,10992,Hollu,Holly,1,R,R,...,2010-05-06 00:00:00,1,,7905 heritage dr Anytown US 00000,-1,C,C,H,H,
319,20.687952,0.999999,census_2030,reference_file,6304,Nahalia,Natalia,1,R,R,...,2010-04-22 00:00:00,1,5035 missouri ave Anytown US 00000,5035 missouri ave Anytown US 00000,2,R,R,N,N,
321,22.803429,1.000000,census_2030,reference_file,10607,Aidan,Akdqn,1,J,J,...,2007-01-19 00:00:00,1,1702 meisner rd Anytown US 00000,1702 meisner rd Anytown US 00000,2,T,T,A-or-blank,A-or-blank,
488,10.160076,0.999127,census_2030,reference_file,8931,Allen,,-1,J,J,...,1974-09-29 00:00:00,1,1702 meisner rd Anytown US 00000,113 fielder way Anytown US 00000,0,M,M,A-or-blank,A-or-blank,
497,12.918648,0.999871,census_2030,reference_file,1455,,,-1,J,J,...,2027-05-03 00:00:00,1,1702 meisner rd Anytown US 00000,,-1,B,B,A-or-blank,A-or-blank,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9639,22.581036,1.000000,census_2030,reference_file,15487,Christopher,Christophef,1,E,E,...,1974-05-22 00:00:00,1,465 wst avenue Anytown US 00000,465 wst avenue Anytown US 00000,2,C,C,C,C,
9663,23.803429,1.000000,census_2030,reference_file,9561,Patricia,Patridia,1,D,D,...,1951-03-11 00:00:00,1,258 sheffield drive fl 5 Anytown US 00000,258 sheffield drive fl 5 Anytown US 00000,2,M,M,P,P,
9741,12.131685,0.999777,census_2030,reference_file,1239,Nichkolas,Nicholas,1,J,J,...,2002-04-28 00:00:00,1,2215 lutheran street Anytown US 00000,113 south college avenue Anytown US 00000,0,U-Z,U-Z,N,N,
9829,14.057366,0.999941,census_2030,reference_file,8470,Xavifer,Xavier,1,E,E,...,2026-07-01 00:00:00,1,352 wesley chapel rd Anytown US 00000,,-1,H,H,U-Z,U-Z,


## Pass 4: Block on DOB

In [69]:
all_combos, pik_pairs = namesearch_pass(["date_of_birth"])

0 links above threshold
0 matches remain after dealing with multiple matches
Matched 0 records; 11.52% still unmatched


### Look at diagnostics

In [70]:
all_combos

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,mean,count
gamma_first_name,gamma_middle_initial,gamma_last_name,gamma_date_of_birth,gamma_geokey,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0,0,1,0,0.00031,1
-1,0,0,1,0,0.000591,1
0,0,0,1,-1,0.000883,1
-1,0,0,1,-1,0.001683,1
2,1,0,1,-1,0.724181,1
2,1,0,1,0,0.779481,1


In [71]:
pik_pairs

Unnamed: 0_level_0,match_weight,match_probability,source_dataset_l,source_dataset_r,unique_id_r,first_name_l,first_name_r,gamma_first_name,middle_initial_l,middle_initial_r,...,date_of_birth_r,gamma_date_of_birth,geokey_l,geokey_r,gamma_geokey,last_initial_cut_l,last_initial_cut_r,first_initial_cut_l,first_initial_cut_r,runner_up_match_weight
unique_id_l,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


# Resulting PIKs

In [72]:
census_2030

Unnamed: 0,record_id,first_name,middle_initial,last_name,sex,race_ethnicity,age,date_of_birth,housing_type,relation_to_household_head,...,street_name,unit_number,city,state,zipcode,geokey,zip3,first_initial_cut,last_initial_cut,pik
0,0,John,E,Davis,Male,Black,87,1942-06-29 00:00:00,Standard,Reference person,...,browning ave,,Anytown,US,00000,147-153 browning ave Anytown US 00000,000,J,D,7113.0
1,1,Sharon,T,Plummer,Female,White,69,1960-10-10 00:00:00,Standard,Reference person,...,stallion st,,Anytown,US,00000,107 stallion st Anytown US 00000,000,S,P,2320.0
2,2,Gail,K,Durand,Female,Multiracial or Other,77,1953-01-03 00:00:00,Standard,Reference person,...,cannon dr,,Anytown,US,00000,2115 cannon dr Anytown US 00000,000,G,D,6680.0
3,3,John,J,Bartlett,Male,White,81,1948-11-24 00:00:00,Standard,Reference person,...,westminster dr,,Anytown,US,00000,19802 westminster dr Anytown US 00000,000,J,B,19095.0
4,4,Linda,L,Bartlett,Female,White,81,1948-09-27 00:00:00,Standard,Opp-sex spouse,...,westminster dr,,Anytown,US,00000,19802 westminster dr Anytown US 00000,000,L,B,17582.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9828,9828,Noah,W,Randall,Male,White,5,2024-09-09 00:00:00,Standard,Biological child,...,w pacific ave,,Anytown,US,00000,17902 w pacific ave Anytown US 00000,000,N,R,16542.0
9829,9829,Xavifer,E,Hurlbut,Male,White,3,2026-07-01 00:00:00,Standard,Biological child,...,wesley chapel rd,,Anytown,US,00000,352 wesley chapel rd Anytown US 00000,000,U-Z,H,8470.0
9830,9830,Teddy,R,Randall,Male,White,2,2028-03-27 00:00:00,Standard,Biological child,...,w pacific ave,,Anytown,US,00000,17902 w pacific ave Anytown US 00000,000,T,R,15080.0
9831,9831,Henry,T,Sawin,Male,White,7,2022-11-18 00:00:00,Standard,Reference person,...,frost view dr,apartment 1,Anytown,US,00000,2349 frost view dr apartment 1 Anytown US 00000,000,H,S,18304.0


In [73]:
census_2030.pik.notnull().mean()

0.8847757551103427

In [74]:
# Not possible to be PIKed
(~census_2030_ground_truth.isin(reference_file_ground_truth)).mean()

0.04952710261364792

In [75]:
census_2030.pik.notnull().mean() / census_2030_ground_truth.isin(reference_file_ground_truth).mean()

0.9308795206505456

In [76]:
# Multiple Census rows assigned the same PIK, indicating duplicates in Census
census_2030.pik.value_counts().value_counts()

count
1    8686
2       7
Name: count, dtype: int64

In [77]:
duplicate_piks = census_2030.pik.value_counts()[census_2030.pik.value_counts() > 1].index

In [78]:
census_2030[census_2030.pik.isin(duplicate_piks)].sort_values('pik')

Unnamed: 0,record_id,first_name,middle_initial,last_name,sex,race_ethnicity,age,date_of_birth,housing_type,relation_to_household_head,...,street_name,unit_number,city,state,zipcode,geokey,zip3,first_initial_cut,last_initial_cut,pik
1015,1015,William,M,Carper,Male,White,14,2015-12-13 00:00:00,Standard,Biological child,...,n prospect av,,Anytown,US,0,18548 n prospect av Anytown US 00000,0,U-Z,C,2357.0
1016,1016,William,M,Carper,Male,White,13,2016-10-13 00:00:00,Standard,Biological child,...,n prospect av,,Anytown,US,0,18548 n prospect av Anytown US 00000,0,U-Z,C,2357.0
3183,3183,Thomas,I,Mcclellon,Male,White,47,1982-07-08 00:00:00,Standard,Reference person,...,e 7th st,,Anytown,US,0,8722 e 7th st Anytown US 00000,0,T,M,3185.0
3195,3195,Thomas,G,Mcclellon,Male,White,3,2027-02-12 00:00:00,Standard,Grandchild,...,e 7th st,,Anytown,US,0,8722 e 7th st Anytown US 00000,0,T,M,3185.0
2181,2181,Yana,E,Ortiz,Female,Latino,13,2016-08-16 00:00:00,Standard,Biological child,...,harvestfish ct,,Anytown,,0,,0,U-Z,O,7769.0
8639,8639,Yana,E,Ortiz,Female,Latino,12,2017-08-01 00:00:00,Standard,Other nonrelative,...,sw nazaneen dr,,Anytown,US,0,1917 sw nazaneen dr Anytown US 00000,0,U-Z,O,7769.0
486,486,Liam,C,,Male,White,7,2023-02-07 00:00:00,Carceral,Institutionalized GQ pop,...,meisner rd,,Anytown,US,0,1702 meisner rd Anytown US 00000,0,L,A-or-blank,8593.0
2515,2515,Liam,C,Ramey,Male,White,8,2022-02-19 00:00:00,Standard,Other nonrelative,...,pierce r,,Anytown,VA,0,2205 pierce r Anytown VA 00000,0,L,R,8593.0
4957,4957,Jesus,N,Brown,Male,Latino,19,2011-02-08 00:00:00,Standard,Other relative,...,la porte st,,,US,0,,0,J,B,9971.0
8725,8725,Jesus,N,Brown,Male,Multiracial or Other,17,2012-04-20 00:00:00,Standard,Sibling,...,clark ave,,Anytown,US,0,413 clark ave Anytown US 00000,0,J,B,9971.0


## PIK accuracy

In [79]:
pik_simulant_id = census_2030.pik.map(reference_file_ground_truth)
pik_simulant_id

0         0_923
1        0_2641
2        0_6176
3       0_13972
4       0_13973
         ...   
9828    0_21929
9829    0_22511
9830    0_23096
9831    0_21280
9832    0_22439
Name: pik, Length: 9833, dtype: object

In [80]:
(pik_simulant_id[pik_simulant_id.notnull()] == census_2030_ground_truth[pik_simulant_id.notnull()]).mean()

0.9985057471264368

In [81]:
errors = census_2030[census_2030.pik.notnull() & (pik_simulant_id != census_2030_ground_truth)]
confused_for = reference_file.set_index('record_id').loc[errors.pik].reset_index().set_index(errors.index)
errors[common_cols].compare(confused_for[common_cols], keep_shape=True, keep_equal=True)

Unnamed: 0_level_0,record_id,record_id,first_name,first_name,middle_initial,middle_initial,last_name,last_name,date_of_birth,date_of_birth,...,zipcode,zipcode,geokey,geokey,zip3,zip3,first_initial_cut,first_initial_cut,last_initial_cut,last_initial_cut
Unnamed: 0_level_1,self,other,self,other,self,other,self,other,self,other,...,self,other,self,other,self,other,self,other,self,other
230,230,5366,Alexis,Alexis,H,H,Buchholz Robbins,Buchholz Robbins,2006-09-15 00:00:00,2005-06-09 00:00:00,...,0,0.0,1702 meisner rd Anytown US 00000,100 sweetbriar plc Anytown US 00000,0,0.0,A-or-blank,A-or-blank,B,B
486,486,8593,Liam,Liam,C,C,,Ramey,2023-02-07 00:00:00,2022-02-19 00:00:00,...,0,0.0,1702 meisner rd Anytown US 00000,1702 meisner rd Anytown US 00000,0,0.0,L,L,A-or-blank,R
666,666,8285,Michael,Michael,D,L,Rahman,Rahman,1963-10-52 00:01:00,1999-03-28 00:00:00,...,0,0.0,69411 n holly rd Anytown US 00000,69411 n holly rd Anytown US 00000,0,0.0,M,M,R,R
1016,1016,2357,William,William,M,M,Carper,Carper,2016-10-13 00:00:00,2015-12-13 00:00:00,...,0,0.0,18548 n prospect av Anytown US 00000,18548 n prospect av Anytown US 00000,0,0.0,U-Z,U-Z,C,C
1155,1155,9528,Stephanie,Stephanie,C,H,Chandler,Chandler,2024-12-25 00:00:00,2004-07-23 00:00:00,...,0,0.0,402 hodges blvd Anytown US 00000,402 hodges blvd Anytown US 00000,0,0.0,S,S,C,C
2181,2181,7769,Yana,Yana,E,E,Ortiz,Ortiz,2016-08-16 00:00:00,2017-08-01 00:00:00,...,0,0.0,,6449 pierce st Anytown US 00000,0,0.0,U-Z,U-Z,O,O
3114,3114,6234,Everleigh,Everleigh,L,L,Lizotte,Lizotte,2029-08-28 00:00:00,2020-09-16 00:00:00,...,0,0.0,325 3rd st Anytown US 00000,2820 fairview rd Anytown US 00000,0,0.0,E,E,L,L
3195,3195,3185,Thomas,Thomas,G,I,Mcclellon,Mcclellon,2027-02-12 00:00:00,1982-07-08 00:00:00,...,0,0.0,8722 e 7th st Anytown US 00000,8722 e 7th st Anytown US 00000,0,0.0,T,T,M,M
3398,3398,17083,W,Yousef,F,C,Arvizu,Arvizu,2021-08-23 00:00:00,2021-08-23 00:00:00,...,0,0.0,,1702 meisner rd Anytown US 00000,0,0.0,U-Z,U-Z,A-or-blank,A-or-blank
4957,4957,9971,Jesus,Jesus,N,N,Brown,Brown,2011-02-08 00:00:00,2012-04-20 00:00:00,...,0,0.0,,,0,0.0,J,J,B,B
