# Simulated PIK statistics

Here we inspect the accuracy and characteristics of the PIKs assigned,
leveraging our knowledge of ground truth from pseudopeople.

It wouldn't be possible to do the ground truth part with the real PVS, but
Layne, Wagner, and Rothhaas did something similar by redacting SSN from real records,
sending them through PVS without the SSN, and then using the true SSN
as ground truth.
The health care records they used are probably quite different from a CUF,
but they found a **very** good overall PIK accuracy (see cell below).

In [1]:
# Query planning is now on by default, but it has some rough edges.
# See https://github.com/dask/dask/issues/10995 for general discussion
# and https://github.com/dask/dask-expr/issues/1060 for the particular
# issue I ran into.
import dask
dask.config.set({"dataframe.query-planning": False})

<dask.config.set at 0x7f2a7ee76950>

In [2]:
import datetime, os, time

from vivarium_research_prl import distributed_compute, utils
from IPython.display import display

In [3]:
print(datetime.datetime.now())

2024-05-29 21:20:10.822567


In [4]:
# DO NOT EDIT if this notebook is not called ground_truth_accuracy.ipynb!
# This notebook is designed to be run with papermill; this cell is tagged 'parameters'
data_to_use = 'small_sample'
simulated_data_output_dir = 'output/generate_simulated_data'
case_study_output_dir = 'output'

compute_engine = 'pandas'
# Only matter if using a distributed compute engine
compute_engine_num_workers = 3
compute_engine_cpus_per_worker = 2
compute_engine_threads_per_worker = 1
compute_engine_memory_per_worker = "1GB"
queue = None
account = None
# NOTE: This is, as Dask requests, a directory local to the compute node.
# But IHME's cluster doesn't support this very well -- it can be small-ish,
# full of stuff from other users, etc.
compute_engine_local_directory = f"/tmp/{os.environ['USER']}_{int(time.time())}_person_linkage_case_study"
compute_engine_log_directory = f'{case_study_output_dir}/{data_to_use}/logs'
walltime = None
compute_engine_memory_constrained = True
compute_engine_scheduler = "slurm"

In [5]:
# Parameters
data_to_use = "small_sample"
simulated_data_output_dir = "output/generate_simulated_data/"
case_study_output_dir = "output/results/"
queue = "long.q"
account = "proj_simscience"
walltime = "16-00:00:00"
compute_engine = "pandas"


In [6]:
if compute_engine.startswith('dask'):
    utils.ensure_empty(compute_engine_local_directory)

In [7]:
case_study_output_dir = f'{case_study_output_dir}/{data_to_use}'
simulated_data_output_dir = f'{simulated_data_output_dir}/{data_to_use}'

In [8]:
df_ops, pd = distributed_compute.start_compute_engine(
    compute_engine,
    num_workers=compute_engine_num_workers,
    cpus_per_worker=compute_engine_cpus_per_worker,
    threads_per_worker=compute_engine_threads_per_worker,
    memory_per_worker=compute_engine_memory_per_worker,
    worker_walltime=walltime,
    local_directory=compute_engine_local_directory,
    log_directory=compute_engine_log_directory,
    memory_constrained=compute_engine_memory_constrained,
    scheduler=compute_engine_scheduler,
    queue=queue,
    account=account,
)

In [9]:
census_2030_piked = df_ops.read_parquet(f'{case_study_output_dir}/census_2030_piked.parquet')
confirmed_piks_with_ground_truth = df_ops.read_parquet(f'{case_study_output_dir}/confirmed_piks.parquet')

In [10]:
piked_proportion = df_ops.compute(census_2030_piked.pik.notnull().mean())
# Compare with 90.28% of input records PIKed in the 2010 CUF,
# as reported in Wagner and Layne, Table 2, p. 18 
print(f'{piked_proportion:.2%} of the input records were PIKed')

89.41% of the input records were PIKed


In [11]:
# Multiple Census rows assigned the same PIK, indicating the model thinks they are duplicates in Census
pik_sizes = df_ops.persist(df_ops.groupby_agg_small_groups(census_2030_piked, by='pik', agg_func=lambda x: x.size()))
df_ops.compute(pik_sizes.value_counts())

1    9775
2      49
Name: count, dtype: int64

In [12]:
# Interesting: in pseudopeople, sometimes siblings are assigned the same (common) first name, making them almost identical.
# The only giveaway is their age and DOB.
# Presumably, this tends not to happen in real life.
duplicate_piks = pik_sizes.rename('pik_size').reset_index().pipe(lambda df: df[df.pik_size > 1])

df_ops.head(census_2030_piked.merge(duplicate_piks, on="pik").sort_values('pik'))

Unnamed: 0,household_id,first_name,middle_initial,last_name,age,date_of_birth,street_number,street_name,unit_number,city,state,zipcode,housing_type,relationship_to_reference_person,sex,race_ethnicity,year,record_id,pik,pik_size
76,0_5753,Jayla,Q,Brooks,1,12/28/2028,2317,dunbar avenue,,Anytown,WA,0,Household,Biological child,Female,Multiracial or Other,2030,simulated_census_2030_0_9391,40_20,2
75,0_5753,Camille,A,Brooks,1,12/28/2028,2317,dunbar avenue,,Anytown,WA,0,Household,Biological child,Female,Multiracial or Other,2030,simulated_census_2030_0_9390,40_20,2
19,0_6990,Samuel,G,Anderson,16,04/13/1982,11716,dogwood street,,Anytown,WA,0,Household,Biological child,Male,White,2030,simulated_census_2030_0_4244,40_332,2
18,0_6990,Kari,A,Anderson,47,04/13/1982,11716,dogwood street,,Anytown,WA,0,Household,Reference person,Female,White,2030,simulated_census_2030_0_4241,40_332,2
16,0_10884,Robert,D,Brimhall,58,02/24/1946,11234,private road 4685,,Anytown,WA,0,Household,Other nonrelative,Male,White,2030,simulated_census_2030_0_4141,40_407,2
15,0_10884,Perry,F,Robertson,84,02/24/1946,11234,private roac 5685,,Anytown,WA,0,Household,Reference person,Male,Multiracial or Other,2030,simulated_census_2030_0_4135,40_407,2
77,0_6036,Linda,K,Phillips,61,09/17/1969,900,sw inez st,,Anytown,WA,0,Household,Reference person,Female,White,2030,simulated_census_2030_0_9914,40_573,2
78,0_6036,Jeremy,K,Phillips,60,09/14/1969,900,sw inez st,,Anytown,WA,0,Household,Opposite-sex spouse,Male,White,2030,simulated_census_2030_0_9915,40_573,2
30,0_2000,Criselda,R,Trejo,52,09/23/1977,6544,forest ave,,Anytown,WA,0,Household,Opposite-sex spouse,Female,Asian,2030,simulated_census_2030_0_5035,40_683,2
29,0_2000,Jay,J,Trejo,66,09/23/1977,6544,forest ave,,Anytown,WA,0,Household,Reference person,Male,Latino,2030,simulated_census_2030_0_5034,40_683,2


## Ground truth statistics

In [13]:
census_2030_ground_truth = df_ops.persist(
    df_ops.read_parquet(f'{simulated_data_output_dir}/simulated_census_2030_ground_truth.parquet')
)

In [14]:
# In this version of pseudopeople, there are no actual duplicates in Census,
# which means all of the duplicates identified above are wrong.
assert len(census_2030_ground_truth) == len(df_ops.drop_duplicates(census_2030_ground_truth))

In [15]:
reference_files_ground_truth = df_ops.persist(df_ops.concat([
    df_ops.read_parquet(f'{simulated_data_output_dir}/simulated_geobase_reference_file_ground_truth.parquet').drop(columns=['n_unique_simulants']),
    df_ops.read_parquet(f'{simulated_data_output_dir}/simulated_name_dob_reference_file_ground_truth.parquet').drop(columns=['n_unique_simulants']),
], ignore_index=True))

In [16]:
# However, there can be reference file records that correspond to multiple simulants,
# due to errors in the reference file construction by SSN
n_unique_simulants = df_ops.persist(df_ops.groupby_agg_small_groups(reference_files_ground_truth, by='record_id', agg_func=lambda x: x.simulant_id.nunique()).rename('n_unique_simulants').reset_index())
df_ops.compute(n_unique_simulants.n_unique_simulants.value_counts())

n_unique_simulants
1    51724
2     1294
3       41
Name: count, dtype: int64

In [17]:
reference_files_ground_truth = df_ops.persist(reference_files_ground_truth.merge(
    n_unique_simulants,
    on='record_id',
    how='left',
))
reference_files_ground_truth.head(n=100)

Unnamed: 0,record_id,simulant_id,n_unique_simulants
0,simulated_geobase_reference_file_0_1027,0_6212,1
1,simulated_geobase_reference_file_0_1108,0_9842,1
2,simulated_geobase_reference_file_0_1312,0_1402,1
3,simulated_geobase_reference_file_0_1419,0_13861,1
4,simulated_geobase_reference_file_0_1430,0_6611,1
...,...,...,...
95,simulated_geobase_reference_file_17_468,0_3507,1
96,simulated_geobase_reference_file_17_817,0_7775,1
97,simulated_geobase_reference_file_17_967,0_10871,1
98,simulated_geobase_reference_file_17_985,0_4273,1


In [18]:
df_ops.head(reference_files_ground_truth[reference_files_ground_truth.n_unique_simulants == df_ops.compute(reference_files_ground_truth.n_unique_simulants.max())])

Unnamed: 0,record_id,simulant_id,n_unique_simulants
794,simulated_geobase_reference_file_3_292,0_1439,3
795,simulated_geobase_reference_file_3_292,0_1441,3
796,simulated_geobase_reference_file_3_292,0_1440,3
4007,simulated_geobase_reference_file_6_1501,0_22202,3
4008,simulated_geobase_reference_file_6_1501,0_22201,3
4009,simulated_geobase_reference_file_6_1501,0_22205,3
4569,simulated_geobase_reference_file_2_570,0_13378,3
4570,simulated_geobase_reference_file_2_570,0_23442,3
4571,simulated_geobase_reference_file_2_570,0_7085,3
4686,simulated_geobase_reference_file_11_89,0_21667,3


In [19]:
census_2030_ground_truth = df_ops.persist(census_2030_ground_truth.merge(
    df_ops.drop_duplicates(reference_files_ground_truth[['simulant_id']]).assign(possible_to_pik=1),
    on='simulant_id',
    how='left',
).assign(possible_to_pik=lambda df: df.possible_to_pik.fillna(0)))
possible_to_pik_proportion = df_ops.compute(census_2030_ground_truth.possible_to_pik.mean())
print(
    f'{(1 - possible_to_pik_proportion):.2%} of the input records are '
    'impossible to PIK correctly, since they are not in any reference files'
)

0.45% of the input records are impossible to PIK correctly, since they are not in any reference files


In [20]:
print(
    f'Assigned PIKs to {(piked_proportion / possible_to_pik_proportion):.2%} of PIK-able records'
)

Assigned PIKs to 89.81% of PIK-able records


In [21]:
reference_file = df_ops.concat([
    df_ops.read_parquet(
        f'{simulated_data_output_dir}/simulated_geobase_reference_file.parquet',
    ),
    df_ops.read_parquet(
        f'{simulated_data_output_dir}/simulated_name_dob_reference_file.parquet',
    ),
], ignore_index=True)

In [22]:
reference_file_piks = df_ops.persist(reference_file[['record_id', 'pik']])
reference_file_piks

Unnamed: 0,record_id,pik
0,simulated_geobase_reference_file_0_0,38_8
1,simulated_geobase_reference_file_0_15,38_252
2,simulated_geobase_reference_file_0_16,38_252
3,simulated_geobase_reference_file_0_17,38_252
4,simulated_geobase_reference_file_0_20,38_287
...,...,...
53054,simulated_name_dob_reference_file_7_21,75_17
53055,simulated_name_dob_reference_file_7_22,75_17
53056,simulated_name_dob_reference_file_10_169,75_392
53057,simulated_name_dob_reference_file_10_170,75_392


In [23]:
assert len(reference_file_piks) == len(df_ops.drop_duplicates(reference_file_piks[['record_id']]))

In [24]:
pik_simulant_pairs = df_ops.persist(df_ops.drop_duplicates(reference_files_ground_truth.merge(reference_file_piks, on='record_id')[['pik', 'simulant_id']]))

In [25]:
# However, there can be PIKs that correspond to multiple simulants,
# due to errors in the reference file construction by SSN
n_unique_simulants = df_ops.persist(df_ops.groupby_agg_small_groups(pik_simulant_pairs, by='pik', agg_func=lambda x: x.simulant_id.nunique()).rename('n_unique_simulants').reset_index())
df_ops.compute(n_unique_simulants.n_unique_simulants.value_counts())

n_unique_simulants
1    17885
2     1042
3       51
Name: count, dtype: int64

In [26]:
pik_simulant_pairs = df_ops.persist(pik_simulant_pairs.merge(
    n_unique_simulants,
    on='pik',
    how='left',
))
pik_simulant_pairs

Unnamed: 0,pik,simulant_id,n_unique_simulants
0,69_497,0_19610,2
1,60_572,0_14452,2
2,41_86,0_17207,2
3,60_718,0_22516,2
4,72_820,0_21653,2
...,...,...,...
20117,75_280,0_15578,1
20118,75_239,0_22864,1
20119,75_247,0_22123,1
20120,75_252,0_22467,1


In [27]:
df_ops.head(pik_simulant_pairs[pik_simulant_pairs.n_unique_simulants == df_ops.compute(pik_simulant_pairs.n_unique_simulants.max())])

Unnamed: 0,pik,simulant_id,n_unique_simulants
25,44_592,0_1439,3
95,49_593,0_21545,3
129,53_401,0_13378,3
130,53_401,0_23442,3
133,40_233,0_21667,3
135,68_622,0_1121,3
136,68_622,0_1123,3
158,60_779,0_12214,3
163,70_21,0_9421,3
172,64_842,0_10345,3


## Definitions of accuracy

1. (most strict) Assigning any PIK with multiple simulants is incorrect
2. Assigning a PIK with multiple simulants is neither incorrect nor correct (excluded from denominator)
3. (most lenient) Assigning a PIK with multiple simulants is correct, as long as at least one of those simulants matches the truth

In [28]:
# All modules, Medicare database, calculated from Layne, Wagner, and Rothhaas Table 1 (p. 15)
real_life_pvs_accuracy = 1 - (2_585 + 60_709 + 129_480 + 89_094) / (52_406_981 + 5_170_924 + 49_374_794 + 50_327_034)
f'{real_life_pvs_accuracy:.5%}'

'99.82079%'

### Definition 1

In [29]:
piks_assigned = df_ops.compute(census_2030_piked.pik.notnull().sum())
piks_assigned

9873

In [30]:
df_ops.head(pik_simulant_pairs[pik_simulant_pairs.n_unique_simulants > 1])

Unnamed: 0,pik,simulant_id,n_unique_simulants
0,69_497,0_19610,2
1,60_572,0_14452,2
2,41_86,0_17207,2
3,60_718,0_22516,2
4,72_820,0_21653,2
5,44_313,0_21855,2
6,59_73,0_1355,2
7,60_118,0_15299,2
8,65_559,0_2438,2
9,40_109,0_19838,2


In [31]:
single_sim_piks_correct = df_ops.compute(
    census_2030_piked[['record_id', 'pik']].merge(pik_simulant_pairs, on='pik').merge(census_2030_ground_truth, on='record_id')
        .pipe(lambda df: (df.simulant_id_x == df.simulant_id_y) & (df.n_unique_simulants == 1))
        .sum()
)
single_sim_piks_correct

9018

In [32]:
# Overall accuracy, treating it as a black box
(
    single_sim_piks_correct / piks_assigned
)

0.9134001823154057

In [33]:
assert len(confirmed_piks_with_ground_truth) == piks_assigned

In [34]:
df_ops.head(census_2030_ground_truth.rename(columns={'record_id': 'record_id_census_2030'}))

Unnamed: 0,record_id_census_2030,simulant_id,possible_to_pik
0,simulated_census_2030_0_0,0_923,1.0
1,simulated_census_2030_0_1,0_2348,1.0
2,simulated_census_2030_0_2,0_2641,1.0
3,simulated_census_2030_0_3,0_6176,1.0
4,simulated_census_2030_0_4,0_10251,1.0
5,simulated_census_2030_0_5,0_13047,1.0
6,simulated_census_2030_0_6,0_13861,1.0
7,simulated_census_2030_0_7,0_13972,1.0
8,simulated_census_2030_0_8,0_13973,1.0
9,simulated_census_2030_0_9,0_13974,1.0


In [35]:
# Looking at whether the exact *record* linked was from the same simulant
single_sim_record_links_correct = df_ops.compute(
    confirmed_piks_with_ground_truth
        .merge(
            census_2030_ground_truth.rename(columns={'record_id': 'record_id_raw_input_file'}),
            on='record_id_raw_input_file',
        )
        .merge(
            reference_files_ground_truth.rename(columns={'record_id': 'record_id_reference_file'}),
            on='record_id_reference_file',
        )
        .pipe(lambda df: (df.simulant_id_x == df.simulant_id_y) & (df.n_unique_simulants == 1))
        .sum()
)
single_sim_record_links_correct

9224

In [36]:
(
    single_sim_record_links_correct / piks_assigned
)

0.9342651676288869

### Definition 2

In [37]:
single_sim_piks_assigned = len(census_2030_piked[['record_id', 'pik']].merge(pik_simulant_pairs[pik_simulant_pairs.n_unique_simulants == 1][['pik', 'simulant_id']]))
single_sim_piks_assigned

9055

In [38]:
# Overall accuracy, treating it as a black box
(
    single_sim_piks_correct / single_sim_piks_assigned
)

0.9959138597459967

In [39]:
# Looking at whether the exact *record* linked was from the same simulant
single_sim_record_links_assigned = df_ops.compute(
    (confirmed_piks_with_ground_truth
        .merge(
            reference_files_ground_truth.rename(columns={'record_id': 'record_id_reference_file'}),
            on='record_id_reference_file',
        )
        .n_unique_simulants == 1).sum()
)
single_sim_record_links_assigned

9262

In [40]:
(
    single_sim_record_links_correct / single_sim_record_links_assigned
)

0.9958972144245304

### Definition 3

In [41]:
pik_simulant_pairs

Unnamed: 0,pik,simulant_id,n_unique_simulants
0,69_497,0_19610,2
1,60_572,0_14452,2
2,41_86,0_17207,2
3,60_718,0_22516,2
4,72_820,0_21653,2
...,...,...,...
20117,75_280,0_15578,1
20118,75_239,0_22864,1
20119,75_247,0_22123,1
20120,75_252,0_22467,1


In [42]:
piks_at_least_partially_correct = df_ops.persist(
    census_2030_piked[['record_id', 'pik']].merge(pik_simulant_pairs, on='pik').merge(census_2030_ground_truth, on='record_id')
        .pipe(df_ops.drop_duplicates)
        .assign(correct=lambda df: df.simulant_id_x == df.simulant_id_y)
        .pipe(df_ops.groupby_agg_small_groups, by=["record_id", "pik"], agg_func=lambda x: x.correct.any())
        .reset_index()
)
piks_at_least_partially_correct

Unnamed: 0,record_id,pik,correct
0,simulated_census_2030_0_0,55_190,True
1,simulated_census_2030_0_1,72_838,True
2,simulated_census_2030_0_10,71_108,True
3,simulated_census_2030_0_100,49_141,True
4,simulated_census_2030_0_1000,42_49,True
...,...,...,...
9868,simulated_census_2030_0_9995,69_243,True
9869,simulated_census_2030_0_9996,69_684,True
9870,simulated_census_2030_0_9997,64_218,True
9871,simulated_census_2030_0_9998,68_532,True


In [43]:
# Overall accuracy, treating it as a black box
piks_correct_proportion = (df_ops.compute(piks_at_least_partially_correct.correct.sum()) / piks_assigned)
piks_correct_proportion

0.9960498328775448

In [44]:
print(f'{piks_correct_proportion:.5%} of the PIKs assigned were correct; compare with {real_life_pvs_accuracy:.5%} in real life')

99.60498% of the PIKs assigned were correct; compare with 99.82079% in real life


In [45]:
# Looking at whether the exact *record* linked was from the same simulant
sim_record_links_at_least_partially_correct = df_ops.persist(
    confirmed_piks_with_ground_truth
        .merge(
            census_2030_ground_truth.rename(columns={'record_id': 'record_id_raw_input_file'}),
            on='record_id_raw_input_file',
        )
        .merge(
            reference_files_ground_truth.rename(columns={'record_id': 'record_id_reference_file'}),
            on='record_id_reference_file',
        )
        .assign(correct=lambda df: df.simulant_id_x == df.simulant_id_y)
        .pipe(df_ops.groupby_agg_small_groups, by=["record_id_raw_input_file", "record_id_reference_file", "pik", "module_name", "pass_name"], agg_func=lambda x: x.correct.any())
        .reset_index()
)
sim_record_links_at_least_partially_correct

Unnamed: 0,record_id_raw_input_file,record_id_reference_file,pik,module_name,pass_name,correct
0,simulated_census_2030_0_0,simulated_geobase_reference_file_8_767,55_190,geosearch,geokey,True
1,simulated_census_2030_0_1,simulated_geobase_reference_file_13_1607,72_838,geosearch,geokey,True
2,simulated_census_2030_0_10,simulated_geobase_reference_file_4_1349,71_108,geosearch,geokey,True
3,simulated_census_2030_0_100,simulated_geobase_reference_file_15_392,49_141,geosearch,geokey,True
4,simulated_census_2030_0_1000,simulated_geobase_reference_file_9_197,42_49,geosearch,some name and DOB information,True
...,...,...,...,...,...,...
9868,simulated_census_2030_0_9995,simulated_geobase_reference_file_15_1238,69_243,hhcompsearch,year of birth,True
9869,simulated_census_2030_0_9996,simulated_geobase_reference_file_13_1451,69_684,geosearch,geokey,True
9870,simulated_census_2030_0_9997,simulated_geobase_reference_file_10_1098,64_218,geosearch,geokey,True
9871,simulated_census_2030_0_9998,simulated_geobase_reference_file_10_1336,68_532,geosearch,house number and street name Soundex,True


In [46]:
len(sim_record_links_at_least_partially_correct)

9873

In [47]:
len(df_ops.drop_duplicates(sim_record_links_at_least_partially_correct[['record_id_raw_input_file', 'record_id_reference_file']]))

9873

In [48]:
(
    df_ops.compute(sim_record_links_at_least_partially_correct.correct.sum()) / piks_assigned
)

0.9959485465410716

In [49]:
assert df_ops.compute((df_ops.groupby_agg_small_groups(confirmed_piks_with_ground_truth, by='record_id_raw_input_file', agg_func=lambda x: x.record_id_reference_file.nunique()) <= 1).all())

In [50]:
# Using definition 3 -- at the PIK level
piks_at_least_partially_correct = df_ops.persist(
    piks_at_least_partially_correct
        .rename(columns={'record_id': 'record_id_raw_input_file'})
        .merge(confirmed_piks_with_ground_truth[['record_id_raw_input_file', 'module_name', 'pass_name']], on='record_id_raw_input_file')
)
piks_at_least_partially_correct

Unnamed: 0,record_id_raw_input_file,pik,correct,module_name,pass_name
0,simulated_census_2030_0_0,55_190,True,geosearch,geokey
1,simulated_census_2030_0_1,72_838,True,geosearch,geokey
2,simulated_census_2030_0_10,71_108,True,geosearch,geokey
3,simulated_census_2030_0_100,49_141,True,geosearch,geokey
4,simulated_census_2030_0_1000,42_49,True,geosearch,some name and DOB information
...,...,...,...,...,...
9868,simulated_census_2030_0_9995,69_243,True,hhcompsearch,year of birth
9869,simulated_census_2030_0_9996,69_684,True,geosearch,geokey
9870,simulated_census_2030_0_9997,64_218,True,geosearch,geokey
9871,simulated_census_2030_0_9998,68_532,True,geosearch,house number and street name Soundex


In [51]:
# Accuracy by module -- note that this shows the opposite pattern (with the sample data)
# relative to the results of Layne et al., who found GeoSearch was much *more* accurate
df_ops.compute(piks_at_least_partially_correct.groupby("module_name").correct.agg(["mean", "size"]).sort_values("mean"))

Unnamed: 0_level_0,mean,size
module_name,Unnamed: 1_level_1,Unnamed: 2_level_1
geosearch,0.995798,9282
dobsearch,1.0,166
hhcompsearch,1.0,37
namesearch,1.0,388


In [52]:
# Accuracy by pass -- could be used to tune pass-specific cutoffs, but
# this might not be too informative while we are still using the sample data.
df_ops.compute(piks_at_least_partially_correct.groupby(["module_name", "pass_name"]).correct.agg(["mean", "size"]).sort_values("mean"))

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,size
module_name,pass_name,Unnamed: 2_level_1,Unnamed: 3_level_1
geosearch,geokey,0.99434,6714
geosearch,house number and street name Soundex,0.998255,573
dobsearch,initials name switch,1.0,5
dobsearch,first two characters of first name and year of birth,1.0,120
dobsearch,reverse Soundex of name,1.0,41
geosearch,geokey name switch,1.0,45
geosearch,house number and street name Soundex name switch,1.0,2
geosearch,some name and DOB information,1.0,1948
hhcompsearch,initials,1.0,29
hhcompsearch,year of birth,1.0,8


In [53]:
# Using definition 3 -- at the link level
df_ops.compute(sim_record_links_at_least_partially_correct.groupby("module_name").correct.agg(["mean", "size"]).sort_values("mean"))

Unnamed: 0_level_0,mean,size
module_name,Unnamed: 1_level_1,Unnamed: 2_level_1
geosearch,0.995691,9282
dobsearch,1.0,166
hhcompsearch,1.0,37
namesearch,1.0,388


In [54]:
df_ops.compute(sim_record_links_at_least_partially_correct.groupby(["module_name", "pass_name"]).correct.agg(["mean", "size"]).sort_values("mean"))

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,size
module_name,pass_name,Unnamed: 2_level_1,Unnamed: 3_level_1
geosearch,geokey,0.99434,6714
geosearch,house number and street name Soundex,0.99651,573
dobsearch,initials name switch,1.0,5
dobsearch,first two characters of first name and year of birth,1.0,120
dobsearch,reverse Soundex of name,1.0,41
geosearch,geokey name switch,1.0,45
geosearch,house number and street name Soundex name switch,1.0,2
geosearch,some name and DOB information,1.0,1948
hhcompsearch,initials,1.0,29
hhcompsearch,year of birth,1.0,8


In [55]:
df_ops.compute(sim_record_links_at_least_partially_correct[~sim_record_links_at_least_partially_correct.correct].groupby(["module_name", "pass_name"]).size()).sort_values()

module_name  pass_name                           
geosearch    house number and street name Soundex     2
             geokey                                  38
dtype: int64

### Incorrect and missed PIKs

In [56]:
incorrectly_linked_pairs = df_ops.persist(df_ops.drop_duplicates(
    sim_record_links_at_least_partially_correct[~sim_record_links_at_least_partially_correct.correct]
        [["record_id_raw_input_file", "record_id_reference_file"]]
))
incorrectly_linked_pairs

Unnamed: 0,record_id_raw_input_file,record_id_reference_file
275,simulated_census_2030_0_10272,simulated_geobase_reference_file_10_1362
386,simulated_census_2030_0_1038,simulated_geobase_reference_file_19_1219
502,simulated_census_2030_0_10493,simulated_geobase_reference_file_9_1296
726,simulated_census_2030_0_10713,simulated_geobase_reference_file_5_227
831,simulated_census_2030_0_10816,simulated_geobase_reference_file_12_1235
1790,simulated_census_2030_0_187,simulated_geobase_reference_file_15_844
1932,simulated_census_2030_0_2000,simulated_geobase_reference_file_15_166
2354,simulated_census_2030_0_2468,simulated_geobase_reference_file_15_820
2392,simulated_census_2030_0_2503,simulated_geobase_reference_file_3_776
3627,simulated_census_2030_0_3780,simulated_geobase_reference_file_3_190


In [57]:
len(incorrectly_linked_pairs)

40

In [58]:
incorrect_links = df_ops.head(incorrectly_linked_pairs, n=100)
incorrect_links

Unnamed: 0,record_id_raw_input_file,record_id_reference_file
275,simulated_census_2030_0_10272,simulated_geobase_reference_file_10_1362
386,simulated_census_2030_0_1038,simulated_geobase_reference_file_19_1219
502,simulated_census_2030_0_10493,simulated_geobase_reference_file_9_1296
726,simulated_census_2030_0_10713,simulated_geobase_reference_file_5_227
831,simulated_census_2030_0_10816,simulated_geobase_reference_file_12_1235
1790,simulated_census_2030_0_187,simulated_geobase_reference_file_15_844
1932,simulated_census_2030_0_2000,simulated_geobase_reference_file_15_166
2354,simulated_census_2030_0_2468,simulated_geobase_reference_file_15_820
2392,simulated_census_2030_0_2503,simulated_geobase_reference_file_3_776
3627,simulated_census_2030_0_3780,simulated_geobase_reference_file_3_190


In [59]:
%xdel incorrectly_linked_pairs

In [60]:
comparison_cols = [
    "first_name",
    "middle_name",
    "last_name",
    "date_of_birth",
    "street_number",
    "street_name",
    "unit_number",
    "city",
    "state",
]

incorrect_links_detail = (
    incorrect_links
        .merge(
            df_ops.compute(census_2030_piked[census_2030_piked.record_id.isin(incorrect_links.record_id_raw_input_file)])
                .rename(columns={"record_id": "record_id_raw_input_file", "middle_initial": "middle_name"})
                [["record_id_raw_input_file"] + comparison_cols],
            on="record_id_raw_input_file",
            how="left",
        )
        .merge(
            df_ops.compute(reference_file[reference_file.record_id.isin(incorrect_links.record_id_reference_file)])
                .rename(columns={"record_id": "record_id_reference_file"})
                .rename(columns=lambda c: c.replace('mailing_address_', ''))
                [["record_id_reference_file"] + comparison_cols],
            on="record_id_reference_file",
            how="left",
            suffixes=("_census", "_reference_file"),
        )
)
def flatten(xss):
    return [x for xs in xss for x in xs]

incorrect_links_detail[flatten([(f'{c}_census', f'{c}_reference_file') for c in comparison_cols])]

Unnamed: 0,first_name_census,first_name_reference_file,middle_name_census,middle_name_reference_file,last_name_census,last_name_reference_file,date_of_birth_census,date_of_birth_reference_file,street_number_census,street_number_reference_file,street_name_census,street_name_reference_file,unit_number_census,unit_number_reference_file,city_census,city_reference_file,state_census,state_reference_file
0,Charlene,Mayra,L,Alaya,Stockman,Stockman,05/22/2025,20250522,13711,13711,filbert st,FILBERT ST,,,Anytown,ANYTOWN,WA,WA
1,Aniylah,Kristin,R,Michelle,Reeder,Reeder,12/29/1982,19821229,6302,6302,n carleton ave,N CARLETON AVE,,,Anytown,ANYTOWN,WA,WA
2,Elroy,Cecflia,J,Karen,Abraham,Abeaham,08/17/1976,19760817,1242,1242,ashburton rd,ASHBURTON RD,,,Anytown,ANYTOWN,WA,WA
3,Jossph,Dylan,B,Jimmy,Hartwell,Hartwell,07/15/1992,19920715,225,225,av 360,AV 360,,,Anytown,ANYTOWN,WA,WA
4,Amara,King,L,Louis,Vanover,Vanover,06/08/2027,20270608,,,sw 178th st,SW 178TH ST,,,Anytown,ANYTOWN,WA,WA
5,Remington,Domonique,F,Jacquelyn,Woods,Woods,12/11/1988,19881211,8173,8173,meridian ave nth,MERIDIAN AVE NTH,,,Anytown,ANYTOWN,WA,WA
6,Carolyn,Mark,E,Daniel,Tamariz,Tamariz,07/12/1966,19660712,3809,3809,brier creek ct,BRIER CREEK CT,,,Anytown,ANYTOWN,WA,WA
7,Vivian,Tilly,M,Spencer,Ervin,Ervin,07/01/2019,20190701,11711,11711,sherwood st,SHERWOOD ST,,,Anytown,ANYTOWN,WA,WA
8,Jorge,Colleen,C,Carol,Patel-Gallardo,Patel-Gallardo,07/18/1974,19740718,1850,1850,via blairo,VIA BLAIRO,,,Anytown,ANYTOWN,WA,WA
9,Angel,Darryl,J,Gregory,Mata-Gonzalez,Mata-Gonzalez,08/26/1970,19700826,11990,11990,n 18th st,N 18TH ST,,,Anytown,ANYTOWN,WA,WA


In [61]:
missed_links = df_ops.persist(
    census_2030_piked[census_2030_piked.pik.isnull()][["record_id"]]
        .merge(census_2030_ground_truth, on="record_id")
        .merge(reference_files_ground_truth[reference_files_ground_truth.n_unique_simulants == 1], on="simulant_id", suffixes=("_census", "_reference_file"))
)

In [62]:
len(missed_links)

3284

In [63]:
simulants_missed = df_ops.head(missed_links[['simulant_id']], n=100).simulant_id.unique()
simulants_missed

<StringArray>
['0_10251', '0_13861',     '0_9', '0_21615', '0_21616',    '0_21',  '0_2154',
 '0_11857', '0_18062', '0_17384',  '0_5474', '0_18927', '0_12284',  '0_4702',
 '0_21693', '0_22602',  '0_4991',  '0_7310', '0_12456', '0_17692',  '0_4445',
  '0_2048',  '0_2538',    '0_74',  '0_1471',  '0_1611',  '0_1637',  '0_1980',
  '0_2057',  '0_2144',  '0_2767',  '0_2957',  '0_2990',  '0_3536',  '0_4519',
  '0_4583',  '0_4763',  '0_4807']
Length: 38, dtype: string

In [64]:
missed_pairs = df_ops.compute(missed_links[missed_links.simulant_id.isin(list(simulants_missed))])
missed_pairs

Unnamed: 0,record_id_census,simulant_id,possible_to_pik,record_id_reference_file,n_unique_simulants
0,simulated_census_2030_0_4,0_10251,1.0,simulated_geobase_reference_file_11_612,1
1,simulated_census_2030_0_4,0_10251,1.0,simulated_geobase_reference_file_11_613,1
2,simulated_census_2030_0_4,0_10251,1.0,simulated_geobase_reference_file_11_614,1
3,simulated_census_2030_0_4,0_10251,1.0,simulated_name_dob_reference_file_1_3553,1
4,simulated_census_2030_0_6,0_13861,1.0,simulated_geobase_reference_file_0_1419,1
...,...,...,...,...,...
98,simulated_census_2030_0_309,0_4763,1.0,simulated_name_dob_reference_file_0_7086,1
99,simulated_census_2030_0_310,0_4807,1.0,simulated_geobase_reference_file_14_1027,1
100,simulated_census_2030_0_310,0_4807,1.0,simulated_geobase_reference_file_14_1028,1
101,simulated_census_2030_0_310,0_4807,1.0,simulated_geobase_reference_file_14_1026,1


In [65]:
%xdel missed_links

In [66]:
missed_links_detail = (
    missed_pairs
        .merge(
            df_ops.compute(census_2030_piked[census_2030_piked.record_id.isin(list(missed_pairs.record_id_census))])
                .rename(columns={"record_id": "record_id_census", "middle_initial": "middle_name"}),
            on="record_id_census",
        )
        .merge(
            df_ops.compute(reference_file[reference_file.record_id.isin(missed_pairs.record_id_reference_file)])
                .rename(columns=lambda c: c.replace('mailing_address_', ''))
                .rename(columns={"record_id": "record_id_reference_file"}),
            on="record_id_reference_file",
            suffixes=("_census", "_reference_file"),
        )
)

In [67]:
for simulant in simulants_missed:
    print(simulant)
    display(missed_links_detail[missed_links_detail.simulant_id == simulant][['simulant_id'] + flatten([(f'{c}_census', f'{c}_reference_file') for c in comparison_cols])])

0_10251


Unnamed: 0,simulant_id,first_name_census,first_name_reference_file,middle_name_census,middle_name_reference_file,last_name_census,last_name_reference_file,date_of_birth_census,date_of_birth_reference_file,street_number_census,street_number_reference_file,street_name_census,street_name_reference_file,unit_number_census,unit_number_reference_file,city_census,city_reference_file,state_census,state_reference_file
0,0_10251,Bobby,Bpbbu,S,Shane,Rhimpson,Thompson,05/30/1985,19850520,,,winding trail rd,WINDING TRAIL RD,,,Anytown,ANYTOWN,WA,WA
1,0_10251,Bobby,Bpbbu,S,Shane,Rhimpson,Thompson,05/30/1985,19850520,,17861.0,winding trail rd,WINDING TRAIL RD,,,Anytown,ANYTOWN,WA,WA
2,0_10251,Bobby,Bpbbu,S,Shane,Rhimpson,Thompson,05/30/1985,19850520,,17868.0,winding trail rd,WINDING TRAIL RD,,,Anytown,ANYTOWN,WA,WA
3,0_10251,Bobby,Bpbbu,S,Shane,Rhimpson,Thompson,05/30/1985,19850520,,,winding trail rd,,,,Anytown,,WA,


0_13861


Unnamed: 0,simulant_id,first_name_census,first_name_reference_file,middle_name_census,middle_name_reference_file,last_name_census,last_name_reference_file,date_of_birth_census,date_of_birth_reference_file,street_number_census,street_number_reference_file,street_name_census,street_name_reference_file,unit_number_census,unit_number_reference_file,city_census,city_reference_file,state_census,state_reference_file
4,0_13861,Levi J,Levi,,Jared,Shirley,Sbirlev,01/19/198o,19800119,32597,1307.0,delacorte dr,ROSEWOOD AVE,,,Anytown,ANYTOWN,WA,WA
5,0_13861,Levi J,Levi,,Jared,Shirley,Sbirlev,01/19/198o,19800119,32597,32597.0,delacorte dr,DELACORTE DR,,,Anytown,ANYTOWN,WA,WA
6,0_13861,Levi J,Levi,,Jared,Shirley,Sbirlev,01/19/198o,19800119,32597,,delacorte dr,,,,Anytown,,WA,


0_9


Unnamed: 0,simulant_id,first_name_census,first_name_reference_file,middle_name_census,middle_name_reference_file,last_name_census,last_name_reference_file,date_of_birth_census,date_of_birth_reference_file,street_number_census,street_number_reference_file,street_name_census,street_name_reference_file,unit_number_census,unit_number_reference_file,city_census,city_reference_file,state_census,state_reference_file
7,0_9,Elijah,Elijah,N,Noah,Esquivel,Esquivel,10/31/2015,29151031,1648,1648.0,,EAGLE HEIGHTS,,,Anytown,ANYTOWN,WA,WA
8,0_9,Elijah,Elijah,N,Noah,Esquivel,Esquivel,10/31/2015,29151031,1648,,,,,,Anytown,,WA,


0_21615


Unnamed: 0,simulant_id,first_name_census,first_name_reference_file,middle_name_census,middle_name_reference_file,last_name_census,last_name_reference_file,date_of_birth_census,date_of_birth_reference_file,street_number_census,street_number_reference_file,street_name_census,street_name_reference_file,unit_number_census,unit_number_reference_file,city_census,city_reference_file,state_census,state_reference_file
9,0_21615,Camila,Camila,F,Farrah,Skhwartz,Schwartz,11/05/2016,20400819,211,3434.0,winchester rd,LAKESIDE CLUB BLVD 7,,,Anytown,ANYTOWN,WA,WA
10,0_21615,Camila,Camila,F,Farrah,Skhwartz,Schwartz,11/05/2016,20400819,211,211.0,winchester rd,WINCHESTER RD,,,Anytown,ANYTOWN,WA,WA
11,0_21615,Camila,Camila,F,Farrah,Skhwartz,Schwartz,11/05/2016,20400819,211,,winchester rd,,,,Anytown,,WA,


0_21616


Unnamed: 0,simulant_id,first_name_census,first_name_reference_file,middle_name_census,middle_name_reference_file,last_name_census,last_name_reference_file,date_of_birth_census,date_of_birth_reference_file,street_number_census,street_number_reference_file,street_name_census,street_name_reference_file,unit_number_census,unit_number_reference_file,city_census,city_reference_file,state_census,state_reference_file
12,0_21616,Brooke,Brooke,S,Emery,Schwartz,Schwartz,10/12/2020,20671012,211,211.0,winchester rd,WINCHESTER RD,,,,ANYTOWN,WA,WA
13,0_21616,Brooke,Brooke,S,Emery,Schwartz,Schwartz,10/12/2020,20671012,211,3434.0,winchester rd,LAKESIDE CLUB BLVD 7,,,,ANYTOWN,WA,WA
14,0_21616,Brooke,Brooke,S,Emery,Schwartz,Schwartz,10/12/2020,20671012,211,,winchester rd,,,,,,WA,


0_21


Unnamed: 0,simulant_id,first_name_census,first_name_reference_file,middle_name_census,middle_name_reference_file,last_name_census,last_name_reference_file,date_of_birth_census,date_of_birth_reference_file,street_number_census,street_number_reference_file,street_name_census,street_name_reference_file,unit_number_census,unit_number_reference_file,city_census,city_reference_file,state_census,state_reference_file
15,0_21,Gianna,Gianna,R,Rosemary,Doucet,Doucet,09/05/1964,19610308,13208,13208.0,w warren ave,W WARREN AVE,,,Anytown,ANYTOWN,WA,WA
16,0_21,Gianna,Gianna,R,Rosemary,Doucet,Doucet,09/05/1964,19610308,13208,13208.0,w warren ave,W WARAHEN AVE,,,Anytown,ANYTOWN,WA,WA
17,0_21,Gianna,Gianna,R,Rosemary,Doucet,Doucet,09/05/1964,19610308,13208,13208.0,w warren ave,W WARREN AVE,,,Anytown,ANYTOWN,WA,WA
18,0_21,Gianna,Gianna,R,Rosemary,Doucet,Doucet,09/05/1964,19610308,13208,13208.0,w warren ave,W WARREN AVE,,,Anytown,ANYTOWN,WA,WA
19,0_21,Gianna,Gianna,R,Rosemary,Doucet,Doucet,09/05/1964,19610308,13208,,w warren ave,,,,Anytown,,WA,


0_2154


Unnamed: 0,simulant_id,first_name_census,first_name_reference_file,middle_name_census,middle_name_reference_file,last_name_census,last_name_reference_file,date_of_birth_census,date_of_birth_reference_file,street_number_census,street_number_reference_file,street_name_census,street_name_reference_file,unit_number_census,unit_number_reference_file,city_census,city_reference_file,state_census,state_reference_file
20,0_2154,Wayne,Wayme,J,Jason,Jimenez,Jimenez,08/06/1678,19780824,2520,2520.0,westminster ave,WESTMINSTER AVE,,,Anytown,ANYTOWN,WA,WA
21,0_2154,Wayne,Wayme,J,Jason,Jimenez,Jimenez,08/06/1678,19780824,2520,2520.0,westminster ave,WESTMINSTER AVE,,,Anytown,4NYTOWN,WA,WA
22,0_2154,Wayne,Wayme,J,Jason,Jimenez,Jimenez,08/06/1678,19780824,2520,,westminster ave,,,,Anytown,,WA,


0_11857


Unnamed: 0,simulant_id,first_name_census,first_name_reference_file,middle_name_census,middle_name_reference_file,last_name_census,last_name_reference_file,date_of_birth_census,date_of_birth_reference_file,street_number_census,street_number_reference_file,street_name_census,street_name_reference_file,unit_number_census,unit_number_reference_file,city_census,city_reference_file,state_census,state_reference_file
23,0_11857,Noah,Noah,B,Bennett,Kron,Kron,08/12/2003,20030812,112,28819.0,stonewall tell rd,GOODMAN STREET,,,Anytown,ANYTOWN,WA,WA
24,0_11857,Noah,Noah,B,Bennett,Kron,Kron,08/12/2003,20030812,112,112.0,stonewall tell rd,STONEWALL TELL RD,,,Anytown,ANYTOWN,WA,PA
25,0_11857,Noah,Noah,B,Bennett,Kron,Kron,08/12/2003,20030812,112,112.0,stonewall tell rd,STONEWALL TELL RD,,,Anytown,ANYTOWN,WA,WA
26,0_11857,Noah,Noah,B,Bennett,Kron,Kron,08/12/2003,20030812,112,28819.0,stonewall tell rd,GOODMAN STREET,,,Anytown,ANITOWN,WA,WA
27,0_11857,Noah,Noah,B,Bennett,Kron,Kron,08/12/2003,20030812,112,,stonewall tell rd,,,,Anytown,,WA,


0_18062


Unnamed: 0,simulant_id,first_name_census,first_name_reference_file,middle_name_census,middle_name_reference_file,last_name_census,last_name_reference_file,date_of_birth_census,date_of_birth_reference_file,street_number_census,street_number_reference_file,street_name_census,street_name_reference_file,unit_number_census,unit_number_reference_file,city_census,city_reference_file,state_census,state_reference_file
28,0_18062,Man,Kevin,S,Inmate,Thurman,Thurman,10/29/5985,19851029,215,215.0,nrthw 66th street,NRTHW 66TH STREET,,,Anytown,ANYTOWN,WA,WA
29,0_18062,Man,Kevin,S,Inmate,Thurman,Thurman,10/29/5985,19851029,215,,nrthw 66th street,,,,Anytown,,WA,


0_17384


Unnamed: 0,simulant_id,first_name_census,first_name_reference_file,middle_name_census,middle_name_reference_file,last_name_census,last_name_reference_file,date_of_birth_census,date_of_birth_reference_file,street_number_census,street_number_reference_file,street_name_census,street_name_reference_file,unit_number_census,unit_number_reference_file,city_census,city_reference_file,state_census,state_reference_file
30,0_17384,Bonnie,Bonnie,D,Danna,Reyes,Reyes,04/30/1969,19690030,9,9.0,hickogy dr,HICKORY DR,,,Anytown,,WA,WA
31,0_17384,Bonnie,Bonnie,D,Danna,Reyes,Reyes,04/30/1969,19690030,9,,hickogy dr,,,,Anytown,,WA,


0_5474


Unnamed: 0,simulant_id,first_name_census,first_name_reference_file,middle_name_census,middle_name_reference_file,last_name_census,last_name_reference_file,date_of_birth_census,date_of_birth_reference_file,street_number_census,street_number_reference_file,street_name_census,street_name_reference_file,unit_number_census,unit_number_reference_file,city_census,city_reference_file,state_census,state_reference_file
32,0_5474,Dylan,Dylan,A,Aceson,Dewyer,Dewyer,07/31/2016,20160731,2055,2055.0,bridlewood dr,BRIDLEWOOD DR,,,Anyton,ANYTOWN,WA,WA
33,0_5474,Dylan,Dylan,A,Aceson,Dewyer,Dewyer,07/31/2016,20160731,2055,,bridlewood dr,,,,Anyton,,WA,


0_18927


Unnamed: 0,simulant_id,first_name_census,first_name_reference_file,middle_name_census,middle_name_reference_file,last_name_census,last_name_reference_file,date_of_birth_census,date_of_birth_reference_file,street_number_census,street_number_reference_file,street_name_census,street_name_reference_file,unit_number_census,unit_number_reference_file,city_census,city_reference_file,state_census,state_reference_file
34,0_18927,Danny M,Danny,,Mark,Moore,Moore,,19630522,2994,2994.0,franklin ave,FRANKLIN AVE,,,Anytown,ANYTOWN,WA,WA
35,0_18927,Danny M,Danny,,Mark,Moore,Moore,,19630522,2994,,franklin ave,,,,Anytown,,WA,


0_12284


Unnamed: 0,simulant_id,first_name_census,first_name_reference_file,middle_name_census,middle_name_reference_file,last_name_census,last_name_reference_file,date_of_birth_census,date_of_birth_reference_file,street_number_census,street_number_reference_file,street_name_census,street_name_reference_file,unit_number_census,unit_number_reference_file,city_census,city_reference_file,state_census,state_reference_file
36,0_12284,Inmate,Nathaniel,C,Carson,Brooks-Lebron,Brooks-Lebron,10/26/2004,20041026,2324,2327.0,n college ave,N COLLEGE AVE,,,Anytown,ANYTOWN,WA,WA
37,0_12284,Inmate,Nathaniel,C,Carson,Brooks-Lebron,Brooks-Lebron,10/26/2004,20041026,2324,2327.0,n college ave,N COLLEGE AVE,,,Anytown,ANYTOWN,WA,WA
38,0_12284,Inmate,Nathaniel,C,Carson,Brooks-Lebron,Brooks-Lebron,10/26/2004,20041026,2324,,n college ave,,,,Anytown,,WA,


0_4702


Unnamed: 0,simulant_id,first_name_census,first_name_reference_file,middle_name_census,middle_name_reference_file,last_name_census,last_name_reference_file,date_of_birth_census,date_of_birth_reference_file,street_number_census,street_number_reference_file,street_name_census,street_name_reference_file,unit_number_census,unit_number_reference_file,city_census,city_reference_file,state_census,state_reference_file
39,0_4702,Brooke,Brooke,A,A,Blackwell,Blackwell,06/22/1995,,5301,5301.0,e lputnah ave,W PUTNAM AVE,,,Anytown,ANYTOWN,WA,WA
40,0_4702,Brooke,Brooke,A,A,Blackwell,Blackwell,06/22/1995,,5301,3548.0,e lputnah ave,S BALDWIN,,,Anytown,ANYTOWN,WA,WA
41,0_4702,Brooke,Brooke,A,A,Blackwell,Blackwell,06/22/1995,,5301,,e lputnah ave,,,,Anytown,,WA,


0_21693


Unnamed: 0,simulant_id,first_name_census,first_name_reference_file,middle_name_census,middle_name_reference_file,last_name_census,last_name_reference_file,date_of_birth_census,date_of_birth_reference_file,street_number_census,street_number_reference_file,street_name_census,street_name_reference_file,unit_number_census,unit_number_reference_file,city_census,city_reference_file,state_census,state_reference_file
42,0_21693,Hordan,Jordan,M,M,Cremoja,Cremona,04/27/1998,,1807,12580.0,n oaklawn ave,LASALLE BLVD,,,Anytown,ANYTOWN,WA,WA
43,0_21693,Hordan,Jordan,M,M,Cremoja,Cremona,04/27/1998,,1807,,n oaklawn ave,,,,Anytown,,WA,


0_22602


Unnamed: 0,simulant_id,first_name_census,first_name_reference_file,middle_name_census,middle_name_reference_file,last_name_census,last_name_reference_file,date_of_birth_census,date_of_birth_reference_file,street_number_census,street_number_reference_file,street_name_census,street_name_reference_file,unit_number_census,unit_number_reference_file,city_census,city_reference_file,state_census,state_reference_file
44,0_22602,Male Child,Emily,J,Jenna,,Blackwell,11/27/2026,20261127,5301,,w putnam ave,,,,Anytown,,WA,


0_4991


Unnamed: 0,simulant_id,first_name_census,first_name_reference_file,middle_name_census,middle_name_reference_file,last_name_census,last_name_reference_file,date_of_birth_census,date_of_birth_reference_file,street_number_census,street_number_reference_file,street_name_census,street_name_reference_file,unit_number_census,unit_number_reference_file,city_census,city_reference_file,state_census,state_reference_file
45,0_4991,Ryan,Ryan,T,T,Cdawforc,Crawford,11/04/1976,,1410,,w chas mdws dr,,,,Anytown,ANYTOWN,WA,WA
46,0_4991,Ryan,Ryan,T,T,Cdawforc,Craw,11/04/1976,,1410,,w chas mdws dr,,,,Anytown,ANYTOWN,WA,WA
47,0_4991,Ryan,Ryan,T,T,Cdawforc,Crawford,11/04/1976,,1410,,w chas mdws dr,,,,Anytown,,WA,
48,0_4991,Ryan,Ryan,T,T,Cdawforc,Craw,11/04/1976,,1410,,w chas mdws dr,,,,Anytown,,WA,


0_7310


Unnamed: 0,simulant_id,first_name_census,first_name_reference_file,middle_name_census,middle_name_reference_file,last_name_census,last_name_reference_file,date_of_birth_census,date_of_birth_reference_file,street_number_census,street_number_reference_file,street_name_census,street_name_reference_file,unit_number_census,unit_number_reference_file,city_census,city_reference_file,state_census,state_reference_file
49,0_7310,Eiizabeyh,Elizabeth,H,Heather,Miller,Doh,08/15/1974,19740815,7809,,august dr,,,,Anytown,,WA,
50,0_7310,Eiizabeyh,Elizabeth,H,Heather,Miller,Doh,08/15/1974,19740815,7809,,august dr,,,,Anytown,,WA,


0_12456


Unnamed: 0,simulant_id,first_name_census,first_name_reference_file,middle_name_census,middle_name_reference_file,last_name_census,last_name_reference_file,date_of_birth_census,date_of_birth_reference_file,street_number_census,street_number_reference_file,street_name_census,street_name_reference_file,unit_number_census,unit_number_reference_file,city_census,city_reference_file,state_census,state_reference_file
51,0_12456,Gregory,Greg,J,Harrington,Harrington,John,12/27/1962,19621227,14 1 2,54 1 2,2nd avenue,2ND AVENUE,floor # 1 apartment 1,FLOOR # 1 APARTMENT 1,Anytown,ANYTOWN,WA,WA
52,0_12456,Gregory,Greg,J,Harrington,Harrington,John,12/27/1962,19621227,14 1 2,14 1 2,2nd avenue,2ND AVENUE,floor # 1 apartment 1,FLOOR # 1 APARTMENT 1,Anytown,ANYTOSWN,WA,WA
53,0_12456,Gregory,Greg,J,Harrington,Harrington,John,12/27/1962,19621227,14 1 2,14 1 2,2nd avenue,2ND AVENUE,floor # 1 apartment 1,FLOOR # 1 APARTMENT 1,Anytown,ANYTOWN,WA,WA
54,0_12456,Gregory,Greg,J,Harrington,Harrington,John,12/27/1962,19621227,14 1 2,14 1 2,2nd avenue,2ND AVENUE,floor # 1 apartment 1,FLOOR # 1 APARTMENT 1,Anytown,ANYTOWN,WA,PA
55,0_12456,Gregory,Greg,J,Harrington,Harrington,John,12/27/1962,19621227,14 1 2,,2nd avenue,,floor # 1 apartment 1,,Anytown,,WA,


0_17692


Unnamed: 0,simulant_id,first_name_census,first_name_reference_file,middle_name_census,middle_name_reference_file,last_name_census,last_name_reference_file,date_of_birth_census,date_of_birth_reference_file,street_number_census,street_number_reference_file,street_name_census,street_name_reference_file,unit_number_census,unit_number_reference_file,city_census,city_reference_file,state_census,state_reference_file
56,0_17692,Suzanne,Sue,J,Judith,Mcfarland,Mcfarland,11/13/1967,19671113,813,813.0,carriage hill ln,CARRIAGE HILL LN,,,Anytown,ANYTOWN,WA,WA
57,0_17692,Suzanne,Sue,J,Judith,Mcfarland,Mcfarland,11/13/1967,19671113,813,,carriage hill ln,,,,Anytown,,WA,


0_4445


Unnamed: 0,simulant_id,first_name_census,first_name_reference_file,middle_name_census,middle_name_reference_file,last_name_census,last_name_reference_file,date_of_birth_census,date_of_birth_reference_file,street_number_census,street_number_reference_file,street_name_census,street_name_reference_file,unit_number_census,unit_number_reference_file,city_census,city_reference_file,state_census,state_reference_file
58,0_4445,James,James,W,William,Cobnn,Conn,08/25/1942,19740213,403,403.0,rt 55,RT 55,,,Anytown,ANYTOWN,,KY
59,0_4445,James,James,W,William,Cobnn,Conn,08/25/1942,19740213,403,603.0,rt 55,RT 55,,,Anytown,,,WA
60,0_4445,James,James,W,William,Cobnn,Conn,08/25/1942,19740213,403,403.0,rt 55,RT 55,,,Anytown,ANYTOWN,,WA
61,0_4445,James,James,W,William,Cobnn,Conn,08/25/1942,19740213,403,,rt 55,,,,Anytown,,,


0_2048


Unnamed: 0,simulant_id,first_name_census,first_name_reference_file,middle_name_census,middle_name_reference_file,last_name_census,last_name_reference_file,date_of_birth_census,date_of_birth_reference_file,street_number_census,street_number_reference_file,street_name_census,street_name_reference_file,unit_number_census,unit_number_reference_file,city_census,city_reference_file,state_census,state_reference_file
62,0_2048,Ddsidee,Ddsiree,R,Rose,Mabry,Mabry,03/22/1956,1956032z,687,687.0,,BUCKSHOT DR,,,Anytown,ANYTOWN,CA,KY
63,0_2048,Ddsidee,Ddsiree,R,Rose,Mabry,Mabry,03/22/1956,1956032z,687,687.0,,BUCKSHOT DR,,,Anytown,ANYTOWN,CA,WA
64,0_2048,Ddsidee,Ddsiree,R,Rose,Mabry,Mabry,03/22/1956,1956032z,687,,,,,,Anytown,,CA,


0_2538


Unnamed: 0,simulant_id,first_name_census,first_name_reference_file,middle_name_census,middle_name_reference_file,last_name_census,last_name_reference_file,date_of_birth_census,date_of_birth_reference_file,street_number_census,street_number_reference_file,street_name_census,street_name_reference_file,unit_number_census,unit_number_reference_file,city_census,city_reference_file,state_census,state_reference_file
65,0_2538,Mary,Mary,J,J,Kam,Kam,06/04/1970,,3268,3268.0,,WALNUT STREET,floor number 1 apartment # 377,FLOOR NUMBER 3 APARTMENT # 377,Anytown,ANYTOWN,WA,WA
66,0_2538,Mary,Mary,J,J,Kam,Kam,06/04/1970,,3268,,,,floor number 1 apartment # 377,,Anytown,,WA,


0_74


Unnamed: 0,simulant_id,first_name_census,first_name_reference_file,middle_name_census,middle_name_reference_file,last_name_census,last_name_reference_file,date_of_birth_census,date_of_birth_reference_file,street_number_census,street_number_reference_file,street_name_census,street_name_reference_file,unit_number_census,unit_number_reference_file,city_census,city_reference_file,state_census,state_reference_file
67,0_74,Ddew,Drew,O,Omar,Kennedy,Kennedy,12/24/2001,,34,34.0,bowen cir sw,BOWEN CIR SW,,,Anytown,ANYTOWN,WA,WA
68,0_74,Ddew,Drew,O,Omar,Kennedy,Kennedy,12/24/2001,,34,34.0,bowen cir sw,BOWEM CIR SW,,,Anytown,ANYTOWN,WA,WA
69,0_74,Ddew,Drew,O,Omar,Kennedy,Kennedy,12/24/2001,,34,,bowen cir sw,,,,Anytown,,WA,


0_1471


Unnamed: 0,simulant_id,first_name_census,first_name_reference_file,middle_name_census,middle_name_reference_file,last_name_census,last_name_reference_file,date_of_birth_census,date_of_birth_reference_file,street_number_census,street_number_reference_file,street_name_census,street_name_reference_file,unit_number_census,unit_number_reference_file,city_census,city_reference_file,state_census,state_reference_file
70,0_1471,Kenzie,Kenzie,A,Aubrey,Turner,Turner,01/22/2007,20072201,34,34.0,bowen cir sw,BOWEN CIR SW,,,Anytown,ANYTOWN,WA,WA
71,0_1471,Kenzie,Kenzie,A,Aubrey,Turner,Turner,01/22/2007,20072201,34,,bowen cir sw,,,,Anytown,,WA,


0_1611


Unnamed: 0,simulant_id,first_name_census,first_name_reference_file,middle_name_census,middle_name_reference_file,last_name_census,last_name_reference_file,date_of_birth_census,date_of_birth_reference_file,street_number_census,street_number_reference_file,street_name_census,street_name_reference_file,unit_number_census,unit_number_reference_file,city_census,city_reference_file,state_census,state_reference_file
72,0_1611,Maricela,Maricela,J,Jeanette,Dhoi,Choi,12/08/1973,19730812,34,34.0,bowen cir sw,BOWEN CIR SW,,,Anytown,ANYTOWN,WA,
73,0_1611,Maricela,Maricela,J,Jeanette,Dhoi,Choi,12/08/1973,19730812,34,34.0,bowen cir sw,BOWEN CIR SW,,,Anytown,ANYTOWN,WA,WA
74,0_1611,Maricela,Maricela,J,Jeanette,Dhoi,Choi,12/08/1973,19730812,34,,bowen cir sw,,,,Anytown,,WA,


0_1637


Unnamed: 0,simulant_id,first_name_census,first_name_reference_file,middle_name_census,middle_name_reference_file,last_name_census,last_name_reference_file,date_of_birth_census,date_of_birth_reference_file,street_number_census,street_number_reference_file,street_name_census,street_name_reference_file,unit_number_census,unit_number_reference_file,city_census,city_reference_file,state_census,state_reference_file
75,0_1637,Dad,Angelina,M,Makayla,Marko,Marko,08/06/2004,20040806,34,34.0,bowen cir sw,BOWEN CIR SW,,,Anytown,ANYTOWN,WA,WA
76,0_1637,Dad,Angelina,M,Makayla,Marko,Marko,08/06/2004,20040806,34,32.0,bowen cir sw,BOWEN CIR SW,,,Anytown,ANYTOWN,WA,WA
77,0_1637,Dad,Angelina,M,Makayla,Marko,Marko,08/06/2004,20040806,34,,bowen cir sw,,,,Anytown,,WA,


0_1980


Unnamed: 0,simulant_id,first_name_census,first_name_reference_file,middle_name_census,middle_name_reference_file,last_name_census,last_name_reference_file,date_of_birth_census,date_of_birth_reference_file,street_number_census,street_number_reference_file,street_name_census,street_name_reference_file,unit_number_census,unit_number_reference_file,city_census,city_reference_file,state_census,state_reference_file
78,0_1980,Charlotte,Charlotte,K,Kasey,Rinehart,Rinehart,04/23/2012,26120423,34,,bowen cir sw,,,,Anytown,,WA,
79,0_1980,Charlotte,Charlotte,K,Kasey,Rinehart,Rinehart,04/23/2012,26120423,34,,bowen cir sw,,,,Anytown,,WA,


0_2057


Unnamed: 0,simulant_id,first_name_census,first_name_reference_file,middle_name_census,middle_name_reference_file,last_name_census,last_name_reference_file,date_of_birth_census,date_of_birth_reference_file,street_number_census,street_number_reference_file,street_name_census,street_name_reference_file,unit_number_census,unit_number_reference_file,city_census,city_reference_file,state_census,state_reference_file
80,0_2057,Dayne,Wayne,J,Jamie,Anaya,Anaya,06/19/1975,19750619,34,34.0,biwen cir saw,BOWEN CIR SW,,,Anytown,ANYTOWN,WA,WA
81,0_2057,Dayne,Wayne,J,Jamie,Anaya,Anaya,06/19/1975,19750619,34,,biwen cir saw,,,,Anytown,,WA,


0_2144


Unnamed: 0,simulant_id,first_name_census,first_name_reference_file,middle_name_census,middle_name_reference_file,last_name_census,last_name_reference_file,date_of_birth_census,date_of_birth_reference_file,street_number_census,street_number_reference_file,street_name_census,street_name_reference_file,unit_number_census,unit_number_reference_file,city_census,city_reference_file,state_census,state_reference_file
82,0_2144,,Mai,T,Thomas,Mai,Joshua,11/10/2004,20041110,34,34.0,bowen cir sw,BOWEN CIR SW,,,Anytown,ANYTOWN,WA,WA
83,0_2144,,Mai,T,Thomas,Mai,Joshua,11/10/2004,20041110,34,,bowen cir sw,,,,Anytown,,WA,


0_2767


Unnamed: 0,simulant_id,first_name_census,first_name_reference_file,middle_name_census,middle_name_reference_file,last_name_census,last_name_reference_file,date_of_birth_census,date_of_birth_reference_file,street_number_census,street_number_reference_file,street_name_census,street_name_reference_file,unit_number_census,unit_number_reference_file,city_census,city_reference_file,state_census,state_reference_file
84,0_2767,,Lynette,G,Geneva,Kimbrough,Kimbrough,06/20/1952,19520620,34,34.0,bowsn cir sw,BOWEN CIR SW,,,Anytown,ANYTOWN,KY,WA
85,0_2767,,Lynette,G,Geneva,Kimbrough,Kimbrough,06/20/1952,19520620,34,,bowsn cir sw,,,,Anytown,,KY,


0_2957


Unnamed: 0,simulant_id,first_name_census,first_name_reference_file,middle_name_census,middle_name_reference_file,last_name_census,last_name_reference_file,date_of_birth_census,date_of_birth_reference_file,street_number_census,street_number_reference_file,street_name_census,street_name_reference_file,unit_number_census,unit_number_reference_file,city_census,city_reference_file,state_census,state_reference_file
86,0_2957,Elizabeth,Elizabeth,M,Marilyn,Declined,Martin,04/03/2011,2011o403,34,34.0,bowen cir sw,BOWEN CIR SW,,,Anytown,ANYTOWN,WA,WA
87,0_2957,Elizabeth,Elizabeth,M,Marilyn,Declined,Martin,04/03/2011,2011o403,34,,bowen cir sw,,,,Anytown,,WA,


0_2990


Unnamed: 0,simulant_id,first_name_census,first_name_reference_file,middle_name_census,middle_name_reference_file,last_name_census,last_name_reference_file,date_of_birth_census,date_of_birth_reference_file,street_number_census,street_number_reference_file,street_name_census,street_name_reference_file,unit_number_census,unit_number_reference_file,city_census,city_reference_file,state_census,state_reference_file
88,0_2990,David,David,J,Josh,Herold,Herold,18/06/1978,19780619,34,34.0,bowen cir sw,BOWEN CIR SW,,,Anytown,ANYTOWN,WA,WA
89,0_2990,David,David,J,Josh,Herold,Herold,18/06/1978,19780619,34,,bowen cir sw,,,,Anytown,,WA,


0_3536


Unnamed: 0,simulant_id,first_name_census,first_name_reference_file,middle_name_census,middle_name_reference_file,last_name_census,last_name_reference_file,date_of_birth_census,date_of_birth_reference_file,street_number_census,street_number_reference_file,street_name_census,street_name_reference_file,unit_number_census,unit_number_reference_file,city_census,city_reference_file,state_census,state_reference_file
90,0_3536,Elyse,Elyse,M,Maria,Arceo,Arceo,31/06/2003,20030126,34,34.0,bowen cir sw,BOWEN CIR SW,,,Anytown,ANYTOWN,WA,WA
91,0_3536,Elyse,Elyse,M,Maria,Arceo,Arceo,31/06/2003,20030126,34,,bowen cir sw,,,,Anytown,,WA,


0_4519


Unnamed: 0,simulant_id,first_name_census,first_name_reference_file,middle_name_census,middle_name_reference_file,last_name_census,last_name_reference_file,date_of_birth_census,date_of_birth_reference_file,street_number_census,street_number_reference_file,street_name_census,street_name_reference_file,unit_number_census,unit_number_reference_file,city_census,city_reference_file,state_census,state_reference_file
92,0_4519,Gabriel,Wyatt,W,Gabriel,Simmons,Of House,12/20/2010,20101220,34,37.0,bowen cir sw,BOWEN CIR SW,,,Anytown,AGYTOWN,WA,WA
93,0_4519,Gabriel,Wyatt,W,Gabriel,Simmons,Of House,12/20/2010,20101220,34,34.0,bowen cir sw,BOWEN CIR SW,,,Anytown,ANYTOWN,WA,WA
94,0_4519,Gabriel,Wyatt,W,Gabriel,Simmons,Of House,12/20/2010,20101220,34,,bowen cir sw,,,,Anytown,,WA,


0_4583


Unnamed: 0,simulant_id,first_name_census,first_name_reference_file,middle_name_census,middle_name_reference_file,last_name_census,last_name_reference_file,date_of_birth_census,date_of_birth_reference_file,street_number_census,street_number_reference_file,street_name_census,street_name_reference_file,unit_number_census,unit_number_reference_file,city_census,city_reference_file,state_census,state_reference_file
95,0_4583,Garrett,,W,Wyatt,Nickell,Nickell,07/29/1995,19950729,34,34.0,bowen cir sw,BOWEN CIR SW,,,Anydown,ANYTOWN,WA,WA
96,0_4583,Garrett,,W,Wyatt,Nickell,Nickell,07/29/1995,19950729,34,,bowen cir sw,,,,Anydown,,WA,


0_4763


Unnamed: 0,simulant_id,first_name_census,first_name_reference_file,middle_name_census,middle_name_reference_file,last_name_census,last_name_reference_file,date_of_birth_census,date_of_birth_reference_file,street_number_census,street_number_reference_file,street_name_census,street_name_reference_file,unit_number_census,unit_number_reference_file,city_census,city_reference_file,state_census,state_reference_file
97,0_4763,Doris,Doris,L,Lana,Gleeson,Gleeson,,19430504,34,34.0,bowen cir sw,BOWEN CIR SW,,,Anytown,ANYTOWN,WA,WA
98,0_4763,Doris,Doris,L,Lana,Gleeson,Gleeson,,19430504,34,,bowen cir sw,,,,Anytown,,WA,


0_4807


Unnamed: 0,simulant_id,first_name_census,first_name_reference_file,middle_name_census,middle_name_reference_file,last_name_census,last_name_reference_file,date_of_birth_census,date_of_birth_reference_file,street_number_census,street_number_reference_file,street_name_census,street_name_reference_file,unit_number_census,unit_number_reference_file,city_census,city_reference_file,state_census,state_reference_file
99,0_4807,Zoey,Zoey,A,Audrey,Leach,Leach,09/58/2010,20100908,34,34.0,bowen cir sw,BOWEN CIR SW,,,Anytown,ANYTOWN,WA,WA
100,0_4807,Zoey,Zoey,A,Audrey,Leach,Leach,09/58/2010,20100908,34,34.0,bowen cir sw,BOWEN CIR SW,,,Anytown,ANYTOWN,WA,WA
101,0_4807,Zoey,Zoey,A,Audrey,Leach,Leach,09/58/2010,20100908,34,4828.0,bowen cir sw,W LINCOLN AVE,,,Anytown,ANYTOWN,WA,WA
102,0_4807,Zoey,Zoey,A,Audrey,Leach,Leach,09/58/2010,20100908,34,,bowen cir sw,,,,Anytown,,WA,
