# Simulated PIK statistics

Here we inspect the accuracy and characteristics of the PIKs assigned,
leveraging our knowledge of ground truth from pseudopeople.

It wouldn't be possible to do the ground truth part with the real PVS, but
Layne, Wagner, and Rothhaas did something similar by redacting SSN from real records,
sending them through PVS without the SSN, and then using the true SSN
as ground truth.
The health care records they used are probably quite different from a CUF,
but they found a **very** good overall PIK accuracy (see cell below).

In [1]:
import pandas as pd

In [2]:
! date

Tue 28 Nov 2023 03:41:21 PM PST


In [3]:
data_to_use = 'small_sample'
simulated_data_output_dir = 'generate_simulated_data/output'
case_study_output_dir = 'output'

In [4]:
census_2030_piked = pd.read_parquet(f'{case_study_output_dir}/{data_to_use}/census_2030_piked.parquet')
confirmed_piks_with_ground_truth = pd.read_parquet(f'{case_study_output_dir}/{data_to_use}/confirmed_piks.parquet')

In [5]:
piked_proportion = census_2030_piked.pik.notnull().mean()
# Compare with 90.28% of input records PIKed in the 2010 CUF,
# as reported in Wagner and Layne, Table 2, p. 18 
print(f'{piked_proportion:.2%} of the input records were PIKed')

93.83% of the input records were PIKed


In [6]:
# Multiple Census rows assigned the same PIK, indicating the model thinks they are duplicates in Census
census_2030_piked.pik.value_counts().value_counts()

count
1    10259
2       45
Name: count, dtype: int64

In [7]:
# Interesting: in pseudopeople, sometimes siblings are assigned the same (common) first name, making them almost identical.
# The only giveaway is their age and DOB.
# Presumably, this tends not to happen in real life.
duplicate_piks = census_2030_piked.pik.value_counts()[census_2030_piked.pik.value_counts() > 1].index
census_2030_piked[census_2030_piked.pik.isin(duplicate_piks)].sort_values('pik')

Unnamed: 0,record_id,household_id,first_name,middle_initial,last_name,age,date_of_birth,street_number,street_name,unit_number,city,state,zipcode,housing_type,relationship_to_reference_person,sex,race_ethnicity,year,pik
5101,simulated_census_2030_5101,0_2876,Logan,L,Stevens,23,05/01/2006,115,kensington drive,,Anytown,WA,00000,Household,Biological child,Male,White,2030,101095
5106,simulated_census_2030_5106,0_2876,Baker,A,Stevens,1,05/01/2006,115,kensington drive,,Anytown,WA,00000,Household,Grandchild,Male,White,2030,101095
9486,simulated_census_2030_9486,0_3662,Allen,R,Yoder,24,02/27/2011,2255,vint hill rd,unit # 1013,Anytown,WA,00000,Household,Biological child,Male,White,2030,102228
9487,simulated_census_2030_9487,0_3662,Aaliyah,I,,24,27/02/2011,2255,vint hill rd,unit # 1013,Anytown,WA,00000,Household,Biological child,Female,White,2030,102228
5026,simulated_census_2030_5026,0_846,Kenya,M,Then,16,08/10/2013,1921,fohney dr,,Anytown,WA,00000,Household,Biological child,Female,Latino,2030,102780
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3896,simulated_census_2030_3896,0_5848,Joey,O,Catania,5,10/22/1995,3951,green acrs rd,,Anytown,WA,00000,Household,Grandchild,Male,White,2030,98537
6561,simulated_census_2030_6561,0_3750,Denise,J,Crisp,51,10/06/1997,2635,se 82nd ave,,Anytown,WA,00000,Household,Reference person,Female,Black,2030,98942
6562,simulated_census_2030_6562,0_3750,Taylor,S,Crisp,32,10/06/1997,2635,se 82nd ave,,Anytown,WA,00000,Household,Biological child,Female,Black,2030,98942
9648,simulated_census_2030_9648,0_10886,Manisha,A,Segura,30,01/19/2000,3415,w cypress st,no 4005,Anytown,WA,00000,Household,Other nonrelative,Female,Latino,2030,99482


## Ground truth statistics

In [8]:
census_2030_ground_truth = (
    pd.read_parquet(f'{simulated_data_output_dir}/{data_to_use}/simulated_census_2030_ground_truth.parquet')
)

In [9]:
# In this version of pseudopeople, there are no actual duplicates in Census,
# which means all of the duplicates identified above are wrong.
assert not census_2030_ground_truth.duplicated().any()

In [10]:
reference_files_ground_truth = pd.concat([
    pd.read_parquet(f'{simulated_data_output_dir}/{data_to_use}/simulated_geobase_reference_file_ground_truth.parquet').drop(columns=['n_unique_simulants']),
    pd.read_parquet(f'{simulated_data_output_dir}/{data_to_use}/simulated_name_dob_reference_file_ground_truth.parquet').drop(columns=['n_unique_simulants']),
], ignore_index=True)

In [11]:
# However, there can be reference file records that correspond to multiple simulants,
# due to errors in the reference file construction by SSN
n_unique_simulants = reference_files_ground_truth.groupby('record_id', as_index=False).simulant_id.nunique().rename(columns={'simulant_id': 'n_unique_simulants'})
n_unique_simulants.n_unique_simulants.value_counts()

n_unique_simulants
1    51390
2     1270
3       40
Name: count, dtype: int64

In [12]:
reference_files_ground_truth = reference_files_ground_truth.merge(
    n_unique_simulants,
    on='record_id',
    how='left',
)
reference_files_ground_truth

Unnamed: 0,record_id,simulant_id,n_unique_simulants
0,simulated_geobase_reference_file_26168,0_730,1
1,simulated_geobase_reference_file_1,0_1366,1
2,simulated_geobase_reference_file_2,0_1366,1
3,simulated_geobase_reference_file_26970,0_1366,1
4,simulated_geobase_reference_file_26971,0_1366,1
...,...,...,...
54045,simulated_name_dob_reference_file_19858,0_23319,1
54046,simulated_name_dob_reference_file_19862,0_1328,1
54047,simulated_name_dob_reference_file_19864,0_23720,1
54048,simulated_name_dob_reference_file_19868,0_5904,1


In [13]:
reference_files_ground_truth.sort_values(['n_unique_simulants', 'record_id'])

Unnamed: 0,record_id,simulant_id,n_unique_simulants
7003,simulated_geobase_reference_file_0,0_2471,1
1,simulated_geobase_reference_file_1,0_1366,1
19123,simulated_geobase_reference_file_10,0_17992,1
18152,simulated_geobase_reference_file_100,0_6438,1
25720,simulated_geobase_reference_file_1000,0_11533,1
...,...,...,...
17374,simulated_geobase_reference_file_8918,0_12218,3
17375,simulated_geobase_reference_file_8918,0_12214,3
22919,simulated_geobase_reference_file_9450,0_10359,3
22920,simulated_geobase_reference_file_9450,0_10356,3


In [14]:
possible_to_pik_proportion = census_2030_ground_truth.simulant_id.isin(reference_files_ground_truth.simulant_id).mean()
print(
    f'{(1 - possible_to_pik_proportion):.2%} of the input records are '
    'impossible to PIK correctly, since they are not in any reference files'
)

0.45% of the input records are impossible to PIK correctly, since they are not in any reference files


In [15]:
print(
    f'Assigned PIKs to {(piked_proportion / possible_to_pik_proportion):.2%} of PIK-able records'
)

Assigned PIKs to 94.26% of PIK-able records


In [16]:
reference_file_piks = pd.concat([
    pd.read_parquet(f'{simulated_data_output_dir}/{data_to_use}/simulated_geobase_reference_file.parquet', columns=['record_id', 'pik']),
    pd.read_parquet(f'{simulated_data_output_dir}/{data_to_use}/simulated_name_dob_reference_file.parquet', columns=['record_id', 'pik']),
], ignore_index=True)
reference_file_piks

Unnamed: 0,record_id,pik
0,simulated_geobase_reference_file_0,105906
1,simulated_geobase_reference_file_1,104653
2,simulated_geobase_reference_file_2,104653
3,simulated_geobase_reference_file_3,106223
4,simulated_geobase_reference_file_4,106223
...,...,...
52695,simulated_name_dob_reference_file_19870,108816
52696,simulated_name_dob_reference_file_19871,108817
52697,simulated_name_dob_reference_file_19872,108818
52698,simulated_name_dob_reference_file_19873,108819


In [17]:
assert reference_file_piks.record_id.is_unique

In [18]:
pik_simulant_pairs = reference_files_ground_truth.merge(reference_file_piks, on='record_id')[['pik', 'simulant_id']].drop_duplicates()

In [19]:
# However, there can be PIKs that correspond to multiple simulants,
# due to errors in the reference file construction by SSN
n_unique_simulants = pik_simulant_pairs.groupby('pik', as_index=False).simulant_id.nunique().rename(columns={'simulant_id': 'n_unique_simulants'})
n_unique_simulants.n_unique_simulants.value_counts()

n_unique_simulants
1    17895
2     1034
3       50
Name: count, dtype: int64

In [20]:
pik_simulant_pairs = pik_simulant_pairs.merge(
    n_unique_simulants,
    on='pik',
    how='left',
)
pik_simulant_pairs

Unnamed: 0,pik,simulant_id,n_unique_simulants
0,104068,0_730,1
1,104653,0_1366,1
2,106546,0_1812,1
3,104654,0_3267,1
4,108181,0_19848,1
...,...,...,...
20108,104720,0_20757,1
20109,91181,0_4838,1
20110,94888,0_15626,1
20111,96729,0_8746,1


In [21]:
pik_simulant_pairs.sort_values(['n_unique_simulants', 'pik'])

Unnamed: 0,pik,simulant_id,n_unique_simulants
10987,100000,0_19054,1
11828,100001,0_19670,1
15787,100003,0_19495,1
9396,100006,0_19545,1
16153,100007,0_13364,1
...,...,...,...
2868,99589,0_18049,3
17688,99589,0_9421,3
2931,99808,0_1122,3
2932,99808,0_1121,3


## Definitions of accuracy

1. (most strict) Assigning any PIK with multiple simulants is incorrect
2. Assigning a PIK with multiple simulants is neither incorrect nor correct (excluded from denominator)
3. (most lenient) Assigning a PIK with multiple simulants is correct, as long as at least one of those simulants matches the truth

In [22]:
# All modules, Medicare database, calculated from Layne, Wagner, and Rothhaas Table 1 (p. 15)
real_life_pvs_accuracy = 1 - (2_585 + 60_709 + 129_480 + 89_094) / (52_406_981 + 5_170_924 + 49_374_794 + 50_327_034)
f'{real_life_pvs_accuracy:.5%}'

'99.82079%'

### Definition 1

In [23]:
piks_assigned = census_2030_piked.pik.notnull().sum()
piks_assigned

10349

In [24]:
single_sim_piks_correct = (
    census_2030_piked[['record_id', 'pik']].merge(pik_simulant_pairs, on='pik').merge(census_2030_ground_truth, on='record_id')
        .pipe(lambda df: (df.simulant_id_x == df.simulant_id_y) & (df.n_unique_simulants == 1))
        .sum()
)
single_sim_piks_correct

9472

In [25]:
# Overall accuracy, treating it as a black box
(
    single_sim_piks_correct / piks_assigned
)

0.9152575128031694

In [26]:
assert len(confirmed_piks_with_ground_truth) == piks_assigned

In [27]:
census_2030_ground_truth.rename(columns={'record_id': 'record_id_census_2030'})

Unnamed: 0,record_id_census_2030,simulant_id
0,simulated_census_2030_0,0_923
1,simulated_census_2030_1,0_2348
2,simulated_census_2030_2,0_2641
3,simulated_census_2030_3,0_6176
4,simulated_census_2030_4,0_10251
...,...,...
11024,simulated_census_2030_11024,0_17003
11025,simulated_census_2030_11025,0_19380
11026,simulated_census_2030_11026,0_20272
11027,simulated_census_2030_11027,0_21997


In [28]:
# Looking at whether the exact *record* linked was from the same simulant
single_sim_record_links_correct = (
    confirmed_piks_with_ground_truth
        .merge(
            census_2030_ground_truth.rename(columns={'record_id': 'record_id_raw_input_file'}),
            on='record_id_raw_input_file',
        )
        .merge(
            reference_files_ground_truth.rename(columns={'record_id': 'record_id_reference_file'}),
            on='record_id_reference_file',
        )
        .pipe(lambda df: (df.simulant_id_x == df.simulant_id_y) & (df.n_unique_simulants == 1))
        .sum()
)
single_sim_record_links_correct

9660

In [29]:
(
    single_sim_record_links_correct / piks_assigned
)

0.9334235191805972

### Definition 2

In [30]:
single_sim_piks_assigned = len(census_2030_piked[['record_id', 'pik']].merge(pik_simulant_pairs[pik_simulant_pairs.n_unique_simulants == 1][['pik', 'simulant_id']]))
single_sim_piks_assigned

9515

In [31]:
# Overall accuracy, treating it as a black box
(
    single_sim_piks_correct / single_sim_piks_assigned
)

0.9954808197582764

In [32]:
# Looking at whether the exact *record* linked was from the same simulant
single_sim_record_links_assigned = (
    (confirmed_piks_with_ground_truth
        .merge(
            reference_files_ground_truth.rename(columns={'record_id': 'record_id_reference_file'}),
            on='record_id_reference_file',
        )
        .n_unique_simulants == 1).sum()
)
single_sim_record_links_assigned

9703

In [33]:
(
    single_sim_record_links_correct / single_sim_record_links_assigned
)

0.9955683809131196

### Definition 3

In [34]:
piks_at_least_partially_correct = (
    census_2030_piked[['record_id', 'pik']].merge(pik_simulant_pairs, on='pik').merge(census_2030_ground_truth, on='record_id')
        .assign(correct=lambda df: df.simulant_id_x == df.simulant_id_y)
)
piks_at_least_partially_correct

Unnamed: 0,record_id,pik,simulant_id_x,n_unique_simulants,simulant_id_y,correct
0,simulated_census_2030_0,89484,0_923,1,0_923,True
1,simulated_census_2030_1,98736,0_2348,1,0_2348,True
2,simulated_census_2030_2,91258,0_2641,1,0_2641,True
3,simulated_census_2030_3,90622,0_6176,1,0_6176,True
4,simulated_census_2030_4,96379,0_10251,1,0_10251,True
...,...,...,...,...,...,...
11216,simulated_census_2030_11023,105205,0_21139,1,0_21139,True
11217,simulated_census_2030_11024,95963,0_17003,1,0_17003,True
11218,simulated_census_2030_11026,104164,0_20272,1,0_20272,True
11219,simulated_census_2030_11027,106182,0_21997,1,0_21997,True


In [35]:
# Overall accuracy, treating it as a black box
piks_correct_proportion = (piks_at_least_partially_correct.correct.sum() / piks_assigned)
piks_correct_proportion

0.9956517537926369

In [36]:
print(f'{piks_correct_proportion:.5%} of the PIKs assigned were correct; compare with {real_life_pvs_accuracy:.5%} in real life')

99.56518% of the PIKs assigned were correct; compare with 99.82079% in real life


In [37]:
# Looking at whether the exact *record* linked was from the same simulant
sim_record_links_at_least_partially_correct = (
    confirmed_piks_with_ground_truth
        .merge(
            census_2030_ground_truth.rename(columns={'record_id': 'record_id_raw_input_file'}),
            on='record_id_raw_input_file',
        )
        .merge(
            reference_files_ground_truth.rename(columns={'record_id': 'record_id_reference_file'}),
            on='record_id_reference_file',
        )
        .assign(correct=lambda df: df.simulant_id_x == df.simulant_id_y)
)
sim_record_links_at_least_partially_correct

Unnamed: 0,record_id_raw_input_file,pik,record_id_census_2030,record_id_reference_file,module_name,pass_name,match_probability,simulant_id_x,simulant_id_y,n_unique_simulants,correct
0,simulated_census_2030_0,89484,census_2030_preprocessed_0,simulated_geobase_reference_file_951,geosearch,geokey,1.000000,0_923,0_923,1,True
1,simulated_census_2030_1,98736,census_2030_preprocessed_1,simulated_geobase_reference_file_17348,geosearch,geokey,1.000000,0_2348,0_2348,1,True
2,simulated_census_2030_10,94481,census_2030_preprocessed_10,simulated_geobase_reference_file_9789,geosearch,geokey,1.000000,0_13975,0_13975,1,True
3,simulated_census_2030_100,100835,census_2030_preprocessed_100,simulated_geobase_reference_file_21247,geosearch,some name and DOB information,1.000000,0_12975,0_12975,1,True
4,simulated_census_2030_1000,93179,census_2030_preprocessed_1000,simulated_geobase_reference_file_7496,geosearch,geokey,1.000000,0_208,0_208,1,True
...,...,...,...,...,...,...,...,...,...,...,...
11012,simulated_census_2030_2161,89295,census_2030_preprocessed_2161,simulated_geobase_reference_file_608,hhcompsearch,year of birth,0.995957,0_6656,0_6656,1,True
11013,simulated_census_2030_3395,106196,census_2030_preprocessed_3395,simulated_geobase_reference_file_29096,hhcompsearch,year of birth,0.995957,0_21992,0_21992,1,True
11014,simulated_census_2030_5275,105747,census_2030_preprocessed_5275,simulated_geobase_reference_file_28463,hhcompsearch,year of birth,0.995957,0_21607,0_21607,1,True
11015,simulated_census_2030_6921,103290,census_2030_preprocessed_6921,simulated_geobase_reference_file_25197,hhcompsearch,year of birth,0.995957,0_7483,0_7483,1,True


In [38]:
(
    sim_record_links_at_least_partially_correct.correct.sum() / piks_assigned
)

0.9956517537926369

In [39]:
assert (confirmed_piks_with_ground_truth.groupby('record_id_raw_input_file').record_id_reference_file.nunique() <= 1).all()

In [40]:
# Using definition 3 -- at the PIK level
piks_at_least_partially_correct = (
    piks_at_least_partially_correct
        .rename(columns={'record_id': 'record_id_raw_input_file'})
        .merge(confirmed_piks_with_ground_truth[['record_id_raw_input_file', 'module_name', 'pass_name']], on='record_id_raw_input_file')
)
piks_at_least_partially_correct

Unnamed: 0,record_id_raw_input_file,pik,simulant_id_x,n_unique_simulants,simulant_id_y,correct,module_name,pass_name
0,simulated_census_2030_0,89484,0_923,1,0_923,True,geosearch,geokey
1,simulated_census_2030_1,98736,0_2348,1,0_2348,True,geosearch,geokey
2,simulated_census_2030_2,91258,0_2641,1,0_2641,True,geosearch,geokey
3,simulated_census_2030_3,90622,0_6176,1,0_6176,True,geosearch,some name and DOB information
4,simulated_census_2030_4,96379,0_10251,1,0_10251,True,geosearch,some name and DOB information
...,...,...,...,...,...,...,...,...
11216,simulated_census_2030_11023,105205,0_21139,1,0_21139,True,geosearch,some name and DOB information
11217,simulated_census_2030_11024,95963,0_17003,1,0_17003,True,geosearch,some name and DOB information
11218,simulated_census_2030_11026,104164,0_20272,1,0_20272,True,geosearch,geokey
11219,simulated_census_2030_11027,106182,0_21997,1,0_21997,True,geosearch,geokey


In [41]:
# Accuracy by module -- note that this shows the opposite pattern (with the sample data)
# relative to the results of Layne et al., who found GeoSearch was much *more* accurate
piks_at_least_partially_correct.groupby("module_name").correct.agg(["mean", "size"]).sort_values("mean")

Unnamed: 0_level_0,mean,size
module_name,Unnamed: 1_level_1,Unnamed: 2_level_1
geosearch,0.917378,10784
namesearch,0.930514,331
dobsearch,0.97,100
hhcompsearch,1.0,6


In [42]:
# Accuracy by pass -- could be used to tune pass-specific cutoffs, but
# this might not be too informative while we are still using the sample data.
piks_at_least_partially_correct.groupby(["module_name", "pass_name"]).correct.agg(["mean", "size"]).sort_values("mean")

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,size
module_name,pass_name,Unnamed: 2_level_1,Unnamed: 3_level_1
namesearch,DOB and initials,0.883117,77
geosearch,geokey,0.915223,8658
geosearch,house number and street name Soundex,0.918391,870
geosearch,some name and DOB information,0.931529,1256
namesearch,DOB and NYSIIS of name,0.940367,218
dobsearch,reverse Soundex of name,0.961538,26
namesearch,birthday and first two characters of name,0.96875,32
dobsearch,first two characters of first name and year of birth,0.972973,74
hhcompsearch,year of birth,1.0,6
namesearch,year of birth and first two characters of name,1.0,4


In [43]:
# Using definition 3 -- at the link level
sim_record_links_at_least_partially_correct.groupby("module_name").correct.agg(["mean", "size"]).sort_values("mean")

Unnamed: 0_level_0,mean,size
module_name,Unnamed: 1_level_1,Unnamed: 2_level_1
geosearch,0.932774,10606
dobsearch,1.0,97
hhcompsearch,1.0,6
namesearch,1.0,308


In [44]:
sim_record_links_at_least_partially_correct.groupby(["module_name", "pass_name"]).correct.agg(["mean", "size"]).sort_values("mean")

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,size
module_name,pass_name,Unnamed: 2_level_1,Unnamed: 3_level_1
geosearch,geokey,0.928739,8532
geosearch,some name and DOB information,0.948905,1233
geosearch,house number and street name Soundex,0.950059,841
dobsearch,reverse Soundex of name,1.0,25
dobsearch,first two characters of first name and year of birth,1.0,72
hhcompsearch,year of birth,1.0,6
namesearch,DOB and NYSIIS of name,1.0,205
namesearch,DOB and initials,1.0,68
namesearch,birthday and first two characters of name,1.0,31
namesearch,year of birth and first two characters of name,1.0,4


### Incorrect PIKs

In [45]:
incorrectly_linked_pairs = (
    sim_record_links_at_least_partially_correct[~sim_record_links_at_least_partially_correct.correct]
        [["record_id_raw_input_file", "record_id_reference_file"]].drop_duplicates()
)
incorrectly_linked_pairs

Unnamed: 0,record_id_raw_input_file,record_id_reference_file
6,simulated_census_2030_10000,simulated_geobase_reference_file_5372
9,simulated_census_2030_10003,simulated_geobase_reference_file_2568
11,simulated_census_2030_10005,simulated_geobase_reference_file_16813
78,simulated_census_2030_10074,simulated_geobase_reference_file_19504
79,simulated_census_2030_10075,simulated_geobase_reference_file_20797
...,...,...
10487,simulated_census_2030_9885,simulated_geobase_reference_file_21798
10494,simulated_census_2030_9890,simulated_geobase_reference_file_30805
10497,simulated_census_2030_9894,simulated_geobase_reference_file_3334
10518,simulated_census_2030_9915,simulated_geobase_reference_file_7641


In [46]:
comparison_cols = [
    "first_name",
    "middle_name",
    "last_name",
    "date_of_birth",
]

address_cols = [
    "street_number",
    "street_name",
    "unit_number",
    "city",
    "state",
]

census_incorrectly_linked = (
    census_2030_piked.rename(columns={"record_id": "record_id_raw_input_file", "middle_initial": "middle_name"})
        .merge(incorrectly_linked_pairs, on="record_id_raw_input_file", how="right")
        .sort_values("record_id_raw_input_file")
        .reset_index()
)

reference_file_incorrectly_linked = (
    pd.concat([
        pd.read_parquet(
            f'{simulated_data_output_dir}/{data_to_use}/simulated_geobase_reference_file.parquet',
            columns=['record_id'] + comparison_cols + ['mailing_address_' + c for c in address_cols],
            filters=[("record_id", "in", incorrectly_linked_pairs.record_id_reference_file)],
        ),
        pd.read_parquet(
            f'{simulated_data_output_dir}/{data_to_use}/simulated_name_dob_reference_file.parquet',
            columns=['record_id'] + comparison_cols,
            filters=[("record_id", "in", incorrectly_linked_pairs.record_id_reference_file)],
        ),
    ], ignore_index=True)
        .rename(columns={"record_id": "record_id_reference_file"})
        .rename(columns=lambda c: c.replace('mailing_address_', ''))
        .merge(incorrectly_linked_pairs, on="record_id_reference_file", how="right")
        .sort_values("record_id_raw_input_file")
        .reset_index()
)

census_incorrectly_linked[comparison_cols + address_cols].compare(
    reference_file_incorrectly_linked[comparison_cols + address_cols],
    keep_shape=True,
    keep_equal=True,
)

Unnamed: 0_level_0,first_name,first_name,middle_name,middle_name,last_name,last_name,date_of_birth,date_of_birth,street_number,street_number,street_name,street_name,unit_number,unit_number,city,city,state,state
Unnamed: 0_level_1,self,other,self,other,self,other,self,other,self,other,self,other,self,other,self,other,self,other
0,William,William,J,Jeffrey,Babcock,Babcock,07/26/1962,19620726,329,329,scofield rd,SCOFIELD RD,,,Anytown,ANYTOWN,WA,WA
1,Eric,Eric,R,Richard,Zigler,Zigler,05/06/1952,19520506,21,21,joyce ct,JOYCE CT,# 9,# 9,Anytown,ANYTOWN,WA,WA
2,Olivia,Olivia,QA,Asha,Dunn,Dunn,05/01/1995,19950501,21,21,joyce ct,JOYCE CT,# 9,# 9,Anytown,ANYTOWN,WA,WA
3,Paige,Paige,M,Madison,Noble,Noble,11/05/2001,20011105,15,15,terrington dr,TERRINGTON DR,,,Anytown,ANYTOWN,WA,WA
4,Emily,Emily,I,Isis,Noble,Noble,06/24/2004,20340624,15,15,terrington dr,TERRINGTON DR,,,Anytown,ANYTOWN,WA,WA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
684,David,David,J,Jordan,Ochoa,Ochoa,06/07/2006,20060607,4427,4427,arkwright st,ARKWRIGHT ST,,,Anytown,ANYTOWN,WA,WA
685,Sophia,Sophia,A,Amelia,Carrillo,Carrillo,01/01/2028,20261125,4427,4427,arkwright st,ARKWRIGHT ST,,,Anytown,ANYTOWN,WA,WA
686,Swrah,Heidi,M,Maureen,Corcoran,Grams,07/26/1955,19550726,9250,9250,w iron ave,W IRON AVE,,,Anytown,ANYTOWN,WA,WA
687,Jeremy,Jeremy,K,Kevin,Mcarthur,Mcarthur,09/14/1969,19690914,900,900,sw inez st,SW INEZ ST,,,Anytown,ANYTOWN,WA,WA
