# Generate simulated data to link

Using v0.6.4 of the `pseudopeople` package.

In [1]:
import pseudopeople
import pandas as pd, numpy as np

In [2]:
!pip freeze | grep pseudopeople

pseudopeople==0.6.4


## Load simulated data

Imagined scenario: PIKing the 2030 census.

One way to do this:
* Use (cumulative) SSA Numident up to that time.
* Link it (deterministically, using SSN) to taxes to get the most recent address for each person.
  * Likely would use 1040 here, but I'll use W2 for now.
* Link probabilistically to the census data.

In [3]:
default_configuration = pseudopeople.get_config()

In [4]:
def column_noise_value(dataset, column, noise_type, default_value):
    if dataset in ('decennial_census', 'taxes_w2_and_1099', 'social_security'):
        if noise_type == "make_typos":
            if column == "middle_initial":
                return {"cell_probability": 0.05, "token_probability": 1}
            elif column in ("first_name", "last_name", "street_name"):
                return {"cell_probability": 0.1, "token_probability": 0.1}
        elif noise_type == "write_wrong_digits":
            return {"cell_probability": 0.1, "token_probability": 0.1}

    return default_value

def row_noise_value(dataset, noise_type, default_value):
    return default_value

In [5]:
custom_configuration = {
    dataset: {
        noise_category: (
            ({
                column: {
                    noise_type: column_noise_value(dataset, column, noise_type, noise_type_config)
                    for noise_type, noise_type_config in column_config.items()
                }
                for column, column_config in noise_category_config.items()
            }
            if noise_category == "column_noise" else
            {
                noise_type: row_noise_value(dataset, noise_type, noise_type_config)
                for noise_type, noise_type_config in noise_category_config.items()
            })
        )
        for noise_category, noise_category_config in dataset_config.items()
    }
    for dataset, dataset_config in default_configuration.items()
}

In [6]:
%%time

# Here I've figured that there would be some delay in getting the Numident -- so by Census processing time
# for the 2030 Census, only the SSA by the end of 2029 would be available.
# Note that with pseudopeople's current design it is only possible to set a cutoff at the end of a calendar year.
ssa = pseudopeople.generate_social_security(year=2029, config=custom_configuration)
ssa

                                                               

CPU times: user 871 ms, sys: 31.5 ms, total: 903 ms
Wall time: 925 ms




Unnamed: 0,simulant_id,first_name,middle_initial,last_name,date_of_birth,ssn,event_type,event_date
0,0_19979,Mary,M,Pierce,12/04/1919,786-77-6454,creation,19191204
1,0_6846,Peter,M,Mundell,06/07/1921,688-88-6377,creation,19210607
2,0_19941,Anna,H,Causey,03/07/1922,665-25-7858,creation,12220307
3,0_19825,Gertrude,M,Osornia,05/11/1922,875-10-2359,creation,19220511
4,0_19806,Edna,A,Hunter,05/25/1922,420-19-3737,creation,19220525
...,...,...,...,...,...,...,...,...
20027,0_23620,Mila,M,Saldana,01/09/2030,133-85-8593,creation,20291218
20028,0_23629,Luna,N,Bonnell,01/09/2030,422-69-9071,creation,20291218
20029,0_23630,Charlotte,A,May,01/10/2030,826-03-0946,creation,20291218
20030,0_23624,Liam,C,Vanover,01/12/2030,778-37-9317,creation,20291218


In [7]:
%%time

# Consider the last few years of taxes -- 2029 taxes would be filed a couple months before Census day 2030
w2_1099 = pd.concat([
    pseudopeople.generate_taxes_w2_and_1099(year=year, config=custom_configuration).assign(tax_year=year) for year in (2025, 2026, 2027, 2028, 2029)
], ignore_index=True)
w2_1099

                                                               

CPU times: user 5.69 s, sys: 172 ms, total: 5.86 s
Wall time: 5.87 s




Unnamed: 0,simulant_id,first_name,middle_initial,last_name,age,date_of_birth,mailing_address_street_number,mailing_address_street_name,mailing_address_unit_number,mailing_address_po_box,...,employer_id,employer_name,employer_street_number,employer_street_name,employer_unit_number,employer_city,employer_state,employer_zipcode,tax_form,tax_year
0,0_5,Michelle,M,Ticas,44,08/10/1981,1312,commonwealth avnue,,,...,12,Jj Rubys Salon Studios,1300,windsor lane,,Anytown,US,00000,W2,2025
1,0_5,Michelle,M,Ticas,44,08/10/1981,1312,commonwealth avnue,,,...,60,Freeway Insurance Agency,1105,largess ln,,Anytown,US,00000,W2,2025
2,0_5623,Gloria,A,Quintana,52,07/23/1973,,,,14011.0,...,46,Nashville City Properties,411,sthe 20th avenue,,Anytown,US,00000,W2,2025
3,0_7252,Tamara,D,Sosa,48,05/15/1977,15,julian dr,,,...,69,Rancho Vistoso Trails Mental Health,4056,goliad st,,Anytown,US,00000,W2,2025
4,0_7252,Tamara,D,Sosa,48,05/15/1977,15,julian dr,,,...,84,New Era Home,222,w hemlock st,,Anytown,US,00000,W2,2025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54141,0_3456,Amanda,M,Mitchell,49,02/15/1980,3,goodland avnue,,,...,84,New Era Home,222,w hemlock st,,Anytown,US,00000,W2,2029
54142,0_3457,Steven,R,Mitchell,49,03/13/1980,3,goodland avnue,,,...,75,France,2506,mccullough lane,,Anytown,US,00000,W2,2029
54143,0_19046,Delbert,D,Hawkins,89,03/15/1940,3,goodland avnue,,,...,43,Ram Fashion Nail,20308,hancock str,,Anytown,US,00000,W2,2029
54144,0_19046,Delbert,D,Hawkins,89,03/15/1940,3,goodland avnue,,,...,53,A Car Title Loans,6100,e ball rd,,Anytown,US,00000,W2,2029


In [8]:
# Slightly hacky workaround for a bug in pseudopeople with the type of the PO box column
# We make sure everything is a string and remove the decimal part
# (we know the decimal point will still be there since there is no noise type that currently affects punctuation)
po_box_fixed = w2_1099.mailing_address_po_box.astype(str).str.replace('\\..*$', '', regex=True).replace('nan', np.nan)

assert po_box_fixed[po_box_fixed.apply(type) == float].isnull().all()
assert not po_box_fixed[(po_box_fixed.apply(type) == str)].str.contains('.', regex=False).any()

po_box_fixed.apply(type).value_counts()

mailing_address_po_box
<class 'float'>    52269
<class 'str'>       1877
Name: count, dtype: int64

In [9]:
po_box_fixed[po_box_fixed.notnull()]

2        14011
69        6846
165       9973
200      16924
218      10131
         ...  
53589    10937
53765    10066
53839      984
53841      984
53871    18713
Name: mailing_address_po_box, Length: 1877, dtype: object

In [10]:
w2_1099['mailing_address_po_box'] = po_box_fixed

In [11]:
%%time

census_2030 = pseudopeople.generate_decennial_census(year=2030, config=custom_configuration)
census_2030

                                                               

CPU times: user 526 ms, sys: 8.31 ms, total: 534 ms
Wall time: 542 ms




Unnamed: 0,simulant_id,first_name,middle_initial,last_name,age,date_of_birth,street_number,street_name,unit_number,city,state,zipcode,relation_to_reference_person,sex,race_ethnicity
0,0_923,John,E,Mcueever,86,06/29/1942,147-153,browning ave,,Anytown,US,00000,Reference person,Male,Black
1,0_2641,Sharon,T,Schmidt,69,10/50/1960,109,stqllion sr,,Anytown,US,00000,Reference person,Female,White
2,0_6176,Gail,K,Durand,77,01/03/1953,2115,cannon dr,,Anytown,US,00000,Reference person,Female,Multiracial or Other
3,0_13972,John,J,Williams,81,11/24/1948,146,delaware av,,Anytown,US,00000,Reference person,Male,White
4,0_13973,Child,L,Wukliamz,81,09/27/1948,146,delaware av,,Anytown,US,00000,Opp-sex spouse,Female,White
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11048,0_22741,Chloe,A,Maryknez-Alvarez,21,07/12/2008,207,harrison st,,Anytown,US,00000,Biological child,Female,Latino
11049,0_22742,Zachary,E,Martinez-Alvarez,18,06/29/2011,207,harrison st,,Anytown,US,00000,Biological child,Male,
11050,0_22743,Madeline,A,Martinez-Alvarez,16,08/12/2013,207,harrison st,,Anytown,US,00000,Biological child,Female,Latino
11051,0_23271,Naomi,A,Martinez-Aldarez,1,11/01/2028,207,harrison st,,Anytown,US,00000,Grandchild,Female,Latino


In [12]:
# Similar to the above issue, there is weird type stuff with age
age_fixed = census_2030['age'].astype(str).replace('nan', np.nan)

assert age_fixed[age_fixed.apply(type) == float].isnull().all()
assert not age_fixed[age_fixed.apply(type) == str].str.contains('.', regex=False).any()

age_fixed.apply(type).value_counts()

age
<class 'str'>      10963
<class 'float'>       90
Name: count, dtype: int64

In [13]:
age_fixed[age_fixed.notnull()]

0        86
1        69
2        77
3        81
4        81
         ..
11048    21
11049    18
11050    16
11051     1
11052    47
Name: age, Length: 10963, dtype: object

In [14]:
census_2030['age'] = age_fixed

## Deterministically get a recent address

In [15]:
# Within each year, we do not have a date field for W2. So we don't know which order jobs happened in.
# We take the address associated with the most income in the most recent year. Note that part or all
# of this address may be missing.
w2_1099.groupby(['tax_year', 'ssn']).mailing_address_street_name.nunique(dropna=False).sort_values()

tax_year  ssn        
2025      000-74-9102    1
2028      308-42-7924    1
          308-48-4345    1
          308-78-1837    1
          309-02-7977    1
                        ..
2025      681-62-4798    2
          325-59-2336    2
2029      298-41-5379    2
2027      413-65-8440    2
2029      385-02-7530    3
Name: mailing_address_street_name, Length: 40697, dtype: int64

In [16]:
w2_1099.ssn.nunique()

14796

In [17]:
w2_1099

Unnamed: 0,simulant_id,first_name,middle_initial,last_name,age,date_of_birth,mailing_address_street_number,mailing_address_street_name,mailing_address_unit_number,mailing_address_po_box,...,employer_id,employer_name,employer_street_number,employer_street_name,employer_unit_number,employer_city,employer_state,employer_zipcode,tax_form,tax_year
0,0_5,Michelle,M,Ticas,44,08/10/1981,1312,commonwealth avnue,,,...,12,Jj Rubys Salon Studios,1300,windsor lane,,Anytown,US,00000,W2,2025
1,0_5,Michelle,M,Ticas,44,08/10/1981,1312,commonwealth avnue,,,...,60,Freeway Insurance Agency,1105,largess ln,,Anytown,US,00000,W2,2025
2,0_5623,Gloria,A,Quintana,52,07/23/1973,,,,14011,...,46,Nashville City Properties,411,sthe 20th avenue,,Anytown,US,00000,W2,2025
3,0_7252,Tamara,D,Sosa,48,05/15/1977,15,julian dr,,,...,69,Rancho Vistoso Trails Mental Health,4056,goliad st,,Anytown,US,00000,W2,2025
4,0_7252,Tamara,D,Sosa,48,05/15/1977,15,julian dr,,,...,84,New Era Home,222,w hemlock st,,Anytown,US,00000,W2,2025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54141,0_3456,Amanda,M,Mitchell,49,02/15/1980,3,goodland avnue,,,...,84,New Era Home,222,w hemlock st,,Anytown,US,00000,W2,2029
54142,0_3457,Steven,R,Mitchell,49,03/13/1980,3,goodland avnue,,,...,75,France,2506,mccullough lane,,Anytown,US,00000,W2,2029
54143,0_19046,Delbert,D,Hawkins,89,03/15/1940,3,goodland avnue,,,...,43,Ram Fashion Nail,20308,hancock str,,Anytown,US,00000,W2,2029
54144,0_19046,Delbert,D,Hawkins,89,03/15/1940,3,goodland avnue,,,...,53,A Car Title Loans,6100,e ball rd,,Anytown,US,00000,W2,2029


In [18]:
recent_addresses = (
    w2_1099[w2_1099.ssn.notnull()].assign(income=lambda x: x.income.fillna(0).astype(float)).sort_values(['tax_year', 'income'], ascending=False).drop_duplicates('ssn').set_index('ssn').filter(like='mailing_address')
)
recent_addresses

Unnamed: 0_level_0,mailing_address_street_number,mailing_address_street_name,mailing_address_unit_number,mailing_address_po_box,mailing_address_city,mailing_address_state,mailing_address_zipcode
ssn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
298-65-5311,3232,maple grove ln,,,Anytown,US,00000
189-95-7819,6250,strand cir,,,Anytown,US,00000
716-56-1374,202,valley green rd,,,Anytown,US,00000
461-95-3444,868,se 9th cir,,,Anytown,US,00000
231-44-7736,12055,westdale avnue,,,Anytown,US,00000
...,...,...,...,...,...,...,...
370-93-5222,17632,wyoming avenue,,,Anytown,US,00000
172-62-3409,93,w beverly rd,,,Anytown,US,00000
108-87-0184,11836,sunset av,,,Anytown,US,00000
095-30-8721,12729,stadium blvd s,,,Anytown,US,00000


In [19]:
# Everyone with any taxes has a most recent address.
assert set(recent_addresses.index) == set(w2_1099.ssn.dropna().unique())

In [20]:
# Some people won't have a recent address, if they don't have any taxes.
ssa[~ssa.ssn.isin(recent_addresses.index)]

Unnamed: 0,simulant_id,first_name,middle_initial,last_name,date_of_birth,ssn,event_type,event_date
0,0_19979,Mary,M,Pierce,12/04/1919,786-77-6454,creation,19191204
1,0_6846,Peter,M,Mundell,06/07/1921,688-88-6377,creation,19210607
2,0_19941,Anna,H,Causey,03/07/1922,665-25-7858,creation,12220307
3,0_19825,Gertrude,M,Osornia,05/11/1922,875-10-2359,creation,19220511
4,0_19806,Edna,A,Hunter,05/25/1922,420-19-3737,creation,19220525
...,...,...,...,...,...,...,...,...
20027,0_23620,Mila,M,Saldana,01/09/2030,133-85-8593,creation,20291218
20028,0_23629,Luna,N,Bonnell,01/09/2030,422-69-9071,creation,20291218
20029,0_23630,Charlotte,A,May,01/10/2030,826-03-0946,creation,20291218
20030,0_23624,Liam,C,Vanover,01/12/2030,778-37-9317,creation,20291218


## Create a fake Numident file

In [21]:
# The probable real-life approach would be: take date of birth from the (first) creation event, date of death (if any)
# from the (last) death event, name from the most recent event of any kind.
# We don't want to throw out events with a missing/invalid date, so we'll fill them with the value *least* likely to be chosen
# (early if taking the latest, late if taking the earliest).
fill_dates = lambda df, fill_type: pd.to_datetime(df.event_date, errors='coerce').fillna(pd.Timestamp('2100-01-01' if fill_type == 'latest' else '1900-01-01'))

date_of_birth = (
    ssa[ssa.event_type == 'creation']
        .assign(event_date_for_sort=lambda df: fill_dates(df, 'latest'))
        .sort_values('event_date_for_sort')
        .drop_duplicates('ssn', keep='first')
        .set_index('ssn')
        .date_of_birth
)
date_of_death = (
    ssa[ssa.event_type == 'death']
        .assign(event_date_for_sort=lambda df: fill_dates(df, 'earliest'))
        .sort_values('event_date_for_sort')
        .drop_duplicates('ssn', keep='last')
        .set_index('ssn')
        .event_date
        .rename('date_of_death')
)
name = (
    ssa
        .assign(event_date_for_sort=lambda df: fill_dates(df, 'earliest'))
        .sort_values('event_date_for_sort')
        .drop_duplicates('ssn', keep='last')
        .set_index('ssn')[['first_name', 'middle_initial', 'last_name']]
)

# What to do about ground truth here? This simple linkage could already be introducing errors, since SSN is not without noise!
# For now, I'll take the most common ground truth.
simulant_id = ssa.groupby('ssn').simulant_id.agg(lambda x: pd.Series.mode(x)[0])

fake_numident = pd.DataFrame(simulant_id).join(date_of_birth, how='left').join(name, how='left').join(date_of_death, how='left').reset_index()
fake_numident

Unnamed: 0,ssn,simulant_id,date_of_birth,first_name,middle_initial,last_name,date_of_death
0,001-02-4588,0_13602,08/08/2008,Isabella,G,Windom,
1,001-15-8330,0_16514,05/04/1976,Gerald,J,Beckham,
2,001-16-0077,0_13906,02/07/1970,Jerald,J,Alvarez,
3,001-17-9511,0_13442,11/20/1966,Teresa,A,Togni,
4,001-25-8258,0_22495,06/29/2026,Bethany,G,Tenorio,
...,...,...,...,...,...,...,...
18769,976-30-9537,0_4258,06/12/1976,Aron,C,Frausto Ferretiz,
18770,978-78-6109,0_19947,05/22/1963,Claude,M,Page,
18771,979-44-7835,0_20792,08/01/1979,Thomas,A,Martinez-Puentes,
18772,998-22-9577,0_9017,04/17/2002,Jeffery,P,Shaw,


In [22]:
# Most people have not died
fake_numident.date_of_death.isnull().mean()

0.9172792159369341

## Create a composite reference file for linking

In [23]:
reference_file = (
    # Exclude those who have died before the census
    fake_numident[~(pd.to_datetime(fake_numident.date_of_death, errors='coerce') <= pd.Timestamp('2020-04-01'))]
        .merge(recent_addresses, on='ssn', how='left')
)
reference_file

Unnamed: 0,ssn,simulant_id,date_of_birth,first_name,middle_initial,last_name,date_of_death,mailing_address_street_number,mailing_address_street_name,mailing_address_unit_number,mailing_address_po_box,mailing_address_city,mailing_address_state,mailing_address_zipcode
0,001-02-4588,0_13602,08/08/2008,Isabella,G,Windom,,685,emerson st,,,Anytown,US,00000
1,001-15-8330,0_16514,05/04/1976,Gerald,J,Beckham,,5010,south doctor martin luther king jr dr,,,Anytown,US,00000
2,001-16-0077,0_13906,02/07/1970,Jerald,J,Alvarez,,,,,,,,
3,001-17-9511,0_13442,11/20/1966,Teresa,A,Togni,,150,s sheldon rd,,,Anytown,US,00000
4,001-25-8258,0_22495,06/29/2026,Bethany,G,Tenorio,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18521,976-30-9537,0_4258,06/12/1976,Aron,C,Frausto Ferretiz,,,,,,,,
18522,978-78-6109,0_19947,05/22/1963,Claude,M,Page,,,,,,,,
18523,979-44-7835,0_20792,08/01/1979,Thomas,A,Martinez-Puentes,,,,,,,,
18524,998-22-9577,0_9017,04/17/2002,Jeffery,P,Shaw,,,,,,,,


In [24]:
# Generate PIKs
reference_file = reference_file.reset_index(drop=True).reset_index().rename(columns={'index': 'pik'})

## Save results

In [25]:
# Add a unique record ID -- could do this within the pipeline, but then it's harder to match up the ground truth
reference_file = reference_file.reset_index(drop=True).reset_index().rename(columns={'index': 'record_id'})
census_2030 = census_2030.reset_index(drop=True).reset_index().rename(columns={'index': 'record_id'})

# Separate ground truth
reference_file_ground_truth = reference_file[['record_id', 'simulant_id']]
census_2030_ground_truth = census_2030[['record_id', 'simulant_id']]

reference_file = reference_file.drop(columns=['simulant_id'])
census_2030 = census_2030.drop(columns=['simulant_id'])

In [26]:
reference_file.to_parquet('../reference_file_sample.parquet')
census_2030.to_parquet('../census_2030_sample.parquet')

reference_file_ground_truth.to_parquet('../reference_file_ground_truth_sample.parquet')
census_2030_ground_truth.to_parquet('../census_2030_ground_truth_sample.parquet')