# Generate pseudopeople simulated datasets

The very first step is generating pseudopeople data that will be used to create the files for the case study.

In [1]:
import warnings
import pseudopeople as psp
import os, time
import logging
from pathlib import Path

# Importing pandas for access, regardless of whether we are using it as the compute engine
import pandas

In [2]:
%load_ext autoreload
%autoreload 1

In [3]:
from person_linkage_case_study import distributed_compute, utils

In [4]:
warnings.simplefilter(action="ignore", category=FutureWarning)

In [5]:
# DO NOT EDIT if this notebook is not called 01_generate_pseudopeople_simulated_datasets.ipynb!
# This notebook is designed to be run with papermill; this cell is tagged 'parameters'
# When you run this, save it to another filename.
data_to_use = "small_sample"
output_dir = "output/01_generate_pseudopeople_simulated_datasets"
compute_engine = "pandas"
# Only matter if distributing
compute_engine_num_workers = 5
compute_engine_cpus_per_worker = 2
compute_engine_threads_per_worker = 1
compute_engine_memory_per_worker = "10GB"
queue = None
account = None
# NOTE: This is, as Dask requests, a directory local to the compute node.
# But IHME's cluster doesn't support this very well -- it can be small-ish,
# full of stuff from other users, etc.
compute_engine_local_directory = (
    f"/tmp/{os.environ['USER']}_{int(time.time())}_person_linkage_case_study"
)
walltime = None
compute_engine_memory_constrained = True
compute_engine_scheduler = "slurm"

very_noisy = True
pseudopeople_seed = 0

ri_simulated_population = None
usa_simulated_population = None

log_directory = f"{output_dir}/{data_to_use}/logs"

In [6]:
# Parameters
data_to_use = "small_sample"
output_dir = "output/01_generate_pseudopeople_simulated_datasets/"
very_noisy = True
compute_engine = "pandas"

In [7]:
if compute_engine != "pandas":
    utils.ensure_empty(compute_engine_local_directory)

In [8]:
output_dir = str(Path(output_dir) / data_to_use)
utils.ensure_empty(output_dir)

In [9]:
df_ops, pd = distributed_compute.start_compute_engine(
    compute_engine,
    num_workers=compute_engine_num_workers,
    cpus_per_worker=compute_engine_cpus_per_worker,
    threads_per_worker=compute_engine_threads_per_worker,
    memory_per_worker=compute_engine_memory_per_worker,
    worker_walltime=walltime,
    local_directory=compute_engine_local_directory,
    log_directory=log_directory,
    memory_constrained=compute_engine_memory_constrained,
    scheduler=compute_engine_scheduler,
    queue=queue,
    account=account,
)

In [10]:
! date

Mon 03 Jun 2024 04:23:01 PM PDT


In [11]:
psp.__version__

'1.1.0'

## Load simulated data

In [12]:
if data_to_use == "small_sample":
    pseudopeople_input_dir = None
elif data_to_use == "ri":
    assert ri_simulated_population is not None
    pseudopeople_input_dir = ri_simulated_population
elif data_to_use == "usa":
    assert usa_simulated_population is not None
    pseudopeople_input_dir = usa_simulated_population
else:
    raise ValueError()

In [13]:
psp_kwargs = {
    "source": pseudopeople_input_dir,
    "seed": pseudopeople_seed,
}
if compute_engine.startswith("dask"):
    psp_kwargs["engine"] = "dask"

### Noise configuration

In order to give ourselves more of a challenge, we significantly increase the amount of noise
from the pseudopeople defaults.

In [14]:
default_configuration = psp.get_config()

In [15]:
# Helper functions for changing the default configuration according to a pattern
def column_noise_value(dataset, column, noise_type, default_value):
    if very_noisy and dataset in (
        "decennial_census",
        "taxes_w2_and_1099",
        "social_security",
    ):
        if noise_type == "make_typos":
            if column == "middle_initial":
                # 5% of middle initials (which are all a single token anyway) are wrong.
                return {"cell_probability": 0.05, "token_probability": 1}
            elif column in ("first_name", "last_name", "street_name"):
                # 10% of these text columns were entered carelessly, at a rate of 1 error
                # per 10 characters.
                # The pseudopeople default is 1% careless.
                return {"cell_probability": 0.1, "token_probability": 0.1}
        elif noise_type == "write_wrong_digits" and (
            dataset != "social_security" or column != "ssn"
        ):
            # 10% of number columns were written carelessly, at a rate of 1 error
            # per 10 characters.
            # The pseudopeople default is 1% careless.
            # Note that this is applied on top of (the default lower levels of) typos,
            # since typos also apply to numeric characters.
            # We never introduce error on the SSN in the SSA dataset
            return {"cell_probability": 0.1, "token_probability": 0.1}

    return default_value


def row_noise_value(dataset, noise_type, default_value):
    return default_value

In [16]:
custom_configuration = {
    dataset: {
        noise_category: (
            (
                {
                    column: {
                        noise_type: column_noise_value(
                            dataset, column, noise_type, noise_type_config
                        )
                        for noise_type, noise_type_config in column_config.items()
                    }
                    for column, column_config in noise_category_config.items()
                }
                if noise_category == "column_noise"
                else {
                    noise_type: row_noise_value(dataset, noise_type, noise_type_config)
                    for noise_type, noise_type_config in noise_category_config.items()
                }
            )
        )
        for noise_category, noise_category_config in dataset_config.items()
    }
    for dataset, dataset_config in default_configuration.items()
}

In [17]:
psp_kwargs["config"] = custom_configuration

### Simulated 1040 tax filings

We assume that the last 5 years of taxes would be available and used in the construction of the reference files -- see section about reference files below.

Note that these are retrieved by *tax* year, so the 2029 taxes would be available in early 2030
(around when our hypothetical case study is taking place).

In [18]:
tax_years = list(range(2025, 2030))
tax_years

[2025, 2026, 2027, 2028, 2029]

In [19]:
psp_kwargs

{'source': None,
 'seed': 0,
 'config': {'decennial_census': {'row_noise': {'do_not_respond': {'row_probability': 0.0145},
    'omit_row': {'row_probability': 0.0},
    'duplicate_with_guardian': {'row_probability_in_households_under_18': 0.02,
     'row_probability_in_college_group_quarters_under_24': 0.05}},
   'column_noise': {'first_name': {'leave_blank': {'cell_probability': 0.01},
     'use_nickname': {'cell_probability': 0.01},
     'use_fake_name': {'cell_probability': 0.01},
     'make_phonetic_errors': {'cell_probability': 0.01,
      'token_probability': 0.1},
     'make_ocr_errors': {'cell_probability': 0.01, 'token_probability': 0.1},
     'make_typos': {'cell_probability': 0.1, 'token_probability': 0.1}},
    'middle_initial': {'leave_blank': {'cell_probability': 0.01},
     'make_phonetic_errors': {'cell_probability': 0.01,
      'token_probability': 0.1},
     'make_ocr_errors': {'cell_probability': 0.01, 'token_probability': 0.1},
     'make_typos': {'cell_probability'

In [20]:
%%time

for year in tax_years:
    print(year)
    df = psp.generate_taxes_1040(
        year=year,
        **psp_kwargs,
    )
    df_ops.to_parquet(
        df, str(Path(output_dir) / f"simulated_taxes_1040_{year}.parquet")
    )

2025
[32m2024-06-03 16:23:02.039[0m | [36mpseudopeople.configuration.validator[0m:[36mvalidate_noise_level_proportions[0m:[36m335[0m - [33m[1mThe configured 'use_nickname' noise level for column_noise 'dependent_4_first_name' is 0.01, which is higher than the maximum possible value based on the provided data for 'taxes_1040'. Noising as many rows as possible. [0m


Applying noise:   0%|                                                                                                                                                                                                              | 0/15 [00:00<?, ?type/s]

Applying noise:  27%|████████████████████████████████████████████████████▊                                                                                                                                                 | 4/15 [00:00<00:00, 30.41type/s]

Applying noise:  67%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                 | 10/15 [00:00<00:00, 44.96type/s]

Applying noise: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:01<00:00, 10.09type/s]

                                                                                                                                                                                                                                                            

2026
[32m2024-06-03 16:23:03.638[0m | [36mpseudopeople.configuration.validator[0m:[36mvalidate_noise_level_proportions[0m:[36m335[0m - [33m[1mThe configured 'use_nickname' noise level for column_noise 'dependent_4_first_name' is 0.01, which is higher than the maximum possible value based on the provided data for 'taxes_1040'. Noising as many rows as possible. [0m




Applying noise:   0%|                                                                                                                                                                                                              | 0/15 [00:00<?, ?type/s]

Applying noise:  27%|████████████████████████████████████████████████████▊                                                                                                                                                 | 4/15 [00:00<00:00, 29.88type/s]

Applying noise:  67%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                 | 10/15 [00:00<00:00, 45.03type/s]

Applying noise: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:01<00:00, 11.18type/s]

                                                                                                                                                                                                                                                            

2027
[32m2024-06-03 16:23:05.106[0m | [36mpseudopeople.configuration.validator[0m:[36mvalidate_noise_level_proportions[0m:[36m335[0m - [33m[1mThe configured 'use_nickname' noise level for column_noise 'dependent_4_first_name' is 0.01, which is higher than the maximum possible value based on the provided data for 'taxes_1040'. Noising as many rows as possible. [0m


[32m2024-06-03 16:23:05.108[0m | [36mpseudopeople.configuration.validator[0m:[36mvalidate_noise_level_proportions[0m:[36m335[0m - [33m[1mThe configured 'copy_from_household_member' noise level for column_noise 'dependent_4_ssn' is 0.01, which is higher than the maximum possible value based on the provided data for 'taxes_1040'. Noising as many rows as possible. [0m




Applying noise:   0%|                                                                                                                                                                                                              | 0/15 [00:00<?, ?type/s]

Applying noise:  27%|████████████████████████████████████████████████████▊                                                                                                                                                 | 4/15 [00:00<00:00, 29.48type/s]

Applying noise:  67%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                 | 10/15 [00:00<00:00, 43.74type/s]

Applying noise: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:01<00:00, 11.07type/s]

                                                                                                                                                                                                                                                            

2028
[32m2024-06-03 16:23:06.598[0m | [36mpseudopeople.configuration.validator[0m:[36mvalidate_noise_level_proportions[0m:[36m335[0m - [33m[1mThe configured 'use_nickname' noise level for column_noise 'dependent_4_first_name' is 0.01, which is higher than the maximum possible value based on the provided data for 'taxes_1040'. Noising as many rows as possible. [0m


[32m2024-06-03 16:23:06.600[0m | [36mpseudopeople.configuration.validator[0m:[36mvalidate_noise_level_proportions[0m:[36m335[0m - [33m[1mThe configured 'copy_from_household_member' noise level for column_noise 'dependent_4_ssn' is 0.01, which is higher than the maximum possible value based on the provided data for 'taxes_1040'. Noising as many rows as possible. [0m




Applying noise:   0%|                                                                                                                                                                                                              | 0/15 [00:00<?, ?type/s]

Applying noise:  27%|████████████████████████████████████████████████████▊                                                                                                                                                 | 4/15 [00:00<00:00, 30.24type/s]

Applying noise:  67%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                 | 10/15 [00:00<00:00, 43.80type/s]

Applying noise: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:01<00:00, 11.21type/s]

                                                                                                                                                                                                                                                            

2029
[32m2024-06-03 16:23:08.162[0m | [36mpseudopeople.configuration.validator[0m:[36mvalidate_noise_level_proportions[0m:[36m335[0m - [33m[1mThe configured 'use_nickname' noise level for column_noise 'dependent_3_first_name' is 0.01, which is higher than the maximum possible value based on the provided data for 'taxes_1040'. Noising as many rows as possible. [0m


[32m2024-06-03 16:23:08.164[0m | [36mpseudopeople.configuration.validator[0m:[36mvalidate_noise_level_proportions[0m:[36m335[0m - [33m[1mThe configured 'use_nickname' noise level for column_noise 'dependent_4_first_name' is 0.01, which is higher than the maximum possible value based on the provided data for 'taxes_1040'. Noising as many rows as possible. [0m


[32m2024-06-03 16:23:08.165[0m | [36mpseudopeople.configuration.validator[0m:[36mvalidate_noise_level_proportions[0m:[36m335[0m - [33m[1mThe configured 'copy_from_household_member' noise level for column_noise 'dependent_4_ssn' is 0.01, which is higher than the maximum possible value based on the provided data for 'taxes_1040'. Noising as many rows as possible. [0m




Applying noise:   0%|                                                                                                                                                                                                              | 0/15 [00:00<?, ?type/s]

Applying noise:  27%|████████████████████████████████████████████████████▊                                                                                                                                                 | 4/15 [00:00<00:00, 28.25type/s]

Applying noise:  67%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                 | 10/15 [00:00<00:00, 41.95type/s]

Applying noise: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:01<00:00, 10.84type/s]

                                                                                                                                                                                                                                                            

CPU times: user 7.43 s, sys: 286 ms, total: 7.71 s
Wall time: 7.66 s




### Simulated W2/1099 tax filings

We assume that the last 5 years of taxes would be available and used in the construction of the reference files.

Note that these are retrieved by *tax* year, so the 2029 taxes would be available in early 2030
(around when our hypothetical case study is taking place).

In [21]:
%%time

for year in tax_years:
    print(year)
    df = psp.generate_taxes_w2_and_1099(
        year=year,
        **psp_kwargs,
    )
    df_ops.to_parquet(
        df, str(Path(output_dir) / f"simulated_taxes_w2_and_1099_{year}.parquet")
    )

2025


Applying noise:   0%|                                                                                                                                                                                                              | 0/15 [00:00<?, ?type/s]

Applying noise:  27%|████████████████████████████████████████████████████▊                                                                                                                                                 | 4/15 [00:00<00:00, 39.11type/s]

Applying noise:  67%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                 | 10/15 [00:00<00:00, 47.07type/s]

Applying noise: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:00<00:00, 14.94type/s]

                                                                                                                                                                                                                                                            



2026


Applying noise:   0%|                                                                                                                                                                                                              | 0/15 [00:00<?, ?type/s]

Applying noise:  27%|████████████████████████████████████████████████████▊                                                                                                                                                 | 4/15 [00:00<00:00, 37.53type/s]

Applying noise:  67%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                 | 10/15 [00:00<00:00, 46.02type/s]

Applying noise: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:00<00:00, 14.85type/s]

                                                                                                                                                                                                                                                            



2027


Applying noise:   0%|                                                                                                                                                                                                              | 0/15 [00:00<?, ?type/s]

Applying noise:  33%|██████████████████████████████████████████████████████████████████                                                                                                                                    | 5/15 [00:00<00:00, 39.75type/s]

Applying noise:  67%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                 | 10/15 [00:00<00:00, 33.55type/s]

Applying noise:  93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊             | 14/15 [00:00<00:00, 16.04type/s]

                                                                                                                                                                                                                                                            



2028


Applying noise:   0%|                                                                                                                                                                                                              | 0/15 [00:00<?, ?type/s]

Applying noise:  27%|████████████████████████████████████████████████████▊                                                                                                                                                 | 4/15 [00:00<00:00, 39.47type/s]

Applying noise:  67%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                 | 10/15 [00:00<00:00, 47.43type/s]

Applying noise: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:00<00:00, 14.01type/s]

                                                                                                                                                                                                                                                            



2029


Applying noise:   0%|                                                                                                                                                                                                              | 0/15 [00:00<?, ?type/s]

Applying noise:  27%|████████████████████████████████████████████████████▊                                                                                                                                                 | 4/15 [00:00<00:00, 39.60type/s]

Applying noise:  67%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                 | 10/15 [00:00<00:00, 47.31type/s]

Applying noise: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:00<00:00, 15.41type/s]

                                                                                                                                                                                                                                                            

CPU times: user 5.66 s, sys: 214 ms, total: 5.88 s
Wall time: 5.85 s




### Simulated 2030 Census Unedited File (CUF)

For now, we gloss over the data schema for addresses.
We don't know how addresses would be formatted in the CUF (and it's hard to guess, because
address is not part of the Census form), but it likely would have some of these fields
(street number, street name, etc) combined.

While PVS input files do not in general have names split into first, middle, and last,
I am guessing the CUF **would** have first name, middle initial, last name (which is how pseudopeople
generates it), because that [matches the Census questionnaire](https://www2.census.gov/programs-surveys/decennial/2020/technical-documentation/questionnaires-and-instructions/questionnaires/2020-informational-questionnaire-english_DI-Q1.pdf).

In [22]:
%%time

simulated_census_2030 = psp.generate_decennial_census(
    year=2030,
    **psp_kwargs,
)
df_ops.to_parquet(
    simulated_census_2030, str(Path(output_dir) / f"simulated_census_2030.parquet")
)

Applying noise:   0%|                                                                                                                                                                                                              | 0/15 [00:00<?, ?type/s]

Applying noise:  20%|███████████████████████████████████████▌                                                                                                                                                              | 3/15 [00:00<00:00, 28.25type/s]

Applying noise:  40%|███████████████████████████████████████████████████████████████████████████████▏                                                                                                                      | 6/15 [00:00<00:00, 25.10type/s]

Applying noise:  80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                       | 12/15 [00:00<00:00, 37.78type/s]

                                                                                                                                                                                                                                                            

CPU times: user 890 ms, sys: 20.4 ms, total: 910 ms
Wall time: 905 ms




### Simulated SSA Numident

Wagner and Layne, p.4:

> The reference files are derived from the Social Security Administration
    (SSA) Numerical Identification file (SSA Numident). The Numident contains all
    transactions recorded against one Social Security Number (SSN)...

Based on the [SSA Numident through 2007 which is publicly available from NARA](https://aad.archives.gov/aad/series-description.jsp?s=5057),
we know there are three kinds of transactions: SSN applications, deaths, and claiming benefits.
SSN holders may change their information (e.g. changing name or sex) by submitting another application,
which generates an additional application transaction.
(The policies about this are found [on the SSA website](https://secure.ssa.gov/poms.nsf/lnx/0110212200).)

The paper ["Likely Transgender Individuals in U.S. Federal Administrative Records and the 2010 Census" by Benjamin Cerf Harris](https://www.census.gov/content/dam/Census/library/working-papers/2015/adrm/carra-wp-2015-03.pdf)
includes some helpful statistics (Table 2).
The average person in the SSA Numident has 2.2 transactions (called "claims" in that paper, but with the same definition
as our term "transaction": "Any time an SSN is created or information associated with an existing SSN is changed, that event is registered
as a claim.").

pseudopeople does not currently include correction, name change, or benefits claim transactions.
It only includes SSN creation and death of the SSN holder.

I've figured that there would be some delay in getting the Numident -- so by Census processing time
for the 2030 Census, only the SSA transactions by the end of 2029 would be available.
Note that with pseudopeople's current design it is only possible to set this cutoff at the end of a calendar year.
The NORC report says that "the Census NUMIDENT is recreated each year, to reflect
Social Security transaction records through **March** of each year" (p. 105),
though it isn't clear when in the year the Census Numident is actually re-created.

In [23]:
%%time

simulated_ssa_numident = psp.generate_social_security(
    year=2029,
    **psp_kwargs,
)

Applying noise:   0%|                                                                                                                                                                                                              | 0/15 [00:00<?, ?type/s]

Applying noise:  67%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                 | 10/15 [00:00<00:00, 86.08type/s]

                                                                                                                                                                                                                                                            

CPU times: user 623 ms, sys: 20.2 ms, total: 643 ms
Wall time: 640 ms




In [24]:
%%time

df_ops.to_parquet(
    simulated_ssa_numident, str(Path(output_dir) / "simulated_ssa_numident.parquet")
)

CPU times: user 25 ms, sys: 47 μs, total: 25 ms
Wall time: 31.2 ms


In [25]:
! date

Mon 03 Jun 2024 04:23:17 PM PDT
