# Generate simulated data to link

In this case study, we imagine running PVS on the 2030 Census Unedited File (CUF) -- see the main notebook for more details, including references used throughout this notebook.
This notebook creates input (CUF) and reference files approximating what would be used in such a PVS process.

In [1]:
import pseudopeople as psp
import os
import logging
import warnings
import os
import shutil
from pathlib import Path
# Importing pandas for access, regardless of whether we are using it as the compute engine
import pandas
import numpy

In [2]:
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
# DO NOT EDIT if this notebook is not called generate_simulated_data_small_sample.ipynb!
# This notebook is designed to be run with papermill; this cell is tagged 'parameters'
# If running with the default parameters, you can overwrite this notebook; otherwise,
# save it to another filename.
# TODO: Rename the notebook to omit 'small_sample' in the filename and omit all outputs
# from the 'canonical version'
data_to_use = 'small_sample'
output_dir = 'output'
compute_engine = 'pandas'
num_jobs = 10
cpus_per_job = 2
memory_per_job = "10GB"

In [4]:
# Parameters
data_to_use = "usa"
output_dir = "/ihme/scratch/users/zmbc/pvs_like_case_study/generate_simulated_data/"
compute_engine = "dask"
num_jobs = 50
memory_per_job = "120GB"
cpus_per_job = 5


In [5]:
! date

Mon Nov 20 16:25:55 PST 2023


In [6]:
psp.__version__

'0.8.3.dev6+g31db93a'

In [7]:
def start_dask_distributed_over_slurm():
    import dask
    # Make Dask much less conservative with memory management: don't start spilling
    # until over 85%, don't kill until basically at memory limit
    # (I don't much mind whether Dask or slurm kills a worker)
    # We want to avoid spilling if at all possible, since it uses a resource
    # (local disk space) which is unpredictably allocated and running out of
    # it can cause the whole computation to fail
    dask.config.set({"distributed.worker.memory.target": False})
    dask.config.set({"distributed.worker.memory.spill": 0.85})
    dask.config.set({"distributed.worker.memory.pause": 0.85})
    dask.config.set({"distributed.worker.memory.terminate": 0.975})

    from dask_jobqueue import SLURMCluster

    cluster = SLURMCluster(
        queue='long.q',
        account="proj_simscience",
        # If you give dask workers more than one core, they will use it to
        # run more tasks at once, which can use more memory than is available.
        # To have more than one thread per worker but use them all for
        # multi-threading code in one task
        # at a time, you have to set cores=1, processes=1 and job_cpu > 1.
        cores=1,
        processes=1,
        memory=memory_per_job,
        walltime="10-00:00:00",
        # Dask distributed looks at OS-reported memory to decide whether a worker is running out.
        # If the memory allocator is not returning the memory to the OS promptly (even when holding onto it
        # is smart), it will lead Dask to make bad decisions.
        # By default, pyarrow uses jemalloc, but I could not get that to release memory quickly.
        # Even this doesn't seem to be completely working, but in combination with small-ish partitions
        # it seems to do okay -- unmanaged memory does seem to shrink from time to time, which it wasn't
        # previously doing.
        job_script_prologue="export ARROW_DEFAULT_MEMORY_POOL=system\nexport MALLOC_TRIM_THRESHOLD_=0",
        job_cpu=cpus_per_job,
        # NOTE: This is, as Dask requests, a directory local to the compute node.
        # But IHME's cluster doesn't support this very well -- it can be small-ish,
        # full of stuff from other users, etc.
        local_directory=f"/tmp/{os.environ['USER']}_dask_generate_simulated_data",
        # NOTE: Network file system -- probably slow and doing a lot of unnecessary I/O!
        # local_directory=f"/ihme/scratch/users/{os.environ['USER']}/dask_work_dir/dask_generate_simulated_data",
        # HACK: Avoid nodes with /tmp too full (as of 11/17/2023)
        job_extra_directives=["-x long-slurm-sarchive-p00[53-59]"],
        log_directory=f"/ihme/temp/slurmoutput/{os.environ['USER']}",
    )

    cluster.scale(n=num_jobs)
    # Supposedly, this will start new jobs if the existing
    # ones fail for some reason.
    # https://stackoverflow.com/a/61295019
    cluster.adapt(minimum_jobs=num_jobs, maximum_jobs=num_jobs)

    from distributed import Client
    client = Client(cluster)

    client.wait_for_workers(n_workers=num_jobs)

    return cluster, client

if compute_engine == 'pandas':
    import pandas as pd
elif compute_engine == 'dask':
    # import dask
    # HACK: Use Python instead of pyarrow strings; this will usually be much slower and
    # require more memory, but pyarrow string columns have a 2GB max
    # Worked around this using large_strings in pyarrow instead
    # dask.config.set({"dataframe.convert-string": False})

    cluster, client = start_dask_distributed_over_slurm()

    import dask.dataframe as pd

    display(client)
elif compute_engine.startswith('modin'):
    if compute_engine.startswith('modin_dask_'):
        import modin.config as modin_cfg
        modin_cfg.Engine.put("dask") # Use dask instead of ray (which is the default)

        if compute_engine == 'modin_dask_distributed':
            cluster, client = start_dask_distributed_over_slurm()
        else:
            from distributed import Client
            cpus_available = int(os.environ['SLURM_CPUS_ON_NODE'])
            client = Client(n_workers=int(cpus_available / 2), threads_per_worker=2)

        # Why is this necessary?!
        # For some reason, if I don't set NPartitions, it seems to default to 0?!
        num_row_groups = 1 if data_to_use == 'small_sample' else 334
        modin_cfg.NPartitions.put(min(num_jobs * 5, num_row_groups))
        modin_cfg.MinPartitionSize.put(1_000) # ensure no column-axis partitions -- they'll need to be joined up right away anyway by our row-wise noising

        display(client)
    elif compute_engine == 'modin_ray':
        # Haven't worked on distributing this across multiple nodes
        import ray
        ray.init(runtime_env={'env_vars': {'__MODIN_AUTOIMPORT_PANDAS__': '1'}}, num_cpus=int(os.environ['SLURM_CPUS_ON_NODE']))
    else:
        # Use serial Python backend (good for debugging errors)
        import modin.config as modin_cfg
        modin_cfg.IsDebug.put(True)

    import modin.pandas as pd

    # https://modin.readthedocs.io/en/stable/usage_guide/advanced_usage/progress_bar.html
    from modin.config import ProgressBar
    ProgressBar.enable()

0,1
Connection method: Cluster object,Cluster type: dask_jobqueue.SLURMCluster
Dashboard: http://10.158.106.9:8787/status,

0,1
Dashboard: http://10.158.106.9:8787/status,Workers: 50
Total threads: 50,Total memory: 5.46 TiB

0,1
Comm: tcp://10.158.106.9:39609,Workers: 50
Dashboard: http://10.158.106.9:8787/status,Total threads: 50
Started: 1 minute ago,Total memory: 5.46 TiB

0,1
Comm: tcp://10.158.147.200:38377,Total threads: 1
Dashboard: http://10.158.147.200:33585/status,Memory: 111.76 GiB
Nanny: tcp://10.158.147.200:39705,
Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-9im9icj1,Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-9im9icj1
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 8.0%,Last seen: Just now
Memory usage: 133.46 MiB,Spilled bytes: 0 B
Read bytes: 563.45 MiB,Write bytes: 2.24 MiB

0,1
Comm: tcp://10.158.147.199:33107,Total threads: 1
Dashboard: http://10.158.147.199:33019/status,Memory: 111.76 GiB
Nanny: tcp://10.158.147.199:41609,
Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-rnkre9oy,Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-rnkre9oy
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 6.0%,Last seen: Just now
Memory usage: 135.01 MiB,Spilled bytes: 0 B
Read bytes: 171.17 MiB,Write bytes: 695.16 kiB

0,1
Comm: tcp://10.158.147.204:43275,Total threads: 1
Dashboard: http://10.158.147.204:43025/status,Memory: 111.76 GiB
Nanny: tcp://10.158.147.204:41275,
Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-1cfue5ux,Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-1cfue5ux
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 4.0%,Last seen: Just now
Memory usage: 133.57 MiB,Spilled bytes: 0 B
Read bytes: 10.87 kiB,Write bytes: 5.09 kiB

0,1
Comm: tcp://10.158.100.144:34317,Total threads: 1
Dashboard: http://10.158.100.144:38541/status,Memory: 111.76 GiB
Nanny: tcp://10.158.100.144:38937,
Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-_ilg0g93,Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-_ilg0g93
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 4.0%,Last seen: Just now
Memory usage: 134.11 MiB,Spilled bytes: 0 B
Read bytes: 784.1243765256852 B,Write bytes: 397.0757712329557 B

0,1
Comm: tcp://10.158.147.217:42837,Total threads: 1
Dashboard: http://10.158.147.217:35503/status,Memory: 111.76 GiB
Nanny: tcp://10.158.147.217:39793,
Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-vrz8np79,Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-vrz8np79
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 6.0%,Last seen: Just now
Memory usage: 133.25 MiB,Spilled bytes: 0 B
Read bytes: 2.53 kiB,Write bytes: 1.24 kiB

0,1
Comm: tcp://10.158.100.16:38473,Total threads: 1
Dashboard: http://10.158.100.16:37989/status,Memory: 111.76 GiB
Nanny: tcp://10.158.100.16:41195,
Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-71iuxezr,Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-71iuxezr
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 6.0%,Last seen: Just now
Memory usage: 137.09 MiB,Spilled bytes: 0 B
Read bytes: 20.66 MiB,Write bytes: 79.50 kiB

0,1
Comm: tcp://10.158.148.63:44293,Total threads: 1
Dashboard: http://10.158.148.63:39061/status,Memory: 111.76 GiB
Nanny: tcp://10.158.148.63:36215,
Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-iv4kw7zk,Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-iv4kw7zk
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 6.0%,Last seen: Just now
Memory usage: 136.04 MiB,Spilled bytes: 0 B
Read bytes: 25.64 kiB,Write bytes: 23.49 kiB

0,1
Comm: tcp://10.158.96.180:45431,Total threads: 1
Dashboard: http://10.158.96.180:38587/status,Memory: 111.76 GiB
Nanny: tcp://10.158.96.180:37711,
Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-a_ibb_dh,Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-a_ibb_dh
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 6.0%,Last seen: Just now
Memory usage: 136.75 MiB,Spilled bytes: 0 B
Read bytes: 7.79 kiB,Write bytes: 1.54 kiB

0,1
Comm: tcp://10.158.100.42:34253,Total threads: 1
Dashboard: http://10.158.100.42:44835/status,Memory: 111.76 GiB
Nanny: tcp://10.158.100.42:36557,
Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-9u0zkdoj,Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-9u0zkdoj
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 4.0%,Last seen: Just now
Memory usage: 133.88 MiB,Spilled bytes: 0 B
Read bytes: 424.28 MiB,Write bytes: 1.28 MiB

0,1
Comm: tcp://10.158.106.11:43169,Total threads: 1
Dashboard: http://10.158.106.11:42219/status,Memory: 111.76 GiB
Nanny: tcp://10.158.106.11:40465,
Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-x0omy1rm,Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-x0omy1rm
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 4.0%,Last seen: Just now
Memory usage: 136.88 MiB,Spilled bytes: 0 B
Read bytes: 5.19 kiB,Write bytes: 4.75 kiB

0,1
Comm: tcp://10.158.100.16:43773,Total threads: 1
Dashboard: http://10.158.100.16:33217/status,Memory: 111.76 GiB
Nanny: tcp://10.158.100.16:33845,
Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-eemb5bq1,Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-eemb5bq1
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 6.0%,Last seen: Just now
Memory usage: 136.73 MiB,Spilled bytes: 0 B
Read bytes: 20.79 MiB,Write bytes: 86.94 kiB

0,1
Comm: tcp://10.158.106.26:45341,Total threads: 1
Dashboard: http://10.158.106.26:42033/status,Memory: 111.76 GiB
Nanny: tcp://10.158.106.26:46403,
Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-g1fd82yy,Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-g1fd82yy
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 4.0%,Last seen: Just now
Memory usage: 133.34 MiB,Spilled bytes: 0 B
Read bytes: 4.63 kiB,Write bytes: 4.36 kiB

0,1
Comm: tcp://10.158.96.150:46489,Total threads: 1
Dashboard: http://10.158.96.150:33693/status,Memory: 111.76 GiB
Nanny: tcp://10.158.96.150:45575,
Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-xbz3fi6q,Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-xbz3fi6q
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 4.0%,Last seen: Just now
Memory usage: 133.91 MiB,Spilled bytes: 0 B
Read bytes: 1.58 kiB,Write bytes: 1.54 kiB

0,1
Comm: tcp://10.158.148.23:36365,Total threads: 1
Dashboard: http://10.158.148.23:34157/status,Memory: 111.76 GiB
Nanny: tcp://10.158.148.23:42919,
Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-g5i84nxn,Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-g5i84nxn
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 6.0%,Last seen: Just now
Memory usage: 135.27 MiB,Spilled bytes: 0 B
Read bytes: 2.20 kiB,Write bytes: 396.7780585972491 B

0,1
Comm: tcp://10.158.100.157:41167,Total threads: 1
Dashboard: http://10.158.100.157:34101/status,Memory: 111.76 GiB
Nanny: tcp://10.158.100.157:45173,
Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-hgdlje3x,Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-hgdlje3x
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 2.0%,Last seen: Just now
Memory usage: 135.05 MiB,Spilled bytes: 0 B
Read bytes: 2.52 kiB,Write bytes: 1.28 kiB

0,1
Comm: tcp://10.158.111.17:35335,Total threads: 1
Dashboard: http://10.158.111.17:46287/status,Memory: 111.76 GiB
Nanny: tcp://10.158.111.17:42727,
Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-9ytvzbm0,Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-9ytvzbm0
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 4.0%,Last seen: Just now
Memory usage: 134.42 MiB,Spilled bytes: 0 B
Read bytes: 334.74 MiB,Write bytes: 2.17 MiB

0,1
Comm: tcp://10.158.100.147:46267,Total threads: 1
Dashboard: http://10.158.100.147:38341/status,Memory: 111.76 GiB
Nanny: tcp://10.158.100.147:38171,
Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-q92jwm28,Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-q92jwm28
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 4.0%,Last seen: Just now
Memory usage: 135.07 MiB,Spilled bytes: 0 B
Read bytes: 782.2334260597927 B,Write bytes: 396.1182055238848 B

0,1
Comm: tcp://10.158.147.211:41629,Total threads: 1
Dashboard: http://10.158.147.211:34329/status,Memory: 111.76 GiB
Nanny: tcp://10.158.147.211:32999,
Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-13x3go7l,Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-13x3go7l
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 6.0%,Last seen: Just now
Memory usage: 133.91 MiB,Spilled bytes: 0 B
Read bytes: 2.61 kiB,Write bytes: 1.30 kiB

0,1
Comm: tcp://10.158.147.204:38069,Total threads: 1
Dashboard: http://10.158.147.204:45231/status,Memory: 111.76 GiB
Nanny: tcp://10.158.147.204:38841,
Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-pozr0tlk,Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-pozr0tlk
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 4.0%,Last seen: Just now
Memory usage: 134.85 MiB,Spilled bytes: 0 B
Read bytes: 10.11 kiB,Write bytes: 7.43 kiB

0,1
Comm: tcp://10.158.148.28:36791,Total threads: 1
Dashboard: http://10.158.148.28:35251/status,Memory: 111.76 GiB
Nanny: tcp://10.158.148.28:39829,
Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-8wvi37ia,Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-8wvi37ia
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 6.0%,Last seen: Just now
Memory usage: 132.51 MiB,Spilled bytes: 0 B
Read bytes: 7.00 kiB,Write bytes: 15.12 kiB

0,1
Comm: tcp://10.158.111.17:46235,Total threads: 1
Dashboard: http://10.158.111.17:38865/status,Memory: 111.76 GiB
Nanny: tcp://10.158.111.17:43637,
Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-5ztn61yd,Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-5ztn61yd
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 2.0%,Last seen: Just now
Memory usage: 134.59 MiB,Spilled bytes: 0 B
Read bytes: 296.66 MiB,Write bytes: 1.61 MiB

0,1
Comm: tcp://10.158.147.211:40133,Total threads: 1
Dashboard: http://10.158.147.211:44057/status,Memory: 111.76 GiB
Nanny: tcp://10.158.147.211:40989,
Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-nb746nf8,Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-nb746nf8
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 4.0%,Last seen: Just now
Memory usage: 132.73 MiB,Spilled bytes: 0 B
Read bytes: 4.13 kiB,Write bytes: 10.00 kiB

0,1
Comm: tcp://10.158.100.54:45129,Total threads: 1
Dashboard: http://10.158.100.54:32919/status,Memory: 111.76 GiB
Nanny: tcp://10.158.100.54:38443,
Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-g5ty9y1z,Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-g5ty9y1z
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 4.0%,Last seen: Just now
Memory usage: 134.89 MiB,Spilled bytes: 0 B
Read bytes: 7.82 kiB,Write bytes: 9.67 kiB

0,1
Comm: tcp://10.158.106.11:44629,Total threads: 1
Dashboard: http://10.158.106.11:41613/status,Memory: 111.76 GiB
Nanny: tcp://10.158.106.11:38247,
Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-_52mup4n,Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-_52mup4n
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 6.0%,Last seen: Just now
Memory usage: 137.49 MiB,Spilled bytes: 0 B
Read bytes: 6.72 kiB,Write bytes: 13.46 kiB

0,1
Comm: tcp://10.158.147.213:42329,Total threads: 1
Dashboard: http://10.158.147.213:40423/status,Memory: 111.76 GiB
Nanny: tcp://10.158.147.213:36125,
Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-9gfj9i7o,Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-9gfj9i7o
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 6.0%,Last seen: Just now
Memory usage: 133.63 MiB,Spilled bytes: 0 B
Read bytes: 573.30 MiB,Write bytes: 2.64 MiB

0,1
Comm: tcp://10.158.148.23:38701,Total threads: 1
Dashboard: http://10.158.148.23:34683/status,Memory: 111.76 GiB
Nanny: tcp://10.158.148.23:40407,
Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-5bqbx7x5,Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-5bqbx7x5
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 4.0%,Last seen: Just now
Memory usage: 136.91 MiB,Spilled bytes: 0 B
Read bytes: 1.53 kiB,Write bytes: 4.74 kiB

0,1
Comm: tcp://10.158.148.27:34515,Total threads: 1
Dashboard: http://10.158.148.27:41795/status,Memory: 111.76 GiB
Nanny: tcp://10.158.148.27:43439,
Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-zf9upqi3,Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-zf9upqi3
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 4.0%,Last seen: Just now
Memory usage: 133.92 MiB,Spilled bytes: 0 B
Read bytes: 558.02 MiB,Write bytes: 1.87 MiB

0,1
Comm: tcp://10.158.96.180:45113,Total threads: 1
Dashboard: http://10.158.96.180:44057/status,Memory: 111.76 GiB
Nanny: tcp://10.158.96.180:36115,
Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-c9cibxec,Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-c9cibxec
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 6.0%,Last seen: Just now
Memory usage: 136.89 MiB,Spilled bytes: 0 B
Read bytes: 8.53 kiB,Write bytes: 5.89 kiB

0,1
Comm: tcp://10.158.96.184:46591,Total threads: 1
Dashboard: http://10.158.96.184:33707/status,Memory: 111.76 GiB
Nanny: tcp://10.158.96.184:42025,
Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-cigfziey,Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-cigfziey
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 0.0%,Last seen: Just now
Memory usage: 58.98 MiB,Spilled bytes: 0 B
Read bytes: 0.0 B,Write bytes: 0.0 B

0,1
Comm: tcp://10.158.148.28:35019,Total threads: 1
Dashboard: http://10.158.148.28:35487/status,Memory: 111.76 GiB
Nanny: tcp://10.158.148.28:37225,
Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-ll1zo2m9,Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-ll1zo2m9
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 6.0%,Last seen: Just now
Memory usage: 135.76 MiB,Spilled bytes: 0 B
Read bytes: 1.58 kiB,Write bytes: 1.54 kiB

0,1
Comm: tcp://10.158.148.24:42633,Total threads: 1
Dashboard: http://10.158.148.24:38365/status,Memory: 111.76 GiB
Nanny: tcp://10.158.148.24:42279,
Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-za376cll,Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-za376cll
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 4.0%,Last seen: Just now
Memory usage: 132.16 MiB,Spilled bytes: 0 B
Read bytes: 180.92 MiB,Write bytes: 743.21 kiB

0,1
Comm: tcp://10.158.147.199:33701,Total threads: 1
Dashboard: http://10.158.147.199:45501/status,Memory: 111.76 GiB
Nanny: tcp://10.158.147.199:41207,
Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-d5h7dsre,Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-d5h7dsre
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 6.0%,Last seen: Just now
Memory usage: 134.66 MiB,Spilled bytes: 0 B
Read bytes: 203.28 MiB,Write bytes: 1.17 MiB

0,1
Comm: tcp://10.158.106.11:34695,Total threads: 1
Dashboard: http://10.158.106.11:34355/status,Memory: 111.76 GiB
Nanny: tcp://10.158.106.11:42795,
Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-nt9i5tlr,Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-nt9i5tlr
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 6.0%,Last seen: Just now
Memory usage: 134.83 MiB,Spilled bytes: 0 B
Read bytes: 4.51 kiB,Write bytes: 14.32 kiB

0,1
Comm: tcp://10.158.100.42:41985,Total threads: 1
Dashboard: http://10.158.100.42:33941/status,Memory: 111.76 GiB
Nanny: tcp://10.158.100.42:37789,
Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-qc5bmz0x,Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-qc5bmz0x
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 4.0%,Last seen: Just now
Memory usage: 133.85 MiB,Spilled bytes: 0 B
Read bytes: 424.45 MiB,Write bytes: 1.29 MiB

0,1
Comm: tcp://10.158.147.201:41809,Total threads: 1
Dashboard: http://10.158.147.201:40699/status,Memory: 111.76 GiB
Nanny: tcp://10.158.147.201:34069,
Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-vjqit2qf,Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-vjqit2qf
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 6.0%,Last seen: Just now
Memory usage: 135.34 MiB,Spilled bytes: 0 B
Read bytes: 194.69 MiB,Write bytes: 1.14 MiB

0,1
Comm: tcp://10.158.111.17:37639,Total threads: 1
Dashboard: http://10.158.111.17:40585/status,Memory: 111.76 GiB
Nanny: tcp://10.158.111.17:43531,
Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-9mnjg9w1,Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-9mnjg9w1
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 2.0%,Last seen: Just now
Memory usage: 134.81 MiB,Spilled bytes: 0 B
Read bytes: 333.13 MiB,Write bytes: 1.90 MiB

0,1
Comm: tcp://10.158.96.137:40929,Total threads: 1
Dashboard: http://10.158.96.137:42733/status,Memory: 111.76 GiB
Nanny: tcp://10.158.96.137:44069,
Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-870ypr2a,Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-870ypr2a
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 4.0%,Last seen: Just now
Memory usage: 134.05 MiB,Spilled bytes: 0 B
Read bytes: 2.77 kiB,Write bytes: 2.42 kiB

0,1
Comm: tcp://10.158.147.201:42593,Total threads: 1
Dashboard: http://10.158.147.201:44803/status,Memory: 111.76 GiB
Nanny: tcp://10.158.147.201:37049,
Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-81b234xb,Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-81b234xb
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 6.0%,Last seen: Just now
Memory usage: 135.79 MiB,Spilled bytes: 0 B
Read bytes: 195.66 MiB,Write bytes: 1.14 MiB

0,1
Comm: tcp://10.158.100.40:41747,Total threads: 1
Dashboard: http://10.158.100.40:39933/status,Memory: 111.76 GiB
Nanny: tcp://10.158.100.40:46155,
Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-lpwtk5i7,Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-lpwtk5i7
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 2.0%,Last seen: Just now
Memory usage: 135.52 MiB,Spilled bytes: 0 B
Read bytes: 242.32 MiB,Write bytes: 1.03 MiB

0,1
Comm: tcp://10.158.100.16:41299,Total threads: 1
Dashboard: http://10.158.100.16:46349/status,Memory: 111.76 GiB
Nanny: tcp://10.158.100.16:41757,
Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-n5m6u521,Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-n5m6u521
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 6.0%,Last seen: Just now
Memory usage: 134.04 MiB,Spilled bytes: 0 B
Read bytes: 17.93 MiB,Write bytes: 87.30 kiB

0,1
Comm: tcp://10.158.147.211:37531,Total threads: 1
Dashboard: http://10.158.147.211:46467/status,Memory: 111.76 GiB
Nanny: tcp://10.158.147.211:45469,
Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-1lj_da49,Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-1lj_da49
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 6.0%,Last seen: Just now
Memory usage: 132.89 MiB,Spilled bytes: 0 B
Read bytes: 3.38 kiB,Write bytes: 5.68 kiB

0,1
Comm: tcp://10.158.147.211:38431,Total threads: 1
Dashboard: http://10.158.147.211:43631/status,Memory: 111.76 GiB
Nanny: tcp://10.158.147.211:33477,
Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-d5g25h56,Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-d5g25h56
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 6.0%,Last seen: Just now
Memory usage: 132.80 MiB,Spilled bytes: 0 B
Read bytes: 4.25 kiB,Write bytes: 14.61 kiB

0,1
Comm: tcp://10.158.148.11:39797,Total threads: 1
Dashboard: http://10.158.148.11:41605/status,Memory: 111.76 GiB
Nanny: tcp://10.158.148.11:43229,
Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-v54fiklb,Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-v54fiklb
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 6.0%,Last seen: Just now
Memory usage: 134.14 MiB,Spilled bytes: 0 B
Read bytes: 351.30 kiB,Write bytes: 352.07 kiB

0,1
Comm: tcp://10.158.147.141:34671,Total threads: 1
Dashboard: http://10.158.147.141:40075/status,Memory: 111.76 GiB
Nanny: tcp://10.158.147.141:34567,
Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-oa_tsor_,Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-oa_tsor_
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 8.0%,Last seen: Just now
Memory usage: 133.47 MiB,Spilled bytes: 0 B
Read bytes: 536.93 MiB,Write bytes: 2.13 MiB

0,1
Comm: tcp://10.158.106.11:36915,Total threads: 1
Dashboard: http://10.158.106.11:37547/status,Memory: 111.76 GiB
Nanny: tcp://10.158.106.11:45191,
Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-i4wopk2b,Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-i4wopk2b
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 4.0%,Last seen: Just now
Memory usage: 136.48 MiB,Spilled bytes: 0 B
Read bytes: 5.96 kiB,Write bytes: 9.10 kiB

0,1
Comm: tcp://10.158.100.54:40479,Total threads: 1
Dashboard: http://10.158.100.54:39743/status,Memory: 111.76 GiB
Nanny: tcp://10.158.100.54:37721,
Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-lu5hetjk,Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-lu5hetjk
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 2.0%,Last seen: Just now
Memory usage: 134.91 MiB,Spilled bytes: 0 B
Read bytes: 8.59 kiB,Write bytes: 14.03 kiB

0,1
Comm: tcp://10.158.148.27:36117,Total threads: 1
Dashboard: http://10.158.148.27:37389/status,Memory: 111.76 GiB
Nanny: tcp://10.158.148.27:35971,
Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-l3d_a64r,Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-l3d_a64r
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 6.0%,Last seen: Just now
Memory usage: 136.32 MiB,Spilled bytes: 0 B
Read bytes: 578.73 MiB,Write bytes: 1.93 MiB

0,1
Comm: tcp://10.158.148.25:44221,Total threads: 1
Dashboard: http://10.158.148.25:37633/status,Memory: 111.76 GiB
Nanny: tcp://10.158.148.25:40801,
Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-sww7f8b9,Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-sww7f8b9
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 4.0%,Last seen: Just now
Memory usage: 134.05 MiB,Spilled bytes: 0 B
Read bytes: 0.93 MiB,Write bytes: 1.38 MiB

0,1
Comm: tcp://10.158.147.170:46529,Total threads: 1
Dashboard: http://10.158.147.170:39097/status,Memory: 111.76 GiB
Nanny: tcp://10.158.147.170:46107,
Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-lzwm_zyf,Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-lzwm_zyf
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 6.0%,Last seen: Just now
Memory usage: 136.05 MiB,Spilled bytes: 0 B
Read bytes: 1.90 GiB,Write bytes: 6.64 MiB

0,1
Comm: tcp://10.158.148.19:44371,Total threads: 1
Dashboard: http://10.158.148.19:37139/status,Memory: 111.76 GiB
Nanny: tcp://10.158.148.19:41749,
Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-22kxdtkl,Local directory: /tmp/zmbc_dask_generate_simulated_data/dask-scratch-space/worker-22kxdtkl
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 6.0%,Last seen: Just now
Memory usage: 136.43 MiB,Spilled bytes: 0 B
Read bytes: 447.33 MiB,Write bytes: 2.33 MiB


In [8]:
# Helpers for dealing with lazy evaluation -- Dask doesn't actually compute
# anything until you explicitly tell it to, while Pandas and Modin are eager

def persist(*args):
    if compute_engine == 'dask':
        if len(args) == 1:
            return client.persist(args[0])

        return client.persist(args)
    else:
        # Eagerly computed already
        if len(args) == 1:
            return args[0]

        return args

def compute(obj):
    if compute_engine == 'dask':
        return obj.compute()
    else:
        # Eagerly computed already
        return obj

In [9]:
# DataFrame operations that need to be done in specific ways for Dask

def drop_duplicates(df, subset=None, sort_col=None, keep='last'):
    original_columns = list(df.columns)

    if subset is None:
        subset = original_columns.copy()
    elif isinstance(subset, str):
        subset = [subset]
    else:
        subset = list(subset)

    if sort_col is not None:
        df = df.sort_values(sort_col)

    if compute_engine == 'pandas' or compute_engine.startswith('modin'):
        return df.drop_duplicates(subset=subset, keep=keep)
    elif compute_engine == 'dask':
        # NOTE: This approach depends crucially on https://github.com/dask/dask/issues/8437, as described in
        # https://github.com/dask/dask/issues/8437#issuecomment-983440465
        index_before = None
        if df.index.name is not None:
            index_before = df.index.name
            df = df.reset_index()

        if len(subset) == 1:
            # Cannot set_index with a column that contains any null values. Any rows that have nulls in any of subset
            # are by definition not duplicates.
            temp_index = subset[0]
            df = concat_avoid_fragmentation([
                df[df[temp_index].isnull()],
                df[df[temp_index].notnull()].set_index(temp_index).map_partitions(lambda x: x[~x.index.duplicated(keep=keep)]).reset_index(),
            ], ignore_index=True)
        else:
            # NOTE: This means it is best to put a high-cardinality column as the first item of subset
            temp_index = subset[0]
            # Cannot set_index with a column that contains any null values. Any rows that have nulls in any of subset
            # are by definition not duplicates.
            df = concat_avoid_fragmentation([
                df[df[temp_index].isnull()],
                df[df[temp_index].notnull()].set_index(temp_index).map_partitions(lambda x: x.reset_index().drop_duplicates(subset=subset, keep=keep).set_index(temp_index)).reset_index(),
            ], ignore_index=True)

        if index_before is None:
            return df
        else:
            return df.set_index(index_before, sort=False)
        # NOTE: The following is another approach I tried. It turns out that Dask groupbys don't work the way
        # you might expect for small groups, which is described more in groupby_agg_small_groups.
        # But even after working around that, it turned out to be much simpler to use the index-based approach above.
#     elif compute_engine == 'dask':
#         if sort_col is None:
#             df = df.assign(dummy_for_cumsum=1).assign(drop_duplicates_unique_id=lambda df: df.dummy_for_cumsum.cumsum()).drop(columns=['dummy_for_cumsum'])
#             sort_col = 'drop_duplicates_unique_id'

#         if keep == 'last':
#             to_keep = df.groupby(subset, dropna=False)[sort_col].max()
#         elif keep == 'first':
#             to_keep = df.groupby(subset, dropna=False)[sort_col].min()
#         else:
#             raise ValueError()

#         result = df.merge(to_keep.to_frame(), on=(subset + [sort_col]), how='inner')[original_columns]

#         if sort_col == 'drop_duplicates_unique_id':
#             return result
#         else:
#             # No guarantee of uniqueness
#             return drop_duplicates(result, subset=(subset + [sort_col]), keep='last')
    else:
        raise ValueError()

# NOTE: Dask groupbys don't work the way you might expect for small groups.
# In our application, when we groupby, we are usually grouping by a column (set)
# with very high cardinality -- almost as many groups as we have rows.
# Dask's agg function tries to create a data structure that is O(N) with the number of
# groups on a single node, which OOMs in this situation.
# Instead, we take advantage of https://github.com/dask/dask/issues/8437 again to turn
# this into a P2P shuffle operation that never holds any substantial amount of the data
# in any one place.
def groupby_agg_small_groups(df, by, agg_func):
    if compute_engine == 'pandas' or compute_engine.startswith('modin'):
        return agg_func(df.groupby(by))
    elif compute_engine == 'dask':
        if isinstance(by, str):
            by = [by]
        else:
            by = list(by)

        index_before = None
        if df.index.name is not None:
            index_before = df.index.name
            df = df.reset_index()

        # NOTE: This means it is best to put a high-cardinality column as the first item of by
        temp_index = by[0]

        # Cannot set_index with a column that contains any null values. Any rows that have nulls in any of subset
        # are not put into any group, like the default pandas behavior
        return df[df[temp_index].notnull()].set_index(temp_index).map_partitions(lambda x: agg_func(x.reset_index().groupby(by)))
    else:
        raise ValueError()

def concat_avoid_fragmentation(*args, **kwargs):
    result = pd.concat(*args, **kwargs)

    if compute_engine == 'dask' and result.npartitions > (num_jobs * 20):
        # By default, a Dask concat operation of A and B will lead to
        # a result with A.npartitions + B.npartitions partitions.
        # We do several operations that look like
        # df = concat([transformation_1(df), transformation_2(df)])
        # which doubles the number of partitions.
        # If we don't repartition, this doubling leads to a partition explosion,
        # which scales scheduler overhead and the memory size of the task graph.
        result = result.repartition(npartitions=(num_jobs * 5))

    return result

In [10]:
# NOTE: By default, Dask uses PyArrow string dtypes, not Python ones.
# This is great, because they are faster to work with, more memory efficient, and
# (crucially) nullable -- all of our string columns can be missing, which we represent
# in NumPy land with NaN, but that is finicky with Parquet.
# Unfortunately, the default PyArrow string dtype has a limit of 2GB of data per
# PyArrow "chunk," and although PyArrow chunks are supposed to work invisibly to the
# user, there are a number of bugs in PyArrow that cause common operations to try to
# switch an entire array to be a single chunk.
# See https://github.com/dask/dask/issues/10139#issuecomment-1812817180 for more on this.
# Due to these issues, we were running into the 2GB limit.
# PyArrow has a "large_string" dtype that has effectively no limit on size (64 bit instead
# of 32 bit offset).
# Dask lets us use this dtype, so long as we use it for *all* strings (due to a bug in Dask,
# see https://github.com/dask/dask/issues/10139#issuecomment-1812969372).

# Based on https://github.com/dask/dask/blob/b2f11d026d2c6f806036c050ff5dbd59d6ceb6ec/dask/dataframe/_pyarrow.py#L64-L98
# and code referenced from there
import pyarrow as pa

def is_pyarrow_string_dtype(dtype):
    """Is the input dtype a pyarrow string?"""

    pa_string_types = [pandas.StringDtype("pyarrow"), pandas.ArrowDtype(pa.string()), pandas.ArrowDtype(pa.large_string())]
    return dtype in pa_string_types

def is_pyarrow_string_index(x):
    if isinstance(x, pandas.MultiIndex):
        return any(is_pyarrow_string_index(level) for level in x.levels)
    return isinstance(x, pandas.Index) and is_pyarrow_string_dtype(x.dtype)

def to_pyarrow_large_string(df):
    string_dtype = pandas.ArrowDtype(pa.large_string())

    # Possibly convert DataFrame/Series/Index to string_dtype
    dtypes = None
    if isinstance(df, pandas.DataFrame):
        dtypes = {
            col: string_dtype for col, dtype in df.dtypes.items() if is_pyarrow_string_dtype(dtype)
        }
    elif dtype_check(df.dtype):
        dtypes = string_dtype

    if dtypes:
        df = df.astype(dtypes, copy=False)

    # Convert DataFrame/Series index too
    if is_pyarrow_string_index(df.index):
        if isinstance(df.index, pandas.MultiIndex):
            levels = {
                i: level.astype(string_dtype)
                for i, level in enumerate(df.index.levels)
                if is_pyarrow_string_dtype(level.dtype)
            }
            # set verify_integrity=False to preserve index codes
            df.index = df.index.set_levels(
                levels.values(), level=levels.keys(), verify_integrity=False
            )
        else:
            df.index = df.index.astype(string_dtype)
    return df

def increase_string_capacity(df):
    if compute_engine != 'dask':
        # Not using pyarrow strings by default
        return df

    return df.map_partitions(
        # NOTE: In Dask they use enforce_metadata=False
        to_pyarrow_large_string, token="to_pyarrow_large_string"
    )

In [11]:
def remove_path(path):
    path = Path(path)
    if path.is_file():
        os.remove(path)
    elif path.exists():
        shutil.rmtree(path)

def save_file_with_ground_truth(file_name, file, ground_truth):
    # Check that file and ground truth have the same records
    assert (
        # record_id is unique
        len(file) ==
        len(drop_duplicates(file[['record_id']]))
    )
    assert (
        len(ground_truth) ==
        len(file[['record_id']].merge(ground_truth[['record_id']], on='record_id', how='inner'))
    )

    file_path = f'{output_dir}/{data_to_use}/{file_name}.parquet'
    remove_path(file_path)
    file.to_parquet(file_path)

    ground_truth_path = f'{output_dir}/{data_to_use}/{file_name}_ground_truth.parquet'
    remove_path(ground_truth_path)
    ground_truth.to_parquet(ground_truth_path)

## Load pseudopeople simulated datasets

### Record ID tracking (data lineage)

We do a little bit of work here to enable tracking the "ground truth" (the simulant IDs from
pseudopeople).
We give each pseudopeople record/row a unique identifier for tracking, and then we immediately
separate the ground truth information (which we would not have if we were using real data)
from the rest of the columns (which we would have).
The ground truth is only used in the specific "ground truth" section of this notebook,
to help avoid accidentally leaking information into the case study.

Since we also combine/aggregate pseudopeople records as part of the process of generating the
simulated PVS reference files, ground truth is a bit more complicated than you might imagine.
For example, the ground truth may tell us that a single row in a reference file is actually
a composite of several individuals, because even the deterministic linkage (by SSN) we use
here is not without error.

We handle this by tracking *all* source records used in the construction of each record in
our reference files.
This is achieved by having a table mapping composite record IDs to the "source record IDs"
(IDs of records that were directly generated by pseudopeople).
When we aggregate records, this is combined accordingly.

In [12]:
def add_unique_id_col(df, col_name='unique_id', value_prefix=''):
    if compute_engine == 'pandas' or compute_engine.startswith('modin'):
        return df.reset_index().rename(columns={'index': col_name}).assign(**{col_name: lambda df: value_prefix + df[col_name].astype(str)})
    elif compute_engine == 'dask':
        # Can use cumsum as in https://stackoverflow.com/a/60852409/ if it needs
        # to be incrementing, but we just need uniqueness

        def add_id_to_partition(df_part, partition_info=None):
            return (
                df_part
                    .assign(**{col_name: range(len(df_part))})
                    .assign(**{col_name: lambda x: (
                            value_prefix +
                            str(partition_info['number'] if partition_info is not None else 0) +
                            '_' +
                            x[col_name].astype(str)
                        ).astype('large_string[pyarrow]')}
                    )
            )

        df = df.map_partitions(add_id_to_partition)

        return df
    else:
        raise ValueError()

def add_unique_record_id(df, dataset_name):
    return add_unique_id_col(df, col_name='record_id', value_prefix=f'{dataset_name}_')

# Initializes a table listing the pairs between record_ids and source record_ids.
# Should only be called on "source records"; that is, records that
# come directly out of pseudopeople.
def record_id_to_single_source_record_pairs(df, source_col='record_id'):
    if source_col == 'record_id':
        # We can't have duplicate column names, so we make a new column
        # literally called 'source_col'
        df = df.assign(source_col=lambda df: df[source_col])
        source_col = 'source_col'

    return df[['record_id', source_col]].rename(columns={source_col: 'source_record_id'})

In [13]:
# Operations that aggregate records, combining the source record pairs
# between all records that are aggregated

def merge_preserving_source_records(dfs, source_record_pairings, new_record_id_prefix, *args, **kwargs):
    assert len(dfs) == len(source_record_pairings)
    for df in dfs:
        assert 'record_id' in df.columns

    on = kwargs.get('on', None)
    if on is not None:
        # If there are nulls in any of the merge columns, they can't match to anything
        dfs = [df.dropna(subset=on, how='any') for df in dfs]

    result = dfs[0]
    source_record_pairs = source_record_pairings[0]
    dfs_and_source_record_pairs_to_combine = list(zip(dfs[1:], source_record_pairings[1:]))
    for index, (df_to_merge, source_record_pairs_to_merge) in enumerate(dfs_and_source_record_pairs_to_combine):
        result = (
            result.merge(df_to_merge, *args, **kwargs)
        )
        if index == len(dfs_and_source_record_pairs_to_combine) - 1:
            # Since this is the last step, these are the record_ids that will actually be returned
            accumulate_step_record_id_prefix = new_record_id_prefix
        else:
            # A dummy intermediate -- this shouldn't be exposed to the user
            accumulate_step_record_id_prefix = f'merge_iter_{index}'

        result = add_unique_record_id(result, accumulate_step_record_id_prefix)
        source_record_pairs = concat_avoid_fragmentation([
            # The pairs that were already in result
            source_record_pairs
                .rename(columns={'record_id': 'record_id_x'})
                .merge(result[['record_id', 'record_id_x']], on='record_id_x')
                .drop(columns=['record_id_x']),
            # The new ones
            source_record_pairs_to_merge
                .rename(columns={'record_id': 'record_id_y'})
                .merge(result[['record_id', 'record_id_y']], on='record_id_y')
                .drop(columns=['record_id_y']),
        ])
        result = result.drop(columns=['record_id_x', 'record_id_y'])

    return result, source_record_pairs


def dedupe_preserving_source_records(df, source_record_pairs, columns_to_dedupe, new_record_id_prefix):#, source_records_col='source_record_ids'):
    result = drop_duplicates(df[columns_to_dedupe])
    result = add_unique_record_id(result, new_record_id_prefix)
    df_to_result_mapping = (
        df[['record_id'] + columns_to_dedupe]
            .rename(columns={'record_id': 'record_id_pre_dedupe'})
            .merge(result, on=columns_to_dedupe)
            [['record_id', 'record_id_pre_dedupe']]
    )
    result_source_record_pairs = (
        source_record_pairs
            .rename(columns={'record_id': 'record_id_pre_dedupe'})
            .merge(df_to_result_mapping, on='record_id_pre_dedupe')
            .drop(columns=['record_id_pre_dedupe'])
    )
    return result, result_source_record_pairs


def concat_preserving_source_records(dfs, source_record_pairings, new_record_id_prefix):
    dfs = [df.rename(columns={'record_id': 'record_id_pre_concat'}) for df in dfs]
    result = concat_avoid_fragmentation(dfs, ignore_index=True)
    result = add_unique_record_id(result, new_record_id_prefix)

    record_id_mapping = (
        result[['record_id', 'record_id_pre_concat']]
    )
    validate_kwarg = {
        'validate': 'm:1',
    } if compute_engine != 'dask' else {} # Dask doesn't support validate

    all_source_record_pairings = persist(concat_avoid_fragmentation(source_record_pairings, ignore_index=False))
    result_source_record_pairs = persist(
        all_source_record_pairings
            .rename(columns={'record_id': 'record_id_pre_concat'})
            .merge(record_id_mapping, on='record_id_pre_concat', **validate_kwarg)
            .drop(columns=['record_id_pre_concat'])
    )

    if compute_engine == 'dask':
        # Manual alternative to the 'validate' kwarg
        assert len(result_source_record_pairs) == len(all_source_record_pairings)

    return result.drop(columns=['record_id_pre_concat']), result_source_record_pairs

### Simulated SSA Numident

In [14]:
%%time

simulated_ssa_numident = pd.read_parquet(f'{output_dir}/{data_to_use}/pseudopeople_simulated_datasets/simulated_ssa_numident.parquet')
simulated_ssa_numident = increase_string_capacity(simulated_ssa_numident)
simulated_ssa_numident = add_unique_record_id(simulated_ssa_numident, 'simulated_ssa_numident')
simulated_ssa_numident = persist(simulated_ssa_numident)
simulated_ssa_numident_source_record_pairs = persist(record_id_to_single_source_record_pairs(simulated_ssa_numident))

CPU times: user 93.1 ms, sys: 18.2 ms, total: 111 ms
Wall time: 303 ms


In [15]:
simulated_ssa_numident_ground_truth = simulated_ssa_numident[['record_id', 'simulant_id']]
simulated_ssa_numident = simulated_ssa_numident.drop(columns=['simulant_id'])
simulated_ssa_numident, simulated_ssa_numident_ground_truth = persist(simulated_ssa_numident, simulated_ssa_numident_ground_truth)
simulated_ssa_numident

Unnamed: 0_level_0,ssn,first_name,middle_name,last_name,date_of_birth,sex,event_type,event_date,record_id
npartitions=300,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
,large_string[pyarrow],large_string[pyarrow],large_string[pyarrow],large_string[pyarrow],large_string[pyarrow],category[unknown],category[unknown],large_string[pyarrow],large_string[pyarrow]
,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...


### 1040 tax filings

We assume that the last 5 years of taxes would be available and used in the construction of the reference files -- see section about reference files below.

Note that these are retrieved by *tax* year, so the 2029 taxes would be available in early 2030
(around when our hypothetical case study is taking place).

In [16]:
tax_years = list(range(2025, 2030))
tax_years

[2025, 2026, 2027, 2028, 2029]

In [17]:
%%time

# Combine 1040 for all years.
simulated_taxes_1040 = concat_avoid_fragmentation([
    pd.read_parquet(
        f'{output_dir}/{data_to_use}/pseudopeople_simulated_datasets/simulated_taxes_1040_{year}.parquet',
        columns=[
            'simulant_id',
            'ssn',
            'first_name',
            'middle_initial',
            'last_name',
            'mailing_address_street_number',
            'mailing_address_street_name',
            'mailing_address_unit_number',
            'mailing_address_po_box',
            'mailing_address_city',
            'mailing_address_state',
            'mailing_address_zipcode',
        ],
    )
    for year in tax_years
], ignore_index=True)
simulated_taxes_1040 = add_unique_record_id(simulated_taxes_1040, 'simulated_1040')
simulated_taxes_1040 = increase_string_capacity(simulated_taxes_1040)
simulated_taxes_1040 = persist(simulated_taxes_1040)
simulated_taxes_1040_source_record_pairs = persist(record_id_to_single_source_record_pairs(simulated_taxes_1040))

CPU times: user 874 ms, sys: 58.2 ms, total: 932 ms
Wall time: 839 ms


In [18]:
simulated_taxes_1040_ground_truth = simulated_taxes_1040[['record_id', 'simulant_id']]
simulated_taxes_1040 = simulated_taxes_1040.drop(columns=['simulant_id'])
simulated_taxes_1040, simulated_taxes_1040_ground_truth = persist(simulated_taxes_1040, simulated_taxes_1040_ground_truth)
simulated_taxes_1040

Unnamed: 0_level_0,ssn,first_name,middle_initial,last_name,mailing_address_street_number,mailing_address_street_name,mailing_address_unit_number,mailing_address_po_box,mailing_address_city,mailing_address_state,mailing_address_zipcode,record_id
npartitions=250,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
,large_string[pyarrow],large_string[pyarrow],large_string[pyarrow],large_string[pyarrow],large_string[pyarrow],large_string[pyarrow],large_string[pyarrow],large_string[pyarrow],large_string[pyarrow],category[unknown],large_string[pyarrow],large_string[pyarrow]
,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...


### W2/1099 tax filings

We assume that the last 5 years of taxes would be available and used in the construction of the reference files --
see section about reference files below.

Note that these are retrieved by *tax* year, so the 2029 taxes would be available in early 2030
(around when our hypothetical case study is taking place).

In [19]:
%%time

# Combine W2/1099 for all years.
simulated_w2_1099 = concat_avoid_fragmentation([
    pd.read_parquet(
        f'{output_dir}/{data_to_use}/pseudopeople_simulated_datasets/simulated_taxes_w2_and_1099_{year}.parquet',
        columns=[
            'simulant_id',
            'ssn',
            'first_name',
            'middle_initial',
            'last_name',
            'mailing_address_street_number',
            'mailing_address_street_name',
            'mailing_address_unit_number',
            'mailing_address_po_box',
            'mailing_address_city',
            'mailing_address_state',
            'mailing_address_zipcode',
        ],
    )
    for year in tax_years
], ignore_index=True)
simulated_w2_1099 = add_unique_record_id(simulated_w2_1099, 'simulated_w2_1099')
simulated_w2_1099 = persist(increase_string_capacity(simulated_w2_1099))
simulated_w2_1099_source_record_pairs = persist(record_id_to_single_source_record_pairs(simulated_w2_1099))

CPU times: user 654 ms, sys: 71.9 ms, total: 726 ms
Wall time: 831 ms


In [20]:
simulated_w2_1099_ground_truth = simulated_w2_1099[['record_id', 'simulant_id']]
simulated_w2_1099 = simulated_w2_1099.drop(columns=['simulant_id'])
simulated_w2_1099, simulated_w2_1099_ground_truth = persist(simulated_w2_1099, simulated_w2_1099_ground_truth)
simulated_w2_1099

Unnamed: 0_level_0,ssn,first_name,middle_initial,last_name,mailing_address_street_number,mailing_address_street_name,mailing_address_unit_number,mailing_address_po_box,mailing_address_city,mailing_address_state,mailing_address_zipcode,record_id
npartitions=250,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
,large_string[pyarrow],large_string[pyarrow],large_string[pyarrow],large_string[pyarrow],large_string[pyarrow],large_string[pyarrow],large_string[pyarrow],large_string[pyarrow],large_string[pyarrow],category[unknown],large_string[pyarrow],large_string[pyarrow]
,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...


In [21]:
simulated_taxes, simulated_taxes_source_record_pairs = concat_preserving_source_records(
    [simulated_taxes_1040, simulated_w2_1099],
    [simulated_taxes_1040_source_record_pairs, simulated_w2_1099_source_record_pairs],
    new_record_id_prefix='simulated_taxes',
)

In [22]:
simulated_taxes_1040_with_itins = simulated_taxes_1040[simulated_taxes_1040.ssn.notnull() & simulated_taxes_1040.ssn.str.startswith('9')]
simulated_taxes_1040_with_itins_source_record_pairs = simulated_taxes_1040_source_record_pairs.merge(simulated_taxes_1040_with_itins[['record_id']], on='record_id', how='inner')
simulated_taxes_1040_with_itins, simulated_taxes_1040_with_itins_source_record_pairs = persist(simulated_taxes_1040_with_itins, simulated_taxes_1040_with_itins_source_record_pairs)

%xdel simulated_taxes_1040
%xdel simulated_taxes_1040_source_record_pairs

In [23]:
%xdel simulated_w2_1099
%xdel simulated_w2_1099_source_record_pairs

In [24]:
# "... many of the [IRS] records contain only the first four letters of the last name."
# (Brown et al. 2023, p.30, footnote 19)
# This should be updated in pseudopeople but for now we do it here.
# Note that this truncation only matters for ITIN PIKing since for SSNs that are present in SSA we use name from SSA.
PROPORTION_OF_IRS_RECORDS_WITH_TRUNCATION = 0.4 # is this a good guess at "many" in the quote above?
if compute_engine == 'dask':
    # import dask.array as np
    # rng = np.random.default_rng(seed=1234)
    # random_df = pd.from_dask_array(rng.random(len(simulated_taxes), chunks=int(np.ceil(len(simulated_taxes) / simulated_taxes.npartitions))), columns=['random'])
    # random_df.index = simulated_taxes.index
    # to_truncate = simulated_taxes.assign(random=random_df.random).random < PROPORTION_OF_IRS_RECORDS_WITH_TRUNCATION
    to_truncate, not_to_truncate = simulated_taxes.random_split([PROPORTION_OF_IRS_RECORDS_WITH_TRUNCATION, 1 - PROPORTION_OF_IRS_RECORDS_WITH_TRUNCATION], random_state=1234)
    to_truncate = to_truncate.assign(last_name=to_truncate.last_name.str[:4])
    simulated_taxes = concat_avoid_fragmentation([to_truncate, not_to_truncate], ignore_index=True)
else:
    import numpy as np
    to_truncate = simulated_taxes.sample(frac=PROPORTION_OF_IRS_RECORDS_WITH_TRUNCATION, random_state=1234).index

    simulated_taxes.loc[to_truncate, 'last_name'] = simulated_taxes.loc[to_truncate, 'last_name'].str[:4]

In [25]:
simulated_taxes, simulated_taxes_source_record_pairs = persist(simulated_taxes, simulated_taxes_source_record_pairs)

In [26]:
source_record_ground_truth = persist(concat_avoid_fragmentation([
    simulated_ssa_numident_ground_truth,
    simulated_taxes_1040_ground_truth,
    simulated_w2_1099_ground_truth,
]).rename(columns={'record_id': 'source_record_id'}))
source_record_ground_truth

Unnamed: 0_level_0,source_record_id,simulant_id
npartitions=800,Unnamed: 1_level_1,Unnamed: 2_level_1
,large_string[pyarrow],large_string[pyarrow]
,...,...
...,...,...
,...,...
,...,...


In [27]:
%xdel simulated_taxes_1040_ground_truth
%xdel simulated_w2_1099_ground_truth

### 2030 Census Unedited File (CUF)

For now, we gloss over the data schema for addresses.
We don't know how addresses would be formatted in the CUF (and it's hard to guess, because
address is not part of the Census form), but it likely would have some of these fields
(street number, street name, etc) combined.

While PVS input files do not in general have names split into first, middle, and last,
I am guessing the CUF **would** have first name, middle initial, last name (which is how pseudopeople
generates it), because that [matches the Census questionnaire](https://www2.census.gov/programs-surveys/decennial/2020/technical-documentation/questionnaires-and-instructions/questionnaires/2020-informational-questionnaire-english_DI-Q1.pdf).

In [28]:
%%time

simulated_census_2030 = pd.read_parquet(f'{output_dir}/{data_to_use}/pseudopeople_simulated_datasets/simulated_census_2030.parquet')
simulated_census_2030 = add_unique_record_id(simulated_census_2030, 'simulated_census_2030')
simulated_census_2030 = increase_string_capacity(simulated_census_2030)
simulated_census_2030 = persist(simulated_census_2030)

CPU times: user 317 ms, sys: 33.3 ms, total: 350 ms
Wall time: 327 ms


In [29]:
simulated_census_2030_ground_truth = simulated_census_2030[['record_id', 'simulant_id']]
simulated_census_2030 = simulated_census_2030.drop(columns=['simulant_id'])
simulated_census_2030, simulated_census_2030_ground_truth = persist(simulated_census_2030, simulated_census_2030_ground_truth)
simulated_census_2030

Unnamed: 0_level_0,household_id,first_name,middle_initial,last_name,age,date_of_birth,street_number,street_name,unit_number,city,state,zipcode,housing_type,relationship_to_reference_person,sex,race_ethnicity,year,record_id
npartitions=300,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
,large_string[pyarrow],large_string[pyarrow],large_string[pyarrow],large_string[pyarrow],large_string[pyarrow],large_string[pyarrow],large_string[pyarrow],large_string[pyarrow],large_string[pyarrow],large_string[pyarrow],category[unknown],large_string[pyarrow],large_string[pyarrow],category[unknown],category[unknown],category[unknown],int64,large_string[pyarrow]
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [30]:
save_file_with_ground_truth('simulated_census_2030', simulated_census_2030, simulated_census_2030_ground_truth)

In [31]:
%xdel simulated_census_2030
%xdel simulated_census_2030_ground_truth

## Create reference files

> The Census Numident – all Social Security Administration (SSA) Numident SSN records are
  edited (collapsed) to produce a Census Numident file that contains “one best-data record” for
  each SSN. All variants of name information for each SSN are retained in the Alternate Name
  Numident file, while all variants of date of birth data are retained in the Alternate DOB
  Numident. The SSN-PIK crosswalk file is used to attach a corresponding unique PIK value for
  each SSN value in the Census Numident file.

### Census Numident

Luque and Wagner, p. 4:
  
> The SSA Numident file contains all transactions ever recorded against any single SSN - with each entry
representing an addition or change (such as name changes) to the SSN record. This file is edited to
create the **Census Numident**, which contains one record for each SSN. Each SSN record in the Census
Numident contains name, DOB, sex, race, place of birth, parents’ name, citizenship status and date of death information.

and in footnote 5:

> Name edits, DOB reconciliation, and race identifiers are some of the edits conducted to produce this Numident
file. **The resulting Numident file contains the most recent name and DOB data.**

We are missing quite a few columns, since they are missing in pseudopeople's SSA Numident: race, place of birth, parents' name,
citizenship status.
However, I'm pretty sure none of these are used in linking.

In [32]:
def fill_dates(df, fill_with):
    return (
        # Replace invalid dates with nans
        pd.to_datetime(df.event_date, format='%Y%m%d', errors='coerce')
            .fillna(pandas.to_datetime('2100-01-01' if fill_with == 'latest' else '1900-01-01'))
    )

def best_data_from_columns(df, columns, best_is_latest=True):
    # We don't want to throw out events with a missing/invalid date, so we'll fill them with the value *least* likely to be chosen
    # (earlier than all values if taking the latest, later than all values if taking the earliest).
    fill_with = 'earliest' if best_is_latest else 'latest'

    result = (
        df
            # Without mutating the existing date column, get one that is actually
            # a date type and can be used for sorting.
            # Note: we actually convert this to an integer for sorting purposes, because Modin was having trouble
            # sorting by it as an actual datetime
            .assign(event_date_for_sort=lambda df: fill_dates(df, fill_with=fill_with).astype(numpy.int64) // 10 ** 9)
            .dropna(subset=columns, how='all')
            .pipe(lambda df: drop_duplicates(df, subset='ssn', sort_col='event_date_for_sort', keep=('last' if best_is_latest else 'first')))
            [['record_id', 'ssn'] + columns]
    )

    return result, record_id_to_single_source_record_pairs(result)

best_name, best_name_source_record_pairs = best_data_from_columns(
    simulated_ssa_numident,
    columns=['first_name', 'middle_name', 'last_name'],
)

best_date_of_birth, best_date_of_birth_source_record_pairs = best_data_from_columns(
    simulated_ssa_numident,
    columns=['date_of_birth'],
)

best_date_of_death, best_date_of_death_source_record_pairs = best_data_from_columns(
    simulated_ssa_numident[simulated_ssa_numident.event_type == 'death'],
    columns=['event_date'],
)
best_date_of_death = best_date_of_death.rename(columns={'event_date': 'date_of_death'})

simulated_census_numident, simulated_census_numident_source_record_pairs = persist(merge_preserving_source_records(
    [best_name, best_date_of_birth, best_date_of_death],
    [best_name_source_record_pairs, best_date_of_birth_source_record_pairs, best_date_of_death_source_record_pairs],
    new_record_id_prefix='simulated_census_numident',
    on=['ssn'],
    how='left',
))
simulated_census_numident

Unnamed: 0_level_0,ssn,first_name,middle_name,last_name,date_of_birth,date_of_death,record_id
npartitions=600,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
,large_string[pyarrow],large_string[pyarrow],large_string[pyarrow],large_string[pyarrow],large_string[pyarrow],large_string[pyarrow],large_string[pyarrow]
,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...
,...,...,...,...,...,...,...


### Alternate Name Numident

Wagner and Layne, p. 9:

>  All variants of name information for each SSN are retained in the Alternate Name
Numident file...

In [33]:
simulated_alternate_name_numident, simulated_alternate_name_numident_source_record_pairs = persist(dedupe_preserving_source_records(
    simulated_ssa_numident,
    simulated_ssa_numident_source_record_pairs,
    columns_to_dedupe=['ssn', 'first_name', 'middle_name', 'last_name'],
    new_record_id_prefix='simulated_alternate_name_numident',
))
simulated_alternate_name_numident

Unnamed: 0_level_0,ssn,first_name,middle_name,last_name,record_id
npartitions=600,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
,large_string[pyarrow],large_string[pyarrow],large_string[pyarrow],large_string[pyarrow],large_string[pyarrow]
,...,...,...,...,...
...,...,...,...,...,...
,...,...,...,...,...
,...,...,...,...,...


In [34]:
compute(groupby_agg_small_groups(simulated_alternate_name_numident, by='ssn', agg_func=lambda x: x.size()).describe())

count    3.834553e+08
mean     1.032053e+00
std      1.761406e-01
min      1.000000e+00
25%      1.000000e+00
50%      1.000000e+00
75%      1.000000e+00
max      2.000000e+00
dtype: float64

In [35]:
# Show alternate names for the same simulated SSN
simulated_alternate_name_numident.merge(
    groupby_agg_small_groups(simulated_alternate_name_numident[["ssn"]], by="ssn", agg_func=lambda x: x.size()).pipe(lambda x: x[x > 1]).reset_index()[['ssn']],
    on='ssn',
    how='inner'
).sort_values('ssn').tail(100)

Unnamed: 0,ssn,first_name,middle_name,last_name,record_id
25714,899-99-6992,Flodence,Doris,Tripoei,simulated_alternate_name_numident_597_4532979
25715,899-99-6992,Florence,Doris,Apellido,simulated_alternate_name_numident_597_4532980
29102,899-99-7081,Joann,Ann,Hunt,simulated_alternate_name_numident_597_4533018
29103,899-99-7081,,Ann,Hunt,simulated_alternate_name_numident_597_4533019
25016,899-99-7168,Donna,Mary,Zimmdr,simulated_alternate_name_numident_597_4533057
...,...,...,...,...,...
21577,899-99-9823,Mary,Carmen,Leyba,simulated_alternate_name_numident_599_70
18570,899-99-9847,Cjarles,Louis,Carr,simulated_alternate_name_numident_599_82
18571,899-99-9847,Chxarlfs,Louis,Carr,simulated_alternate_name_numident_599_83
21346,899-99-9867,James,Darryl,Frassineoli,simulated_alternate_name_numident_599_92


### Alternate DOB Numident

Wagner and Layne, p. 9:

> ... while all variants of date of birth data are retained in the Alternate DOB
Numident.

In [36]:
simulated_alternate_dob_numident, simulated_alternate_dob_numident_source_record_pairs = persist(dedupe_preserving_source_records(
    simulated_ssa_numident,
    simulated_ssa_numident_source_record_pairs,
    columns_to_dedupe=['ssn', 'date_of_birth'],
    new_record_id_prefix='simulated_alternate_dob_numident',
))
simulated_alternate_dob_numident

Unnamed: 0_level_0,ssn,date_of_birth,record_id
npartitions=600,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,large_string[pyarrow],large_string[pyarrow],large_string[pyarrow]
,...,...,...
...,...,...,...
,...,...,...
,...,...,...


In [37]:
compute(groupby_agg_small_groups(simulated_alternate_dob_numident, by='ssn', agg_func=lambda x: x.size()).describe())

count    3.834553e+08
mean     1.016533e+00
std      1.275149e-01
min      1.000000e+00
25%      1.000000e+00
50%      1.000000e+00
75%      1.000000e+00
max      2.000000e+00
dtype: float64

In [38]:
simulated_alternate_dob_numident.merge(
    groupby_agg_small_groups(simulated_alternate_dob_numident[["ssn"]], by="ssn", agg_func=lambda x: x.size()).pipe(lambda x: x[x > 1]).reset_index()[['ssn']],
    on='ssn',
    how='inner'
).sort_values('ssn').tail(100)

Unnamed: 0,ssn,date_of_birth,record_id
10998,899-99-3444,19340308,simulated_alternate_dob_numident_597_3590246
10999,899-99-3444,19349308,simulated_alternate_dob_numident_597_3590247
10940,899-99-3485,19470121,simulated_alternate_dob_numident_597_3590264
10941,899-99-3485,19419121,simulated_alternate_dob_numident_597_3590265
11004,899-99-3571,19480507,simulated_alternate_dob_numident_597_3590296
...,...,...,...
11075,899-99-9400,20280205,simulated_alternate_dob_numident_598_346
12310,899-99-9451,19480427,simulated_alternate_dob_numident_598_369
12311,899-99-9451,17480427,simulated_alternate_dob_numident_598_370
15564,899-99-9867,19742407,simulated_alternate_dob_numident_599_109


### Name/DOB Reference File

Wagner and Layne, p. 9:

> The Name and DOB Reference files are reformatted versions of the Census Numident
and includes **all possible combinations of alternate names and dates of birth, as well as
ITIN data**. All of the reference files contain SSN/ITIN and the corresponding PIK. When
an input record is linked to a reference file, the corresponding PIK is assigned. Table 1
presents the number of observations in each of the reference files.

A slightly confusing point: sometimes the Name and DOB reference files are described
as one and the same thing, and sometimes as separate.
I believe this is because **they differ only in how they are "cut" for the PVS process:**
the name reference file is cut by first and last initial,
while the DOB reference file is cut by month and day of birth.

This is described in Wagner and Layne, p.15:

> The [DOBSearch] module matches against a re-split
version of the Numident Name Reference file, splitting the data based on month and day
of birth.

Since we handle the logic of "cutting" in the linkage process itself, we generate
a single reference file here.

Note that unlike for addresses, and unlike for the pre-processing of PVS *input* files
(as opposed to reference files), there is no explicit nickname processing/correction here.
I am fairly sure that is accurate to the real PVS, which I believe assumes that nicknames
would not be present in SSA/tax records (or at least, that the real name would appear
at least once in these records).

In [39]:
simulated_name_dob_numident_records, simulated_name_dob_numident_records_source_record_pairs = persist(merge_preserving_source_records(
    [simulated_alternate_name_numident, simulated_alternate_dob_numident],
    [simulated_alternate_name_numident_source_record_pairs, simulated_alternate_dob_numident_source_record_pairs],
    on=['ssn'],
    how='left',
    new_record_id_prefix='name_dob_numident_records',
))
simulated_name_dob_numident_records

Unnamed: 0_level_0,ssn,first_name,middle_name,last_name,date_of_birth,record_id
npartitions=600,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
,large_string[pyarrow],large_string[pyarrow],large_string[pyarrow],large_string[pyarrow],large_string[pyarrow],large_string[pyarrow]
,...,...,...,...,...,...
...,...,...,...,...,...,...
,...,...,...,...,...,...
,...,...,...,...,...,...


In [40]:
# Show alternate name/DOB for the same simulated SSN
simulated_name_dob_numident_records.merge(
    groupby_agg_small_groups(simulated_name_dob_numident_records[["ssn"]], by="ssn", agg_func=lambda x: x.size()).pipe(lambda x: x[x > 1]).reset_index()[['ssn']],
    on='ssn',
    how='inner',
).sort_values('ssn').tail(100)

Unnamed: 0,ssn,first_name,middle_name,last_name,date_of_birth,record_id
28040,899-99-7942,Darrell,Harvey,Kimmell,19330308,name_dob_numident_records_232_305501
28041,899-99-7942,Darrell,Mom,Kimmell,19330308,name_dob_numident_records_232_305502
30770,899-99-7944,Gerald,Charles,Diorio,19460101,name_dob_numident_records_526_336256
30771,899-99-7944,Gerald,Minor,Diorio,19460101,name_dob_numident_records_526_336257
27894,899-99-7963,Lila,Ofelia,Veaglf,19320507,name_dob_numident_records_288_300761
...,...,...,...,...,...,...
25309,899-99-9847,Chxarlfs,Louis,Carr,19720520,name_dob_numident_records_393_274870
29240,899-99-9867,James,Darryl,Frassineoli,19742407,name_dob_numident_records_78_317345
29241,899-99-9867,James,Darryl,Frassineoli,19740404,name_dob_numident_records_78_317346
29242,899-99-9867,James,Darryl,Frassinelli,19742407,name_dob_numident_records_78_317347


#### Incorporating people with ITINs

Individual Taxpayer Identification Numbers (ITINs) can be issued to people who are required to file
federal taxes but are not eligible for a Social Security Number.
The most common reason for this is being an undocumented immigrant and therefore not being authorized
to work in the United States.

People without SSNs used to be impossible to assign PIKs to.
In 2011 the NORC report stated (p. 38, footnote 19):

> NORC understands that the Census Bureau has undertaken an effort to enhance the PVS reference files with IRS
files that include Individual Taxpayer Identification Numbers (ITIN). For those people who are required to file a tax
return but do not have, and may not want an SSN—such as a non-U.S. citizen—the IRS issues the taxpayer an ITIN.
This enhancement to the PVS reference file may help to match more non-U.S citizens.

By 2014 (Wagner and Layne, p. 5):

> One of the key enhancements [made in recent years] increased the coverage of the reference files by
including records for persons with Individual Taxpayer Identification Numbers assigned
by the Internal Revenue Service (ITINs) to [along with?] the SSN-based Numident data. 

I have not found a specific description of how ITIN records are constructed in any of the
publicly-available sources.
This may be because it is straightforward, or because the tax data schema is confidential.
I assume that only IRS data is used, since no other data source that I am aware of would
report ITIN.

It is stated that the ITIN records are created directly from tax filings and not
from ITIN applications (Brown et al. p. 29, footnote 16), which is convenient
because the tax filing data is what we can simulate with
pseudopeople:

> The NUMIDENT provides the PII on the SSN-holder from the issuing agency (SSA), and that PII is used in SSN
verification. **For ITINs, the Census Bureau does not have access to the ITIN applications** to the issuing agency (IRS),
so name and DOB verification of ITINs is less reliable.

"Less reliable" is a bit confusing here, because as stated above when generating
the simulated tax data, IRS data should not contain date of birth at all.
Here, we have stayed true to this by omitting it entirely.

We assume that only 1040 filings would be used for this purpose; we wouldn't expect ITINs to
show up on employer-filed W-2/1099 forms.

In [41]:
# Analogous to the process of getting alternate names and dates of birth
# from SSA, we retain all versions of the name from taxes.
simulated_name_for_itins, simulated_name_for_itins_source_record_pairs = persist(dedupe_preserving_source_records(
    simulated_taxes_1040_with_itins.rename(columns={'middle_initial': 'middle_name'}),
    simulated_taxes_1040_with_itins_source_record_pairs,
    columns_to_dedupe=['ssn', 'first_name', 'middle_name', 'last_name'],
    new_record_id_prefix='simulated_name_for_itins',
))
simulated_name_for_itins

Unnamed: 0_level_0,ssn,first_name,middle_name,last_name,record_id
npartitions=500,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
,large_string[pyarrow],large_string[pyarrow],large_string[pyarrow],large_string[pyarrow],large_string[pyarrow]
,...,...,...,...,...
...,...,...,...,...,...
,...,...,...,...,...
,...,...,...,...,...


In [42]:
compute(groupby_agg_small_groups(simulated_name_for_itins, by='ssn', agg_func=lambda x: x.size()).describe())

count    1.520003e+07
mean     1.234247e+00
std      4.755774e-01
min      1.000000e+00
25%      1.000000e+00
50%      1.000000e+00
75%      1.000000e+00
max      6.000000e+00
dtype: float64

In [43]:
# How many records in the simulated name/DOB numident are in the ITIN range?
# With normal pseudopeople settings, this shouldn't happen, because SSN in SSA is noiseless.
# If this number is >0, we may be adding ITIN records as more alternates to existing records.
compute(simulated_name_dob_numident_records.ssn.str.startswith('9').sum())

0

In [44]:
simulated_name_dob_reference_file, simulated_name_dob_reference_file_source_record_pairs = persist(concat_preserving_source_records(
    [simulated_name_dob_numident_records, simulated_name_for_itins],
    [simulated_name_dob_numident_records_source_record_pairs, simulated_name_for_itins_source_record_pairs],
    new_record_id_prefix='simulated_name_dob_reference_file',
))
simulated_name_dob_reference_file

Unnamed: 0_level_0,ssn,first_name,middle_name,last_name,date_of_birth,record_id
npartitions=250,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
,large_string[pyarrow],large_string[pyarrow],large_string[pyarrow],large_string[pyarrow],large_string[pyarrow],large_string[pyarrow]
,...,...,...,...,...,...
...,...,...,...,...,...,...
,...,...,...,...,...,...
,...,...,...,...,...,...


### GeoBase Reference File

Wagner and Layne, p. 9:

> PVS creates three other sets of reference
files containing Numident data: the **GeoBase Reference File**, the Name Reference File,
and the DOB Reference file.
The GeoBase Reference File appends addresses from administrative records attached
to Numident data, including all possible combinations of alternate names and dates of
birth for SSN. Addresses from administrative records are edited and processed through
commercial software product to clean and standardize address data. ITIN data is also
incorporated into the Geobase.

Luque and Wagner, p. 5:

> Reference files contain data from the Numident file enhanced with address
data obtained from federal AR [administrative records] files.<sup>8</sup>
The reference files, thus, contain all variants of a person’s name, DOB,
and sex, as well as current and recent addresses. These reference files are
referred to as the (PVS) Geobase reference file since addresses (a geographic component)
are appended to each person record.<sup>9</sup> It is important to note that there are
multiple Geobase reference files that are created depending on the vintage of the
incoming file to be processed through PVS.

> <sup>8</sup> Namely, data from the IRS, Department of Housing and Urban Development,
several files from the Department of Health and Human Services, and Selective Service.

> <sup>9</sup> In particular, the address data is cleaned and standardized and used
to construct a variable called GEOKEY. The GEOKEY variable is constructed as a subset
of the full address, and then is appended to the Numident data to create the
PVS Geobase Reference file.

We only have IRS data to use for addresses, and specifically only W-2/1099 data,
which is a limitation of this case study.
I can't find a concrete definition of "recent" -- as noted above, we use 5 years
of IRS data.
This may be longer than the true window, but this may end up making up for
the lack of non-IRS data sources, and get us closer to a realistic number of
alternate addresses.

Also, our address data comes out of pseudopeople already parsed into address parts
like street name, etc.
For more realism, pseudopeople should output a single string that we have to (imperfectly) parse apart.

I haven't been able to find out more about what kind of "subset" the geokey is.
It is unclear to me why geokey is "interesting" since it is just derived from the
address parts.

In [45]:
address_cols = [c for c in simulated_taxes.columns if 'mailing_address' in c]

def standardize_address_part(column):
    return (
        column
            # Remove leading or trailing whitespace
            .str.strip()
            # Turn any strings of consecutive whitespace into a single space
            .str.replace('\s+', ' ', regex=True)
            # Normalize case
            .str.upper()
            # Normalize the word street as described in the example quoted above
            # In reality, there would be many rules like this
            .str.replace('\b(STREET|STR)\b', 'ST', regex=True)
            # Make sure missingness is represented consistently
            .replace('', numpy.nan)
    )

simulated_tax_addresses = (
    simulated_taxes
        # Can only link these to the other files if they have an SSN
        .dropna(subset=['ssn'])
        [['record_id', 'ssn'] + address_cols]
        .assign(**{c: lambda df, c=c: standardize_address_part(df[c]) for c in address_cols})
        .pipe(increase_string_capacity)
)

simulated_addresses_by_ssn, simulated_addresses_by_ssn_source_record_pairs = persist(dedupe_preserving_source_records(
    simulated_tax_addresses,
    simulated_taxes_source_record_pairs,
    columns_to_dedupe=['ssn'] + address_cols,
    new_record_id_prefix='addresses_by_ssn',
))
simulated_addresses_by_ssn

Unnamed: 0_level_0,ssn,mailing_address_street_number,mailing_address_street_name,mailing_address_unit_number,mailing_address_po_box,mailing_address_city,mailing_address_state,mailing_address_zipcode,record_id
npartitions=250,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
,large_string[pyarrow],large_string[pyarrow],large_string[pyarrow],large_string[pyarrow],large_string[pyarrow],large_string[pyarrow],large_string[pyarrow],large_string[pyarrow],large_string[pyarrow]
,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...


In [46]:
num_addresses = groupby_agg_small_groups(simulated_addresses_by_ssn, by='ssn', agg_func=lambda x: x.size()).rename('size').reset_index().sort_values('size')
num_addresses

Unnamed: 0_level_0,ssn,size
npartitions=8,Unnamed: 1_level_1,Unnamed: 2_level_1
,large_string[pyarrow],int64
,...,...
...,...,...
,...,...
,...,...


In [47]:
# Show some SSNs with a lot of address variation
compute(pd.merge(num_addresses.tail(10), simulated_addresses_by_ssn, on='ssn', how='inner').sort_values('ssn'))

Unnamed: 0,ssn,size,mailing_address_street_number,mailing_address_street_name,mailing_address_unit_number,mailing_address_po_box,mailing_address_city,mailing_address_state,mailing_address_zipcode,record_id
0,236-98-4856,325,681,CEDAR AVE,,,LAFAYETTE,IN,47660,addresses_by_ssn_157_4617950
1,236-98-4856,325,440,VALLEY ST,,,UNINCORPORATED,FL,33126,addresses_by_ssn_157_4617951
2,236-98-4856,325,7964,HERRING CR RD,,,COHOES,NY,10458,addresses_by_ssn_157_4617952
3,236-98-4856,325,11,SOUTH SULLIVAN STREET,,,MIDDLESEX,NJ,07094,addresses_by_ssn_157_4617953
4,236-98-4856,325,1360,VIA ESPRILLO,,,HERCULES,CA,94087,addresses_by_ssn_157_4617954
...,...,...,...,...,...,...,...,...,...,...
320,862-25-2275,325,5857,HAWLEY,,,APOPKA,FL,33020,addresses_by_ssn_243_2843391
321,862-25-2275,325,8,SE 100TH CT,,,,AZ,85713,addresses_by_ssn_243_2843392
322,862-25-2275,325,491,NORTH TOLL STREET,,,SURFSIDE BEACH,SC,29406,addresses_by_ssn_243_2843393
323,862-25-2275,325,9566,HAROLD ST,,,SENECA FALLS,NY,11377,addresses_by_ssn_243_2843394


In [48]:
# Rough estimate of how many rows we should have in our reference file, once we do this Cartesian product
compute(
    len(simulated_name_dob_reference_file) *
    groupby_agg_small_groups(simulated_addresses_by_ssn, by='ssn', agg_func=lambda x: x.size()).mean()
)

889081291.6787932

In [49]:
simulated_geobase_reference_file, simulated_geobase_reference_file_source_record_pairs = persist(merge_preserving_source_records(
    [simulated_name_dob_reference_file, simulated_addresses_by_ssn],
    [simulated_name_dob_reference_file_source_record_pairs, simulated_addresses_by_ssn_source_record_pairs],
    on=['ssn'],
    how='left',
    new_record_id_prefix='simulated_geobase_reference_file',
))
simulated_geobase_reference_file

Unnamed: 0_level_0,ssn,first_name,middle_name,last_name,date_of_birth,mailing_address_street_number,mailing_address_street_name,mailing_address_unit_number,mailing_address_po_box,mailing_address_city,mailing_address_state,mailing_address_zipcode,record_id
npartitions=250,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
,large_string[pyarrow],large_string[pyarrow],large_string[pyarrow],large_string[pyarrow],large_string[pyarrow],large_string[pyarrow],large_string[pyarrow],large_string[pyarrow],large_string[pyarrow],large_string[pyarrow],large_string[pyarrow],large_string[pyarrow],large_string[pyarrow]
,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...


In [50]:
# Actual number of rows
len(simulated_geobase_reference_file)

951990015

## Track ground truth for reference files

In [51]:
def get_simulants_of_source_records(source_record_pairs, filter_record_ids=None):
    if filter_record_ids is not None:
        source_record_pairs = source_record_pairs.pipe(filter_record_ids)

    result = (
        drop_duplicates(source_record_pairs)
            .merge(source_record_ground_truth, on='source_record_id')
            [['record_id', 'simulant_id']]
            .pipe(drop_duplicates)
    )
    result = result.merge(
        groupby_agg_small_groups(result, by='record_id', agg_func=lambda x: x.simulant_id.nunique().rename('n_unique_simulants')).reset_index(),
        on='record_id',
        how='left',
    )
    return result

def get_ground_truth_for_records(source_record_pairs):
    result = persist(get_simulants_of_source_records(source_record_pairs))

    result_nunique_describe = compute(result['n_unique_simulants'].describe())

    if numpy.isclose(result_nunique_describe.loc['max'], 1):
        print('No collisions')
        return result

    print('Collisions:')
    display(result_nunique_describe)

    print('Simulated tax records with the most collisions:')
    most_collisions_record_id = result.sort_values('n_unique_simulants', ascending=False).head(1).record_id.iloc[0]
    most_collisions_source_record_ids = persist(
        drop_duplicates(source_record_pairs[source_record_pairs.record_id == most_collisions_record_id][['source_record_id']])
    )
    most_collisions_tax_filings = (
        most_collisions_source_record_ids
            .merge(drop_duplicates(simulated_taxes_source_record_pairs[['record_id', 'source_record_id']]), on='source_record_id', how='inner')
            .merge(simulated_taxes, on='record_id', how='left')
            .merge(source_record_ground_truth, on='source_record_id', how='left')
    )
    display(compute(most_collisions_tax_filings))

    print('Simulated SSA records with the most collisions:')
    display(
        compute(
            most_collisions_source_record_ids
                .merge(simulated_ssa_numident.rename(columns={'record_id': 'source_record_id'}), on='source_record_id', how='inner')
                .merge(source_record_ground_truth, on='source_record_id', how='left')
        )
    )

    return result

### Simulated Census Numident

In [52]:
len(simulated_census_numident_source_record_pairs)

801719241

In [53]:
simulated_census_numident_ground_truth = get_ground_truth_for_records(simulated_census_numident_source_record_pairs)

No collisions


In [54]:
simulated_census_numident_ground_truth

Unnamed: 0_level_0,record_id,simulant_id,n_unique_simulants
npartitions=250,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,large_string[pyarrow],large_string[pyarrow],int64
,...,...,...
...,...,...,...
,...,...,...
,...,...,...


### Simulated Alternate Name Numident

In [55]:
len(simulated_alternate_name_numident_source_record_pairs)

419397698

In [56]:
simulated_alternate_name_numident_ground_truth = get_ground_truth_for_records(simulated_alternate_name_numident_source_record_pairs)

No collisions


In [57]:
simulated_alternate_name_numident_ground_truth

Unnamed: 0_level_0,record_id,simulant_id,n_unique_simulants
npartitions=250,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,large_string[pyarrow],large_string[pyarrow],int64
,...,...,...
...,...,...,...
,...,...,...
,...,...,...


### Alternate DOB Numident

In [58]:
len(simulated_alternate_dob_numident_source_record_pairs)

419397698

In [59]:
simulated_alternate_dob_numident_ground_truth = get_ground_truth_for_records(simulated_alternate_dob_numident_source_record_pairs)

No collisions


In [60]:
simulated_alternate_dob_numident_ground_truth

Unnamed: 0_level_0,record_id,simulant_id,n_unique_simulants
npartitions=250,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,large_string[pyarrow],large_string[pyarrow],int64
,...,...,...
...,...,...,...
,...,...,...
,...,...,...


### Simulated Name/DOB Reference File

In [61]:
len(simulated_name_dob_reference_file_source_record_pairs)

919701212

In [62]:
simulated_name_dob_reference_file_ground_truth = get_ground_truth_for_records(simulated_name_dob_reference_file_source_record_pairs)

No collisions


In [63]:
simulated_name_dob_reference_file_ground_truth

Unnamed: 0_level_0,record_id,simulant_id,n_unique_simulants
npartitions=250,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,large_string[pyarrow],large_string[pyarrow],int64
,...,...,...
...,...,...,...
,...,...,...
,...,...,...


### Simulated GeoBase Reference File

In [64]:
len(simulated_geobase_reference_file_source_record_pairs)

4887572638

In [65]:
simulated_geobase_reference_file_ground_truth = get_ground_truth_for_records(simulated_geobase_reference_file_source_record_pairs)

Collisions:


count    1.055236e+09
mean     1.199412e+00
std      4.141232e-01
min      1.000000e+00
25%      1.000000e+00
50%      1.000000e+00
75%      1.000000e+00
max      9.000000e+00
Name: n_unique_simulants, dtype: float64

Simulated tax records with the most collisions:


Unnamed: 0,source_record_id,record_id,ssn,first_name,middle_initial,last_name,mailing_address_street_number,mailing_address_street_name,mailing_address_unit_number,mailing_address_po_box,mailing_address_city,mailing_address_state,mailing_address_zipcode,simulant_id
0,simulated_1040_185_1866523,simulated_taxes_185_1866523,808-26-5998,David,D,Ronero,114,w 5th st,,,doraville,,31906,6554_95389
0,simulated_w2_1099_85_2064619,simulated_taxes_335_2064619,808-26-5998,Justin,J,Flores,114,w 5th st,,,doraville,,31906,6554_95396
0,simulated_1040_185_1866536,simulated_taxes_185_1866536,808-26-5998,Justin,A,Koscielski,114,w 5th st,,,doraville,,31906,6554_95406
0,simulated_1040_235_1901957,simulated_taxes_235_1901957,808-26-5998,Justin,A,Koscielski,114,w 5th st,,,doraville,,31906,6554_95406
0,simulated_1040_35_1757674,simulated_taxes_35_1757674,808-26-5998,Justin,A,Kosc,114,w 5th st,,,doraville,,31906,6554_95406
0,simulated_w2_1099_135_2074019,simulated_taxes_385_2074019,808-26-5998,Justina,A,Koscielski,114,w 5th st,,,doraville,,31906,6554_95406
0,simulated_w2_1099_235_2088011,simulated_taxes_485_2088011,808-26-5998,Justin,A,Koscielski,114,w 5th st,,,doraville,,31906,6554_95406
0,simulated_w2_1099_85_2064612,simulated_taxes_335_2064612,808-26-5998,Bradley,M,Hwrn,114,w 5th st,,,doraville,,31906,6554_95392
0,simulated_w2_1099_135_2074022,simulated_taxes_385_2074022,808-26-5998,Justin,A,Koscielski,114,w 5th st,,,doraville,,31906,6554_95406
0,simulated_w2_1099_35_2057346,simulated_taxes_285_2057346,808-26-5998,Ismael,B,Jime,114,w 5th st,,,doraville,,31906,6554_95403


Simulated SSA records with the most collisions:


Unnamed: 0,source_record_id,ssn,first_name,middle_name,last_name,date_of_birth,sex,event_type,event_date,simulant_id
0,simulated_ssa_numident_212_681571,808-26-5998,Justin,Austin,Koscielski,19960210,,,19960210,6554_95406


In [66]:
simulated_geobase_reference_file_ground_truth

Unnamed: 0_level_0,record_id,simulant_id,n_unique_simulants
npartitions=250,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,large_string[pyarrow],large_string[pyarrow],int64
,...,...,...
...,...,...,...
,...,...,...
,...,...,...


In [67]:
%xdel simulated_taxes
%xdel simulated_taxes_source_record_pairs

### Get ground truth by SSN

In [68]:
simulated_ssn_simulant_pairs = persist(
    concat_avoid_fragmentation([
        simulated_census_numident[["record_id", "ssn"]].merge(simulated_census_numident_ground_truth, on="record_id"),
        simulated_alternate_name_numident[["record_id", "ssn"]].merge(simulated_alternate_name_numident_ground_truth, on="record_id"),
        simulated_alternate_dob_numident[["record_id", "ssn"]].merge(simulated_alternate_dob_numident_ground_truth, on="record_id"),
        simulated_name_dob_reference_file[["record_id", "ssn"]].merge(simulated_name_dob_reference_file_ground_truth, on="record_id"),
        simulated_geobase_reference_file[["record_id", "ssn"]].merge(simulated_geobase_reference_file_ground_truth, on="record_id"),
    ])
        [['ssn', 'simulant_id']]
        .pipe(drop_duplicates)
)
simulated_ssn_simulant_pairs

Unnamed: 0_level_0,ssn,simulant_id
npartitions=500,Unnamed: 1_level_1,Unnamed: 2_level_1
,large_string[pyarrow],large_string[pyarrow]
,...,...
...,...,...
,...,...
,...,...


In [69]:
# How many simulated SSNs have different simulant IDs
# contributing to them?
compute((groupby_agg_small_groups(simulated_ssn_simulant_pairs, by='ssn', agg_func=lambda x: x.simulant_id.nunique()) > 1).sum())

76178097

## Save results

In [70]:
files = {
    'simulated_census_numident': (simulated_census_numident, simulated_census_numident_ground_truth),
    'simulated_alternate_name_numident': (simulated_alternate_name_numident, simulated_alternate_name_numident_ground_truth),
    'simulated_alternate_dob_numident': (simulated_alternate_dob_numident, simulated_alternate_dob_numident_ground_truth),
    'simulated_geobase_reference_file': (simulated_geobase_reference_file, simulated_geobase_reference_file_ground_truth),
    'simulated_name_dob_reference_file': (simulated_name_dob_reference_file, simulated_name_dob_reference_file_ground_truth),
}

In [71]:
simulated_reference_files = [
    simulated_census_numident,
    simulated_alternate_name_numident,
    simulated_alternate_dob_numident,
    simulated_geobase_reference_file,
    simulated_name_dob_reference_file,
]
# TODO: Rename the ssn column to explicitly include itins, since this is confusing
simulated_all_ssns_itins_in_reference_files = concat_avoid_fragmentation([df[["ssn"]] for df in simulated_reference_files], ignore_index=True)
simulated_ssn_to_pik = (
    simulated_all_ssns_itins_in_reference_files
        .pipe(drop_duplicates)
        .pipe(add_unique_id_col, col_name='pik')
        [['ssn', 'pik']]
)
simulated_ssn_to_pik

Unnamed: 0_level_0,ssn,pik
npartitions=500,Unnamed: 1_level_1,Unnamed: 2_level_1
,large_string[pyarrow],large_string[pyarrow]
,...,...
...,...,...
,...,...
,...,...


In [72]:
simulated_pik_simulant_pairs = (
    simulated_ssn_simulant_pairs
        .merge(simulated_ssn_to_pik, on="ssn", how="inner")
        [['pik', 'simulant_id']]
)
simulated_pik_simulant_pairs

Unnamed: 0_level_0,pik,simulant_id
npartitions=500,Unnamed: 1_level_1,Unnamed: 2_level_1
,large_string[pyarrow],large_string[pyarrow]
,...,...
...,...,...
,...,...
,...,...


In [73]:
%xdel simulated_ssn_simulant_pairs

In [74]:
for file_name, (file, ground_truth) in files.items():
    file = persist(file.merge(simulated_ssn_to_pik, on='ssn', how='left'))
    assert compute(file.pik.notnull().all())

    save_file_with_ground_truth(file_name, file, ground_truth)

In [75]:
simulated_pik_simulant_pairs_path = f'{output_dir}/{data_to_use}/simulated_pik_simulant_pairs.parquet'
remove_path(simulated_pik_simulant_pairs_path)
simulated_pik_simulant_pairs.to_parquet(simulated_pik_simulant_pairs_path)

In [76]:
! date

Mon Nov 20 22:54:53 PST 2023
