# Simulated PIK statistics

Here we inspect the accuracy and characteristics of the PIKs assigned,
leveraging our knowledge of ground truth from pseudopeople.

It wouldn't be possible to do the ground truth part with the real PVS, but
Layne, Wagner, and Rothhaas did something similar by redacting SSN from real records,
sending them through PVS without the SSN, and then using the true SSN
as ground truth.
The health care records they used are probably quite different from a CUF,
but they found a **very** good overall PIK accuracy (see cell below).

In [None]:
# Query planning is now on by default, but it has some rough edges.
# See https://github.com/dask/dask/issues/10995 for general discussion
# and https://github.com/dask/dask-expr/issues/1060 for the particular
# issue I ran into.
try:
    import dask

    dask.config.set({"dataframe.query-planning": False})
except ImportError:
    pass

In [None]:
import datetime, os, time

from person_linkage_case_study_utils import distributed_compute, utils
from IPython.display import display

In [None]:
print(datetime.datetime.now())

In [None]:
# DO NOT EDIT if this notebook is not called ground_truth_accuracy.ipynb!
# This notebook is designed to be run with papermill; this cell is tagged 'parameters'
data_to_use = "small_sample"
case_study_files_dir = "output/02_generate_case_study_files"
case_study_output_dir = "output/03_link_datasets"

compute_engine = "pandas"
# Only matter if using a distributed compute engine
compute_engine_num_workers = 3
compute_engine_cpus_per_worker = 2
compute_engine_threads_per_worker = 1
compute_engine_memory_per_worker = "1GB"
queue = None
account = None
# NOTE: This is, as Dask requests, a directory local to the compute node.
# But IHME's cluster doesn't support this very well -- it can be small-ish,
# full of stuff from other users, etc.
compute_engine_local_directory = (
    f"/tmp/{os.environ['USER']}_{int(time.time())}_person_linkage_case_study"
)
compute_engine_log_directory = f"{case_study_output_dir}/{data_to_use}/logs"
walltime = None
compute_engine_memory_constrained = True
scheduler = "slurm"

In [None]:
if compute_engine.startswith("dask"):
    utils.ensure_empty(compute_engine_local_directory)

In [None]:
case_study_output_dir = f"{case_study_output_dir}/{data_to_use}"
case_study_files_dir = f"{case_study_files_dir}/{data_to_use}"

In [None]:
df_ops, pd = distributed_compute.start_compute_engine(
    compute_engine,
    num_workers=compute_engine_num_workers,
    cpus_per_worker=compute_engine_cpus_per_worker,
    threads_per_worker=compute_engine_threads_per_worker,
    memory_per_worker=compute_engine_memory_per_worker,
    worker_walltime=walltime,
    local_directory=compute_engine_local_directory,
    log_directory=compute_engine_log_directory,
    memory_constrained=compute_engine_memory_constrained,
    scheduler=scheduler,
    queue=queue,
    account=account,
)

In [None]:
census_2030_piked = df_ops.read_parquet(
    f"{case_study_output_dir}/census_2030_piked.parquet"
)
confirmed_piks_with_ground_truth = df_ops.read_parquet(
    f"{case_study_output_dir}/confirmed_piks.parquet"
)

In [None]:
piked_proportion = df_ops.compute(census_2030_piked.pik.notnull().mean())
# Compare with 90.28% of input records PIKed in the 2010 CUF,
# as reported in Wagner and Layne, Table 2, p. 18
print(f"{piked_proportion:.2%} of the input records were PIKed")

In [None]:
# Multiple Census rows assigned the same PIK, indicating the model thinks they are duplicates in Census
pik_sizes = df_ops.persist(
    df_ops.groupby_agg_small_groups(
        census_2030_piked, by="pik", agg_func=lambda x: x.size()
    )
)
df_ops.compute(pik_sizes.value_counts())

In [None]:
# Interesting: in pseudopeople, sometimes siblings are assigned the same (common) first name, making them almost identical.
# The only giveaway is their age and DOB.
# Presumably, this tends not to happen in real life.
duplicate_piks = (
    pik_sizes.rename("pik_size").reset_index().pipe(lambda df: df[df.pik_size > 1])
)

df_ops.head(census_2030_piked.merge(duplicate_piks, on="pik").sort_values("pik"))

## Ground truth statistics

In [None]:
census_2030_ground_truth = df_ops.persist(
    df_ops.read_parquet(
        f"{case_study_files_dir}/simulated_census_2030_ground_truth.parquet"
    )
)

In [None]:
# In this version of pseudopeople, there are no actual duplicates in Census,
# which means all of the duplicates identified above are wrong.
assert len(census_2030_ground_truth) == len(
    df_ops.drop_duplicates(census_2030_ground_truth)
)

In [None]:
reference_files_ground_truth = df_ops.persist(
    df_ops.concat(
        [
            df_ops.read_parquet(
                f"{case_study_files_dir}/simulated_geobase_reference_file_ground_truth.parquet"
            ).drop(columns=["n_unique_simulants"]),
            df_ops.read_parquet(
                f"{case_study_files_dir}/simulated_name_dob_reference_file_ground_truth.parquet"
            ).drop(columns=["n_unique_simulants"]),
        ],
        ignore_index=True,
    )
)

In [None]:
# However, there can be reference file records that correspond to multiple simulants,
# due to errors in the reference file construction by SSN
n_unique_simulants = df_ops.persist(
    df_ops.groupby_agg_small_groups(
        reference_files_ground_truth,
        by="record_id",
        agg_func=lambda x: x.simulant_id.nunique(),
    )
    .rename("n_unique_simulants")
    .reset_index()
)
df_ops.compute(n_unique_simulants.n_unique_simulants.value_counts())

In [None]:
reference_files_ground_truth = df_ops.persist(
    reference_files_ground_truth.merge(
        n_unique_simulants,
        on="record_id",
        how="left",
    )
)
reference_files_ground_truth.head(n=100)

In [None]:
df_ops.head(
    reference_files_ground_truth[
        reference_files_ground_truth.n_unique_simulants
        == df_ops.compute(reference_files_ground_truth.n_unique_simulants.max())
    ]
)

In [None]:
census_2030_ground_truth = df_ops.persist(
    census_2030_ground_truth.merge(
        df_ops.drop_duplicates(reference_files_ground_truth[["simulant_id"]]).assign(
            possible_to_pik=1
        ),
        on="simulant_id",
        how="left",
    ).assign(possible_to_pik=lambda df: df.possible_to_pik.fillna(0))
)
possible_to_pik_proportion = df_ops.compute(
    census_2030_ground_truth.possible_to_pik.mean()
)
print(
    f"{(1 - possible_to_pik_proportion):.2%} of the input records are "
    "impossible to PIK correctly, since they are not in any reference files"
)

In [None]:
print(
    f"Assigned PIKs to {(piked_proportion / possible_to_pik_proportion):.2%} of PIK-able records"
)

In [None]:
reference_file = df_ops.concat(
    [
        df_ops.read_parquet(
            f"{case_study_files_dir}/simulated_geobase_reference_file.parquet",
        ),
        df_ops.read_parquet(
            f"{case_study_files_dir}/simulated_name_dob_reference_file.parquet",
        ),
    ],
    ignore_index=True,
)

In [None]:
reference_file_piks = df_ops.persist(reference_file[["record_id", "pik"]])
reference_file_piks

In [None]:
assert len(reference_file_piks) == len(
    df_ops.drop_duplicates(reference_file_piks[["record_id"]])
)

In [None]:
pik_simulant_pairs = df_ops.persist(
    df_ops.drop_duplicates(
        reference_files_ground_truth.merge(reference_file_piks, on="record_id")[
            ["pik", "simulant_id"]
        ]
    )
)

In [None]:
# However, there can be PIKs that correspond to multiple simulants,
# due to errors in the reference file construction by SSN
n_unique_simulants = df_ops.persist(
    df_ops.groupby_agg_small_groups(
        pik_simulant_pairs, by="pik", agg_func=lambda x: x.simulant_id.nunique()
    )
    .rename("n_unique_simulants")
    .reset_index()
)
df_ops.compute(n_unique_simulants.n_unique_simulants.value_counts())

In [None]:
pik_simulant_pairs = df_ops.persist(
    pik_simulant_pairs.merge(
        n_unique_simulants,
        on="pik",
        how="left",
    )
)
pik_simulant_pairs

In [None]:
df_ops.head(
    pik_simulant_pairs[
        pik_simulant_pairs.n_unique_simulants
        == df_ops.compute(pik_simulant_pairs.n_unique_simulants.max())
    ]
)

## Definitions of accuracy

1. (most strict) Assigning any PIK with multiple simulants is incorrect
2. Assigning a PIK with multiple simulants is neither incorrect nor correct (excluded from denominator)
3. (most lenient) Assigning a PIK with multiple simulants is correct, as long as at least one of those simulants matches the truth

In [None]:
# All modules, Medicare database, calculated from Layne, Wagner, and Rothhaas Table 1 (p. 15)
real_life_pvs_accuracy = 1 - (2_585 + 60_709 + 129_480 + 89_094) / (
    52_406_981 + 5_170_924 + 49_374_794 + 50_327_034
)
f"{real_life_pvs_accuracy:.5%}"

### Definition 1

In [None]:
piks_assigned = df_ops.compute(census_2030_piked.pik.notnull().sum())
piks_assigned

In [None]:
df_ops.head(pik_simulant_pairs[pik_simulant_pairs.n_unique_simulants > 1])

In [None]:
single_sim_piks_correct = df_ops.compute(
    census_2030_piked[["record_id", "pik"]]
    .merge(pik_simulant_pairs, on="pik")
    .merge(census_2030_ground_truth, on="record_id")
    .pipe(
        lambda df: (df.simulant_id_x == df.simulant_id_y) & (df.n_unique_simulants == 1)
    )
    .sum()
)
single_sim_piks_correct

In [None]:
# Overall accuracy, treating it as a black box
(single_sim_piks_correct / piks_assigned)

In [None]:
assert len(confirmed_piks_with_ground_truth) == piks_assigned

In [None]:
df_ops.head(
    census_2030_ground_truth.rename(columns={"record_id": "record_id_census_2030"})
)

In [None]:
# Looking at whether the exact *record* linked was from the same simulant
single_sim_record_links_correct = df_ops.compute(
    confirmed_piks_with_ground_truth.merge(
        census_2030_ground_truth.rename(
            columns={"record_id": "record_id_raw_input_file"}
        ),
        on="record_id_raw_input_file",
    )
    .merge(
        reference_files_ground_truth.rename(
            columns={"record_id": "record_id_reference_file"}
        ),
        on="record_id_reference_file",
    )
    .pipe(
        lambda df: (df.simulant_id_x == df.simulant_id_y) & (df.n_unique_simulants == 1)
    )
    .sum()
)
single_sim_record_links_correct

In [None]:
(single_sim_record_links_correct / piks_assigned)

### Definition 2

In [None]:
single_sim_piks_assigned = len(
    census_2030_piked[["record_id", "pik"]].merge(
        pik_simulant_pairs[pik_simulant_pairs.n_unique_simulants == 1][
            ["pik", "simulant_id"]
        ]
    )
)
single_sim_piks_assigned

In [None]:
# Overall accuracy, treating it as a black box
(single_sim_piks_correct / single_sim_piks_assigned)

In [None]:
# Looking at whether the exact *record* linked was from the same simulant
single_sim_record_links_assigned = df_ops.compute(
    (
        confirmed_piks_with_ground_truth.merge(
            reference_files_ground_truth.rename(
                columns={"record_id": "record_id_reference_file"}
            ),
            on="record_id_reference_file",
        ).n_unique_simulants
        == 1
    ).sum()
)
single_sim_record_links_assigned

In [None]:
(single_sim_record_links_correct / single_sim_record_links_assigned)

### Definition 3

In [None]:
pik_simulant_pairs

In [None]:
piks_at_least_partially_correct = df_ops.persist(
    census_2030_piked[["record_id", "pik"]]
    .merge(pik_simulant_pairs, on="pik")
    .merge(census_2030_ground_truth, on="record_id")
    .pipe(df_ops.drop_duplicates)
    .assign(correct=lambda df: df.simulant_id_x == df.simulant_id_y)
    .pipe(
        df_ops.groupby_agg_small_groups,
        by=["record_id", "pik"],
        agg_func=lambda x: x.correct.any(),
    )
    .reset_index()
)
piks_at_least_partially_correct

In [None]:
# Overall accuracy, treating it as a black box
piks_correct_proportion = (
    df_ops.compute(piks_at_least_partially_correct.correct.sum()) / piks_assigned
)
piks_correct_proportion

In [None]:
print(
    f"{piks_correct_proportion:.5%} of the PIKs assigned were correct; compare with {real_life_pvs_accuracy:.5%} in real life"
)

In [None]:
# Looking at whether the exact *record* linked was from the same simulant
sim_record_links_at_least_partially_correct = df_ops.persist(
    confirmed_piks_with_ground_truth.merge(
        census_2030_ground_truth.rename(
            columns={"record_id": "record_id_raw_input_file"}
        ),
        on="record_id_raw_input_file",
    )
    .merge(
        reference_files_ground_truth.rename(
            columns={"record_id": "record_id_reference_file"}
        ),
        on="record_id_reference_file",
    )
    .assign(correct=lambda df: df.simulant_id_x == df.simulant_id_y)
    .pipe(
        df_ops.groupby_agg_small_groups,
        by=[
            "record_id_raw_input_file",
            "record_id_reference_file",
            "pik",
            "module_name",
            "pass_name",
        ],
        agg_func=lambda x: x.correct.any(),
    )
    .reset_index()
)
sim_record_links_at_least_partially_correct

In [None]:
len(sim_record_links_at_least_partially_correct)

In [None]:
len(
    df_ops.drop_duplicates(
        sim_record_links_at_least_partially_correct[
            ["record_id_raw_input_file", "record_id_reference_file"]
        ]
    )
)

In [None]:
(
    df_ops.compute(sim_record_links_at_least_partially_correct.correct.sum())
    / piks_assigned
)

In [None]:
assert df_ops.compute(
    (
        df_ops.groupby_agg_small_groups(
            confirmed_piks_with_ground_truth,
            by="record_id_raw_input_file",
            agg_func=lambda x: x.record_id_reference_file.nunique(),
        )
        <= 1
    ).all()
)

In [None]:
# Using definition 3 -- at the PIK level
piks_at_least_partially_correct = df_ops.persist(
    piks_at_least_partially_correct.rename(
        columns={"record_id": "record_id_raw_input_file"}
    ).merge(
        confirmed_piks_with_ground_truth[
            ["record_id_raw_input_file", "module_name", "pass_name"]
        ],
        on="record_id_raw_input_file",
    )
)
piks_at_least_partially_correct

In [None]:
# Accuracy by module -- note that this shows the opposite pattern (with the sample data)
# relative to the results of Layne et al., who found GeoSearch was much *more* accurate
df_ops.compute(
    piks_at_least_partially_correct.groupby("module_name")
    .correct.agg(["mean", "size"])
    .sort_values("mean")
)

In [None]:
# Accuracy by pass -- could be used to tune pass-specific cutoffs, but
# this might not be too informative while we are still using the sample data.
df_ops.compute(
    piks_at_least_partially_correct.groupby(["module_name", "pass_name"])
    .correct.agg(["mean", "size"])
    .sort_values("mean")
)

In [None]:
# Using definition 3 -- at the link level
df_ops.compute(
    sim_record_links_at_least_partially_correct.groupby("module_name")
    .correct.agg(["mean", "size"])
    .sort_values("mean")
)

In [None]:
df_ops.compute(
    sim_record_links_at_least_partially_correct.groupby(["module_name", "pass_name"])
    .correct.agg(["mean", "size"])
    .sort_values("mean")
)

In [None]:
df_ops.compute(
    sim_record_links_at_least_partially_correct[
        ~sim_record_links_at_least_partially_correct.correct
    ]
    .groupby(["module_name", "pass_name"])
    .size()
).sort_values()

### Incorrect and missed PIKs

In [None]:
incorrectly_linked_pairs = df_ops.persist(
    df_ops.drop_duplicates(
        sim_record_links_at_least_partially_correct[
            ~sim_record_links_at_least_partially_correct.correct
        ][["record_id_raw_input_file", "record_id_reference_file"]]
    )
)
incorrectly_linked_pairs

In [None]:
len(incorrectly_linked_pairs)

In [None]:
incorrect_links = df_ops.head(incorrectly_linked_pairs, n=100)
incorrect_links

In [None]:
%xdel incorrectly_linked_pairs

In [None]:
comparison_cols = [
    "first_name",
    "middle_name",
    "last_name",
    "date_of_birth",
    "street_number",
    "street_name",
    "unit_number",
    "city",
    "state",
]

incorrect_links_detail = incorrect_links.merge(
    df_ops.compute(
        census_2030_piked[
            census_2030_piked.record_id.isin(incorrect_links.record_id_raw_input_file)
        ]
    ).rename(
        columns={
            "record_id": "record_id_raw_input_file",
            "middle_initial": "middle_name",
        }
    )[
        ["record_id_raw_input_file"] + comparison_cols
    ],
    on="record_id_raw_input_file",
    how="left",
).merge(
    df_ops.compute(
        reference_file[
            reference_file.record_id.isin(incorrect_links.record_id_reference_file)
        ]
    )
    .rename(columns={"record_id": "record_id_reference_file"})
    .rename(columns=lambda c: c.replace("mailing_address_", ""))[
        ["record_id_reference_file"] + comparison_cols
    ],
    on="record_id_reference_file",
    how="left",
    suffixes=("_census", "_reference_file"),
)


def flatten(xss):
    return [x for xs in xss for x in xs]


incorrect_links_detail[
    flatten([(f"{c}_census", f"{c}_reference_file") for c in comparison_cols])
]

In [None]:
missed_links = df_ops.persist(
    census_2030_piked[census_2030_piked.pik.isnull()][["record_id"]]
    .merge(census_2030_ground_truth, on="record_id")
    .merge(
        reference_files_ground_truth[
            reference_files_ground_truth.n_unique_simulants == 1
        ],
        on="simulant_id",
        suffixes=("_census", "_reference_file"),
    )
)

In [None]:
len(missed_links)

In [None]:
simulants_missed = df_ops.head(
    missed_links[["simulant_id"]], n=100
).simulant_id.unique()
simulants_missed

In [None]:
missed_pairs = df_ops.compute(
    missed_links[missed_links.simulant_id.isin(list(simulants_missed))]
)
missed_pairs

In [None]:
%xdel missed_links

In [None]:
missed_links_detail = missed_pairs.merge(
    df_ops.compute(
        census_2030_piked[
            census_2030_piked.record_id.isin(list(missed_pairs.record_id_census))
        ]
    ).rename(
        columns={"record_id": "record_id_census", "middle_initial": "middle_name"}
    ),
    on="record_id_census",
).merge(
    df_ops.compute(
        reference_file[
            reference_file.record_id.isin(missed_pairs.record_id_reference_file)
        ]
    )
    .rename(columns=lambda c: c.replace("mailing_address_", ""))
    .rename(columns={"record_id": "record_id_reference_file"}),
    on="record_id_reference_file",
    suffixes=("_census", "_reference_file"),
)

In [None]:
for simulant in simulants_missed:
    print(simulant)
    display(
        missed_links_detail[missed_links_detail.simulant_id == simulant][
            ["simulant_id"]
            + flatten([(f"{c}_census", f"{c}_reference_file") for c in comparison_cols])
        ]
    )