# **PVS-like case study: sample data**

In [1]:
import re
import pandas as pd, numpy as np

# Load data

See code in `generate_simulated_data` directory for how we generated the files to link

In [2]:
reference_file = pd.read_parquet('reference_file_sample.parquet')
census_2030 = pd.read_parquet('census_2030_sample.parquet')

# Pre-process data

Not much needed here because the datasets are already so tidy and similar to each other.

In [3]:
# Use NaN for all forms of missingness, including empty string
reference_file = reference_file.fillna(np.nan).replace('', np.nan)
census_2030 = census_2030.fillna(np.nan).replace('', np.nan)

In [4]:
# We want to compare mailing address with physical address
reference_file = reference_file.rename(columns=lambda c: c.replace('mailing_address_', ''))

In [5]:
# My working theory: the purpose of the "geokey" is because address parts violate conditional independence
get_geokey = lambda x: (x.street_number + ' ' + x.street_name + ' ' + x.unit_number.fillna('') + ' ' + x.city + ' ' + x.state.astype(str) + ' ' + x.zipcode).str.strip().str.split().str.join(' ')
reference_file = reference_file.assign(geokey=get_geokey)
census_2030 = census_2030.assign(geokey=get_geokey)

In [6]:
# Add columns used to "cut the database": ZIP3 and a grouping of first and last initial
reference_file = reference_file.assign(zip3=lambda x: x.zipcode.str[:3])
census_2030 = census_2030.assign(zip3=lambda x: x.zipcode.str[:3])

# Page 20 of the NORC report: "Name-cuts are defined by combinations of the first characters of the first and last names. The twenty letter groupings
# for the first character are: A-or-blank, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, and U-Z."
initial_cut = lambda x: x.fillna('A').str[0].replace('A', 'A-or-blank').replace(['U', 'V', 'W', 'X', 'Y', 'Z'], 'U-Z')
reference_file = reference_file.assign(first_initial_cut=lambda x: initial_cut(x.first_name), last_initial_cut=lambda x: initial_cut(x.last_name))
census_2030 = census_2030.assign(first_initial_cut=lambda x: initial_cut(x.first_name), last_initial_cut=lambda x: initial_cut(x.last_name))

# Data, ready to link

Note: I have not yet introduced alternate names and dates of birth here.

In [7]:
reference_file

Unnamed: 0,record_id,pik,ssn,date_of_birth,first_name,middle_initial,last_name,date_of_death,street_number,street_name,unit_number,po_box,city,state,zipcode,geokey,zip3,first_initial_cut,last_initial_cut
0,0,0,001-02-4588,08/08/2008,Isabella,G,Windom,,685,emerson st,,,Anytown,US,00000,685 emerson st Anytown US 00000,000,I,U-Z
1,1,1,001-15-8330,05/04/1976,Gerald,J,Beckham,,5010,south doctor martin luther king jr dr,,,Anytown,US,00000,5010 south doctor martin luther king jr dr Any...,000,G,B
2,2,2,001-16-0077,02/07/1970,Jerald,J,Alvarez,,,,,,,,,,,J,A-or-blank
3,3,3,001-17-9511,11/20/1966,Teresa,A,Togni,,150,s sheldon rd,,,Anytown,US,00000,150 s sheldon rd Anytown US 00000,000,T,T
4,4,4,001-25-8258,06/29/2026,Bethany,G,Tenorio,,,,,,,,,,,B,T
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18521,18521,18521,976-30-9537,06/12/1976,Aron,C,Frausto Ferretiz,,,,,,,,,,,A-or-blank,F
18522,18522,18522,978-78-6109,05/22/1963,Claude,M,Page,,,,,,,,,,,C,P
18523,18523,18523,979-44-7835,08/01/1979,Thomas,A,Martinez-Puentes,,,,,,,,,,,T,M
18524,18524,18524,998-22-9577,04/17/2002,Jeffery,P,Shaw,,,,,,,,,,,J,S


In [8]:
census_2030

Unnamed: 0,record_id,first_name,middle_initial,last_name,age,date_of_birth,street_number,street_name,unit_number,city,state,zipcode,relation_to_reference_person,sex,race_ethnicity,geokey,zip3,first_initial_cut,last_initial_cut
0,0,John,E,Mcueever,86,06/29/1942,147-153,browning ave,,Anytown,US,00000,Reference person,Male,Black,147-153 browning ave Anytown US 00000,000,J,M
1,1,Sharon,T,Schmidt,69,10/50/1960,109,stqllion sr,,Anytown,US,00000,Reference person,Female,White,109 stqllion sr Anytown US 00000,000,S,S
2,2,Gail,K,Durand,77,01/03/1953,2115,cannon dr,,Anytown,US,00000,Reference person,Female,Multiracial or Other,2115 cannon dr Anytown US 00000,000,G,D
3,3,John,J,Williams,81,11/24/1948,146,delaware av,,Anytown,US,00000,Reference person,Male,White,146 delaware av Anytown US 00000,000,J,U-Z
4,4,Child,L,Wukliamz,81,09/27/1948,146,delaware av,,Anytown,US,00000,Opp-sex spouse,Female,White,146 delaware av Anytown US 00000,000,C,U-Z
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11048,11048,Chloe,A,Maryknez-Alvarez,21,07/12/2008,207,harrison st,,Anytown,US,00000,Biological child,Female,Latino,207 harrison st Anytown US 00000,000,C,M
11049,11049,Zachary,E,Martinez-Alvarez,18,06/29/2011,207,harrison st,,Anytown,US,00000,Biological child,Male,,207 harrison st Anytown US 00000,000,U-Z,M
11050,11050,Madeline,A,Martinez-Alvarez,16,08/12/2013,207,harrison st,,Anytown,US,00000,Biological child,Female,Latino,207 harrison st Anytown US 00000,000,M,M
11051,11051,Naomi,A,Martinez-Aldarez,1,11/01/2028,207,harrison st,,Anytown,US,00000,Grandchild,Female,Latino,207 harrison st Anytown US 00000,000,N,M


# Implement PVS-like matching with `splink`

## Estimate parameters (lambda, m, u) once for both modules

In reality these parameters are not estimated from the data.
It is unclear to me whether they are actually the same for both modules or even for different passes of the same module.

In [9]:
common_cols = [c for c in reference_file.columns if c in census_2030.columns]
common_cols

['record_id',
 'date_of_birth',
 'first_name',
 'middle_initial',
 'last_name',
 'street_number',
 'street_name',
 'unit_number',
 'city',
 'state',
 'zipcode',
 'geokey',
 'zip3',
 'first_initial_cut',
 'last_initial_cut']

In [10]:
def prep_table_for_splink(df, dataset_name):
    return (
        df[common_cols]
            .assign(dataset_name=dataset_name)
    )

tables_for_splink = [prep_table_for_splink(reference_file, "reference_file"), prep_table_for_splink(census_2030, "census_2030")]

In [11]:
[len(t) for t in tables_for_splink]

[18526, 11053]

In [12]:
# estimate_probability_two_random_records_match did not seem to give me a reasonable estimate
# we estimate that around 90% of the census are present in the reference file
probability_two_random_records_match = (0.90 * len(census_2030)) / (len(reference_file) * len(census_2030))
probability_two_random_records_match

4.858037352909425e-05

In [13]:
from splink.spark.comparison_library import (
    exact_match,
    levenshtein_at_thresholds,
)

settings = {
    "link_type": "link_only",
    "comparisons": [
        levenshtein_at_thresholds("first_name", 2, term_frequency_adjustments=True),
        exact_match("middle_initial"),
        levenshtein_at_thresholds("last_name", 2, term_frequency_adjustments=True),
        # For some reason, this makes everything crash!?
        # levenshtein_at_thresholds("date_of_birth", 1),
        exact_match("date_of_birth"),
        levenshtein_at_thresholds("geokey", 5),
    ],
    "probability_two_random_records_match": probability_two_random_records_match,
    "unique_id_column_name": "record_id",
}

# https://moj-analytical-services.github.io/splink/demos/examples/spark/deduplicate_1k_synthetic.html
from splink.spark.jar_location import similarity_jar_location

from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import types

conf = SparkConf()
conf.setMaster("local[2]")
conf.set("spark.driver.memory", "12g")
conf.set("spark.default.parallelism", "2")

# Add custom similarity functions, which are bundled with Splink
# documented here: https://github.com/moj-analytical-services/splink_scalaudfs
path = similarity_jar_location()
conf.set("spark.jars", path)

sc = SparkContext.getOrCreate(conf=conf)

spark = SparkSession(sc)
spark.sparkContext.setCheckpointDir("./tmp_checkpoints")

from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import types
schema = types.StructType([types.StructField(c, types.IntegerType() if c == "unique_id" else types.StringType(), True) for c in tables_for_splink[0].columns])
spark_tables = [spark.createDataFrame(df, schema) for df in tables_for_splink]

from splink.spark.linker import SparkLinker
linker = SparkLinker(
    tables_for_splink,
    settings,
    input_table_aliases=["reference_file", "census_2030"],
    spark=spark,
)

import warnings
# PySpark triggers a lot of Pandas warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# linker = DuckDBLinker(
#     tables_for_splink,
#     settings,
#     input_table_aliases=["reference_file", "census_2030"]
# )

23/09/13 19:42:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
  if should_localize and is_datetime64tz_dtype(s.dtype) and s.dt.tz is not None:
  if should_localize and is_datetime64tz_dtype(s.dtype) and s.dt.tz is not None:


In [14]:
# So it turns out that even when using Spark, splink uses DuckDB a little bit
# I get a totally bizarre error that only happens the first time DuckDB is called
# This "flushes it out"?!?
# I have no clue why this is happening and didn't want to spend the time to investigate.
# May be a duckdb bug, or something really weird about my environment.
import duckdb
r1 = duckdb.sql('SELECT 42 AS i')

try:
    duckdb.sql('SELECT i * 2 AS k FROM r1').show()
except duckdb.InvalidInputException:
    print('Hit the error!')
    pass

duckdb.sql('SELECT i * 3 AS k FROM r1').show()

Hit the error!
┌───────┐
│   k   │
│ int32 │
├───────┤
│   126 │
└───────┘



In [15]:
# NOTE: This is not reproducible!
linker.estimate_u_using_random_sampling(max_pairs=1e5)

blocking_rule_for_training = "l.first_name = r.first_name and l.last_name = r.last_name"
linker.estimate_parameters_using_expectation_maximisation(blocking_rule_for_training, fix_probability_two_random_records_match=True)

blocking_rule_for_training = "l.geokey = r.geokey"
linker.estimate_parameters_using_expectation_maximisation(blocking_rule_for_training, fix_probability_two_random_records_match=True)

----- Estimating u probabilities using random sampling -----
23/09/13 19:43:04 WARN DataSource: All paths were ignored:                      
  file:/ihme/homes/zmbc/src/vivarium_research_prl/linkage/pvs_like_case_study/tmp_checkpoints/0f72b038-f695-40c9-b1a7-e35b39d3832c/__splink__df_concat_with_tf_131ef4ebb
                                                                                
Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - first_name (no m values are trained).
    - middle_initial (no m values are trained).
    - last_name (no m values are trained).
    - date_of_birth (no m values are trained).
    - geokey (no m values are trained).

----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
l.first_name = r.first_name and l.last_name = r.last_name

Parameter estimates will be made for the following comparison(s):
    - middle_initial
    - date_of_birth
    - ge

<EMTrainingSession, blocking on l.geokey = r.geokey, deactivating comparisons geokey>

In [16]:
linker.match_weights_chart()

In [17]:
# NOTE: EM appears to be finding people in the same family instead of the same person!
# See first_name m probabilities.
# For now, I address this by almost always blocking on first name.
# More experimentation needed to get reasonable values here.
linker.m_u_parameters_chart()

In [18]:
linker.parameter_estimate_comparisons_chart()

In [19]:
splink_settings = linker._settings_obj.as_dict()

In [20]:
PROBABILITY_THRESHOLD = 0.85

In [21]:
# Save these variables; this means that if you restart the kernel, you don't need to run this first part of the notebook again.
%store splink_settings PROBABILITY_THRESHOLD

Stored 'splink_settings' (dict)
Stored 'PROBABILITY_THRESHOLD' (float)


## Implement matching passes

In [22]:
# Calculate this once to save time -- mapping from record_id to index of each dataframe
reference_file_index_of_ids = reference_file.reset_index().set_index('record_id')['index']
census_index_of_ids = census_2030.reset_index().set_index('record_id')['index']

# TODO: Have this function output more charts and diagnostics
def pvs_matching_pass(blocking_cols):
    tables_for_splink = [prep_table_for_splink(reference_file, "reference_file"), prep_table_for_splink(census_2030[census_2030.pik.isnull()], "census_2030")]

    blocking_rule_parts = [f"l.{col} = r.{col}" for col in blocking_cols]
    blocking_rule = " and ".join(blocking_rule_parts)
    linker = SparkLinker(
        tables_for_splink,
        {**splink_settings, **{
            "blocking_rules_to_generate_predictions": [blocking_rule],
        }},
        # Must match order of tables_for_splink
        input_table_aliases=["reference_file", "census_2030"],
        spark=spark,
    )

    potential_links = (
        linker.predict(threshold_match_probability=PROBABILITY_THRESHOLD)
            .as_pandas_dataframe()
    )
    # Name the columns better than "_r" and "_l"
    # In practice it seems to always be one dataset on the right and another on the left,
    # but it's "backwards" relative to the order above and I don't want to rely on it
    potential_links_census_left = potential_links[potential_links.source_dataset_l == 'census_2030']
    assert (potential_links_census_left.source_dataset_r == 'reference_file').all()
    potential_links_census_left = (
        potential_links_census_left
            .rename(columns=lambda c: re.sub('_l$', '_census_2030', c))
            .rename(columns=lambda c: re.sub('_r$', '_reference_file', c))
    )

    potential_links_reference_left = potential_links[potential_links.source_dataset_l == 'reference_file']
    assert (potential_links_reference_left.source_dataset_r == 'census_2030').all()
    potential_links_reference_left = (
        potential_links_reference_left
            .rename(columns=lambda c: re.sub('_l$', '_reference_file', c))
            .rename(columns=lambda c: re.sub('_r$', '_census_2030', c))
    )

    assert len(potential_links) == len(potential_links_census_left) + len(potential_links_reference_left)
    potential_links = pd.concat([potential_links_census_left, potential_links_reference_left], ignore_index=True)

    print(f'{len(potential_links)} links above threshold')

    # Post-processing: deal with multiple matches
    # According to the report, a record is considered not linkable if it has multiple matches above the threshold
    # I represent "not linkable" here with a PIK of -1 (different from NaN, which means yet-to-be-linked)
    potential_links = potential_links.merge(reference_file[['record_id', 'pik']], left_on='record_id_reference_file', right_on='record_id', how='left').drop(columns=['record_id'])
    print(f'{potential_links.record_id_census_2030.nunique()} input records have a match')
    census_records_with_multiple_potential_piks = potential_links.groupby('record_id_census_2030').pik.nunique().pipe(lambda c: c[c > 1]).index
    if len(census_records_with_multiple_potential_piks) > 0:
        print(f'{len(census_records_with_multiple_potential_piks)} input records matched to multiple PIKs, marking as unlinkable')

    potential_links.loc[potential_links.record_id_census_2030.isin(census_records_with_multiple_potential_piks), 'pik'] = -1

    assert (potential_links.groupby('record_id_census_2030').pik.nunique() == 1).all()
    links = potential_links.groupby('record_id_census_2030').pik.first().reset_index()
    census_2030.loc[census_index_of_ids.loc[links.record_id_census_2030], 'pik'] = links.pik.values

    print(f'Matched {len(links)} records; {census_2030.pik.isnull().mean():.2%} still eligible to match')
    
    # Diagnostic showing the predicted values for each combination of column similarity values
    all_predictions = linker.predict().as_pandas_dataframe()
    all_combos = all_predictions.groupby(list(all_predictions.filter(like='gamma_').columns)).match_probability.agg(['mean', 'count']).sort_values('mean')
    
    return all_combos, links

# GeoSearch

> There are six passes through GeoSearch defined currently for an ACS PVS run. These passes use the first
  three digits of an address ZIP code (ZIP3) as a database “cutting” strategy...
>
> The GeoSearch matching
  variables include name and DOB, but also several variables derived from the Geokey (street name, house
  number, etc).

[(source)](https://www.norc.org/PDFs/May%202011%20Personal%20Validation%20and%20Entity%20Resolution%20Conference/PVS%20Assessment%20Report%20FINAL%20JULY%202011.pdf)

In [23]:
def geosearch_pass(blocking_cols):
    return pvs_matching_pass(["zip3"] + blocking_cols)

## Pass 1: block on full name and entire address

In [24]:
census_2030['pik'] = np.nan

In [25]:
all_combos, pik_pairs = geosearch_pass(["first_name", "middle_initial", "last_name", "geokey"])

23/09/13 19:44:32 WARN SimpleFunctionRegistry: The function jaro_sim replaced a previously registered function.
23/09/13 19:44:32 WARN SimpleFunctionRegistry: The function jaro_winkler replaced a previously registered function.
23/09/13 19:44:32 WARN SimpleFunctionRegistry: The function jaccard replaced a previously registered function.
23/09/13 19:44:32 WARN SimpleFunctionRegistry: The function cosine_distance replaced a previously registered function.
23/09/13 19:44:32 WARN SimpleFunctionRegistry: The function damerau_levenshtein replaced a previously registered function.
23/09/13 19:44:32 WARN SimpleFunctionRegistry: The function dmetaphone replaced a previously registered function.
23/09/13 19:44:32 WARN SimpleFunctionRegistry: The function dmetaphonealt replaced a previously registered function.
23/09/13 19:44:32 WARN SimpleFunctionRegistry: The function qgramtokeniser replaced a previously registered function.
23/09/13 19:44:36 WARN DataSource: All paths were ignored:            

3568 links above threshold
3568 input records have a match
Matched 3568 records; 67.72% still eligible to match


23/09/13 19:44:46 WARN DataSource: All paths were ignored:                      
  file:/ihme/homes/zmbc/src/vivarium_research_prl/linkage/pvs_like_case_study/tmp_checkpoints/0f72b038-f695-40c9-b1a7-e35b39d3832c/__splink__df_predict_fd23e90ed


### Look at diagnostics

In [26]:
all_combos

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,mean,count
gamma_first_name,gamma_middle_initial,gamma_last_name,gamma_date_of_birth,gamma_geokey,Unnamed: 5_level_1,Unnamed: 6_level_1
2,1,2,0,2,0.99988,430
2,1,2,-1,2,0.999976,122
2,1,2,1,2,1.0,3016


In [27]:
pik_pairs

Unnamed: 0,record_id_census_2030,pik
0,3,18337
1,6,2021
2,12,10198
3,19,7422
4,22,9496
...,...,...
3563,11035,5058
3564,11037,18078
3565,11046,643
3566,11047,17273


## Pass 2: Block on first name and entire address

In [28]:
all_combos, pik_pairs = geosearch_pass(["first_name", "geokey"])

23/09/13 19:44:49 WARN SimpleFunctionRegistry: The function jaro_sim replaced a previously registered function.
23/09/13 19:44:49 WARN SimpleFunctionRegistry: The function jaro_winkler replaced a previously registered function.
23/09/13 19:44:49 WARN SimpleFunctionRegistry: The function jaccard replaced a previously registered function.
23/09/13 19:44:49 WARN SimpleFunctionRegistry: The function cosine_distance replaced a previously registered function.
23/09/13 19:44:49 WARN SimpleFunctionRegistry: The function damerau_levenshtein replaced a previously registered function.
23/09/13 19:44:49 WARN SimpleFunctionRegistry: The function dmetaphone replaced a previously registered function.
23/09/13 19:44:49 WARN SimpleFunctionRegistry: The function dmetaphonealt replaced a previously registered function.
23/09/13 19:44:49 WARN SimpleFunctionRegistry: The function qgramtokeniser replaced a previously registered function.
23/09/13 19:44:53 WARN DataSource: All paths were ignored:            

1051 links above threshold
1049 input records have a match
2 input records matched to multiple PIKs, marking as unlinkable
Matched 1049 records; 58.23% still eligible to match


23/09/13 19:45:01 WARN DataSource: All paths were ignored:                      
  file:/ihme/homes/zmbc/src/vivarium_research_prl/linkage/pvs_like_case_study/tmp_checkpoints/0f72b038-f695-40c9-b1a7-e35b39d3832c/__splink__df_predict_ca3316db5


### Look at diagnostics

In [29]:
all_combos

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,mean,count
gamma_first_name,gamma_middle_initial,gamma_last_name,gamma_date_of_birth,gamma_geokey,Unnamed: 5_level_1,Unnamed: 6_level_1
2,0,0,0,2,0.004398,329
2,-1,0,0,2,0.012271,9
2,0,0,-1,2,0.033917,11
2,0,-1,0,2,0.129059,13
2,1,0,0,2,0.223769,64
2,-1,-1,-1,2,0.425618,1
2,1,0,-1,2,0.59471,4
2,0,1,0,2,0.814856,6
2,1,-1,0,2,0.910198,20
2,1,-1,-1,2,0.911966,1


In [30]:
pik_pairs

Unnamed: 0,record_id_census_2030,pik
0,0,6829
1,14,907
2,24,1469
3,26,9816
4,35,12330
...,...,...
1044,11017,5591
1045,11019,2624
1046,11020,10308
1047,11042,13168


## Pass 3: Block on full name and street address

In [31]:
all_combos, pik_pairs = geosearch_pass(["first_name", "middle_initial", "last_name", "street_number", "street_name"])

23/09/13 19:45:04 WARN SimpleFunctionRegistry: The function jaro_sim replaced a previously registered function.
23/09/13 19:45:04 WARN SimpleFunctionRegistry: The function jaro_winkler replaced a previously registered function.
23/09/13 19:45:04 WARN SimpleFunctionRegistry: The function jaccard replaced a previously registered function.
23/09/13 19:45:04 WARN SimpleFunctionRegistry: The function cosine_distance replaced a previously registered function.
23/09/13 19:45:04 WARN SimpleFunctionRegistry: The function damerau_levenshtein replaced a previously registered function.
23/09/13 19:45:04 WARN SimpleFunctionRegistry: The function dmetaphone replaced a previously registered function.
23/09/13 19:45:04 WARN SimpleFunctionRegistry: The function dmetaphonealt replaced a previously registered function.
23/09/13 19:45:04 WARN SimpleFunctionRegistry: The function qgramtokeniser replaced a previously registered function.
23/09/13 19:45:07 WARN DataSource: All paths were ignored:            

292 links above threshold
292 input records have a match
Matched 292 records; 55.59% still eligible to match


23/09/13 19:45:16 WARN DataSource: All paths were ignored:                      
  file:/ihme/homes/zmbc/src/vivarium_research_prl/linkage/pvs_like_case_study/tmp_checkpoints/0f72b038-f695-40c9-b1a7-e35b39d3832c/__splink__df_predict_39e6d37af


### Look at diagnostics

In [32]:
all_combos

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,mean,count
gamma_first_name,gamma_middle_initial,gamma_last_name,gamma_date_of_birth,gamma_geokey,Unnamed: 5_level_1,Unnamed: 6_level_1
2,1,2,-1,0,0.469222,1
2,1,2,0,-1,0.99302,10
2,1,2,-1,-1,0.997456,4
2,1,2,0,1,0.999907,32
2,1,2,-1,1,0.999964,5
2,1,2,1,0,0.999986,5
2,1,2,1,-1,1.0,60
2,1,2,1,1,1.0,176


In [33]:
pik_pairs

Unnamed: 0,record_id_census_2030,pik
0,18,2575
1,60,13291
2,74,8705
3,106,5087
4,132,16060
...,...,...
287,10902,1881
288,10923,16689
289,10967,7650
290,11012,13714


## Pass 4: Block on first name and street address

In [34]:
all_combos, pik_pairs = geosearch_pass(["first_name", "street_number", "street_name"])

23/09/13 19:45:18 WARN SimpleFunctionRegistry: The function jaro_sim replaced a previously registered function.
23/09/13 19:45:18 WARN SimpleFunctionRegistry: The function jaro_winkler replaced a previously registered function.
23/09/13 19:45:18 WARN SimpleFunctionRegistry: The function jaccard replaced a previously registered function.
23/09/13 19:45:18 WARN SimpleFunctionRegistry: The function cosine_distance replaced a previously registered function.
23/09/13 19:45:18 WARN SimpleFunctionRegistry: The function damerau_levenshtein replaced a previously registered function.
23/09/13 19:45:18 WARN SimpleFunctionRegistry: The function dmetaphone replaced a previously registered function.
23/09/13 19:45:18 WARN SimpleFunctionRegistry: The function dmetaphonealt replaced a previously registered function.
23/09/13 19:45:18 WARN SimpleFunctionRegistry: The function qgramtokeniser replaced a previously registered function.
23/09/13 19:45:21 WARN DataSource: All paths were ignored:            

73 links above threshold
73 input records have a match
Matched 73 records; 54.93% still eligible to match


23/09/13 19:45:28 WARN DataSource: All paths were ignored:                      
  file:/ihme/homes/zmbc/src/vivarium_research_prl/linkage/pvs_like_case_study/tmp_checkpoints/0f72b038-f695-40c9-b1a7-e35b39d3832c/__splink__df_predict_6176677fc
                                                                                

### Look at diagnostics

In [35]:
all_combos

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,mean,count
gamma_first_name,gamma_middle_initial,gamma_last_name,gamma_date_of_birth,gamma_geokey,Unnamed: 5_level_1,Unnamed: 6_level_1
2,0,0,0,0,5.940343e-07,1
2,0,0,0,-1,3.703424e-05,6
2,0,0,0,1,0.002089388,11
2,1,0,0,-1,0.002330642,2
2,0,0,0,2,0.004055876,201
2,-1,0,0,2,0.004654058,2
2,0,0,-1,1,0.004942786,2
2,0,0,-1,2,0.04407394,7
2,1,0,0,1,0.1004593,6
2,0,-1,0,2,0.1212065,4


In [36]:
pik_pairs

Unnamed: 0,record_id_census_2030,pik
0,166,12042
1,229,7798
2,430,1537
3,549,4607
4,688,5915
...,...,...
68,10631,14000
69,10665,14512
70,10741,15077
71,10796,11136


## Pass 5: Block on first and last name

In [37]:
all_combos, pik_pairs = geosearch_pass(["first_name", "last_name"])

23/09/13 19:45:30 WARN SimpleFunctionRegistry: The function jaro_sim replaced a previously registered function.
23/09/13 19:45:30 WARN SimpleFunctionRegistry: The function jaro_winkler replaced a previously registered function.
23/09/13 19:45:30 WARN SimpleFunctionRegistry: The function jaccard replaced a previously registered function.
23/09/13 19:45:30 WARN SimpleFunctionRegistry: The function cosine_distance replaced a previously registered function.
23/09/13 19:45:30 WARN SimpleFunctionRegistry: The function damerau_levenshtein replaced a previously registered function.
23/09/13 19:45:30 WARN SimpleFunctionRegistry: The function dmetaphone replaced a previously registered function.
23/09/13 19:45:30 WARN SimpleFunctionRegistry: The function dmetaphonealt replaced a previously registered function.
23/09/13 19:45:30 WARN SimpleFunctionRegistry: The function qgramtokeniser replaced a previously registered function.
23/09/13 19:45:33 WARN DataSource: All paths were ignored:            

1441 links above threshold
1439 input records have a match
2 input records matched to multiple PIKs, marking as unlinkable
Matched 1439 records; 41.91% still eligible to match


23/09/13 19:45:41 WARN DataSource: All paths were ignored:                      
  file:/ihme/homes/zmbc/src/vivarium_research_prl/linkage/pvs_like_case_study/tmp_checkpoints/0f72b038-f695-40c9-b1a7-e35b39d3832c/__splink__df_predict_ebeab9aa8


### Look at diagnostics

In [38]:
all_combos

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,mean,count
gamma_first_name,gamma_middle_initial,gamma_last_name,gamma_date_of_birth,gamma_geokey,Unnamed: 5_level_1,Unnamed: 6_level_1
2,0,2,0,0,0.009866,35
2,0,2,-1,0,0.273351,3
2,0,2,0,-1,0.285417,22
2,1,2,0,0,0.4909,16
2,-1,2,0,-1,0.687806,4
2,1,2,-1,0,0.771658,4
2,0,2,0,2,0.839669,1
2,1,2,0,-1,0.982915,66
2,0,2,0,1,0.990794,12
2,1,2,-1,-1,0.995233,22


In [39]:
pik_pairs

Unnamed: 0,record_id_census_2030,pik
0,9,13649
1,11,18255
2,23,18202
3,28,13138
4,30,15889
...,...,...
1434,10989,596
1435,11003,13987
1436,11013,4139
1437,11015,10808


In [40]:
all_combos

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,mean,count
gamma_first_name,gamma_middle_initial,gamma_last_name,gamma_date_of_birth,gamma_geokey,Unnamed: 5_level_1,Unnamed: 6_level_1
2,0,2,0,0,0.009866,35
2,0,2,-1,0,0.273351,3
2,0,2,0,-1,0.285417,22
2,1,2,0,0,0.4909,16
2,-1,2,0,-1,0.687806,4
2,1,2,-1,0,0.771658,4
2,0,2,0,2,0.839669,1
2,1,2,0,-1,0.982915,66
2,0,2,0,1,0.990794,12
2,1,2,-1,-1,0.995233,22


In [41]:
pik_pairs

Unnamed: 0,record_id_census_2030,pik
0,9,13649
1,11,18255
2,23,18202
3,28,13138
4,30,15889
...,...,...
1434,10989,596
1435,11003,13987
1436,11013,4139
1437,11015,10808


# NameSearch

>    The NameSearch module, by contrast, does not use any geographic variables for matching. Only the
>    Name and DOB are used to match. There are four NameSearch passes defined for the ACS. All passes
>    use the first characters of the First and Last names to define cuts...

In [42]:
def namesearch_pass(blocking_cols):
    return pvs_matching_pass(["first_initial_cut", "last_initial_cut"] + blocking_cols)

## Pass 1: Block on full name and DOB

In [43]:
all_combos, pik_pairs = namesearch_pass(["first_name", "middle_initial", "last_name", "date_of_birth"])

23/09/13 19:45:44 WARN SimpleFunctionRegistry: The function jaro_sim replaced a previously registered function.
23/09/13 19:45:44 WARN SimpleFunctionRegistry: The function jaro_winkler replaced a previously registered function.
23/09/13 19:45:44 WARN SimpleFunctionRegistry: The function jaccard replaced a previously registered function.
23/09/13 19:45:44 WARN SimpleFunctionRegistry: The function cosine_distance replaced a previously registered function.
23/09/13 19:45:44 WARN SimpleFunctionRegistry: The function damerau_levenshtein replaced a previously registered function.
23/09/13 19:45:44 WARN SimpleFunctionRegistry: The function dmetaphone replaced a previously registered function.
23/09/13 19:45:44 WARN SimpleFunctionRegistry: The function dmetaphonealt replaced a previously registered function.
23/09/13 19:45:44 WARN SimpleFunctionRegistry: The function qgramtokeniser replaced a previously registered function.
23/09/13 19:45:48 WARN DataSource: All paths were ignored:            

1618 links above threshold
1618 input records have a match
Matched 1618 records; 27.27% still eligible to match


23/09/13 19:45:55 WARN DataSource: All paths were ignored:                      
  file:/ihme/homes/zmbc/src/vivarium_research_prl/linkage/pvs_like_case_study/tmp_checkpoints/0f72b038-f695-40c9-b1a7-e35b39d3832c/__splink__df_predict_25cf865d7


### Look at diagnostics

In [44]:
all_combos

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,mean,count
gamma_first_name,gamma_middle_initial,gamma_last_name,gamma_date_of_birth,gamma_geokey,Unnamed: 5_level_1,Unnamed: 6_level_1
2,1,2,1,0,0.999998,2
2,1,2,1,-1,1.0,1578
2,1,2,1,1,1.0,38


In [45]:
pik_pairs

Unnamed: 0,record_id_census_2030,pik
0,2,6400
1,8,16268
2,16,6232
3,17,2140
4,20,3060
...,...,...
1613,11040,13175
1614,11043,14961
1615,11044,14948
1616,11045,18263


## Pass 2: Block on first name and DOB

In [46]:
all_combos, pik_pairs = namesearch_pass(["first_name", "date_of_birth"])

23/09/13 19:45:57 WARN SimpleFunctionRegistry: The function jaro_sim replaced a previously registered function.
23/09/13 19:45:57 WARN SimpleFunctionRegistry: The function jaro_winkler replaced a previously registered function.
23/09/13 19:45:57 WARN SimpleFunctionRegistry: The function jaccard replaced a previously registered function.
23/09/13 19:45:57 WARN SimpleFunctionRegistry: The function cosine_distance replaced a previously registered function.
23/09/13 19:45:57 WARN SimpleFunctionRegistry: The function damerau_levenshtein replaced a previously registered function.
23/09/13 19:45:57 WARN SimpleFunctionRegistry: The function dmetaphone replaced a previously registered function.
23/09/13 19:45:57 WARN SimpleFunctionRegistry: The function dmetaphonealt replaced a previously registered function.
23/09/13 19:45:57 WARN SimpleFunctionRegistry: The function qgramtokeniser replaced a previously registered function.
23/09/13 19:46:00 WARN DataSource: All paths were ignored:            

473 links above threshold
473 input records have a match
Matched 473 records; 22.99% still eligible to match


23/09/13 19:46:08 WARN DataSource: All paths were ignored:                      
  file:/ihme/homes/zmbc/src/vivarium_research_prl/linkage/pvs_like_case_study/tmp_checkpoints/0f72b038-f695-40c9-b1a7-e35b39d3832c/__splink__df_predict_7ee2c33ef


### Look at diagnostics

In [47]:
all_combos

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,mean,count
gamma_first_name,gamma_middle_initial,gamma_last_name,gamma_date_of_birth,gamma_geokey,Unnamed: 5_level_1,Unnamed: 6_level_1
2,1,0,1,-1,0.981104,12
2,1,1,1,0,0.998534,5
2,0,1,1,-1,0.999269,20
2,1,-1,1,-1,0.999692,2
2,-1,1,1,-1,0.999721,8
2,1,0,1,1,0.999769,2
2,0,2,1,-1,0.999983,169
2,1,1,1,-1,0.999986,165
2,0,1,1,1,0.999994,8
2,-1,2,1,-1,0.999998,27


In [48]:
pik_pairs

Unnamed: 0,record_id_census_2030,pik
0,37,8650
1,46,17694
2,58,12477
3,101,8612
4,103,17414
...,...,...
468,10945,3169
469,10959,70
470,10991,15699
471,10998,8331


## Pass 3: Block on last name and DOB

In [49]:
all_combos, pik_pairs = namesearch_pass(["last_name", "date_of_birth"])

23/09/13 19:46:10 WARN SimpleFunctionRegistry: The function jaro_sim replaced a previously registered function.
23/09/13 19:46:10 WARN SimpleFunctionRegistry: The function jaro_winkler replaced a previously registered function.
23/09/13 19:46:10 WARN SimpleFunctionRegistry: The function jaccard replaced a previously registered function.
23/09/13 19:46:10 WARN SimpleFunctionRegistry: The function cosine_distance replaced a previously registered function.
23/09/13 19:46:10 WARN SimpleFunctionRegistry: The function damerau_levenshtein replaced a previously registered function.
23/09/13 19:46:10 WARN SimpleFunctionRegistry: The function dmetaphone replaced a previously registered function.
23/09/13 19:46:10 WARN SimpleFunctionRegistry: The function dmetaphonealt replaced a previously registered function.
23/09/13 19:46:10 WARN SimpleFunctionRegistry: The function qgramtokeniser replaced a previously registered function.
23/09/13 19:46:14 WARN DataSource: All paths were ignored:            

591 links above threshold
591 input records have a match
Matched 591 records; 17.64% still eligible to match


23/09/13 19:46:23 WARN DataSource: All paths were ignored:                      
  file:/ihme/homes/zmbc/src/vivarium_research_prl/linkage/pvs_like_case_study/tmp_checkpoints/0f72b038-f695-40c9-b1a7-e35b39d3832c/__splink__df_predict_873afe91f
                                                                                

### Look at diagnostics

In [50]:
all_combos

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,mean,count
gamma_first_name,gamma_middle_initial,gamma_last_name,gamma_date_of_birth,gamma_geokey,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0,2,1,-1,0.968133,3
-1,0,2,1,-1,0.98252,1
0,1,2,1,-1,0.999382,6
1,0,2,1,-1,0.999743,17
1,1,2,1,0,0.999939,5
1,-1,2,1,-1,0.99995,1
-1,1,2,1,-1,0.999972,4
-1,0,2,1,2,0.999986,1
0,0,2,1,2,0.999986,2
1,1,2,1,-1,0.999992,183


In [51]:
pik_pairs

Unnamed: 0,record_id_census_2030,pik
0,5,15686
1,7,2207
2,10,8512
3,42,5328
4,115,3358
...,...,...
586,10892,2038
587,10909,6972
588,10924,8436
589,10988,1263


## Pass 4: Block on DOB

In [52]:
all_combos, pik_pairs = namesearch_pass(["date_of_birth"])

23/09/13 19:46:25 WARN SimpleFunctionRegistry: The function jaro_sim replaced a previously registered function.
23/09/13 19:46:25 WARN SimpleFunctionRegistry: The function jaro_winkler replaced a previously registered function.
23/09/13 19:46:25 WARN SimpleFunctionRegistry: The function jaccard replaced a previously registered function.
23/09/13 19:46:25 WARN SimpleFunctionRegistry: The function cosine_distance replaced a previously registered function.
23/09/13 19:46:25 WARN SimpleFunctionRegistry: The function damerau_levenshtein replaced a previously registered function.
23/09/13 19:46:25 WARN SimpleFunctionRegistry: The function dmetaphone replaced a previously registered function.
23/09/13 19:46:25 WARN SimpleFunctionRegistry: The function dmetaphonealt replaced a previously registered function.
23/09/13 19:46:25 WARN SimpleFunctionRegistry: The function qgramtokeniser replaced a previously registered function.
23/09/13 19:46:29 WARN DataSource: All paths were ignored:            

43 links above threshold
43 input records have a match
Matched 43 records; 17.25% still eligible to match


23/09/13 19:46:35 WARN DataSource: All paths were ignored:                      
  file:/ihme/homes/zmbc/src/vivarium_research_prl/linkage/pvs_like_case_study/tmp_checkpoints/0f72b038-f695-40c9-b1a7-e35b39d3832c/__splink__df_predict_835acf0cc


### Look at diagnostics

In [53]:
all_combos

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,mean,count
gamma_first_name,gamma_middle_initial,gamma_last_name,gamma_date_of_birth,gamma_geokey,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0,0,1,0,3.3e-05,1
-1,0,0,1,0,0.000102,2
0,0,0,1,-1,0.001479,2
1,0,0,1,-1,0.067924,1
1,1,0,1,-1,0.779643,3
1,0,0,1,1,0.924504,1
0,1,1,1,-1,0.986939,2
1,0,1,1,-1,0.987105,2
1,1,1,1,0,0.988094,2
-1,1,1,1,-1,0.995715,1


In [54]:
pik_pairs

Unnamed: 0,record_id_census_2030,pik
0,47,15004
1,406,1575
2,419,8487
3,1313,2448
4,1489,15469
5,1545,12814
6,1692,10169
7,1698,18269
8,1737,7394
9,2000,15301


# Post-processing multiple matches

In [55]:
# Sentinel value represents matching to more than one PIK
census_2030[census_2030.pik == -1]

Unnamed: 0,record_id,first_name,middle_initial,last_name,age,date_of_birth,street_number,street_name,unit_number,city,state,zipcode,relation_to_reference_person,sex,race_ethnicity,geokey,zip3,first_initial_cut,last_initial_cut,pik
580,580,Natalie,A,,28,07/26/2001,1702.0,meisner rd,,Anytown,US,0,Institutionalized GQ pop,Female,AIAN,1702 meisner rd Anytown US 00000,0,N,A-or-blank,-1.0
4131,4131,Jacob,E,Collazo,24,03/27/2005,2520.0,5th placf,,Anytown,US,0,Biological child,Male,Latino,2520 5th placf Anytown US 00000,0,J,C,-1.0
5423,5423,John,E,Morales Valencia,84,11/26/1945,4828.0,farmville rd,,Anytown,US,0,Reference person,Male,Latino,4828 farmville rd Anytown US 00000,0,J,M,-1.0
6859,6859,Justin,A,Brown,41,04/20/1988,,seast washington str,,Anytown,US,0,Reference person,Male,Black,,0,J,B,-1.0


In [56]:
census_2030.loc[census_2030.pik == -1, 'pik'] = np.nan

# Resulting PIKs

In [57]:
census_2030

Unnamed: 0,record_id,first_name,middle_initial,last_name,age,date_of_birth,street_number,street_name,unit_number,city,state,zipcode,relation_to_reference_person,sex,race_ethnicity,geokey,zip3,first_initial_cut,last_initial_cut,pik
0,0,John,E,Mcueever,86,06/29/1942,147-153,browning ave,,Anytown,US,00000,Reference person,Male,Black,147-153 browning ave Anytown US 00000,000,J,M,6829.0
1,1,Sharon,T,Schmidt,69,10/50/1960,109,stqllion sr,,Anytown,US,00000,Reference person,Female,White,109 stqllion sr Anytown US 00000,000,S,S,
2,2,Gail,K,Durand,77,01/03/1953,2115,cannon dr,,Anytown,US,00000,Reference person,Female,Multiracial or Other,2115 cannon dr Anytown US 00000,000,G,D,6400.0
3,3,John,J,Williams,81,11/24/1948,146,delaware av,,Anytown,US,00000,Reference person,Male,White,146 delaware av Anytown US 00000,000,J,U-Z,18337.0
4,4,Child,L,Wukliamz,81,09/27/1948,146,delaware av,,Anytown,US,00000,Opp-sex spouse,Female,White,146 delaware av Anytown US 00000,000,C,U-Z,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11048,11048,Chloe,A,Maryknez-Alvarez,21,07/12/2008,207,harrison st,,Anytown,US,00000,Biological child,Female,Latino,207 harrison st Anytown US 00000,000,C,M,17071.0
11049,11049,Zachary,E,Martinez-Alvarez,18,06/29/2011,207,harrison st,,Anytown,US,00000,Biological child,Male,,207 harrison st Anytown US 00000,000,U-Z,M,12689.0
11050,11050,Madeline,A,Martinez-Alvarez,16,08/12/2013,207,harrison st,,Anytown,US,00000,Biological child,Female,Latino,207 harrison st Anytown US 00000,000,M,M,10874.0
11051,11051,Naomi,A,Martinez-Aldarez,1,11/01/2028,207,harrison st,,Anytown,US,00000,Grandchild,Female,Latino,207 harrison st Anytown US 00000,000,N,M,10825.0


In [58]:
census_2030.pik.notnull().mean()

0.8271057631412286

In [59]:
census_2030_ground_truth = pd.read_parquet('census_2030_ground_truth_sample.parquet').set_index('record_id').simulant_id
reference_file_ground_truth = pd.read_parquet('reference_file_ground_truth_sample.parquet').set_index('record_id').simulant_id

In [60]:
# Not possible to be PIKed, since they are truly not in the reference file
(~census_2030_ground_truth.isin(reference_file_ground_truth)).mean()

0.04641273862299828

In [61]:
census_2030.pik.notnull().mean() / census_2030_ground_truth.isin(reference_file_ground_truth).mean()

0.8673624288425048

In [62]:
# Multiple Census rows assigned the same PIK, indicating the model thinks they are duplicates in Census
census_2030.pik.value_counts().value_counts()

count
1    9138
2       2
Name: count, dtype: int64

In [63]:
# However, in this version of pseudopeople, there are no actual duplicates in Census
assert not census_2030_ground_truth.duplicated().any()

In [64]:
# Interesting: in pseudopeople, sometimes siblings are assigned the same (common) first name, making them almost identical.
# The only giveaway is their age and DOB.
# Presumably, this tends not to happen in real life.
duplicate_piks = census_2030.pik.value_counts()[census_2030.pik.value_counts() > 1].index
census_2030[census_2030.pik.isin(duplicate_piks)].sort_values('pik')

Unnamed: 0,record_id,first_name,middle_initial,last_name,age,date_of_birth,street_number,street_name,unit_number,city,state,zipcode,relation_to_reference_person,sex,race_ethnicity,geokey,zip3,first_initial_cut,last_initial_cut,pik
10702,10702,John,A,Bean,17,12/31/2012,4837,knappton r,,Anytown,US,0,Biological child,Male,Multiracial or Other,4837 knappton r Anytown US 00000,0,J,B,4514.0
10701,10701,John,A,Bean,19,05/30/2010,4837,knappton r,,Anytown,US,0,Biological child,Male,Multiracial or Other,4837 knappton r Anytown US 00000,0,J,B,4514.0
8379,8379,Emily,K,Allen,16,07/30/2013,3076,hanna ave n,,Anytown,US,0,,Female,White,3076 hanna ave n Anytown US 00000,0,E,A-or-blank,4981.0
8378,8378,Emily,K,Allen,18,04/14/2011,3076,hanna ave n,,Anytown,US,0,Biological child,Female,White,3076 hanna ave n Anytown US 00000,0,E,A-or-blank,4981.0


## PIK accuracy

In [65]:
pik_simulant_id = census_2030.pik.map(reference_file_ground_truth)
pik_simulant_id

0          0_923
1            NaN
2         0_6176
3        0_13972
4            NaN
          ...   
11048    0_22741
11049    0_22742
11050    0_22743
11051    0_23271
11052    0_16724
Name: pik, Length: 11053, dtype: object

In [66]:
(pik_simulant_id[pik_simulant_id.notnull()] == census_2030_ground_truth[pik_simulant_id.notnull()]).mean()

0.9993436884707941

In [67]:
errors = census_2030[census_2030.pik.notnull() & (pik_simulant_id != census_2030_ground_truth)]
confused_for = reference_file.set_index('record_id').loc[errors.pik].reset_index().set_index(errors.index)
errors[common_cols].compare(confused_for[common_cols], keep_shape=True, keep_equal=True)

Unnamed: 0_level_0,record_id,record_id,date_of_birth,date_of_birth,first_name,first_name,middle_initial,middle_initial,last_name,last_name,...,zipcode,zipcode,geokey,geokey,zip3,zip3,first_initial_cut,first_initial_cut,last_initial_cut,last_initial_cut
Unnamed: 0_level_1,self,other,self,other,self,other,self,other,self,other,...,self,other,self,other,self,other,self,other,self,other
2290,2290,9957,05/05/2026,09/27/2000,Bennett,Bennett,J,J,Smith,Smith,...,0,0,3232 maple grove ln Anyyown US 00000,,0,0,B,B,S,S
8379,8379,4981,07/30/2013,04/14/2011,Emily,Emily,K,K,Allen,Allen,...,0,0,3076 hanna ave n Anytown US 00000,3076 hanna ave n Anytown US 00000,0,0,E,E,A-or-blank,A-or-blank
9404,9404,4251,09/21/1932,09/21/1932,Margaret,Margaret,C,C,Moe,Miranda,...,0,0,4410 705 707 ivan ave s Anytown US 00000,,0,0,M,M,M,M
10038,10038,8876,04/18/1972,06/14/1978,John,John,D,D,Dietrick,Dietrick,...,0,0,519 s ocean blvd Anytown US 00000,519 s ocean blvd Anytown US 00000,0,0,J,J,D,D
10702,10702,4514,12/31/2012,05/30/2010,John,John,A,A,Bean,Beaj,...,0,0,4837 knappton r Anytown US 00000,4837 knappton r Anytown US 00000,0,0,J,J,B,B
10797,10797,2759,03/24/1979,09/16/1965,Kevin,Kevin,T,R,Herrera,Herrera,...,0,0,5096 e 22nd st Anytown US 00000,5096 e 22nd st Anytown US 00000,0,0,K,K,H,H


In [68]:
census_2030.to_parquet('census_2030_with_piks_sample.parquet')

In [69]:
# Convert this notebook to a Python script
! jupyter nbconvert --config ../../nbconvert_no_magic/config.py --to python --template ../../nbconvert_no_magic/template pvs_like_case_study_sample_data_spark_local.ipynb

[NbConvertApp] Converting notebook pvs_like_case_study_sample_data_spark_local.ipynb to python
[NbConvertApp] Writing 14837 bytes to pvs_like_case_study_sample_data_spark_local.py
