This script aims to reproduce the synthetic ePBRN-error-simulated datasets for Scheme B, which are stored in two files ePBRN_D_dup.csv and ePBRN_F_dup.csv. The original datasets used dont have any duplicated in them. 

#### Importing required packages

In [1]:
import pandas as pd
import numpy as np
import random
from numpy.random import choice
import sys
import warnings
warnings.filterwarnings("ignore")

#### Initializing weights and percentage of shared records.

The following weighs and percentages are replicated as is from the paper so that the data reproduced matches with the data used in this paper. 

In [2]:
# Input the percentage of  [2, 3, 4] shared records in one linkage:
count_shared = [1.68+21.0659, 1.9986 + 0.0471, 0.05]

#### Defining functions to process the original data and reproduce simulated datasets.

In [3]:
#Reading and pre-processing the input data
def preprocess(inputfile):
    df = pd.read_csv(
        PATH_DATA+inputfile+".csv", 
        parse_dates=["date_of_birth"])
    df['street_number'] = df['street_number'].fillna('0').astype(int)
    df['postcode'] =   df['postcode'].fillna('0000').astype(int)
    df['day'] = df['date_of_birth'].dt.strftime('%d')
    df['month'] = df['date_of_birth'].dt.strftime('%m')
    df['year'] = df['date_of_birth'].dt.strftime('%Y')
    df["rec_id"] = range(len(df))
    df['rec_id'] = df['rec_id'].astype(str)
    df.fillna({'surname':'',\
               'given_name':'',\
               'given_name':'',\
               'address_1':'',\
               'address_2':'',\
               'day':'',\
               'month':'',\
               'year':''}, inplace=True)
    df["match_id"] = range(len(df))
    
    df = df.drop(["age", "phone_number", "soc_sec_id", "blocking_number", "date_of_birth"], axis=1)
    col_list = df.columns.values.tolist()
    col_list.remove('rec_id')
    
    print("============================================================================================")
    print("Data preprocess success")
    print("============================================================================================")
    print("[1] Total records in ", inputfile, " :", len(df))
    print("[2] Preprocessed ", inputfile, " data sample :")
    print(df.head())
    print("============================================================================================")
    
    return df, col_list

#Sampling and generating random indices for the reproduced linkages.
def sampling(df, count_shared):
    records = len(df)
    double = int(records*count_shared[0]/100)
    triple = int(records*count_shared[1]/100)
    quad = int(records*count_shared[2]/100)
    
    list_double_linked = random.sample(range(records),k=double)
    unlinked = [item for item in range(records) if item not in list_double_linked]
    list_triple_linked = random.sample(unlinked,k=triple)
    unlinked = [item for item in unlinked if item not in list_triple_linked]
    list_quad_linked = random.sample(unlinked,k=quad)
    
    tot_rec_gen = records + double + triple*2 + quad*3
    matched = double + triple*3 + quad*6
    print("============================================================================================")
    print("Sampling and random indices generation success")
    print("============================================================================================")
    print("[1] Total records in ", inputfile, " :", records)
    print("[2] Double links:", double)
    print("[3] Triple links:",triple)
    print("[4] Quad links:",quad)
    print("[5] Total records after generated:", tot_rec_gen)
    print("[6] Matched pairs:", matched)
    print("============================================================================================")
    
    return list_double_linked, list_triple_linked, list_quad_linked
    
# Process each record 
def process_record(rc, all_fields):
    # Assigning the weights for each type of error:
    #The following weighs and percentages are replicated as is from the paper so
    # that the data reproduced matches with the data used in this paper.
    abr = 1 # abbreviation on surname: Michael -> M
    jwd1 = 1 # join with dash: John Peter -> John-Peter, join surname and given name into surname
    jwd2 = 1 # join with dash: John Peter -> John-Peter, join surname and given name into given name
    jwb1 = 1 # join with blank: 
    jwb2 = 1 # join with blank: 
    drf = 1 # drop all tokens in any field
    dlc1 = 1 # drop last character in surname: Peter -> Pete
    dlc2 = 1 # drop last character in given name
    swn = 1 # swap surname and given name: John Peter -> Peter John
    swc1 = 1 # swap character in surname: Peter -> Petre
    swc2 = 1 # swap character in given name: Peter -> Petre
    swd = 1 # swap day and month fields: 12/04 -> 04/12
    rsd = 1 # reset day and month: 12/04/1991 -> 01/01/1991
    chy = 1 # change year of birth by a margin of (+/-)5 
    drz1 = 1 # drop leading zeros from day of birth: 02/04 -> 2/04
    drz2 = 1 # drop leading zeros from month of birth: 02/04 -> 02/4
    chz = 1 # change any number of digit from zip code
    mar = 1 # change the whole token of surname: Mary Ward -> Mary Winston
    twi = 1 # duplicate all fields except given name: Micheal Williams -> Leo Williams
    add = 1 # change the whole 3 fields of address by randomly replacing each field by any other row

    all_error_types = ['abr','jwd1','jwd2','jwb1','jwb2' ,'drf','dlc1','dlc2','swn',
                       'rsd','chy','chz','mar','twi','add']
    all_error_weights = [abr, jwd1, jwd2, jwb1, jwb2, drf, dlc1, dlc2, swn, rsd, chy, chz, mar, twi, add]
    all_error_weights = all_error_weights/sum(np.asarray(all_error_weights))
    
    no_error = np.random.poisson(1, 1)
    errortypes = choice(all_error_types, no_error, p=all_error_weights)
    for errortype in errortypes:
        if errortype == 'abr':
            if len(rc["surname"])>0:
                #rc["surname"] = rc["surname"][0]
                rc.at["surname"] = rc["surname"][0]
        if errortype == 'jwd1':
            rc.at["surname"] = rc["surname"]+'-' +rc["given_name"]
            rc.at["given_name"] = ''
        if errortype == 'jwd2':
            rc.at["given_name"] = rc["surname"]+'-' +rc["given_name"]
            rc.at["surname"] = ''
        if errortype == 'jwb1':
            rc.at["surname"] = rc["surname"]+' ' +rc["given_name"]
            rc.at["given_name"] = ''
        if errortype == 'jwb2':
            rc.at["given_name"] = rc["surname"]+' ' +rc["given_name"]
            rc.at["surname"] = ''
        if errortype == 'drf':    
            selected_field = random.choice(all_fields)
            rc.at[selected_field] = ''
        if errortype == 'dlc1':
            if len(rc["surname"])>0:
                rc.at["surname"] = rc['surname'][0:-1]
        if errortype == 'dlc2':
            if len(rc["given_name"])>0:
                rc.at["given_name"] = rc['given_name'][0:-1]
        if errortype == 'swn':
            temp = rc['given_name']
            rc.at["given_name"] = rc['surname']
            rc.at["surname"] = temp
        if errortype == 'swd': 
            temp = rc['day']
            rc.at['day'] = rc['month']
            rc.at['month'] = temp
        if errortype == 'rsd':
            rc.at['day'] = '01'
            rc.at['month'] = '01'
        if errortype == 'chy': 
            if rc.at['year'] != 'NaT' and rc['year'] != '':
                margin = random.choice(range(-5,5))
                rc.at['year'] = str( int(rc['year']) + margin)
        if errortype == 'chz':
            if len(str(rc['postcode']))== 4:
                selected_digit = random.choice(range(4))
                code = list(str(rc['postcode']))
                code[selected_digit] = str( random.choice(range(9)))
                rc.at['postcode'] = int(''.join(code))
        if errortype == 'mar':
            rc.at["surname"] = df.iloc[random.choice(range(len(df)))]['surname']
        if errortype == 'twi':
            rc.at["given_name"] = df.iloc[random.choice(range(len(df)))]['given_name']
        if errortype == 'add':
            rc.at['address_1'] = df.iloc[random.choice(range(len(df)))]['address_1']
            rc.at['address_2'] = df.iloc[random.choice(range(len(df)))]['address_2']
            rc.at['street_number'] = random.choice(range(500))
    return rc

def data_synthesizer(df, list_double_linked, list_triple_linked, list_quad_linked, outputfile):
    counter = 0
    processed_df = df
    for linked_list in [list_double_linked, list_triple_linked, list_quad_linked]:
        counter += 1
        for each in linked_list:
            for k in range(counter):
                each_record = df.iloc[each]
                processed_record = process_record(each_record, all_fields)
                processed_record.at["rec_id"] = processed_record["rec_id"] + "-dup-" + str(k)
                processed_df = processed_df.append(processed_record)
    
    OP_path = PATH_DATA + outputfile + ".csv" 
    processed_df.to_csv(OP_path, index=False)
    print("============================================================================================")
    print("Data reproduction success")
    print("============================================================================================")
    print("[1] Total records in ", outputfile, " :", len(processed_df))
    print("[2] Reproduced data sample :")
    print(processed_df.head())
    print("[3] Data saved path :",OP_path)
    print("============================================================================================")

#### Recreation of F and D ePBRN datasets

##### F dataset recreation

In [4]:
PATH_DATA = "../data/"
inputfile = 'ePBRN_F_original' 
outputfile = 'ePBRN_F_dup' 

df, all_fields = preprocess(inputfile)
list_double_linked, list_triple_linked, list_quad_linked = sampling(df, count_shared)
data_synthesizer(df, list_double_linked, list_triple_linked, list_quad_linked, outputfile)

Data preprocess success
[1] Total records in  ePBRN_F_original  : 11100
[2] Preprocessed  ePBRN_F_original  data sample :
  rec_id given_name   surname  street_number         address_1 address_2  \
0      0      jenna    kilpin            179    mcfarlan place             
1      1     bianca  randazzo             37  lindrum crescent  sunshine   
2      2      james   borlase             75                     rocklea   
3      3   nicholas    beeton             20         mugga way             
4      4      megan   footner              4      jewell close             

          suburb  postcode state day month  year  match_id  
0       hillarys      2768   vic  26    02  1950         0  
1        forster      2281    wa  07    04  1988         1  
2         casula      2460   qld  03    09  1913         2  
3       hawthorn      2480   vic  22    06  1999         3  
4  taylors lakes      3129   tas  22    09  1912         4  
Sampling and random indices generation success
[1] Tota

##### D dataset recreation

In [5]:
PATH_DATA = "../data/"
inputfile = 'ePBRN_D_original' 
outputfile = 'ePBRN_D_dup' 

df, all_fields = preprocess(inputfile)
list_double_linked, list_triple_linked, list_quad_linked = sampling(df, count_shared)
data_synthesizer(df, list_double_linked, list_triple_linked, list_quad_linked, outputfile)

Data preprocess success
[1] Total records in  ePBRN_D_original  : 9250
[2] Preprocessed  ePBRN_D_original  data sample :
  rec_id given_name        surname  street_number         address_1  \
0      0      riley         cowley              2   ballarat street   
1      1   maddison          hiley              5  limestone avenue   
2      2     thomas        roberts             49   fortescue place   
3      3     nasyah  van der ploeg            395    kenyon circuit   
4      4       zara          denne             19                     

          address_2     suburb  postcode state day month  year  match_id  
0                       laguna      4218   vic                         0  
1        lochenfels     cobram      6415   nsw  14    04  1921         1  
2  dp 12750 talunga  leongatha      2911   qld  16    01  1907         2  
3                     oakleigh      2515   NaN                         3  
4         bestblock  kingsford      5290    sa  11    11  1911         4  
Sa