## Reproduce datasets for Scheme A

Paper: "Statistical supervised meta-ensemble algorithm for data linkage"

Kha Vo, Jitendra Jonnagaddala, Siaw-Teng Liaw

February 2019

Jounal of Biomedical Informatics

In [1]:
import recordlinkage as rl, pandas as pd, numpy as np
from recordlinkage.datasets import load_febrl4, load_febrl1, load_febrl2, load_febrl3
from IPython.display import clear_output

In [2]:
PATH_FILES =  "../data/FEBRL/"
OUTPUT_FILE_NAME = ['febrl3_rep.csv', 'febrl4_rep.csv']

In [17]:
## FOR DATASET febrl3

outputfile = PATH_FILES + OUTPUT_FILE_NAME[0]

df, true_links_ab = load_febrl3(return_links=True)

# WARNING: load_febrl2 and load_febrl3 does not suit this code, need further process

print("df:", len(df))
print("true_links_ab:", len(true_links_ab))

df: 5000
true_links_ab: 6538


In [18]:
## FOR DATASET febrl4

outputfile = PATH_FILES + OUTPUT_FILE_NAME[1]

df_a, df_b, true_links_ab = load_febrl4(return_links=True) # use this for load_febrl4
df = df_a.append(df_b) # use for load_febrl4

# WARNING: load_febrl2 and load_febrl3 does not suit this code, need further process

print("df:", len(df))
print("true_links_ab:", len(true_links_ab))

df: 10000
true_links_ab: 5000


In [15]:
df['date_of_birth']  = pd.to_datetime(df['date_of_birth'], errors = 'coerce')
df['day'] = df['date_of_birth'].dt.strftime('%d')
df['month'] = df['date_of_birth'].dt.strftime('%m')
df['year'] = df['date_of_birth'].dt.strftime('%Y')

df['postcode'] =   df['postcode'].fillna('0000')
df['postcode'] = df['postcode'].astype(int)

df['street_number'] =   df['street_number'].fillna('0')
df['street_number'] = df['street_number'].astype(int)

df = df.drop(["soc_sec_id",  "date_of_birth"], axis=1)

for col in ["surname", "given_name", "address_1", "address_2", "day", "month"]:
    df[col] = df[col].fillna('')
    df[col] = df[col].astype(str)


# df["match_id"] = range(len(df))

all_fields = df.columns.values.tolist()
#all_fields.remove('rec_id')

print("All fields:", all_fields)
df.head()

All fields: ['given_name', 'surname', 'street_number', 'address_1', 'address_2', 'suburb', 'postcode', 'state', 'day', 'month', 'year']


Unnamed: 0_level_0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,day,month,year
rec_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
rec-1496-org,mitchell,green,7,wallaby place,delmar,cleveland,2119,sa,9,4,1956
rec-552-dup-3,harley,mccarthy,177,pridhamstreet,milton,marsden,3165,nsw,19,4,1908
rec-988-dup-1,madeline,mason,54,hoseason street,lakefront retrmnt vlge,granville,4881,nsw,28,11,1908
rec-1716-dup-1,isabelle,,23,gundulu place,currin ga,utakarra,2193,wa,19,11,1992
rec-1213-org,taylor,hathaway,7,yuranigh court,brentwood vlge,,4220,nsw,7,12,1999


In [16]:
df['match_id'] = [-1]*len(df)

for i in range(len(true_links_ab)):
    k0  = true_links_ab[i][0]
    k1  = true_links_ab[i][1]
    df.loc[k0, "match_id"] = i
    df.loc[k1, "match_id"] = i
    print("Processed:", i, " - " , true_links_ab[i])


Processed: 0  -  ('rec-552-dup-1', 'rec-552-dup-3')
Processed: 1  -  ('rec-552-dup-0', 'rec-552-dup-3')
Processed: 2  -  ('rec-552-dup-0', 'rec-552-dup-1')
Processed: 3  -  ('rec-552-org', 'rec-552-dup-3')
Processed: 4  -  ('rec-552-org', 'rec-552-dup-1')
Processed: 5  -  ('rec-552-org', 'rec-552-dup-0')
Processed: 6  -  ('rec-552-dup-2', 'rec-552-dup-3')
Processed: 7  -  ('rec-552-dup-2', 'rec-552-dup-1')
Processed: 8  -  ('rec-552-dup-2', 'rec-552-dup-0')
Processed: 9  -  ('rec-552-dup-2', 'rec-552-org')
Processed: 10  -  ('rec-988-org', 'rec-988-dup-1')
Processed: 11  -  ('rec-988-dup-0', 'rec-988-dup-1')
Processed: 12  -  ('rec-988-dup-0', 'rec-988-org')
Processed: 13  -  ('rec-1716-dup-2', 'rec-1716-dup-1')
Processed: 14  -  ('rec-1716-dup-0', 'rec-1716-dup-1')
Processed: 15  -  ('rec-1716-dup-0', 'rec-1716-dup-2')
Processed: 16  -  ('rec-1716-org', 'rec-1716-dup-1')
Processed: 17  -  ('rec-1716-org', 'rec-1716-dup-2')
Processed: 18  -  ('rec-1716-org', 'rec-1716-dup-0')
Processed

In [12]:
#df.to_csv("febrl3_UNSW.csv", index=True)

df.to_csv(outputfile, index=True)
#df.to_csv("febrl4_UNSW.csv", index=True)