In [1]:
%load_ext autoreload
%autoreload 2
import sys
# do this to be able to import the custom python scripts
sys.path.insert(1, "../../../python_scripts")
import os

import dm_utils
import dm_file_checker

import dedupe
import json

## Get Appropriate Filepaths

In [2]:
saved_files_path = "../../../saved_files"
task_name = os.path.basename(os.getcwd())
    
is_data_source_deduped = dm_file_checker.check_is_data_source_deduped(task_name, saved_files_path)
    
# files to be written out
settings_filepath =  dm_file_checker.get_filepath(task_name, "model_settings", saved_files_path)
labeled_data_filepath = dm_file_checker.get_filepath(task_name, "labeled_data", saved_files_path)
blocks_filepath = dm_file_checker.get_filepath(task_name, "blocks", saved_files_path)

RECALL_TRAIN_VAL = dm_file_checker.get_task_info(task_name, "recall_train", saved_files_path)
print("Using recall value of {}".format(RECALL_TRAIN_VAL))

Using recall value of 1.0


In [3]:
is_data_source_deduped

False

## Define parameters for model

In [4]:
# default blocked_proportion is 50%
BLOCKED_PROPORTION = 0.5
# default sample size is 15000
#SAMPLE_SIZE = 30_000
SAMPLE_SIZE = 15_000

## Reading in Data

In [5]:
if is_data_source_deduped:
    unlabeled_data_1, unlabeled_data_2 = dm_utils.get_deduped_data_for_rl(task_name, saved_files_path)
    print("Using canonicalized deduped dataset instead of the original preprocessed data.")
else:
    unlabeled_data_1_filepath, unlabeled_data_2_filepath = dm_file_checker.get_proper_unlabeled_data_filepath(task_name, saved_files_path)
    print("Not using the canonicalized deduped dataset. Using the original preprocessed data.")
    
    numeric_fields_1, numeric_fields_2 = dm_file_checker.get_dataset_info(task_name, "numeric_fields", saved_files_path)
    print("Numeric fields 1 are {}".format(numeric_fields_1))
    print("Numeric fields 2 are {}".format(numeric_fields_2))
    
    unlabeled_data_1 = dm_utils.read_unlabeled_data_json(unlabeled_data_1_filepath, numeric_fields = numeric_fields_1)
    unlabeled_data_2 = dm_utils.read_unlabeled_data_json(unlabeled_data_2_filepath, numeric_fields = numeric_fields_2)

Not using the canonicalized deduped dataset. Using the original preprocessed data.
Numeric fields 1 are []
Numeric fields 2 are []
converting 94 empty string values of column date_of_birth to None
converting 112 empty string values of column given_name to None
converting 48 empty string values of column surname to None
converting 158 empty string values of column street_number to None
converting 98 empty string values of column address_1 to None
converting 420 empty string values of column address_2 to None
converting 55 empty string values of column suburb to None
converting 0 empty string values of column postcode to None
converting 50 empty string values of column state to None
converting 0 empty string values of column soc_sec_id to None
converting 263 empty string values of column date_of_birth to None
converting 234 empty string values of column given_name to None
converting 102 empty string values of column surname to None
converting 287 empty string values of column street_numb

## Training the Model
- Link on syntax on how to define fields https://docs.dedupe.io/en/latest/Variable-definition.html

## Load Pre-Trained Model if it Already Exists

In [6]:
# If a settings file already exists, we'll just load that and skip training
if os.path.exists(settings_filepath):
    print('reading from', settings_filepath)
    with open(settings_filepath, 'rb') as f:
        linker = dedupe.StaticRecordLink(f)
    skip_training = True
else:
    skip_training = False

## Set Data Fields for Model

In [7]:
if not skip_training:
    fields = dm_file_checker.get_task_info(task_name, "fields", saved_files_path)
    print("Using the following fields for the model")
    print(fields)

    linker = dedupe.RecordLink(fields, num_cores = 4)


Using the following fields for the model
[{'field': 'street_number', 'type': 'ShortString', 'has missing': True}, {'field': 'postcode', 'type': 'ShortString'}, {'field': 'state', 'type': 'ShortString', 'has missing': True}, {'field': 'soc_sec_id', 'type': 'ShortString'}, {'field': 'given_name', 'type': 'String', 'has missing': True}, {'field': 'surname', 'type': 'String', 'has missing': True}, {'field': 'address_1', 'type': 'String', 'has missing': True}, {'field': 'address_2', 'type': 'String', 'has missing': True}, {'field': 'suburb', 'type': 'String', 'has missing': True}, {'field': 'date_of_birth', 'type': 'DateTime', 'has missing': True, 'fuzzy': False, 'yearfirst': True}]


## Load Data in Model (NOTE: this might take a while)

In [8]:
%%time
if not skip_training:
    dm_utils.prepare_training_linker(linker, unlabeled_data_1, unlabeled_data_2,
                                      labeled_data_filepath, 
                                      blocked_proportion = BLOCKED_PROPORTION, 
                                      sample_size = SAMPLE_SIZE)
    

INFO:dedupe.api:reading training from file


reading labeled examples from  ../../../saved_files/rl-febrl4a_febrl4b/training_output/labeled_data.json


INFO:dedupe.canopy_index:Removing stop word ce
INFO:dedupe.canopy_index:Removing stop word re
INFO:dedupe.canopy_index:Removing stop word  s
INFO:dedupe.canopy_index:Removing stop word ee
INFO:dedupe.canopy_index:Removing stop word st
INFO:dedupe.canopy_index:Removing stop word tr
INFO:dedupe.canopy_index:Removing stop word et
INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (commonTwoTokens, address_2), SimplePredicate: (suffixArray, given_name))
INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(LevenshteinSearchPredicate: (2, postcode), LevenshteinSearchPredicate: (2, soc_sec_id))
INFO:dedupe.training:(SimplePredicate: (dayPredicate, date_of_birth), TfidfTextSearchPredicate: (0.2, address_1))
INFO:dedupe.training:(SimplePredicate: (commonIntegerPredicate, address_2), SimplePredicate: (sortedAcronym, address_1))
INFO:dedupe.training:(SimplePredicate: (commonIntegerPredicate, address_2), SimplePredicate: (commonTwoTokens, suburb))
INFO:dedu

CPU times: user 7min 11s, sys: 3.97 s, total: 7min 15s
Wall time: 7min 51s


## Labeling of Data Proper

## Watch Out For the Following When Tagging
1. Family Members
    - Family members usually have many fields in common (last name, address, landline number, place of birth, mothers maiden name). Key distinguishing fields are first name, birthdate, social security number (SSS), and tax identification number (TIN)
    - Most tricky are young siblings because only distinguishing fields would be first name and birthdate. No SSS and TIN because they're not yet of working age.
2. Businesses Mistagged as Individuals
    - If there's a pair of records with the same social security number (SSS) and tax identification number (TIN) but one of the records has no first name, middle name, and last name, then one of the records may be a business mistagged as an individual.
    - Still treat these records as separate

In [9]:
%%time
if not skip_training:
    # ## Active learning
    # Dedupe will find the next pair of records
    # it is least certain about and ask you to label them as duplicates
    # or not.
    # use 'y', 'n' and 'u' keys to flag duplicates
    # press 'f' when you are finished
    print('starting active labeling...')
    dedupe.console_label(linker)

    # Using the examples we just labeled, train the deduper and learn blocking predicates
    linker.train(recall = RECALL_TRAIN_VAL)
    
    dm_utils.save_trained_linker(linker, labeled_data_filepath, settings_filepath)

starting active labeling...


street_number : 26
postcode : 3042
state : tas
soc_sec_id : 8370048
given_name : kirra
surname : browne
address_1 : shoalhaven avenue
address_2 : loormeah park
suburb : surry hills
date_of_birth : 1943/04/07

street_number : 26
postcode : 3042
state : tas
soc_sec_id : 8221881
given_name : kirra
surname : browne
address_1 : shoalhaven avenue
address_2 : loormeah park
suburb : surry hills
date_of_birth : None

78/10 positive, 61/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished


f


Finished labeling
INFO:rlr.crossvalidation:using cross validation to find optimum alpha...
INFO:rlr.crossvalidation:optimum alpha: 0.000010, score 0.9431560492122969
INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(LevenshteinSearchPredicate: (2, postcode), LevenshteinSearchPredicate: (2, soc_sec_id))
INFO:dedupe.training:(SimplePredicate: (dayPredicate, date_of_birth), TfidfTextSearchPredicate: (0.2, address_1))
INFO:dedupe.training:(SimplePredicate: (commonIntegerPredicate, address_2), SimplePredicate: (sortedAcronym, address_1))
INFO:dedupe.training:(SimplePredicate: (commonIntegerPredicate, address_2), SimplePredicate: (commonTwoTokens, suburb))
INFO:dedupe.training:(SimplePredicate: (commonThreeTokens, suburb), TfidfNGramSearchPredicate: (0.4, surname))
INFO:dedupe.training:(SimplePredicate: (commonThreeTokens, address_2), SimplePredicate: (twoGramFingerprint, surname))


CPU times: user 1min 53s, sys: 2.15 s, total: 1min 55s
Wall time: 2min 12s


## Retrain Model if Needed

In [10]:
retrain_model = False

In [11]:
if retrain_model:
    dedupe.console_label(linker)

In [12]:
if retrain_model:
    for field in linker.fingerprinter.index_fields:
        field_data_1 = set(record[field] for key,record in unlabeled_data_1.items())
        field_data_2 = set(record[field] for key,record in unlabeled_data_2.items())
        field_data = field_data_1.union(field_data_2)
        linker.fingerprinter.index(field_data, field)
        
    # Using the examples we just labeled, train the deduper and learn blocking predicates
    linker.train(recall = RECALL_TRAIN_VAL)
    
    dm_utils.save_trained_linker(linker, labeled_data_filepath, settings_filepath)

## Write Blocks to Disk

In [13]:
dm_utils.write_linker_blocks(linker, unlabeled_data_1, unlabeled_data_2, blocks_filepath)

## Check performance of blocking method

In [14]:
blocked_data = dm_utils.read_linker_blocks(unlabeled_data_1, unlabeled_data_2, blocks_filepath)
num_candidate_pairs = dm_utils.count_blocked_pairs(linker, blocked_data)
reduction_ratio = 1 - (num_candidate_pairs/(len(unlabeled_data_1)*len(unlabeled_data_2)))

print("Number of candidate record pairs after blocking is {:,}".format(num_candidate_pairs))
print("Reduction ratio from blocking is {}%".format(reduction_ratio*100))

Number of candidate record pairs after blocking is 5,659
Reduction ratio from blocking is 99.97736400000001%
