In [1]:
%load_ext autoreload
%autoreload 2
import sys
# do this to be able to import the custom python scripts
sys.path.insert(1, "../../../python_scripts")
import os

import dm_utils
import dm_file_checker

import dedupe
import json

## Get Appropriate Filepaths

In [2]:
saved_files_path = "../../../saved_files"
task_name = os.path.basename(os.getcwd())
    
unlabeled_data_filepath = dm_file_checker.get_proper_unlabeled_data_filepath(task_name, saved_files_path)
settings_filepath =  dm_file_checker.get_filepath(task_name, "model_settings", saved_files_path)
labeled_data_filepath = dm_file_checker.get_filepath(task_name, "labeled_data", saved_files_path)
blocks_filepath = dm_file_checker.get_filepath(task_name, "blocks", saved_files_path)

numeric_fields = dm_file_checker.get_dataset_info(task_name, "numeric_fields", saved_files_path)
RECALL_TRAIN_VAL = dm_file_checker.get_task_info(task_name, "recall_train", saved_files_path)
print("Using recall value of {}".format(RECALL_TRAIN_VAL))
print("Numeric fields are {}".format(numeric_fields))

Using recall value of 1.0
Numeric fields are []


## Define parameters for model

In [3]:
# default blocked_proportion is 50%
BLOCKED_PROPORTION = 0.5
# default sample size is 15000
#SAMPLE_SIZE = 30_000
SAMPLE_SIZE = 15_000

## Reading in Data

In [4]:
unlabeled_data = dm_utils.read_unlabeled_data_json(unlabeled_data_filepath, numeric_fields = numeric_fields)

converting 190 empty string values of column date_of_birth to None
converting 156 empty string values of column given_name to None
converting 79 empty string values of column surname to None
converting 245 empty string values of column street_number to None
converting 154 empty string values of column address_1 to None
converting 693 empty string values of column address_2 to None
converting 85 empty string values of column suburb to None
converting 0 empty string values of column postcode to None
converting 85 empty string values of column state to None
converting 0 empty string values of column soc_sec_id to None


## Training the Model
- Link on syntax on how to define fields https://docs.dedupe.io/en/latest/Variable-definition.html

## Load Pre-Trained Model if it Already Exists

In [5]:
# If a settings file already exists, we'll just load that and skip training
if os.path.exists(settings_filepath):
    print('reading from', settings_filepath)
    with open(settings_filepath, 'rb') as f:
        deduper = dedupe.StaticDedupe(f)
    skip_training = True
else:
    skip_training = False

## Set Data Fields for Model

In [6]:
if not skip_training:
    fields = dm_file_checker.get_task_info(task_name, "fields", saved_files_path)
    print("Using the following fields for the model")
    print(fields)

    deduper = dedupe.Dedupe(fields, num_cores = 4)

Using the following fields for the model
[{'field': 'street_number', 'type': 'ShortString', 'has missing': True}, {'field': 'postcode', 'type': 'ShortString'}, {'field': 'state', 'type': 'ShortString', 'has missing': True}, {'field': 'soc_sec_id', 'type': 'ShortString'}, {'field': 'given_name', 'type': 'String', 'has missing': True}, {'field': 'surname', 'type': 'String', 'has missing': True}, {'field': 'address_1', 'type': 'String', 'has missing': True}, {'field': 'address_2', 'type': 'String', 'has missing': True}, {'field': 'suburb', 'type': 'String', 'has missing': True}, {'field': 'date_of_birth', 'type': 'DateTime', 'has missing': True, 'fuzzy': False, 'yearfirst': True}]


## Load Data in Model (NOTE: this might take a while)

In [7]:
%%time
if not skip_training:
    dm_utils.prepare_training_deduper(deduper, unlabeled_data, labeled_data_filepath, 
                                      blocked_proportion = BLOCKED_PROPORTION, 
                                      sample_size = SAMPLE_SIZE)

INFO:dedupe.api:reading training from file


reading labeled examples from  ../../../saved_files/dedup-febrl3/training_output/labeled_data.json


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (commonIntegerPredicate, address_2), SimplePredicate: (commonTwoTokens, suburb))
INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:SimplePredicate: (sameThreeCharStartPredicate, soc_sec_id)
INFO:dedupe.training:(SimplePredicate: (dayPredicate, date_of_birth), SimplePredicate: (hundredIntegersOddPredicate, postcode))
INFO:dedupe.training:(SimplePredicate: (fingerprint, address_2), SimplePredicate: (fingerprint, given_name))
INFO:dedupe.training:(SimplePredicate: (commonThreeTokens, address_1), SimplePredicate: (fingerprint, given_name))


CPU times: user 1min 53s, sys: 1.99 s, total: 1min 55s
Wall time: 2min 5s


## Labeling of Data Proper

## Watch Out For the Following When Tagging
1. Family Members
    - Family members usually have many fields in common (last name, address, landline number, place of birth, mothers maiden name). Key distinguishing fields are first name, birthdate, social security number (SSS), and tax identification number (TIN)
    - Most tricky are young siblings because only distinguishing fields would be first name and birthdate. No SSS and TIN because they're not yet of working age.
2. Businesses Mistagged as Individuals
    - If there's a pair of records with the same social security number (SSS) and tax identification number (TIN) but one of the records has no first name, middle name, and last name, then one of the records may be a business mistagged as an individual.
    - Still treat these records as separate

In [8]:
%%time
if not skip_training:
    # ## Active learning
    # Dedupe will find the next pair of records
    # it is least certain about and ask you to label them as duplicates
    # or not.
    # use 'y', 'n' and 'u' keys to flag duplicates
    # press 'f' when you are finished
    print('starting active labeling...')
    dedupe.console_label(deduper)

    # Using the examples we just labeled, train the deduper and learn blocking predicates
    deduper.train(recall = RECALL_TRAIN_VAL)
    
    dm_utils.save_trained_deduper(deduper, labeled_data_filepath, settings_filepath)

starting active labeling...


street_number : 3
postcode : 2101
state : vic
soc_sec_id : 1092994
given_name : holly
surname : green
address_1 : larpent street
address_2 : brentwood vlge
suburb : ormond
date_of_birth : 1941/01/14

street_number : 3
postcode : 2101
state : vic
soc_sec_id : 8051408
given_name : matthew
surname : green
address_1 : larpent sztreet
address_2 : brentwodo vlge
suburb : diane lla
date_of_birth : None

66/10 positive, 60/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished


f


Finished labeling
INFO:rlr.crossvalidation:using cross validation to find optimum alpha...
INFO:rlr.crossvalidation:optimum alpha: 0.000100, score 0.9534646324666324
INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:SimplePredicate: (sameThreeCharStartPredicate, soc_sec_id)
INFO:dedupe.training:(SimplePredicate: (dayPredicate, date_of_birth), SimplePredicate: (hundredIntegersOddPredicate, postcode))
INFO:dedupe.training:(SimplePredicate: (fingerprint, address_2), SimplePredicate: (fingerprint, given_name))
INFO:dedupe.training:(SimplePredicate: (commonThreeTokens, address_1), SimplePredicate: (fingerprint, given_name))


CPU times: user 25.7 s, sys: 994 ms, total: 26.7 s
Wall time: 30 s


## Retrain Model if Needed

In [9]:
retrain_model = False

In [10]:
if retrain_model:
    for field in deduper.fingerprinter.index_fields:
        field_data = set(record[field] for key,record in unlabeled_data.items())
        deduper.fingerprinter.index(field_data, field)
   
    dedupe.console_label(deduper)

In [11]:
if retrain_model:

    # Using the examples we just labeled, train the deduper and learn blocking predicates
    deduper.train(recall = RECALL_TRAIN_VAL)
    
    dm_utils.save_trained_deduper(deduper, labeled_data_filepath, settings_filepath)

## Write Blocks to Disk

In [12]:
dm_utils.write_deduper_blocks(deduper, unlabeled_data, blocks_filepath)

## Check performance of blocking method

In [13]:
blocked_data = dm_utils.read_deduper_blocks(unlabeled_data, blocks_filepath)
num_candidate_pairs = dm_utils.count_blocked_pairs(deduper, blocked_data)
reduction_ratio = 1 - (num_candidate_pairs/(len(unlabeled_data)**2))

print("Number of candidate record pairs after blocking is {:,}".format(num_candidate_pairs))
print("Reduction ratio from blocking is {}%".format(reduction_ratio*100))

Number of candidate record pairs after blocking is 21,011
Reduction ratio from blocking is 99.915956%


In [14]:
blocked_data = dm_utils.read_deduper_blocks(unlabeled_data, blocks_filepath)
dm_utils.check_block_sizes(blocked_data)

Sizes of top 10 biggest blocks are: [2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
Record pair contributions from top 10 biggest blocks are : [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
