In [1]:
%load_ext autoreload
%autoreload 2
import sys
# do this to be able to import the custom python scripts
sys.path.insert(1, "../../../python_scripts")
import os

import dm_utils
import dm_file_checker

import dedupe
import json
import pandas as pd
import csv

## Get Appropriate Filepaths

In [2]:
saved_files_path = "../../../saved_files"
task_name = os.path.basename(os.getcwd())
dataset_name = task_name.split("-")[1]

# files to read in
primary_key = dm_file_checker.get_dataset_info(task_name, "primary_key", saved_files_path)
unlabeled_data_filepath = dm_file_checker.get_filepath(task_name, "unlabeled_data", saved_files_path)
unlabeled_data_no_exact_filepath = dm_file_checker.get_filepath(task_name, "unlabeled_data_no_exact", saved_files_path)

numeric_fields = dm_file_checker.get_dataset_info(task_name, "numeric_fields", saved_files_path)
print("Numeric fields are {}".format(numeric_fields))

Numeric fields are []


## Reading in Data

In [3]:
%%time
unlabeled_data = dm_utils.read_unlabeled_data_json(unlabeled_data_filepath, numeric_fields = numeric_fields,
                                                  empty_str_to_none = False)
unlabeled_data = pd.DataFrame.from_dict(unlabeled_data, orient = "index")
unlabeled_data.head()

CPU times: user 152 ms, sys: 8.12 ms, total: 160 ms
Wall time: 171 ms


Unnamed: 0,date_of_birth,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,soc_sec_id
febrl4a-rec-1070-org,1915/11/11,michaela,neumann,8,stanley street,miami,winston hills,4223,nsw,5304218
febrl4a-rec-1016-org,1916/12/14,courtney,painter,12,pinkerton circuit,bega flats,richlands,4560,vic,4066625
febrl4a-rec-4405-org,1948/09/30,charles,green,38,salkauskas crescent,kela,dapto,4566,nsw,4365168
febrl4a-rec-1288-org,1995/11/19,vanessa,parr,905,macquoid place,broadbridge manor,south grafton,2135,sa,9239102
febrl4a-rec-3585-org,1986/02/08,mikayla,malloney,37,randwick road,avalind,hoppers crossing,4552,vic,7207688


In [4]:
# convert null numerical fields to empty string
for field in numeric_fields:
    unlabeled_data[field] = unlabeled_data[field].fillna("")
    unlabeled_data[field] = unlabeled_data[field].apply(lambda x: str(x))

## Checking for Duplicates in Data

In [5]:
num_duplicates = unlabeled_data.duplicated().sum()
print("There are {} records that have duplicates".format(num_duplicates))

There are 0 records that have duplicates


In [6]:
assert num_duplicates > 0, "Since no exact duplicates found, not continuing further"

AssertionError: Since no exact duplicates found, not continuing further

## Assigning New ID to Same Record
- New ID is `<dataset_name>-ex-<number>` which refer to the records with exactly the same fields.
- ex stands for exact

In [None]:
new_id_mapping = unlabeled_data.groupby(unlabeled_data.columns.tolist()).ngroup() + 1
new_id_mapping = new_id_mapping.apply(lambda x: "{}-ex-{}".format(dataset_name, x))
new_id_mapping = pd.DataFrame(new_id_mapping, columns = [primary_key])
new_id_mapping.head()

In [None]:
n_new_id = new_id_mapping[primary_key].unique().shape[0]
print("Originally, there are {:,} IDs".format(new_id_mapping.shape[0]))
print("There are {:,} remaining IDs after disregarding exact duplicates".format(n_new_id))

## Setting New ID as Primary Key

In [None]:
unlabeled_data = pd.merge(left = new_id_mapping, right = unlabeled_data, right_index = True, left_index = True,
                            validate = "one_to_one")
unlabeled_data.index.name = "{}_old".format(primary_key)
unlabeled_data = unlabeled_data.reset_index()
unlabeled_data.head()

In [None]:
# write out mapping from old primary key to new primary key
primary_key_mapping = unlabeled_data.loc[:,[primary_key, "{}_old".format(primary_key)]]
primary_key_mapping_filepath = unlabeled_data_filepath.replace("unlabeled_data.json", "primary_key_mapping_exact_duplicates.csv")
primary_key_mapping.to_csv(primary_key_mapping_filepath, index = False, quoting = csv.QUOTE_ALL)
del primary_key_mapping

unlabeled_data = unlabeled_data.drop(columns = "{}_old".format(primary_key))

same_key_bool = unlabeled_data[primary_key].duplicated(keep = "first")
print("Removing {} rows that have the same new primary key (i.e. exact duplicates)".format(same_key_bool.sum()))
unlabeled_data = unlabeled_data.loc[~same_key_bool,:].set_index(primary_key)

unlabeled_data.head()

## Write Out New Dataset to a json file

In [None]:
# making sure all values are strings before writing to json
assert unlabeled_data.applymap(type).eq(str).all().all(), "not all values are strings!"

assert unlabeled_data.isnull().sum().sum() == 0, "still found a native Python null in the dataset!"

In [None]:
unlabeled_data = unlabeled_data.to_dict(orient = "index")

with open(unlabeled_data_no_exact_filepath, "w") as json_file:
    json.dump(unlabeled_data, json_file)