# Dedupe Package
The dupe package helps you to identify duplicated values in a data set that should be unique.  This package was created to assist data visualization, science, and engineering hobbiests and professionals.  Improper joins, poorly created data sets, or queries can cause problems with data in terms of level of detail leading to inaccuracies.  Analysis of duplicated values takes time and this package aims to return time back to more important tasks.

## Import required packages
This package relies on polars instead of pandas for the lazy option and the better speed.  There's no way to know how much data will be thrown at this package so optimizing for speed in the abstract is important for performance once in the wild.

In [1]:
import numpy as np
import polars as pl
import time

## Dupe Class

In [12]:
## Create class object
class Dupe:
    def __init__(self, data, key = None, suggest_key = False):
        self.data = data
        self.key = key
        self.suggest_key = suggest_key
        self.dupe_data = None

    
    ## Function to set the Dupe class item key, or column that should be unique
    def set_key(self, key):
        self.key = key

    
    ## Function to get information about the data and key in the Dupe class object
    def get_info(self):
        message = ''
        if self.key == None:
            message += 'No key is currently assigned\n'
        else:
            message += f'Key value: {self.key}\n'

        s = self.data.shape
        message += f'Your data includes {s[0]} rows and {s[1]} columns'
        print(message)

    
    ## Function to show the head of the dataframe in the Dupe class object, defaults to five rows
    def show_data(self, rows = 5):
        return self.data.head(rows)

    
    ## Function that suggests best candidates for unique ids in dataframe
    def get_key_suggestion(self, n_suggestions = 3):
        ## Get unique values for each column        
        cols = {'column': [], 'unique': []}
        uni = 0
        
        for c in self.data.columns:
            cols['column'].append(c)
            cols['unique'].append(self.data[c].unique().shape[0])
        
        ## Check if any columns are completely unique
        suggested_keys = []
        df_rows = self.data.shape[0]
        sug_key = pl.DataFrame(cols)
        
        ## Return a list of completely unique columns
        unique_cols = sug_key.filter(pl.col('unique') == df_rows).shape[0]
        if unique_cols > 0:
            suggested_keys = sug_key.filter(pl.col('unique') == df_rows)['column'].to_list()
            if len(suggested_keys) == 1:
                message = 'Your data includes one unique id\n'
            else:
                message = 'Your data includes unique ids\n'
        
        ## Otherwise, return the top N options based on highest unique values
        else:
            suggested_keys = sug_key.sort('unique', descending = True).head(n_suggestions)['column'].to_list()
            message = f'The top {n_suggestions} suggestions for a unique id include:\n'

        key_messages = []
        key_message = ''
        pct_uni = 0.0
        bullet = 1
        
        for key in suggested_keys:
            unique_count = sug_key.filter(pl.col('column') == key).select('unique').item()
            pct_uni = f'{unique_count / df_rows * 100:.2f}%'
            
            key_message = f"\t{bullet}. {key}: {pct_uni} unique ({unique_count} of {df_rows} unique)"
            key_messages.append(key_message)
            bullet += 1

        key_messages = '\n'.join(key_messages)
        print(message + key_messages)

    ## Function that identifies the columns responsible for duplicating a specified column
    def find_dupe_cols(self, ignore_cols = []):
        print('Setting everything up')

        dfd = self.data
        key = self.key
        unique_keys = self.data[key].unique().to_list()
        uk_len = len(unique_keys)
        uk_rows, col_rows = 0, 0
        uk_df = pl.DataFrame()
        key_header = f'{key}_value'
        dupe_dict = {key_header: [], 'dupe_col': [], 'dupe_count': [], 'dupe_vals': []}

        print(f'Starting to process {uk_len} rows of data')
        
        for uk in unique_keys:
            uk_df = self.data.select(key).filter(pl.col(key) == uk)
            uk_rows = uk_df.shape[0]

            if uk_rows > 1:
                for col in [col for col in dfd.columns if col != key]:
                    uk_df = dfd.filter(pl.col(key) == uk).select([key, col]).unique()
                    col_rows = uk_df.shape[0]
                    if col_rows > 1:
                        diff_vals = uk_df[col].to_list()
                        dupe_dict[key_header].append(uk)
                        dupe_dict['dupe_col'].append(col)
                        dupe_dict['dupe_count'].append(col_rows)
                        dupe_dict['dupe_vals'].append(diff_vals)
            
        self.dupe_data = pl.DataFrame(dupe_dict)
        print('Duplicates discovered')
        return self.dupe_data

# Example usage
## Import two data sets for testing
- superstore_with_5_dupes.csv includes 5 duplicated values
- superstore.csv is 10,000 rows of classic Superstore data

In [3]:
## dfd stands for duplicated dataframe, there are 5 added to the data
## Import data and format column names to be lower and use _ instead of space
dfd = pl.read_csv('superstore_with_5_dupes.csv')

new_cols = {}
for c in dfd.columns:
    new_cols[c] = c.lower().replace(' ', '_')

dfd = dfd.rename(new_cols)

In [4]:
## df is the original or non-duplicated superstore data
## Import data and format column names to be lower and use _ instead of space
df = pl.read_csv('superstore.csv')

new_cols = {}
for c in df.columns:
    new_cols[c] = c.lower().replace(' ', '_')

df = df.rename(new_cols)
# df.head(2)

## Create Dupe object
- To start, we'll use the duplicated superstore data (dfd)

In [5]:
## Create class object
dup = Dupe(dfd)

In [6]:
## Get info
dup.get_info()

No key is currently assigned
Your data includes 9999 rows and 21 columns


In [7]:
## Review the first three rows
dup.show_data(rows = 3)

row_id,order_id,order_date,ship_date,ship_mode,customer_id,customer_name,segment,country,city,state,postal_code,region,product_id,category,sub-category,product_name,sales,quantity,discount,profit
i64,str,str,str,str,str,str,str,str,str,str,i64,str,str,str,str,str,f64,i64,f64,f64
1,"""CA-2016-152156…","""2016-11-08""","""2016-11-11""","""Second Class""","""CG-12520""","""Claire Gute""","""Consumer""","""United States""","""Henderson""","""Kentucky""",42420,"""South""","""FUR-BO-1000179…","""Furniture""","""Bookcases""","""Bush Somerset …",261.96,2,0.0,41.9136
2,"""CA-2016-152156…","""2016-11-08""","""2016-11-11""","""Second Class""","""CG-12520""","""Claire Gute""","""Consumer""","""United States""","""Henderson""","""Kentucky""",42420,"""South""","""FUR-CH-1000045…","""Furniture""","""Chairs""","""Hon Deluxe Fab…",731.94,3,0.0,219.582
3,"""CA-2016-138688…","""2016-06-12""","""2016-06-16""","""Second Class""","""DV-13045""","""Darrin Van Huf…","""Corporate""","""United States""","""Los Angeles""","""California""",90036,"""West""","""OFF-LA-1000024…","""Office Supplie…","""Labels""","""Self-Adhesive …",14.62,2,0.0,6.8714


In [8]:
## Get unique key value suggestions (5)
dup.get_key_suggestion(n_suggestions = 5)

The top 5 suggestions for a unique id include:
	1. row_id: 99.95% unique (9994 of 9999 unique)
	2. profit: 73.15% unique (7314 of 9999 unique)
	3. sales: 58.27% unique (5826 of 9999 unique)
	4. order_id: 50.10% unique (5009 of 9999 unique)
	5. product_id: 18.62% unique (1862 of 9999 unique)


In [9]:
## Looks like the row_id column is the most unique
## Let's set it as the key and find where and why it is being duplicated
dup.set_key('row_id')

In [10]:
## When we run find_dupe_cols() we go through each unique value in the key column
## Per each unique value, we iterate through each column, remove duplicates and count the total rows remaining
## Any filtered dataframe with more than one row indicates a duplicate and is captured
dupes = dup.find_dupe_cols()

Setting everything up
Starting to process 9994 rows of data
Duplicates discovered


In [11]:
## Let's review the dupe_data to see which values of our key column are being duplicated and what column is responsible
dup.dupe_data

row_id_value,dupe_col,dupe_count,dupe_vals
i64,str,i64,list[str]
9990,"""segment""",2,"[""Consumer"", ""Different Value Causes Dupe""]"
9991,"""country""",2,"[""United States"", ""Different Value Causes Dupe""]"
9992,"""city""",2,"[""Costa Mesa"", ""Different Value Causes Dupe""]"
9993,"""postal_code""",2,"[""999999999"", ""92627""]"
9994,"""sales""",2,"[""243.16"", ""999999999.0""]"
