## Import required packages

In [1]:
import polars as pl
import numpy as np
from dedupe import Dupe

## Example usage with dataframe with duplicated data

In [2]:
## Import dataframes for example usage
## dfd stands for duplicated dataframe, there are 5 added to the data
## Import data and format column names to be lower and use _ instead of space
dfd = pl.read_csv('superstore_with_5_dupes.csv')

new_cols = {}
for c in dfd.columns:
    new_cols[c] = c.lower().replace(' ', '_')

dfd = dfd.rename(new_cols)

In [3]:
dfd.head(2)

row_id,order_id,order_date,ship_date,ship_mode,customer_id,customer_name,segment,country,city,state,postal_code,region,product_id,category,sub-category,product_name,sales,quantity,discount,profit
i64,str,str,str,str,str,str,str,str,str,str,i64,str,str,str,str,str,f64,i64,f64,f64
1,"""CA-2016-152156…","""2016-11-08""","""2016-11-11""","""Second Class""","""CG-12520""","""Claire Gute""","""Consumer""","""United States""","""Henderson""","""Kentucky""",42420,"""South""","""FUR-BO-1000179…","""Furniture""","""Bookcases""","""Bush Somerset …",261.96,2,0.0,41.9136
2,"""CA-2016-152156…","""2016-11-08""","""2016-11-11""","""Second Class""","""CG-12520""","""Claire Gute""","""Consumer""","""United States""","""Henderson""","""Kentucky""",42420,"""South""","""FUR-CH-1000045…","""Furniture""","""Chairs""","""Hon Deluxe Fab…",731.94,3,0.0,219.582


In [4]:
## Create class object
dup = Dupe(dfd)

## Get info
dup.get_info()

No key is currently assigned
Your data includes 9999 rows and 21 columns


In [5]:
## Review the first three rows
dup.show_data(rows = 3)

row_id,order_id,order_date,ship_date,ship_mode,customer_id,customer_name,segment,country,city,state,postal_code,region,product_id,category,sub-category,product_name,sales,quantity,discount,profit
i64,str,str,str,str,str,str,str,str,str,str,i64,str,str,str,str,str,f64,i64,f64,f64
1,"""CA-2016-152156…","""2016-11-08""","""2016-11-11""","""Second Class""","""CG-12520""","""Claire Gute""","""Consumer""","""United States""","""Henderson""","""Kentucky""",42420,"""South""","""FUR-BO-1000179…","""Furniture""","""Bookcases""","""Bush Somerset …",261.96,2,0.0,41.9136
2,"""CA-2016-152156…","""2016-11-08""","""2016-11-11""","""Second Class""","""CG-12520""","""Claire Gute""","""Consumer""","""United States""","""Henderson""","""Kentucky""",42420,"""South""","""FUR-CH-1000045…","""Furniture""","""Chairs""","""Hon Deluxe Fab…",731.94,3,0.0,219.582
3,"""CA-2016-138688…","""2016-06-12""","""2016-06-16""","""Second Class""","""DV-13045""","""Darrin Van Huf…","""Corporate""","""United States""","""Los Angeles""","""California""",90036,"""West""","""OFF-LA-1000024…","""Office Supplie…","""Labels""","""Self-Adhesive …",14.62,2,0.0,6.8714


In [6]:
## Get unique key value suggestions (5)
dup.get_key_suggestion(n_suggestions = 5)

The top 5 suggestions for a unique id include:
	1. row_id: 99.95% unique (9994 of 9999 unique)
	2. profit: 73.15% unique (7314 of 9999 unique)
	3. sales: 58.27% unique (5826 of 9999 unique)
	4. order_id: 50.10% unique (5009 of 9999 unique)
	5. product_id: 18.62% unique (1862 of 9999 unique)


In [7]:
## Looks like the row_id column is the most unique
## Let's set it as the key and find where and why it is being duplicated
dup.set_key('row_id')

In [8]:
## When we run find_dupe_cols() we go through each unique value in the key column
## Per each unique value, we iterate through each column, remove duplicates and count the total rows remaining
## Any filtered dataframe with more than one row indicates a duplicate and is captured
dupes = dup.find_dupe_cols()

Setting everything up
Starting to process 9994 rows of data
Duplicates discovered


In [9]:
## Let's review the dupe_data to see which values of our key column are being duplicated and what column is responsible
dup.dupe_data

row_id_value,dupe_col,dupe_count,dupe_vals
i64,str,i64,list[str]
9990,"""segment""",2,"[""Consumer"", ""Different Value Causes Dupe""]"
9991,"""country""",2,"[""Different Value Causes Dupe"", ""United States""]"
9992,"""city""",2,"[""Different Value Causes Dupe"", ""Costa Mesa""]"
9993,"""postal_code""",2,"[""999999999"", ""92627""]"
9994,"""sales""",2,"[""243.16"", ""999999999.0""]"


# Example with no duplicated values

In [10]:
## df is the original or non-duplicated superstore data
## Import data and format column names to be lower and use _ instead of space
df = pl.read_csv('superstore.csv')

new_cols = {}
for c in df.columns:
    new_cols[c] = c.lower().replace(' ', '_')

df = df.rename(new_cols)

In [12]:
df.head(2)

row_id,order_id,order_date,ship_date,ship_mode,customer_id,customer_name,segment,country,city,state,postal_code,region,product_id,category,sub-category,product_name,sales,quantity,discount,profit
i64,str,str,str,str,str,str,str,str,str,str,i64,str,str,str,str,str,f64,i64,f64,f64
1,"""CA-2016-152156…","""2016-11-08""","""2016-11-11""","""Second Class""","""CG-12520""","""Claire Gute""","""Consumer""","""United States""","""Henderson""","""Kentucky""",42420,"""South""","""FUR-BO-1000179…","""Furniture""","""Bookcases""","""Bush Somerset …",261.96,2,0.0,41.9136
2,"""CA-2016-152156…","""2016-11-08""","""2016-11-11""","""Second Class""","""CG-12520""","""Claire Gute""","""Consumer""","""United States""","""Henderson""","""Kentucky""",42420,"""South""","""FUR-CH-1000045…","""Furniture""","""Chairs""","""Hon Deluxe Fab…",731.94,3,0.0,219.582


In [17]:
# df.head(2)
## Create class object
dup = Dupe(df)

## Get info
dup.get_info()

No key is currently assigned
Your data includes 9994 rows and 21 columns


In [18]:
## Review the first three rows
dup.show_data(rows = 3)

row_id,order_id,order_date,ship_date,ship_mode,customer_id,customer_name,segment,country,city,state,postal_code,region,product_id,category,sub-category,product_name,sales,quantity,discount,profit
i64,str,str,str,str,str,str,str,str,str,str,i64,str,str,str,str,str,f64,i64,f64,f64
1,"""CA-2016-152156…","""2016-11-08""","""2016-11-11""","""Second Class""","""CG-12520""","""Claire Gute""","""Consumer""","""United States""","""Henderson""","""Kentucky""",42420,"""South""","""FUR-BO-1000179…","""Furniture""","""Bookcases""","""Bush Somerset …",261.96,2,0.0,41.9136
2,"""CA-2016-152156…","""2016-11-08""","""2016-11-11""","""Second Class""","""CG-12520""","""Claire Gute""","""Consumer""","""United States""","""Henderson""","""Kentucky""",42420,"""South""","""FUR-CH-1000045…","""Furniture""","""Chairs""","""Hon Deluxe Fab…",731.94,3,0.0,219.582
3,"""CA-2016-138688…","""2016-06-12""","""2016-06-16""","""Second Class""","""DV-13045""","""Darrin Van Huf…","""Corporate""","""United States""","""Los Angeles""","""California""",90036,"""West""","""OFF-LA-1000024…","""Office Supplie…","""Labels""","""Self-Adhesive …",14.62,2,0.0,6.8714


In [19]:
## Get unique key value suggestions (5)
## In this case, the n_suggestions is ignored because there IS a unique id (row_id)
dup.get_key_suggestion(n_suggestions = 5)

Your data includes one unique id
	1. row_id: 100.00% unique (9994 of 9994 unique)


In [20]:
## Looks like the row_id column is the most unique
## Let's set it as the key and find where and why it is being duplicated
dup.set_key('row_id')

In [21]:
## When we run find_dupe_cols() we go through each unique value in the key column
## Per each unique value, we iterate through each column, remove duplicates and count the total rows remaining
## Any filtered dataframe with more than one row indicates a duplicate and is captured
dupes = dup.find_dupe_cols()

Setting everything up
Starting to process 9994 rows of data
Duplicates discovered


In [22]:
## Let's review the dupe_data to see which values of our key column are being duplicated and what column is responsible
## Empty dataframe means no duplicates!  Rejoice!!
dup.dupe_data

row_id_value,dupe_col,dupe_count,dupe_vals
null,null,null,null
