In [1]:
import pandas as pd
from artificialdata import *
from stainer import *

# Data Example - Duplicated Rows (Manual Entry)

In [2]:
data = pd.read_csv("data/online_retail_small.csv", parse_dates = ["InvoiceDate"]) # Contains only first 5000 rows of the full data

In [38]:
data.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-01-12 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-01-12 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-01-12 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-01-12 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-01-12 08:26:00,3.39,17850.0,United Kingdom


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   InvoiceNo    5000 non-null   object        
 1   StockCode    5000 non-null   object        
 2   Description  4988 non-null   object        
 3   Quantity     5000 non-null   int64         
 4   InvoiceDate  5000 non-null   datetime64[ns]
 5   UnitPrice    5000 non-null   float64       
 6   CustomerID   3795 non-null   float64       
 7   Country      5000 non-null   object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 312.6+ KB


In [31]:
# Fix rows 4950 - 5000 to be duplicated
add_dup_specified = AddDuplicate(0, fixed_row = [i for i in range(4950, 5000)])

In [32]:
# Any row can be duplicated with a 20% chance. Each duplication can result in up to 4 copies
add_dup = AddDuplicate(0.2, randomize_order = False, max_rep = 4)

In [33]:
ddf = DirtyDF(data, history = True)

In [34]:
c = Combiner(ddf, [add_dup_specified, add_dup])

In [35]:
c.transform_all()

In [44]:
# DF after the first step
c.get_finalDDF().summary[0].df

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-01-12 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-01-12 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-01-12 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-01-12 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-01-12 08:26:00,3.39,17850.0,United Kingdom
...,...,...,...,...,...,...,...,...
5045,536836,21539,RED RETROSPOT BUTTER DISH,3,2010-02-12 18:08:00,4.95,18168.0,United Kingdom
5046,536836,22198,LARGE POPCORN HOLDER,2,2010-02-12 18:08:00,1.65,18168.0,United Kingdom
5047,536836,22198,LARGE POPCORN HOLDER,2,2010-02-12 18:08:00,1.65,18168.0,United Kingdom
5048,536836,22197,SMALL POPCORN HOLDER,2,2010-02-12 18:08:00,0.85,18168.0,United Kingdom


In [36]:
# Final result (After the second step)
c.get_finalDDF().get_finalDF()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-01-12 08:26:00,2.55,17850.0,United Kingdom
1,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-01-12 08:26:00,2.55,17850.0,United Kingdom
2,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-01-12 08:26:00,2.55,17850.0,United Kingdom
3,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-01-12 08:26:00,2.55,17850.0,United Kingdom
4,536365,71053,WHITE METAL LANTERN,6,2010-01-12 08:26:00,3.39,17850.0,United Kingdom
...,...,...,...,...,...,...,...,...
7073,536836,22198,LARGE POPCORN HOLDER,2,2010-02-12 18:08:00,1.65,18168.0,United Kingdom
7074,536836,22198,LARGE POPCORN HOLDER,2,2010-02-12 18:08:00,1.65,18168.0,United Kingdom
7075,536836,22198,LARGE POPCORN HOLDER,2,2010-02-12 18:08:00,1.65,18168.0,United Kingdom
7076,536836,22197,SMALL POPCORN HOLDER,2,2010-02-12 18:08:00,0.85,18168.0,United Kingdom


In [37]:
c.get_finalDDF().get_full_history()

['Added Duplicate Rows for 50 specified rows and 0% of the remaining rows. Each duplicated row should appear a maximum of 2 time',
 'Added Duplicate Rows for 0 specified rows and 20.0% of the remaining rows. Each duplicated row should appear a maximum of 4 time']