In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import matplotlib.pyplot as plt
from DirtyDF import *
from stainer import *

In [2]:
df = pd.DataFrame([(0, 'Cat'), (1, 'Dog'), (2, 'Rabbit'), (3, 'Cat'), (4, 'Cat'), (5, 'Dog')],
                  columns=('id', 'class'))

In [3]:
df["class"] = df["class"].astype("category")

# Shuffle

In [4]:
stain = ShuffleStainer()

In [5]:
res = DirtyDF(df, seed = 214).add_stainers(stain).add_stainers(stain).run_all_stainers()
res.get_df()

Unnamed: 0,id,class
2,2,Rabbit
5,5,Dog
4,4,Cat
1,1,Dog
0,0,Cat
3,3,Cat


In [6]:
# Original DF is unmodified
df

Unnamed: 0,id,class
0,0,Cat
1,1,Dog
2,2,Rabbit
3,3,Cat
4,4,Cat
5,5,Dog


In [7]:
res.get_map_from_history(0)

array([[0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1.]])

In [8]:
res.get_previous_map()

array([[0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0., 0.]])

In [9]:
res.get_mapping(axis = 0)

array([[0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.]])

In [10]:
res.get_mapping(axis = 1)

array([[1., 0.],
       [0., 1.]])

In [11]:
new_ddf = DirtyDF(df, seed = 214).add_stainers(stain).run_stainer()
new_ddf.add_stainers(stain).run_stainer().get_df()

Unnamed: 0,id,class
2,2,Rabbit
5,5,Dog
4,4,Cat
1,1,Dog
0,0,Cat
3,3,Cat


In [12]:
res.print_history()

1. Shuffle 
 Order of rows randomized 
 Time taken: 0.001995563507080078 

2. Shuffle 
 Order of rows randomized 
 Time taken: 0.0010259151458740234 



# Row Duplicate

In [13]:
dup_stainer = RowDuplicateStainer(deg = 0.6, max_rep = 3)

In [14]:
res = DirtyDF(df, seed = 42).add_stainers(dup_stainer).run_all_stainers()
res.get_df()

Unnamed: 0,id,class
0,0,Cat
1,0,Cat
2,0,Cat
3,1,Dog
4,2,Rabbit
5,3,Cat
6,3,Cat
7,4,Cat
8,5,Dog
9,5,Dog


In [15]:
res.get_mapping()

array([[1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1.]])

In [16]:
res.get_mapping(axis = 1)

array([[1., 0.],
       [0., 1.]])

In [17]:
res = DirtyDF(df, seed = 42).add_stainers(dup_stainer).add_stainers(stain).run_all_stainers()
res.get_df()

Unnamed: 0,id,class
4,2,Rabbit
8,5,Dog
2,0,Cat
6,3,Cat
10,5,Dog
9,5,Dog
7,4,Cat
3,1,Dog
0,0,Cat
1,0,Cat


In [18]:
res.get_mapping()

array([[0., 0., 1., 0., 0., 0., 0., 0., 1., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 1., 1., 0., 0., 0., 0., 0.]])

In [19]:
res.print_history()

1. Add Duplicates 
 Added Duplicate Rows for 3 rows. 
  Each duplicated row should appear a maximum of 3 times. 
  Rows added: 5 
 Time taken: 0.0009965896606445312 

2. Shuffle 
 Order of rows randomized 
 Time taken: 0.001996278762817383 



# Nullify

In [20]:
nullify = NullifyStainer(deg = 0.4)

In [21]:
res_nul = DirtyDF(df, seed = 214).add_stainers(nullify).run_all_stainers()
res_nul.get_df()

Unnamed: 0,id,class
0,,Cat
1,1.0,Dog
2,,
3,3.0,Cat
4,4.0,
5,5.0,Dog


In [22]:
res_nul.print_history()

1. Nullify 
 Replaced 4 values to become empty in specificed rows/cols. 
 Time taken: 0.002993345260620117 



In [23]:
display(res_nul.get_mapping(axis = 0))
res_nul.get_mapping(axis = 1)

array([[1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1.]])

array([[1., 0.],
       [0., 1.]])

In [24]:
nullify2 = NullifyStainer(deg = 0.4, col_idx = [0], new_val = "NONE", new_type = True)

In [25]:
res_nul2 = DirtyDF(df, seed = 214).add_stainers(nullify2).run_all_stainers()
res_nul2.get_df()

Unnamed: 0,id,class
0,NONE,Cat
1,1,Dog
2,2,Rabbit
3,3,Cat
4,4,Cat
5,NONE,Dog


**Combined duplicate, shuffle and nullifier**

In [26]:
nullify3 = NullifyStainer(deg = 1, row_idx = [0], col_idx = [0])

In [27]:
res_nul3 = DirtyDF(df, seed = 214).add_stainers([dup_stainer, stain, nullify3]).run_all_stainers()

In [28]:
res_nul3.get_df()

Unnamed: 0,id,class
9,5.0,Dog
8,4.0,Cat
1,,Cat
3,2.0,Rabbit
2,1.0,Dog
4,2.0,Rabbit
7,4.0,Cat
6,4.0,Cat
5,3.0,Cat
0,,Cat


In [29]:
res_nul3.get_mapping()

array([[0., 0., 1., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 0., 1., 1., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [30]:
res_nul3.print_history()

1. Add Duplicates 
 Added Duplicate Rows for 3 rows. 
  Each duplicated row should appear a maximum of 3 times. 
  Rows added: 4 
 Time taken: 0.0009725093841552734 

2. Shuffle 
 Order of rows randomized 
 Time taken: 0.0019936561584472656 

3. Nullify 
 Replaced 2 values to become empty in specificed rows/cols. 
 Time taken: Time not updated. Use update_history to update time 

