In [21]:
%load_ext autoreload
%autoreload 2

import pandas as pd
from DirtyDF import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [56]:
df = pd.DataFrame([(0, 'Cat'), (1, 'Dog'), (2, 'Rabbit'), (3, 'Cat'), (4, 'Cat'), (5, 'Dog')],
                  columns=('id', 'class'))

In [57]:
df["class"] = df["class"].astype("category")

# Shuffle

In [92]:
stain = ShuffleStainer()

In [295]:
res = DirtyDF(df, seed = 214).add_stainers(stain).add_stainers(stain).run_all_stainers()
res.get_df()

Unnamed: 0,id,class
2,2,Rabbit
5,5,Dog
4,4,Cat
1,1,Dog
0,0,Cat
3,3,Cat


In [296]:
# Original DF is unmodified
df

Unnamed: 0,id,class
0,0,Cat
1,1,Dog
2,2,Rabbit
3,3,Cat
4,4,Cat
5,5,Dog


In [297]:
res.get_map_from_history(0)

array([[0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1.]])

In [298]:
res.get_previous_map()

array([[0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0., 0.]])

In [299]:
res.get_mapping(axis = 0)

array([[0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.]])

In [300]:
res.get_mapping(axis = 1)

array([[1., 0.],
       [0., 1.]])

In [301]:
new_ddf = DirtyDF(df, seed = 214).add_stainers(stain).run_stainer()
new_ddf.add_stainers(stain).run_stainer().get_df()

Unnamed: 0,id,class
2,2,Rabbit
5,5,Dog
4,4,Cat
1,1,Dog
0,0,Cat
3,3,Cat


In [302]:
res.print_history()

1. Shuffle 
 Order of rows randomized 
 Time taken: 0.001967906951904297 

2. Shuffle 
 Order of rows randomized 
 Time taken: 0.0010044574737548828 



# Row Duplicate

In [129]:
dup_stainer = RowDuplicateStainer(deg = 0.6, max_rep = 3)

In [177]:
res = DirtyDF(df, seed = 42).add_stainers(dup_stainer).run_all_stainers()
res.get_df()

Unnamed: 0,id,class
0,0,Cat
1,0,Cat
2,0,Cat
3,1,Dog
4,2,Rabbit
5,3,Cat
6,3,Cat
7,4,Cat
8,5,Dog
9,5,Dog


In [178]:
res.get_mapping()

array([[1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1.]])

In [179]:
res.get_mapping(axis = 1)

array([[1., 0.],
       [0., 1.]])

In [180]:
res = DirtyDF(df, seed = 42).add_stainers(dup_stainer).add_stainers(stain).run_all_stainers()
res.get_df()

Unnamed: 0,id,class
0,2,Rabbit
1,5,Dog
2,0,Cat
3,3,Cat
4,5,Dog
5,5,Dog
6,4,Cat
7,1,Dog
8,0,Cat
9,0,Cat


In [181]:
res.get_mapping()

array([[0., 0., 1., 0., 0., 0., 0., 0., 1., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 1., 1., 0., 0., 0., 0., 0.]])

In [182]:
res.print_history()

1. Add Duplicates 
 Added Duplicate Rows for 3 rows. 
  Each duplicated row should appear a maximum of 3 times. 
  Rows added: 5 
 Time taken: 0.0009968280792236328 

2. Shuffle 
 Order of rows randomized 
 Time taken: Time not updated. Use update_history to update time 



# Inflection

In [89]:
rng = np.random.default_rng(42)

df2 = pd.DataFrame(zip(range(100), rng.choice(['Cat','Dog','Rabbit'], 100)),
                  columns=('id', 'class'))

In [90]:
df2['class'].value_counts()

Rabbit    40
Dog       33
Cat       27
Name: class, dtype: int64

In [91]:
inflect = InflectionStainer(col_idx=[1], num_format = 2, formats=['original', 'lowercase', 'pluralize'], ignore_cats={1: ['Dogs']})

In [94]:
res = inflect.transform(df2, rng)

In [95]:
res[0]['class'].value_counts()

Rabbit     23
Dogs       19
Rabbits    17
Cats       16
Dog        14
Cat        11
Name: class, dtype: int64

# DateFormat & DateSplit

In [96]:
import itertools
import datetime

In [97]:
df3 = pd.read_csv("https://raw.githubusercontent.com/jbrownlee/Datasets/master/daily-min-temperatures.csv", parse_dates=['Date'])
df3['date_copy_1'] = df3['Date'] + datetime.timedelta(days=1)
df3['date_copy_2'] = df3['Date'] + datetime.timedelta(days=-1)
df3['zero_col'] = 0
df3['date_copy_3'] = df3['Date'] + datetime.timedelta(days=2)
df3['one_col'] = 1

In [98]:
df3.shape

(3650, 7)

In [99]:
df3.head(3)

Unnamed: 0,Date,Temp,date_copy_1,date_copy_2,zero_col,date_copy_3,one_col
0,1981-01-01,20.7,1981-01-02,1980-12-31,0,1981-01-03,1
1,1981-01-02,17.9,1981-01-03,1981-01-01,0,1981-01-04,1
2,1981-01-03,18.8,1981-01-04,1981-01-02,0,1981-01-05,1


In [100]:
date_format = DateFormatStainer(col_idx=[0], num_format=5)

In [101]:
df3.head(3)

Unnamed: 0,Date,Temp,date_copy_1,date_copy_2,zero_col,date_copy_3,one_col
0,1981-01-01,20.7,1981-01-02,1980-12-31,0,1981-01-03,1
1,1981-01-02,17.9,1981-01-03,1981-01-01,0,1981-01-04,1
2,1981-01-03,18.8,1981-01-04,1981-01-02,0,1981-01-05,1


In [102]:
rng = np.random.default_rng(42)
res = date_format.transform(df3, rng)

In [103]:
res[0]

Unnamed: 0,Date,Temp,date_copy_1,date_copy_2,zero_col,date_copy_3,one_col
0,"1981,Jan,01",20.7,1981-01-02,1980-12-31,0,1981-01-03,1
1,1981/02/Jan,17.9,1981-01-03,1981-01-01,0,1981-01-04,1
2,19810103,18.8,1981-01-04,1981-01-02,0,1981-01-05,1
3,1981-04-Jan,14.6,1981-01-05,1981-01-03,0,1981-01-06,1
4,"1981,Jan,05",15.8,1981-01-06,1981-01-04,0,1981-01-07,1
...,...,...,...,...,...,...,...
3645,1990-27-Dec,14.0,1990-12-28,1990-12-26,0,1990-12-29,1
3646,1990-28-Dec,13.6,1990-12-29,1990-12-27,0,1990-12-30,1
3647,"1990,Dec,29",13.5,1990-12-30,1990-12-28,0,1990-12-31,1
3648,1990/30/December,15.7,1990-12-31,1990-12-29,0,1991-01-01,1


In [104]:
date_split = DateSplitStainer([0,2,3,5], prob=0.75)

In [105]:
rng = np.random.default_rng(42)
res = date_split.transform(df3, rng)

In [106]:
res[0]

Unnamed: 0,Date,Temp,date_copy_1_day,date_copy_1_month,date_copy_1_year,date_copy_2_day,date_copy_2_month,date_copy_2_year,zero_col,date_copy_3,one_col
0,1981-01-01,20.7,02,January,81,31,12,1980,0,1981-01-03,1
1,1981-01-02,17.9,03,January,81,01,01,1981,0,1981-01-04,1
2,1981-01-03,18.8,04,January,81,02,01,1981,0,1981-01-05,1
3,1981-01-04,14.6,05,January,81,03,01,1981,0,1981-01-06,1
4,1981-01-05,15.8,06,January,81,04,01,1981,0,1981-01-07,1
...,...,...,...,...,...,...,...,...,...,...,...
3645,1990-12-27,14.0,28,December,90,26,12,1990,0,1990-12-29,1
3646,1990-12-28,13.6,29,December,90,27,12,1990,0,1990-12-30,1
3647,1990-12-29,13.5,30,December,90,28,12,1990,0,1990-12-31,1
3648,1990-12-30,15.7,31,December,90,29,12,1990,0,1991-01-01,1


In [107]:
date_split.get_history()

('Date Split',
 'Split the following date columns: date_copy_1, date_copy_2',
 0.24338221549987793)

In [108]:
res[2]

array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 1., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 1., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]])

In [109]:
Stainer.convert_mapper_dct_to_array({0:[0], 1:[1,2], 2:[3,4], 3:[], 4:[5]})

array([[1., 0., 0., 0., 0., 0.],
       [0., 1., 1., 0., 0., 0.],
       [0., 0., 0., 1., 1., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1.]])