In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import matplotlib.pyplot as plt
from ddf.DirtyDF import *
from ddf.stainer import *

In [2]:
df = pd.DataFrame([(0, 'Cat'), (1, 'Dog'), (2, 'Rabbit'), (3, 'Cat'), (4, 'Cat'), (5, 'Dog')],
                  columns=('id', 'class'))

In [3]:
df["class"] = df["class"].astype("category")

# Shuffle

In [4]:
stain = ShuffleStainer()

In [5]:
res = DirtyDF(df, seed = 214).add_stainers(stain).add_stainers(stain).run_all_stainers()
res.get_df()

Unnamed: 0,id,class
2,2,Rabbit
5,5,Dog
4,4,Cat
1,1,Dog
0,0,Cat
3,3,Cat


In [6]:
# Original DF is unmodified
df

Unnamed: 0,id,class
0,0,Cat
1,1,Dog
2,2,Rabbit
3,3,Cat
4,4,Cat
5,5,Dog


In [7]:
res.get_map_from_history(0)

array([[0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1.]])

In [8]:
res.get_previous_map()

array([[0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0., 0.]])

In [9]:
res.get_mapping(axis = 0)

array([[0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.]])

In [10]:
res.get_mapping(axis = 1)

array([[1., 0.],
       [0., 1.]])

In [11]:
new_ddf = DirtyDF(df, seed = 214).add_stainers(stain).run_stainer()
new_ddf.add_stainers(stain).run_stainer().get_df()

Unnamed: 0,id,class
2,2,Rabbit
5,5,Dog
4,4,Cat
1,1,Dog
0,0,Cat
3,3,Cat


In [12]:
res.print_history()

1. Shuffle 
 Order of rows randomized 
 Time taken: 0.0009133815765380859 

2. Shuffle 
 Order of rows randomized 
 Time taken: 0.0020072460174560547 



# Row Duplicate

In [13]:
dup_stainer = RowDuplicateStainer(deg = 0.6, max_rep = 3)

In [14]:
res = DirtyDF(df, seed = 42).add_stainers(dup_stainer).run_all_stainers()
res.get_df()

Unnamed: 0,id,class
0,0,Cat
1,0,Cat
2,0,Cat
3,1,Dog
4,2,Rabbit
5,3,Cat
6,3,Cat
7,4,Cat
8,5,Dog
9,5,Dog


In [15]:
res.get_mapping()

array([[1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1.]])

In [16]:
res.get_mapping(axis = 1)

array([[1., 0.],
       [0., 1.]])

In [17]:
res = DirtyDF(df, seed = 42).add_stainers(dup_stainer).add_stainers(stain).run_all_stainers()
res.get_df()

Unnamed: 0,id,class
4,2,Rabbit
8,5,Dog
2,0,Cat
6,3,Cat
10,5,Dog
9,5,Dog
7,4,Cat
3,1,Dog
0,0,Cat
1,0,Cat


In [18]:
res.get_mapping()

array([[0., 0., 1., 0., 0., 0., 0., 0., 1., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 1., 1., 0., 0., 0., 0., 0.]])

In [19]:
res.print_history()

1. Add Duplicates 
 Added Duplicate Rows for 3 rows. 
  Each duplicated row should appear a maximum of 3 times. 
  Rows added: 5 
 Time taken: 0.000997781753540039 

2. Shuffle 
 Order of rows randomized 
 Time taken: 0.0019941329956054688 



# Inflection

In [17]:
rng = np.random.default_rng(42)

df2 = pd.DataFrame(zip(range(100), rng.choice(['Cat','Dog','Rabbit'], 100), rng.choice(['Cow', 'Sheep', 'Goat'], 100)),
                  columns=('id', 'class', 'class2'))

In [18]:
df2.head()

Unnamed: 0,id,class,class2
0,0,Cat,Goat
1,1,Rabbit,Cow
2,2,Dog,Goat
3,3,Dog,Cow
4,4,Dog,Goat


In [19]:
df2['class'].value_counts()

Rabbit    40
Dog       33
Cat       27
Name: class, dtype: int64

In [40]:
inflect = InflectionStainer(col_idx=[1, 2], num_format = 2, formats=['original', 'lowercase', 'pluralize'], ignore_cats={1: ['Dogs']})

In [41]:
res = inflect.transform(df2, rng)

In [42]:
res[0]['class'].value_counts()

Dogs       21
rabbit     20
Rabbits    20
Cats       16
dog        12
cat        11
Name: class, dtype: int64

In [43]:
print(inflect.get_history()[1])

Categorical inflections on:
{1: {'Cat': ['Cats', 'cat'], 'Rabbit': ['Rabbits', 'rabbit'], 'Dog': ['dog', 'Dogs']}, 2: {'Goat': ['Goat', 'Goats'], 'Cow': ['Cow', 'Kine'], 'Sheep': ['Sheep']}}


In [23]:
pd.concat([df2.head(10), res[0].head(10)], axis=1)

Unnamed: 0,id,class,class2,id.1,class.1,class2.1
0,0,Cat,Goat,0,Cat,Goats
1,1,Rabbit,Cow,1,Rabbits,Kine
2,2,Dog,Goat,2,Dog,goat
3,3,Dog,Cow,3,Dog,cow
4,4,Dog,Goat,4,Dog,Goats
5,5,Rabbit,Goat,5,Rabbit,Goats
6,6,Cat,Goat,6,Cats,goat
7,7,Rabbit,Sheep,7,Rabbits,sheep
8,8,Cat,Sheep,8,Cats,sheep
9,9,Cat,Goat,9,Cat,goat


# DateFormat & DateSplit

In [2]:
import itertools
import datetime

In [3]:
df3 = pd.read_csv("https://raw.githubusercontent.com/jbrownlee/Datasets/master/daily-min-temperatures.csv", parse_dates=['Date'])
df3['date_copy_1'] = df3['Date'] + datetime.timedelta(days=1)
df3['date_copy_2'] = df3['Date'] + datetime.timedelta(days=-1)
df3['zero_col'] = 0
df3['date_copy_3'] = df3['Date'] + datetime.timedelta(days=2)
df3['one_col'] = 1

In [4]:
df3.shape

(3650, 7)

In [47]:
df3.head(3)

Unnamed: 0,Date,Temp,date_copy_1,date_copy_2,zero_col,date_copy_3,one_col
0,1981-01-01,20.7,1981-01-02,1980-12-31,0,1981-01-03,1
1,1981-01-02,17.9,1981-01-03,1981-01-01,0,1981-01-04,1
2,1981-01-03,18.8,1981-01-04,1981-01-02,0,1981-01-05,1


In [49]:
date_format = DateFormatStainer(col_idx=[0], num_format=5)

In [30]:
df3.head(3)

Unnamed: 0,Date,Temp,date_copy_1,date_copy_2,zero_col,date_copy_3,one_col
0,1981-01-01,20.7,1981-01-02,1980-12-31,0,1981-01-03,1
1,1981-01-02,17.9,1981-01-03,1981-01-01,0,1981-01-04,1
2,1981-01-03,18.8,1981-01-04,1981-01-02,0,1981-01-05,1


In [50]:
rng = np.random.default_rng(42)
res = date_format.transform(df3, rng)

In [51]:
res[0]

Unnamed: 0,Date,Temp,date_copy_1,date_copy_2,zero_col,date_copy_3,one_col
0,"1981,Jan,01",20.7,1981-01-02,1980-12-31,0,1981-01-03,1
1,1981/02/Jan,17.9,1981-01-03,1981-01-01,0,1981-01-04,1
2,19810103,18.8,1981-01-04,1981-01-02,0,1981-01-05,1
3,1981-04-Jan,14.6,1981-01-05,1981-01-03,0,1981-01-06,1
4,"1981,Jan,05",15.8,1981-01-06,1981-01-04,0,1981-01-07,1
...,...,...,...,...,...,...,...
3645,1990-27-Dec,14.0,1990-12-28,1990-12-26,0,1990-12-29,1
3646,1990-28-Dec,13.6,1990-12-29,1990-12-27,0,1990-12-30,1
3647,"1990,Dec,29",13.5,1990-12-30,1990-12-28,0,1990-12-31,1
3648,1990/30/December,15.7,1990-12-31,1990-12-29,0,1991-01-01,1


In [52]:
date_format.get_history()

('Date Formats',
 "Date Formats used:\n{0: ['%Y/%d/%B', '%Y-%d-%b', '%Y/%d/%b', '%Y,%b,%d', '%Y%m%d']}",
 0.044852256774902344)

In [33]:
datetime_format = DatetimeFormatStainer(col_idx = [0, 2, 3], num_format=3)

In [11]:
rng = np.random.default_rng(42)
res = datetime_format.transform(df3, rng)

In [12]:
res[0]

Unnamed: 0,Date,Temp,date_copy_1,date_copy_2,zero_col,date_copy_3,one_col
0,"1981,Jan,01 00:00:00",20.7,"02, Jan, 1981 00:00:00","1980, 31, December 00:00:00",0,1981-01-03,1
1,"1981,Jan,02 00:00:00",17.9,03 January 1981 00:00:00,1981-Jan-01 00:00:00,0,1981-01-04,1
2,January/03/1981 00:00:00,18.8,"04, January, 1981 00:00:00",1981-Jan-02 00:00:00,0,1981-01-05,1
3,Jan/04/1981 00:00:00,14.6,05 January 1981 00:00:00,"1981, 03, January 00:00:00",0,1981-01-06,1
4,January/05/1981 00:00:00,15.8,"06, Jan, 1981 00:00:00",1981-Jan-04 00:00:00,0,1981-01-07,1
...,...,...,...,...,...,...,...
3645,December/27/1990 00:00:00,14.0,"28, Dec, 1990 00:00:00",1990-Dec-26 00:00:00,0,1990-12-29,1
3646,December/28/1990 00:00:00,13.6,"29, Dec, 1990 00:00:00",1990-Dec-27 00:00:00,0,1990-12-30,1
3647,December/29/1990 00:00:00,13.5,"30, December, 1990 00:00:00",1990-Dec-28 00:00:00,0,1990-12-31,1
3648,Dec/30/1990 00:00:00,15.7,"31, December, 1990 00:00:00",1990-Dec-29 00:00:00,0,1991-01-01,1


In [18]:
date_split = DatetimeSplitStainer([0,2,3,5], keep_time = False, prob=0.75)
datetime_split = DatetimeSplitStainer([0,2,3,5], keep_time = True, prob = 0.75)

In [19]:
rng = np.random.default_rng(42)
res = date_split.transform(df3, rng)

In [16]:
res[0]

Unnamed: 0,Date,Temp,date_copy_1_day,date_copy_1_month,date_copy_1_year,date_copy_2_day,date_copy_2_month,date_copy_2_year,zero_col,date_copy_3,one_col
0,1981-01-01,20.7,02,January,81,31,12,1980,0,1981-01-03,1
1,1981-01-02,17.9,03,January,81,01,01,1981,0,1981-01-04,1
2,1981-01-03,18.8,04,January,81,02,01,1981,0,1981-01-05,1
3,1981-01-04,14.6,05,January,81,03,01,1981,0,1981-01-06,1
4,1981-01-05,15.8,06,January,81,04,01,1981,0,1981-01-07,1
...,...,...,...,...,...,...,...,...,...,...,...
3645,1990-12-27,14.0,28,December,90,26,12,1990,0,1990-12-29,1
3646,1990-12-28,13.6,29,December,90,27,12,1990,0,1990-12-30,1
3647,1990-12-29,13.5,30,December,90,28,12,1990,0,1990-12-31,1
3648,1990-12-30,15.7,31,December,90,29,12,1990,0,1991-01-01,1


In [20]:
rng = np.random.default_rng(42)
res = datetime_split.transform(df3, rng)

In [22]:
res[0]

Unnamed: 0,Date,Temp,date_copy_1_day,date_copy_1_month,date_copy_1_year,date_copy_1_hour,date_copy_1_minute,date_copy_1_second,date_copy_2_day,date_copy_2_month,date_copy_2_year,date_copy_2_hour,date_copy_2_minute,date_copy_2_second,zero_col,date_copy_3,one_col
0,1981-01-01,20.7,02,January,81,00,00,00,31,12,1980,00,00,00,0,1981-01-03,1
1,1981-01-02,17.9,03,January,81,00,00,00,01,01,1981,00,00,00,0,1981-01-04,1
2,1981-01-03,18.8,04,January,81,00,00,00,02,01,1981,00,00,00,0,1981-01-05,1
3,1981-01-04,14.6,05,January,81,00,00,00,03,01,1981,00,00,00,0,1981-01-06,1
4,1981-01-05,15.8,06,January,81,00,00,00,04,01,1981,00,00,00,0,1981-01-07,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3645,1990-12-27,14.0,28,December,90,00,00,00,26,12,1990,00,00,00,0,1990-12-29,1
3646,1990-12-28,13.6,29,December,90,00,00,00,27,12,1990,00,00,00,0,1990-12-30,1
3647,1990-12-29,13.5,30,December,90,00,00,00,28,12,1990,00,00,00,0,1990-12-31,1
3648,1990-12-30,15.7,31,December,90,00,00,00,29,12,1990,00,00,00,0,1991-01-01,1


In [21]:
date_split.get_history()

('Date Split',
 'Split the following date columns: date_copy_1, date_copy_2',
 0.14164233207702637)

In [37]:
res[2]

array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 1., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 1., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]])

In [38]:
Stainer.convert_mapper_dct_to_array({0:[0], 1:[1,2], 2:[3,4], 3:[], 4:[5]})

array([[1., 0., 0., 0., 0., 0.],
       [0., 1., 1., 0., 0., 0.],
       [0., 0., 0., 1., 1., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1.]])

# BinningStainer

In [53]:
df4 = pd.DataFrame({"idx1": np.random.rand(100), "idx2": np.sort(np.random.rand(100))})

In [54]:
i=0
for col in df4.columns:
    df4.loc[df4.sample(frac=0.15, random_state = i).index, col] = np.nan
    i += 10

In [55]:
df4.head(15)

Unnamed: 0,idx1,idx2
0,0.419831,0.003535
1,0.328642,
2,,
3,0.569462,
4,0.115702,0.02755
5,0.546893,0.038613
6,0.344285,0.05019
7,,0.055147
8,0.397974,0.063638
9,0.574699,0.069837


In [56]:
binner = BinningStainer([0,1])

In [57]:
rng = np.random.default_rng(42)
res = binner.transform(df4, rng)

In [58]:
res[0].head(15)

Unnamed: 0,idx1,idx2
0,"[0.1994, 0.4423)","[0.9784, 0.003535)"
1,"[0.1994, 0.4423)",
2,,
3,"[0.4423, 0.6265)",
4,"[0.001575, 0.1994)","[0.003535, 0.1942)"
5,"[0.4423, 0.6265)","[0.003535, 0.1942)"
6,"[0.1994, 0.4423)","[0.003535, 0.1942)"
7,,"[0.003535, 0.1942)"
8,"[0.1994, 0.4423)","[0.003535, 0.1942)"
9,"[0.4423, 0.6265)","[0.003535, 0.1942)"


In [59]:
binner.get_history()

('Binning',
 'Binning using the following cutpoints:\n{0: [0.001575, 0.1994, 0.4423, 0.6265, 0.7707, 0.9905], 1: [0.003535, 0.1942, 0.3181, 0.6002, 0.801, 0.9784]}',
 0.00395655632019043)