In [58]:
%load_ext autoreload
%autoreload 2

import pandas as pd
from numpy.random import default_rng
from time import time
from DirtyDF import *
from stainer import *
from history import *

In [2]:
df = pd.DataFrame([(0, 'Cat'), (1, 'Dog'), (2, 'Rabbit'), (3, 'Cat'), (4, 'Cat'), (5, 'Dog')],
                  columns=('id', 'class'))

# Shuffle

In [26]:
stain = ShuffleStainer()

In [27]:
res = DirtyDF(df, seed = 214).add_stainers(stain).add_stainers(stain).run_all_stainers()
res.get_df()

Unnamed: 0,id,class
0,2,Rabbit
1,5,Dog
2,4,Cat
3,1,Dog
4,0,Cat
5,3,Cat


In [21]:
# Original DF is unmodified
df

Unnamed: 0,id,class
0,0,Cat
1,1,Dog
2,2,Rabbit
3,3,Cat
4,4,Cat
5,5,Dog


In [6]:
res.get_map_from_history(0)

array([[0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1.]])

In [7]:
res.get_previous_map()

array([[0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0., 0.]])

In [8]:
res.get_mapping(axis = 0)

array([[0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.]])

In [9]:
res.get_mapping(axis = 1)

array([[1., 0.],
       [0., 1.]])

In [10]:
new_ddf = DirtyDF(df, seed = 214).add_stainers(stain).run_stainer()
new_ddf.add_stainers(stain).run_stainer().get_df()

Unnamed: 0,id,class
0,2,Rabbit
1,5,Dog
2,4,Cat
3,1,Dog
4,0,Cat
5,3,Cat


In [11]:
res.print_history()

1. Shuffle 
 Order of rows randomized 
 Time taken: 0.0009691715240478516 

2. Shuffle 
 Order of rows randomized 
 Time taken: Time not updated. Use update_history to update time 



# Inflection

In [82]:
rng = np.random.default_rng(42)

df2 = pd.DataFrame(zip(range(100), rng.choice(['Cat','Dog','Rabbit'], 100)),
                  columns=('id', 'class'))

In [83]:
df2['class'].value_counts()

Rabbit    40
Dog       33
Cat       27
Name: class, dtype: int64

In [131]:
inflect = InflectionStainer(col_idx=[1], num_format = 2, formats=['original', 'lowercase', 'pluralize'], ignore_cats={1: ['Dogs'})

In [132]:
res = inflect.transform(df2, rng)

In [133]:
res[0]['class'].value_counts()

rabbit     22
Rabbits    18
dog        18
cat        15
Dogs       15
Cats       12
Name: class, dtype: int64

# DateFormat & DateSplit

In [44]:
import itertools
import datetime

In [49]:
df3 = pd.read_csv("https://raw.githubusercontent.com/jbrownlee/Datasets/master/daily-min-temperatures.csv", parse_dates=['Date'])
df3['date_copy_1'] = df3['Date'] + datetime.timedelta(days=1)
df3['date_copy_2'] = df3['Date'] + datetime.timedelta(days=-1)
df3['zero_col'] = 0
df3['date_copy_3'] = df3['Date'] + datetime.timedelta(days=2)
df3['one_col'] = 1

In [50]:
df3.shape

(3650, 7)

In [51]:
df3.head(3)

Unnamed: 0,Date,Temp,date_copy_1,date_copy_2,zero_col,date_copy_3,one_col
0,1981-01-01,20.7,1981-01-02,1980-12-31,0,1981-01-03,1
1,1981-01-02,17.9,1981-01-03,1981-01-01,0,1981-01-04,1
2,1981-01-03,18.8,1981-01-04,1981-01-02,0,1981-01-05,1


In [9]:
date_format = DateFormatStainer(col_idx=[0], num_format=5)

In [10]:
df3.head(3)

Unnamed: 0,Date,Temp
0,1981-01-01,20.7
1,1981-01-02,17.9
2,1981-01-03,18.8


In [12]:
rng = np.random.default_rng(42)
res = date_format.transform(df3, rng)

In [13]:
res[0]

Unnamed: 0,Date,Temp
0,"1981,Jan,01",20.7
1,1981/02/Jan,17.9
2,19810103,18.8
3,1981-04-Jan,14.6
4,"1981,Jan,05",15.8
...,...,...
3645,1990-27-Dec,14.0
3646,1990-28-Dec,13.6
3647,"1990,Dec,29",13.5
3648,1990/30/December,15.7


In [76]:
date_split = DateSplitStainer([0,2,3,5], prob=0.75)

In [77]:
rng = np.random.default_rng(42)
res = date_split.transform(df3, rng)

In [78]:
res[0]

Unnamed: 0,Date,Temp,date_copy_1_day,date_copy_1_month,date_copy_1_year,date_copy_2_day,date_copy_2_month,date_copy_2_year,zero_col,date_copy_3,one_col
0,1981-01-01,20.7,02,January,81,31,12,1980,0,1981-01-03,1
1,1981-01-02,17.9,03,January,81,01,01,1981,0,1981-01-04,1
2,1981-01-03,18.8,04,January,81,02,01,1981,0,1981-01-05,1
3,1981-01-04,14.6,05,January,81,03,01,1981,0,1981-01-06,1
4,1981-01-05,15.8,06,January,81,04,01,1981,0,1981-01-07,1
...,...,...,...,...,...,...,...,...,...,...,...
3645,1990-12-27,14.0,28,December,90,26,12,1990,0,1990-12-29,1
3646,1990-12-28,13.6,29,December,90,27,12,1990,0,1990-12-30,1
3647,1990-12-29,13.5,30,December,90,28,12,1990,0,1990-12-31,1
3648,1990-12-30,15.7,31,December,90,29,12,1990,0,1991-01-01,1


In [79]:
date_split.get_history()

('Date Split',
 'Split the following date columns: date_copy_1, date_copy_2',
 0.15059494972229004)

In [80]:
res[2]

array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 1., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 1., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]])

In [13]:
Stainer.convert_mapper_dct_to_array({0:[0], 1:[1,2], 2:[3,4], 3:[], 4:[5]})

array([[1., 0., 0., 0., 0., 0.],
       [0., 1., 1., 0., 0., 0.],
       [0., 0., 0., 1., 1., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1.]])

# BinningStainer

In [92]:
np.sort(np.random.rand(100))

array([0.00990821, 0.02726183, 0.0477136 , 0.06415641, 0.06851776,
       0.07106917, 0.07732596, 0.07867519, 0.09253264, 0.11013822,
       0.11328192, 0.11418847, 0.12814846, 0.14502817, 0.14543668,
       0.16637548, 0.18753771, 0.18978233, 0.19532428, 0.20946734,
       0.21711606, 0.22123744, 0.22431716, 0.23352213, 0.24880601,
       0.26813267, 0.27515103, 0.30270832, 0.30618358, 0.31036434,
       0.3123504 , 0.33185948, 0.34107737, 0.36147949, 0.36507598,
       0.38422853, 0.38536794, 0.4068727 , 0.42114936, 0.42227402,
       0.43462161, 0.44126257, 0.44638249, 0.45461733, 0.45950476,
       0.46285402, 0.46792329, 0.4873443 , 0.49885345, 0.50954646,
       0.52376529, 0.53575077, 0.54306557, 0.55123501, 0.55259108,
       0.57030669, 0.58238156, 0.60031099, 0.60873287, 0.6097665 ,
       0.62429239, 0.6281018 , 0.62982452, 0.63249367, 0.63892298,
       0.6490726 , 0.68943399, 0.68976978, 0.70134065, 0.70389656,
       0.70956751, 0.72646319, 0.73764366, 0.73891416, 0.74303

In [96]:
df4 = pd.DataFrame({"idx1": np.random.rand(100), "idx2": np.sort(np.random.rand(100))})

In [97]:
df4.head(3)

Unnamed: 0,idx1,idx2
0,0.737674,0.004419
1,0.957328,0.018594
2,0.578615,0.031676


In [106]:
binner = BinningStainer([0,1])

In [107]:
rng = np.random.default_rng(42)
res = binner.transform(df4, rng)

In [108]:
df4.head(5)

Unnamed: 0,idx1,idx2
0,0.737674,0.004419
1,0.957328,0.018594
2,0.578615,0.031676
3,0.222338,0.032989
4,0.874617,0.039108


In [109]:
res[0].head(5)

Unnamed: 0,idx1,idx2
0,"[0.5787, 0.7806)","[0.004419, 0.1373)"
1,"[0.7806, 0.998]","[0.004419, 0.1373)"
2,"[0.3812, 0.5787)","[0.004419, 0.1373)"
3,"[0.1886, 0.3812)","[0.004419, 0.1373)"
4,"[0.7806, 0.998]","[0.004419, 0.1373)"
