In [2]:
%load_ext autoreload
%autoreload 2

import pandas as pd
from numpy.random import default_rng
from time import time
from DirtyDF import *
from stainer import *
from history import *

In [23]:
df = pd.DataFrame([(0, 'Cat'), (1, 'Dog'), (2, 'Rabbit'), (3, 'Cat'), (4, 'Cat'), (5, 'Dog')],
                  columns=('id', 'class'))

# Shuffle

In [26]:
stain = ShuffleStainer()

In [27]:
res = DirtyDF(df, seed = 214).add_stainers(stain).add_stainers(stain).run_all_stainers()
res.get_df()

Unnamed: 0,id,class
0,2,Rabbit
1,5,Dog
2,4,Cat
3,1,Dog
4,0,Cat
5,3,Cat


In [21]:
# Original DF is unmodified
df

Unnamed: 0,id,class
0,0,Cat
1,1,Dog
2,2,Rabbit
3,3,Cat
4,4,Cat
5,5,Dog


In [6]:
res.get_map_from_history(0)

array([[0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1.]])

In [7]:
res.get_previous_map()

array([[0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0., 0.]])

In [8]:
res.get_mapping(axis = 0)

array([[0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.]])

In [9]:
res.get_mapping(axis = 1)

array([[1., 0.],
       [0., 1.]])

In [10]:
new_ddf = DirtyDF(df, seed = 214).add_stainers(stain).run_stainer()
new_ddf.add_stainers(stain).run_stainer().get_df()

Unnamed: 0,id,class
0,2,Rabbit
1,5,Dog
2,4,Cat
3,1,Dog
4,0,Cat
5,3,Cat


In [11]:
res.print_history()

1. Shuffle 
 Order of rows randomized 
 Time taken: 0.0009691715240478516 

2. Shuffle 
 Order of rows randomized 
 Time taken: Time not updated. Use update_history to update time 



# Inflection

In [82]:
rng = np.random.default_rng(42)

df2 = pd.DataFrame(zip(range(100), rng.choice(['Cat','Dog','Rabbit'], 100)),
                  columns=('id', 'class'))

In [83]:
df2['class'].value_counts()

Rabbit    40
Dog       33
Cat       27
Name: class, dtype: int64

In [131]:
inflect = InflectionStainer(col_idx=[1], num_format = 2, formats=['original', 'lowercase', 'pluralize'], ignore_cats={1: ['Dogs'})

In [132]:
res = inflect.transform(df2, rng)

In [133]:
res[0]['class'].value_counts()

rabbit     22
Rabbits    18
dog        18
cat        15
Dogs       15
Cats       12
Name: class, dtype: int64

# DateFormat & DateSplit

In [6]:
import itertools

In [7]:
df3 = pd.read_csv("https://raw.githubusercontent.com/jbrownlee/Datasets/master/daily-min-temperatures.csv", parse_dates=['Date'])

In [8]:
df3.shape

(3650, 2)

In [9]:
date_format = DateFormatStainer(col_idx=[0], num_format=5)

In [10]:
df3.head(3)

Unnamed: 0,Date,Temp
0,1981-01-01,20.7
1,1981-01-02,17.9
2,1981-01-03,18.8


In [12]:
rng = np.random.default_rng(42)
res = date_format.transform(df3, rng)

In [13]:
res[0]

Unnamed: 0,Date,Temp
0,"1981,Jan,01",20.7
1,1981/02/Jan,17.9
2,19810103,18.8
3,1981-04-Jan,14.6
4,"1981,Jan,05",15.8
...,...,...
3645,1990-27-Dec,14.0
3646,1990-28-Dec,13.6
3647,"1990,Dec,29",13.5
3648,1990/30/December,15.7


In [14]:
date_split = DateSplitStainer([0])

In [16]:
rng = np.random.default_rng(42)
res = date_split.transform(df3, rng)

In [17]:
res[0]

Unnamed: 0,Temp,Date_day,Date_month,Date_year
0,20.7,01,January,1981
1,17.9,02,January,81
2,18.8,03,January,81
3,14.6,04,Jan,1981
4,15.8,05,01,81
...,...,...,...,...
3645,14.0,27,December,1990
3646,13.6,28,Dec,1990
3647,13.5,29,Dec,90
3648,15.7,30,12,90
