In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
from numpy.random import default_rng
from time import time
from DirtyDF import *
from stainer import *
from history import *

In [2]:
df = pd.DataFrame([(0, 'Cat'), (1, 'Dog'), (2, 'Rabbit'), (3, 'Cat'), (4, 'Cat'), (5, 'Dog')],
                  columns=('id', 'class'))

# Shuffle

In [26]:
stain = ShuffleStainer()

In [27]:
res = DirtyDF(df, seed = 214).add_stainers(stain).add_stainers(stain).run_all_stainers()
res.get_df()

Unnamed: 0,id,class
0,2,Rabbit
1,5,Dog
2,4,Cat
3,1,Dog
4,0,Cat
5,3,Cat


In [21]:
# Original DF is unmodified
df

Unnamed: 0,id,class
0,0,Cat
1,1,Dog
2,2,Rabbit
3,3,Cat
4,4,Cat
5,5,Dog


In [6]:
res.get_map_from_history(0)

array([[0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1.]])

In [7]:
res.get_previous_map()

array([[0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0., 0.]])

In [8]:
res.get_mapping(axis = 0)

array([[0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.]])

In [9]:
res.get_mapping(axis = 1)

array([[1., 0.],
       [0., 1.]])

In [10]:
new_ddf = DirtyDF(df, seed = 214).add_stainers(stain).run_stainer()
new_ddf.add_stainers(stain).run_stainer().get_df()

Unnamed: 0,id,class
0,2,Rabbit
1,5,Dog
2,4,Cat
3,1,Dog
4,0,Cat
5,3,Cat


In [11]:
res.print_history()

1. Shuffle 
 Order of rows randomized 
 Time taken: 0.0009691715240478516 

2. Shuffle 
 Order of rows randomized 
 Time taken: Time not updated. Use update_history to update time 



# Inflection

In [82]:
rng = np.random.default_rng(42)

df2 = pd.DataFrame(zip(range(100), rng.choice(['Cat','Dog','Rabbit'], 100)),
                  columns=('id', 'class'))

In [83]:
df2['class'].value_counts()

Rabbit    40
Dog       33
Cat       27
Name: class, dtype: int64

In [131]:
inflect = InflectionStainer(col_idx=[1], num_format = 2, formats=['original', 'lowercase', 'pluralize'], ignore_cats={1: ['Dogs'})

In [132]:
res = inflect.transform(df2, rng)

In [133]:
res[0]['class'].value_counts()

rabbit     22
Rabbits    18
dog        18
cat        15
Dogs       15
Cats       12
Name: class, dtype: int64

# DateFormat & DateSplit

In [44]:
import itertools
import datetime

In [49]:
df3 = pd.read_csv("https://raw.githubusercontent.com/jbrownlee/Datasets/master/daily-min-temperatures.csv", parse_dates=['Date'])
df3['date_copy_1'] = df3['Date'] + datetime.timedelta(days=1)
df3['date_copy_2'] = df3['Date'] + datetime.timedelta(days=-1)
df3['zero_col'] = 0
df3['date_copy_3'] = df3['Date'] + datetime.timedelta(days=2)
df3['one_col'] = 1

In [50]:
df3.shape

(3650, 7)

In [51]:
df3.head(3)

Unnamed: 0,Date,Temp,date_copy_1,date_copy_2,zero_col,date_copy_3,one_col
0,1981-01-01,20.7,1981-01-02,1980-12-31,0,1981-01-03,1
1,1981-01-02,17.9,1981-01-03,1981-01-01,0,1981-01-04,1
2,1981-01-03,18.8,1981-01-04,1981-01-02,0,1981-01-05,1


In [9]:
date_format = DateFormatStainer(col_idx=[0], num_format=5)

In [10]:
df3.head(3)

Unnamed: 0,Date,Temp
0,1981-01-01,20.7
1,1981-01-02,17.9
2,1981-01-03,18.8


In [12]:
rng = np.random.default_rng(42)
res = date_format.transform(df3, rng)

In [13]:
res[0]

Unnamed: 0,Date,Temp
0,"1981,Jan,01",20.7
1,1981/02/Jan,17.9
2,19810103,18.8
3,1981-04-Jan,14.6
4,"1981,Jan,05",15.8
...,...,...
3645,1990-27-Dec,14.0
3646,1990-28-Dec,13.6
3647,"1990,Dec,29",13.5
3648,1990/30/December,15.7


In [76]:
date_split = DateSplitStainer([0,2,3,5], prob=0.75)

In [77]:
rng = np.random.default_rng(42)
res = date_split.transform(df3, rng)

In [78]:
res[0]

Unnamed: 0,Date,Temp,date_copy_1_day,date_copy_1_month,date_copy_1_year,date_copy_2_day,date_copy_2_month,date_copy_2_year,zero_col,date_copy_3,one_col
0,1981-01-01,20.7,02,January,81,31,12,1980,0,1981-01-03,1
1,1981-01-02,17.9,03,January,81,01,01,1981,0,1981-01-04,1
2,1981-01-03,18.8,04,January,81,02,01,1981,0,1981-01-05,1
3,1981-01-04,14.6,05,January,81,03,01,1981,0,1981-01-06,1
4,1981-01-05,15.8,06,January,81,04,01,1981,0,1981-01-07,1
...,...,...,...,...,...,...,...,...,...,...,...
3645,1990-12-27,14.0,28,December,90,26,12,1990,0,1990-12-29,1
3646,1990-12-28,13.6,29,December,90,27,12,1990,0,1990-12-30,1
3647,1990-12-29,13.5,30,December,90,28,12,1990,0,1990-12-31,1
3648,1990-12-30,15.7,31,December,90,29,12,1990,0,1991-01-01,1


In [79]:
date_split.get_history()

('Date Split',
 'Split the following date columns: date_copy_1, date_copy_2',
 0.15059494972229004)

In [80]:
res[2]

array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 1., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 1., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]])

In [13]:
Stainer.convert_mapper_dct_to_array({0:[0], 1:[1,2], 2:[3,4], 3:[], 4:[5]})

array([[1., 0., 0., 0., 0., 0.],
       [0., 1., 1., 0., 0., 0.],
       [0., 0., 0., 1., 1., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1.]])

# BinningStainer

In [38]:
df4 = pd.DataFrame({"idx1": np.random.rand(100), "idx2": np.sort(np.random.rand(100))})

In [39]:
i=0
for col in df4.columns:
    df4.loc[df4.sample(frac=0.15, random_state = i).index, col] = np.nan
    i += 10

In [40]:
df4.head(15)

Unnamed: 0,idx1,idx2
0,0.164806,0.023758
1,0.114614,
2,,
3,0.852072,
4,0.822083,0.062122
5,0.780226,0.069423
6,0.152427,0.083772
7,,0.088495
8,0.343147,0.100023
9,0.266744,0.115947


In [41]:
binner = BinningStainer([0,1])

In [42]:
rng = np.random.default_rng(42)
res = binner.transform(df4, rng)

In [45]:
res[0].head(15)

Unnamed: 0,idx1,idx2
0,"[0.004129, 0.2061)","[0.9918, 0.02376)"
1,"[0.004129, 0.2061)",
2,,
3,"[0.8003, 0.968]",
4,"[0.8003, 0.968]","[0.02376, 0.2647)"
5,"[0.6083, 0.8003)","[0.02376, 0.2647)"
6,"[0.004129, 0.2061)","[0.02376, 0.2647)"
7,,"[0.02376, 0.2647)"
8,"[0.2061, 0.413)","[0.02376, 0.2647)"
9,"[0.2061, 0.413)","[0.02376, 0.2647)"
