# Preprocessing

In [1]:
import os, sys
import itertools
import random
import numpy as np
import pandas as pd

In [7]:
class Example(object):
    def __init__(self, inspecting = True):
        """ Reset dataset for each example. """
        global dataset
        dataset = pd.read_csv('datasets/processing_examples.csv')
        self.inspecting = inspecting
        
    def __enter__(self):
        if self.inspecting:
            print('====== before ======')
            print(dataset)
            
    def __exit__(self, type, value, traceback):
        if self.inspecting:
            print('====== after ======')
            print(dataset)

This shows an example of zeroing out all elements in a dataset. 

In [8]:
with Example():
    dataset.iloc[:, :] = 0

      float  int yes/no        date categorical
0  0.283405    4     No  2017-10-06           E
1  0.034334    5    Yes  2017-10-07           F
2  0.773453    6    Yes  2017-10-08           G
3  0.550071    8     No  2017-10-10           B
4  0.382113    9     No  2017-10-11           C
5  0.921326    1     No  2017-10-03           B
6  0.691557    9    Yes  2017-10-11           C
7  0.526204    1     No  2017-10-03           B
   float  int  yes/no  date  categorical
0      0    0       0     0            0
1      0    0       0     0            0
2      0    0       0     0            0
3      0    0       0     0            0
4      0    0       0     0            0
5      0    0       0     0            0
6      0    0       0     0            0
7      0    0       0     0            0


## Delete a column

In [12]:
with Example():
    ret = dataset.drop("float", axis = 1)
    print("\n \n ======returns=======")
    print(ret)

      float  int yes/no        date categorical
0  0.283405    4     No  2017-10-06           E
1  0.034334    5    Yes  2017-10-07           F
2  0.773453    6    Yes  2017-10-08           G
3  0.550071    8     No  2017-10-10           B
4  0.382113    9     No  2017-10-11           C
5  0.921326    1     No  2017-10-03           B
6  0.691557    9    Yes  2017-10-11           C
7  0.526204    1     No  2017-10-03           B

 
   int yes/no        date categorical
0    4     No  2017-10-06           E
1    5    Yes  2017-10-07           F
2    6    Yes  2017-10-08           G
3    8     No  2017-10-10           B
4    9     No  2017-10-11           C
5    1     No  2017-10-03           B
6    9    Yes  2017-10-11           C
7    1     No  2017-10-03           B
      float  int yes/no        date categorical
0  0.283405    4     No  2017-10-06           E
1  0.034334    5    Yes  2017-10-07           F
2  0.773453    6    Yes  2017-10-08           G
3  0.550071    8     No  2017-1

In [14]:
with Example():
    ret = dataset.drop("float", inplace = True, axis = 1)
    print("\n \n =========returns===========")
    print(ret)

      float  int yes/no        date categorical
0  0.283405    4     No  2017-10-06           E
1  0.034334    5    Yes  2017-10-07           F
2  0.773453    6    Yes  2017-10-08           G
3  0.550071    8     No  2017-10-10           B
4  0.382113    9     No  2017-10-11           C
5  0.921326    1     No  2017-10-03           B
6  0.691557    9    Yes  2017-10-11           C
7  0.526204    1     No  2017-10-03           B

 
None
   int yes/no        date categorical
0    4     No  2017-10-06           E
1    5    Yes  2017-10-07           F
2    6    Yes  2017-10-08           G
3    8     No  2017-10-10           B
4    9     No  2017-10-11           C
5    1     No  2017-10-03           B
6    9    Yes  2017-10-11           C
7    1     No  2017-10-03           B


### Recommended way:

In [15]:
with Example():
    del dataset["float"]

      float  int yes/no        date categorical
0  0.283405    4     No  2017-10-06           E
1  0.034334    5    Yes  2017-10-07           F
2  0.773453    6    Yes  2017-10-08           G
3  0.550071    8     No  2017-10-10           B
4  0.382113    9     No  2017-10-11           C
5  0.921326    1     No  2017-10-03           B
6  0.691557    9    Yes  2017-10-11           C
7  0.526204    1     No  2017-10-03           B
   int yes/no        date categorical
0    4     No  2017-10-06           E
1    5    Yes  2017-10-07           F
2    6    Yes  2017-10-08           G
3    8     No  2017-10-10           B
4    9     No  2017-10-11           C
5    1     No  2017-10-03           B
6    9    Yes  2017-10-11           C
7    1     No  2017-10-03           B


## Convert column in binary

In [16]:
with Example():
    dataset["yes/no"] = dataset["yes/no"].apply(["Yes", "No"].index)

      float  int yes/no        date categorical
0  0.283405    4     No  2017-10-06           E
1  0.034334    5    Yes  2017-10-07           F
2  0.773453    6    Yes  2017-10-08           G
3  0.550071    8     No  2017-10-10           B
4  0.382113    9     No  2017-10-11           C
5  0.921326    1     No  2017-10-03           B
6  0.691557    9    Yes  2017-10-11           C
7  0.526204    1     No  2017-10-03           B
      float  int  yes/no        date categorical
0  0.283405    4       1  2017-10-06           E
1  0.034334    5       0  2017-10-07           F
2  0.773453    6       0  2017-10-08           G
3  0.550071    8       1  2017-10-10           B
4  0.382113    9       1  2017-10-11           C
5  0.921326    1       1  2017-10-03           B
6  0.691557    9       0  2017-10-11           C
7  0.526204    1       1  2017-10-03           B


In [17]:
with Example():
    dataset["yes/no"] = list(map(["Yes", "No"].index, dataset["yes/no"]))

      float  int yes/no        date categorical
0  0.283405    4     No  2017-10-06           E
1  0.034334    5    Yes  2017-10-07           F
2  0.773453    6    Yes  2017-10-08           G
3  0.550071    8     No  2017-10-10           B
4  0.382113    9     No  2017-10-11           C
5  0.921326    1     No  2017-10-03           B
6  0.691557    9    Yes  2017-10-11           C
7  0.526204    1     No  2017-10-03           B
      float  int  yes/no        date categorical
0  0.283405    4       1  2017-10-06           E
1  0.034334    5       0  2017-10-07           F
2  0.773453    6       0  2017-10-08           G
3  0.550071    8       1  2017-10-10           B
4  0.382113    9       1  2017-10-11           C
5  0.921326    1       1  2017-10-03           B
6  0.691557    9       0  2017-10-11           C
7  0.526204    1       1  2017-10-03           B


map() takes a function and an iterable object and applies the function to all elements of that iterable object

In [19]:
a = [0, 2, 4]
a

[0, 2, 4]

In [22]:
list(map(lambda x: x + 1, a))

[1, 3, 5]

## Convert column into np.datetime64

In [23]:
with Example():
    dataset["date"] = dataset["date"].apply(np.datetime64)

      float  int yes/no        date categorical
0  0.283405    4     No  2017-10-06           E
1  0.034334    5    Yes  2017-10-07           F
2  0.773453    6    Yes  2017-10-08           G
3  0.550071    8     No  2017-10-10           B
4  0.382113    9     No  2017-10-11           C
5  0.921326    1     No  2017-10-03           B
6  0.691557    9    Yes  2017-10-11           C
7  0.526204    1     No  2017-10-03           B
      float  int yes/no       date categorical
0  0.283405    4     No 2017-10-06           E
1  0.034334    5    Yes 2017-10-07           F
2  0.773453    6    Yes 2017-10-08           G
3  0.550071    8     No 2017-10-10           B
4  0.382113    9     No 2017-10-11           C
5  0.921326    1     No 2017-10-03           B
6  0.691557    9    Yes 2017-10-11           C
7  0.526204    1     No 2017-10-03           B


In [24]:
type(dataset["date"][0])

pandas._libs.tslib.Timestamp

In [25]:
with Example():
    dataset["date"] = dataset["date"].apply(np.datetime64)
    print("=======day==========")
    print(dataset["date"].apply(lambda d: d.day))

      float  int yes/no        date categorical
0  0.283405    4     No  2017-10-06           E
1  0.034334    5    Yes  2017-10-07           F
2  0.773453    6    Yes  2017-10-08           G
3  0.550071    8     No  2017-10-10           B
4  0.382113    9     No  2017-10-11           C
5  0.921326    1     No  2017-10-03           B
6  0.691557    9    Yes  2017-10-11           C
7  0.526204    1     No  2017-10-03           B
0     6
1     7
2     8
3    10
4    11
5     3
6    11
7     3
Name: date, dtype: int64
      float  int yes/no       date categorical
0  0.283405    4     No 2017-10-06           E
1  0.034334    5    Yes 2017-10-07           F
2  0.773453    6    Yes 2017-10-08           G
3  0.550071    8     No 2017-10-10           B
4  0.382113    9     No 2017-10-11           C
5  0.921326    1     No 2017-10-03           B
6  0.691557    9    Yes 2017-10-11           C
7  0.526204    1     No 2017-10-03           B


In [26]:
with Example():
    dataset["date"] = dataset["date"].apply(np.datetime64)
    print("=======month========")
    print(dataset["date"].apply(lambda d: d.month))

      float  int yes/no        date categorical
0  0.283405    4     No  2017-10-06           E
1  0.034334    5    Yes  2017-10-07           F
2  0.773453    6    Yes  2017-10-08           G
3  0.550071    8     No  2017-10-10           B
4  0.382113    9     No  2017-10-11           C
5  0.921326    1     No  2017-10-03           B
6  0.691557    9    Yes  2017-10-11           C
7  0.526204    1     No  2017-10-03           B
0    10
1    10
2    10
3    10
4    10
5    10
6    10
7    10
Name: date, dtype: int64
      float  int yes/no       date categorical
0  0.283405    4     No 2017-10-06           E
1  0.034334    5    Yes 2017-10-07           F
2  0.773453    6    Yes 2017-10-08           G
3  0.550071    8     No 2017-10-10           B
4  0.382113    9     No 2017-10-11           C
5  0.921326    1     No 2017-10-03           B
6  0.691557    9    Yes 2017-10-11           C
7  0.526204    1     No 2017-10-03           B


## Find unique values in a column

In [27]:
Example(inspecting = False)
np.unique(dataset["categorical"])

array(['B', 'C', 'E', 'F', 'G'], dtype=object)

## Iterating through columns

In [28]:
Example(inspecting = False)

for column_name in ["int", "categorical"]:
    print(dataset[column_name].head())

0    4
1    5
2    6
3    8
4    9
Name: int, dtype: int64
0    E
1    F
2    G
3    B
4    C
Name: categorical, dtype: object


In [29]:
Example(inspecting = False)

for column_name in dataset.columns:
    print(dataset[column_name].head())

0    0.283405
1    0.034334
2    0.773453
3    0.550071
4    0.382113
Name: float, dtype: float64
0    4
1    5
2    6
3    8
4    9
Name: int, dtype: int64
0     No
1    Yes
2    Yes
3     No
4     No
Name: yes/no, dtype: object
0    2017-10-06
1    2017-10-07
2    2017-10-08
3    2017-10-10
4    2017-10-11
Name: date, dtype: object
0    E
1    F
2    G
3    B
4    C
Name: categorical, dtype: object


In [30]:
Example(inspecting = False)

for column_name in np.array(dataset.columns)[[2, 4]]:
    print(dataset[column_name].head())

0     No
1    Yes
2    Yes
3     No
4     No
Name: yes/no, dtype: object
0    E
1    F
2    G
3    B
4    C
Name: categorical, dtype: object


In [31]:
Example(inspecting = False)

for column_name in np.array(dataset.columns)[[False, True, False, False, True]]:
    print(dataset[column_name].head())

0    4
1    5
2    6
3    8
4    9
Name: int, dtype: int64
0    E
1    F
2    G
3    B
4    C
Name: categorical, dtype: object


## Iterating through columns with exclusions

In [35]:
Example(inspecting = False)
exclusion = ["float", "yes/no", "date"]

for column_name in set(dataset.columns)-set(exclusion):
    print(dataset[column_name].head())

0    4
1    5
2    6
3    8
4    9
Name: int, dtype: int64
0    E
1    F
2    G
3    B
4    C
Name: categorical, dtype: object


In [37]:
Example(inspecting = False)
exclusion = [0, 2, 3]

for column_name in set(dataset.columns) - set(np.array(dataset.columns)[exclusion]):
    print(dataset[column_name].head())

0    4
1    5
2    6
3    8
4    9
Name: int, dtype: int64
0    E
1    F
2    G
3    B
4    C
Name: categorical, dtype: object


In [38]:
Example(inspecting = False)

exclusion = [0, 2, 3]

for column_name in [v for i, v in enumerate(dataset.columns) if i not in exclusion]:
    print(dataset[column_name].head())

0    4
1    5
2    6
3    8
4    9
Name: int, dtype: int64
0    E
1    F
2    G
3    B
4    C
Name: categorical, dtype: object


In [40]:
Example(inspecting = False)

exclusion = [True, False, True, True, False]

for column_name in np.array(dataset.columns)[~ np.array(exclusion)]:
    print(dataset[column_name].head())

0    4
1    5
2    6
3    8
4    9
Name: int, dtype: int64
0    E
1    F
2    G
3    B
4    C
Name: categorical, dtype: object


## Drop rows from dataframe

In [41]:
with Example():
    dataset.drop([3, 4, 5], inplace = True)

      float  int yes/no        date categorical
0  0.283405    4     No  2017-10-06           E
1  0.034334    5    Yes  2017-10-07           F
2  0.773453    6    Yes  2017-10-08           G
3  0.550071    8     No  2017-10-10           B
4  0.382113    9     No  2017-10-11           C
5  0.921326    1     No  2017-10-03           B
6  0.691557    9    Yes  2017-10-11           C
7  0.526204    1     No  2017-10-03           B
      float  int yes/no        date categorical
0  0.283405    4     No  2017-10-06           E
1  0.034334    5    Yes  2017-10-07           F
2  0.773453    6    Yes  2017-10-08           G
6  0.691557    9    Yes  2017-10-11           C
7  0.526204    1     No  2017-10-03           B


In [42]:
with Example():
    dataset.drop([3, 4, 5], inplace = True)
    dataset.reset_index(drop = True, inplace = True)

      float  int yes/no        date categorical
0  0.283405    4     No  2017-10-06           E
1  0.034334    5    Yes  2017-10-07           F
2  0.773453    6    Yes  2017-10-08           G
3  0.550071    8     No  2017-10-10           B
4  0.382113    9     No  2017-10-11           C
5  0.921326    1     No  2017-10-03           B
6  0.691557    9    Yes  2017-10-11           C
7  0.526204    1     No  2017-10-03           B
      float  int yes/no        date categorical
0  0.283405    4     No  2017-10-06           E
1  0.034334    5    Yes  2017-10-07           F
2  0.773453    6    Yes  2017-10-08           G
3  0.691557    9    Yes  2017-10-11           C
4  0.526204    1     No  2017-10-03           B


## One hot encoding string column

In [43]:
from sklearn.preprocessing import LabelBinarizer

In [45]:
Example(inspecting = False)

encoder = LabelBinarizer()
onehot = encoder.fit_transform(dataset["categorical"])
print(onehot)

[[0 0 1 0 0]
 [0 0 0 1 0]
 [0 0 0 0 1]
 [1 0 0 0 0]
 [0 1 0 0 0]
 [1 0 0 0 0]
 [0 1 0 0 0]
 [1 0 0 0 0]]
