<a href="https://colab.research.google.com/github/ibribr/ML/blob/master/preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')
plt.xkcd()
%matplotlib inline
from sklearn.preprocessing import scale, normalize, MinMaxScaler, Binarizer
from sklearn.impute import SimpleImputer  

In [15]:
# standardization
x = np.array([[1, -2, 2],
              [3, 0, 0],
              [0, 1, -1]])
print(x)
x_scaled =scale(x)
print(x_scaled) 

[[ 1 -2  2]
 [ 3  0  0]
 [ 0  1 -1]]
[[-0.26726124 -1.33630621  1.33630621]
 [ 1.33630621  0.26726124 -0.26726124]
 [-1.06904497  1.06904497 -1.06904497]]


In [16]:
# normalization
x_norm = normalize(x, norm='l1')  # using Manhattan distance
print(x_norm)
x_norm = normalize(x, norm='l2')  # using Euclidean distance
print(x_norm)

[[ 0.2 -0.4  0.4]
 [ 1.   0.   0. ]
 [ 0.   0.5 -0.5]]
[[ 0.33333333 -0.66666667  0.66666667]
 [ 1.          0.          0.        ]
 [ 0.          0.70710678 -0.70710678]]


In [19]:
# Feature scaling:
min_max_scalar = MinMaxScaler(feature_range=(0,1))
x_min_max = min_max_scalar.fit_transform(x)
print(x_min_max)

min_max_scalar = MinMaxScaler(feature_range=(-1,1))
x_min_max = min_max_scalar.fit_transform(x)
print(x_min_max)

[[0.33333333 0.         1.        ]
 [1.         0.66666667 0.33333333]
 [0.         1.         0.        ]]
[[-0.33333333 -1.          1.        ]
 [ 1.          0.33333333 -0.33333333]
 [-1.          1.         -1.        ]]


In [20]:
# binarizing features
binarizer = Binarizer(threshold=0.5)  # values > 0.5 will be converted into 1s, rest are 0s.
x_bin = binarizer.transform(x)
print(x_bin)

[[1 0 1]
 [1 0 0]
 [0 1 0]]


In [26]:
# handling missing values
from numpy import nan
x = np.array([[nan, 0, 3],
             [2, 9, -8],
             [1, nan, 1],
             [5, 2, 4],
             [7, 6, -3]])
# replace all nan values with mean value along a specified axis
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
x_imp = imp.fit_transform(x)
print(x_imp)

# replace all nan values with median value along a specified axis
imp = SimpleImputer(missing_values=np.nan, strategy='median')
x_imp = imp.fit_transform(x)
print(x_imp)

# replace all nan values with the most frequent value along a specified axis
imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
x_imp = imp.fit_transform(x)
print(x_imp)



[[ 3.75  0.    3.  ]
 [ 2.    9.   -8.  ]
 [ 1.    4.25  1.  ]
 [ 5.    2.    4.  ]
 [ 7.    6.   -3.  ]]
[[ 3.5  0.   3. ]
 [ 2.   9.  -8. ]
 [ 1.   4.   1. ]
 [ 5.   2.   4. ]
 [ 7.   6.  -3. ]]
[[ 1.  0.  3.]
 [ 2.  9. -8.]
 [ 1.  0.  1.]
 [ 5.  2.  4.]
 [ 7.  6. -3.]]
