In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import Normalizer, normalize
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer

In [2]:
X = np.random.randint(1, 20, (4,4))
print("raw data : \n", X)

raw data : 
 [[ 9  5 15 18]
 [14 16  3  6]
 [ 4  3  1 10]
 [18 15 14 10]]


# Standardization


## Why do we use standard scaler?
StandardScaler removes the mean and scales each feature/variable to unit variance. This operation is performed feature-wise in an independent way. StandardScaler can be influenced by outliers (if they exist in the dataset) since it involves the estimation of the empirical mean and standard deviation of each feature.

```class sklearn.preprocessing.StandardScaler(*, copy=True, with_mean=True, with_std=True)```

The standard score of a sample x is calculated as: z = (x - u) / s
where u is the mean of the training samples or zero if with_mean=False, and s is the standard deviation of the training samples or one if with_std=False


## Why do we use min max scaler?
Transform features by scaling each feature to a given range. This estimator scales and translates each feature individually such that it is in the given range on the training set, e.g. between zero and one.


## Why do we use maxabsscaler?
MaxAbs Scaler: MaxAbsScaler is best suited to scale the sparse data. It scales each feature by dividing it with the largest maximum value in each feature. For example, if an input variable has the original value [2,-1,0,1] then MaxAbs will scale it as [1,-0.5,0,0.5]. It divided each value with the highest value i.e. 2


## Should I use MinMaxScaler or StandardScaler?
StandardScaler is useful for the features that follow a Normal distribution. Therefore, it makes mean = 0 and scales the data to unit variance.
MinMaxScaler may be used when the upper and lower boundaries are well known from domain knowledge. MinMaxScaler preserves the shape of the original distribution. It doesn't meaningfully change the information embedded in the original data. Note that MinMaxScaler doesn't reduce the importance of outliers.

In [3]:
# StandardScaler [-1, +1]
sc = StandardScaler()

X_scale = sc.fit_transform(X)
print("\nafter StandardScaler transform : \n", X_scale)


after StandardScaler transform : 
 [[-0.4276029  -0.81838794  1.07146232  1.60591014]
 [ 0.52262577  1.07682623 -0.83335958 -1.14707867]
 [-1.37783158 -1.16297233 -1.1508299  -0.22941573]
 [ 1.28280871  0.90453403  0.91272716 -0.22941573]]


In [4]:
print("Mean = ", X_scale.mean())
print("Max = ", X_scale.max())
print("Min = ", X_scale.min())
print("Std = ", X_scale.std())

Mean =  -2.7755575615628914e-17
Max =  1.6059101370939322
Min =  -1.3778315806221817
Std =  1.0


In [5]:
revert_standardization = sc.inverse_transform(X_scale)
print("revert StandardScaler :\n", revert_standardization)

revert StandardScaler :
 [[ 9.  5. 15. 18.]
 [14. 16.  3.  6.]
 [ 4.  3.  1. 10.]
 [18. 15. 14. 10.]]


In [6]:
# MinMaxScaler [0, 1]
sc = MinMaxScaler()

X_scale = sc.fit_transform(X)
print("after MinMaxScaler transform : \n", X_scale)

after MinMaxScaler transform : 
 [[0.35714286 0.15384615 1.         1.        ]
 [0.71428571 1.         0.14285714 0.        ]
 [0.         0.         0.         0.33333333]
 [1.         0.92307692 0.92857143 0.33333333]]


In [7]:
print("Mean = ", X_scale.mean())
print("Max = ", X_scale.max())
print("Min = ", X_scale.min())
print("Std = ", X_scale.std())

Mean =  0.4929029304029304
Max =  1.0
Min =  0.0
Std =  0.41315230618238763


In [8]:
revert_standardization = sc.inverse_transform(X_scale)
print("revert StandardScaler :\n", revert_standardization)

revert StandardScaler :
 [[ 9.  5. 15. 18.]
 [14. 16.  3.  6.]
 [ 4.  3.  1. 10.]
 [18. 15. 14. 10.]]


In [9]:
# MaxAbsScaler [-1, 1]
'''
This scaler is meant for data that is already centered at zero or sparse data. 
It does not shift/center the data, and thus does not destroy any sparsity.
'''
sc = MaxAbsScaler()

X_scale = sc.fit_transform(X)
print("after MaxAbsScaler transform : \n", X_scale)

after MaxAbsScaler transform : 
 [[0.5        0.3125     1.         1.        ]
 [0.77777778 1.         0.2        0.33333333]
 [0.22222222 0.1875     0.06666667 0.55555556]
 [1.         0.9375     0.93333333 0.55555556]]


In [10]:
print("Mean = ", X_scale.mean())
print("Max = ", X_scale.max())
print("Min = ", X_scale.min())
print("Std = ", X_scale.std())

Mean =  0.5988715277777777
Max =  1.0
Min =  0.06666666666666667
Std =  0.33742005038342393


In [11]:
revert_standardization = sc.inverse_transform(X_scale)
print("revert StandardScaler :\n", revert_standardization)

revert StandardScaler :
 [[ 9.  5. 15. 18.]
 [14. 16.  3.  6.]
 [ 4.  3.  1. 10.]
 [18. 15. 14. 10.]]


In [12]:
# PowerTransformer [Apply a power transform feature wise to make data more Gaussian-like]
'''
PowerTransformer(method='yeo-johnson', *, standardize=True, copy=True)

yeo-johnson => works with positive and negative values
box-cox => only works with strictly positive values

This is often described as removing skewness in the distribution, although more generally is described 
as stabilizing the variance of the distribution.
'''
sc = PowerTransformer()

X_scale = sc.fit_transform(X)
print("after PowerTransformer transform : \n", X_scale)

after PowerTransformer transform : 
 [[-0.38044128 -0.70358814  1.02043694  1.42647651]
 [ 0.54874683  1.03777587 -0.65767331 -1.40173416]
 [-1.41456211 -1.25616325 -1.29052012 -0.01237117]
 [ 1.24625655  0.92197552  0.92775649 -0.01237117]]


In [13]:
print("Mean = ", X_scale.mean())
print("Max = ", X_scale.max())
print("Min = ", X_scale.min())
print("Std = ", X_scale.std())

Mean =  8.326672684688674e-17
Max =  1.4264765124294436
Min =  -1.4145621072079244
Std =  0.9999999999999999


# Normalization

In [14]:
# normalize [0, 1]
nc = Normalizer()

X_scale = nc.fit_transform(X)
print("after Normalizer transform : \n", X_scale)

after Normalizer transform : 
 [[0.3516591  0.19536617 0.5860985  0.7033182 ]
 [0.62798583 0.71769809 0.13456839 0.26913678]
 [0.35634832 0.26726124 0.08908708 0.89087081]
 [0.61921882 0.51601569 0.48161464 0.34401046]]


In [15]:
print("Mean = ", X_scale.mean())
print("Max = ", X_scale.max())
print("Min = ", X_scale.min())
print("Std = ", X_scale.std())

Mean =  0.44689113195705926
Max =  0.8908708063747479
Min =  0.0890870806374748
Std =  0.22425056561386467


In [16]:
X_scale = normalize(X, norm='l2')
print("after normalize transform : \n", X_scale)

after normalize transform : 
 [[0.3516591  0.19536617 0.5860985  0.7033182 ]
 [0.62798583 0.71769809 0.13456839 0.26913678]
 [0.35634832 0.26726124 0.08908708 0.89087081]
 [0.61921882 0.51601569 0.48161464 0.34401046]]


In [17]:
print("Mean = ", X_scale.mean())
print("Max = ", X_scale.max())
print("Min = ", X_scale.min())
print("Std = ", X_scale.std())

Mean =  0.44689113195705926
Max =  0.8908708063747479
Min =  0.0890870806374748
Std =  0.22425056561386467


# Categorical Features

In [2]:
x_cat = [['male', 'US', 'Safari'], ['female', 'Europe', 'Firefox'], ['female', 'Asia', 'Chrome']]

In [3]:
# Encoding Independent categorical data
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0, 1])], remainder='passthrough')

X_encoded = ct.fit_transform(x_cat)
print("after OneHotEncoder transform : \n", X_encoded)

after OneHotEncoder transform : 
 [[0.0 1.0 0.0 0.0 1.0 'Safari']
 [1.0 0.0 0.0 1.0 0.0 'Firefox']
 [1.0 0.0 1.0 0.0 0.0 'Chrome']]


In [20]:
# Encoding Independent categorical data
ct = ColumnTransformer(transformers=[('encoder', OrdinalEncoder(), [0, 1])], remainder='passthrough')

X_encoded = ct.fit_transform(x_cat)
print("after OrdinalEncoder transform : \n", X_encoded)

after OrdinalEncoder transform : 
 [[1.0 2.0 'Safari']
 [0.0 1.0 'Firefox']
 [0.0 0.0 'Chrome']]


# Custom transformers

In [21]:
transformer = FunctionTransformer(np.log2)
transformer.fit_transform(X)

array([[3.169925  , 2.32192809, 3.9068906 , 4.169925  ],
       [3.80735492, 4.        , 1.5849625 , 2.5849625 ],
       [2.        , 1.5849625 , 0.        , 3.32192809],
       [4.169925  , 3.9068906 , 3.80735492, 3.32192809]])