In [3]:
# learn the preprocess skill in scikit learning library
from sklearn import preprocessing
import numpy as np

In [4]:
# standardization or mean removal and variance scaling
X = np.array([[1.,-1.,2.],
              [2.,0.,0.],
              [0.,1.,-1.]])

In [5]:
# the scale is based on the column. The defaulted is axis = 0, which means operating along rows for each column
X_scaled = preprocessing.scale(X)

In [6]:
X_scaled

array([[ 0.        , -1.22474487,  1.33630621],
       [ 1.22474487,  0.        , -0.26726124],
       [-1.22474487,  1.22474487, -1.06904497]])

In [8]:
X_scaled.mean(axis=0)

array([ 0.,  0.,  0.])

In [9]:
X_scaled.std(axis=0)

array([ 1.,  1.,  1.])

In [10]:
# StandardScaler is a Transformer API to apply computed mean and standard deviation on the training set to testing set
scaler = preprocessing.StandardScaler().fit(X)

In [11]:
scaler

StandardScaler(copy=True, with_mean=True, with_std=True)

In [12]:
# mean
scaler.mean_

array([ 1.        ,  0.        ,  0.33333333])

In [13]:
# std
scaler.scale_

array([ 0.81649658,  0.81649658,  1.24721913])

In [14]:
scaler.transform(X)

array([[ 0.        , -1.22474487,  1.33630621],
       [ 1.22474487,  0.        , -0.26726124],
       [-1.22474487,  1.22474487, -1.06904497]])

In [15]:
scaler.transform([[-1.,1.,0.]])

array([[-2.44948974,  1.22474487, -0.26726124]])

In [16]:
# Scaling features to a range, also based on columns
X_train = np.array([[1.,-1.,2.],
              [2.,0.,0.],
              [0.,1.,-1.]])

In [18]:
min_max_scaler = preprocessing.MinMaxScaler()

In [19]:
min_max_scaler

MinMaxScaler(copy=True, feature_range=(0, 1))

In [20]:
X_train_minmax = min_max_scaler.fit_transform(X_train)

In [21]:
X_train_minmax

array([[ 0.5       ,  0.        ,  1.        ],
       [ 1.        ,  0.5       ,  0.33333333],
       [ 0.        ,  1.        ,  0.        ]])

In [22]:
X_test = np.array([[-3.,-1.,4.]])

In [23]:
# apply on the test data
X_test_minmax = min_max_scaler.transform(X_test)

In [24]:
X_test_minmax

array([[-1.5       ,  0.        ,  1.66666667]])

In [26]:
# have a look at the scale and move we use as the tranformation
min_max_scaler.scale_

array([ 0.5       ,  0.5       ,  0.33333333])

In [27]:
min_max_scaler.min_

array([ 0.        ,  0.5       ,  0.33333333])

In [28]:
# MaxAbsScaler is used in rescale the range into [-1,1]
max_abs_scaler = preprocessing.MaxAbsScaler()
X_train_maxabs = max_abs_scaler.fit_transform(X_train)
X_train_maxabs

array([[ 0.5, -1. ,  1. ],
       [ 1. ,  0. ,  0. ],
       [ 0. ,  1. , -0.5]])

In [30]:
X_test_maxabs = max_abs_scaler.transform(X_test)
X_test_maxabs

array([[-1.5, -1. ,  2. ]])

In [31]:
# Normalization
X_normalized = preprocessing.normalize(X, norm='l2')
X_normalized

array([[ 0.40824829, -0.40824829,  0.81649658],
       [ 1.        ,  0.        ,  0.        ],
       [ 0.        ,  0.70710678, -0.70710678]])

In [32]:
# Normalizer API is used to apply the computed normalized parameters learned from training set to testing set
normalizer = preprocessing.Normalizer().fit(X)
normalizer

Normalizer(copy=True, norm='l2')

In [33]:
normalizer.transform(X)

array([[ 0.40824829, -0.40824829,  0.81649658],
       [ 1.        ,  0.        ,  0.        ],
       [ 0.        ,  0.70710678, -0.70710678]])

In [34]:
normalizer.transform([[-1.,1.,0.]])

array([[-0.70710678,  0.70710678,  0.        ]])

In [35]:
# Binarization
# Feature Binarization
# Feature Binarization is the process of thresholding numerical features to get boolean values
binarizer = preprocessing.Binarizer().fit(X) # fit does nothing here
binarizer

Binarizer(copy=True, threshold=0.0)

In [36]:
binarizer.transform(X)

array([[ 1.,  0.,  1.],
       [ 1.,  0.,  0.],
       [ 0.,  1.,  0.]])

In [37]:
binarizer = preprocessing.Binarizer(threshold=1.1)
binarizer.transform(X)

array([[ 0.,  0.,  1.],
       [ 1.,  0.,  0.],
       [ 0.,  0.,  0.]])

In [38]:
# encoding categoricall features
# transform each categorical feature with m possible values into m binary features with only one active
enc = preprocessing.OneHotEncoder()
enc.fit([[0,0,3],[1,1,0],[0,2,1],[1,0,2]])

OneHotEncoder(categorical_features='all', dtype=<type 'float'>,
       handle_unknown='error', n_values='auto', sparse=True)

In [40]:
enc.transform([[0,1,3]]).toarray()

array([[ 1.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  1.]])

In [41]:
# Imputation of missing values
from sklearn.preprocessing import Imputer

In [42]:
imp = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)
imp.fit([[1,2],[np.nan,3],[7,6]])

Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)

In [43]:
X_test1 = [[np.nan,2],[6,np.nan],[7,6]]
print(imp.transform(X_test1))

[[ 4.          2.        ]
 [ 6.          3.66666667]
 [ 7.          6.        ]]


In [44]:
# Imputer also supports sparse matrices
import scipy.sparse as sp
X_train1 = sp.csc_matrix([[1,2],[0,3],[7,6]])

In [45]:
imp = Imputer(missing_values = 0,strategy='mean',axis=0)
imp.fit(X_train1)

Imputer(axis=0, copy=True, missing_values=0, strategy='mean', verbose=0)

In [46]:
X_test = sp.csc_matrix([[0,2],[6,0],[7,6]])

In [47]:
print(imp.transform(X_test))

[[ 4.          2.        ]
 [ 6.          3.66666667]
 [ 7.          6.        ]]


In [50]:
# Generating polynomial features
from sklearn.preprocessing import PolynomialFeatures
X_train2 = np.arange(6).reshape(3,2)
X_train2

array([[0, 1],
       [2, 3],
       [4, 5]])

In [51]:
poly = PolynomialFeatures(2)

In [53]:
poly.fit_transform(X_train2)

array([[  1.,   0.,   1.,   0.,   0.,   1.],
       [  1.,   2.,   3.,   4.,   6.,   9.],
       [  1.,   4.,   5.,  16.,  20.,  25.]])

In [54]:
X_train3 = np.arange(9).reshape(3,3)
X_train3

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [55]:
#(X1,x2,x3)=>(1,x1,x2,x3,x1x2,x1x3,x2x3,x1x2x3)
poly1 = PolynomialFeatures(degree=3,interaction_only=True)

In [57]:
poly1.fit_transform(X_train3)

array([[   1.,    0.,    1.,    2.,    0.,    0.,    2.,    0.],
       [   1.,    3.,    4.,    5.,   12.,   15.,   20.,   60.],
       [   1.,    6.,    7.,    8.,   42.,   48.,   56.,  336.]])

In [58]:
# custom transformers
from sklearn.preprocessing import FunctionTransformer

In [59]:
transformer = FunctionTransformer(np.log1p)

In [60]:
X_train4 = np.array([[0,1],[2,3]])

In [62]:
transformer.transform(X_train4)

array([[ 0.        ,  0.69314718],
       [ 1.09861229,  1.38629436]])