In [1]:
import numpy as np

from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
from sklearn.model_selection import cross_val_score

rng = np.random.RandomState(0)

dataset = load_boston()
X_full, y_full = dataset.data, dataset.target
n_samples = X_full.shape[0]
n_features = X_full.shape[1]

# Estimate the score on the entire dataset, with no missing values
estimator = RandomForestRegressor(random_state=0, n_estimators=100)
score = cross_val_score(estimator, X_full, y_full).mean()
print("Score with the entire dataset = %.2f" % score)

# Add missing values in 75% of the lines
missing_rate = 0.75
n_missing_samples = np.floor(n_samples * missing_rate)
missing_samples = np.hstack((np.zeros(n_samples - n_missing_samples,
                                      dtype=np.bool),
                             np.ones(n_missing_samples,
                                     dtype=np.bool)))
rng.shuffle(missing_samples)
missing_features = rng.randint(0, n_features, n_missing_samples)

# Estimate the score without the lines containing missing values
X_filtered = X_full[~missing_samples, :]
y_filtered = y_full[~missing_samples]
estimator = RandomForestRegressor(random_state=0, n_estimators=100)
score = cross_val_score(estimator, X_filtered, y_filtered).mean()
print("Score without the samples containing missing values = %.2f" % score)

# Estimate the score after imputation of the missing values
X_missing = X_full.copy()
X_missing[np.where(missing_samples)[0], missing_features] = 0
y_missing = y_full.copy()
estimator = Pipeline([("imputer", Imputer(missing_values=0,
                                          strategy="mean",
                                          axis=0)),
                      ("forest", RandomForestRegressor(random_state=0,
                                                       n_estimators=100))])
score = cross_val_score(estimator, X_missing, y_missing).mean()
print("Score after imputation of the missing values = %.2f" % score)

Score with the entire dataset = 0.56


  a = empty(shape, dtype, order)


Score without the samples containing missing values = 0.48
Score after imputation of the missing values = 0.57


In [25]:
from sklearn import preprocessing
import numpy as np
X = np.array([[ 1., -1.,  2.],
              [ 2.,  0.,  0.],
              [ 0.,  1., -1.]])
X_scaled = preprocessing.scale(X)

X_scaled

array([[ 0.        , -1.22474487,  1.33630621],
       [ 1.22474487,  0.        , -0.26726124],
       [-1.22474487,  1.22474487, -1.06904497]])

In [26]:
np.average(X, axis=0)

array([ 1.        ,  0.        ,  0.33333333])

In [28]:
aveX = X - np.average(X, axis=0)

In [29]:
aveX

array([[ 0.        , -1.        ,  1.66666667],
       [ 1.        ,  0.        , -0.33333333],
       [-1.        ,  1.        , -1.33333333]])

In [30]:
np.std(aveX)

0.98130676292531638

In [31]:
np.std(aveX, axis=0)

array([ 0.81649658,  0.81649658,  1.24721913])

In [32]:
aveX / np.std(aveX, axis=0)

array([[ 0.        , -1.22474487,  1.33630621],
       [ 1.22474487,  0.        , -0.26726124],
       [-1.22474487,  1.22474487, -1.06904497]])

In [33]:
X

array([[ 1., -1.,  2.],
       [ 2.,  0.,  0.],
       [ 0.,  1., -1.]])

In [34]:
from sklearn import preprocessing

In [35]:
X_scaled = preprocessing.scale(X)

In [36]:
X_scaled

array([[ 0.        , -1.22474487,  1.33630621],
       [ 1.22474487,  0.        , -0.26726124],
       [-1.22474487,  1.22474487, -1.06904497]])

In [37]:
print X_scaled.mean(axis=0)
print X_scaled.mean(axis=1)
print X_scaled.mean()

[ 0.  0.  0.]
[ 0.03718711  0.31916121 -0.35634832]
4.93432455389e-17


In [39]:
print X_scaled.std(axis=0)
print X_scaled.std(axis=1)
print X_scaled.std()

[ 1.  1.  1.]
[ 1.04587533  0.64957343  1.11980724]
1.0


In [41]:
scaler = preprocessing.StandardScaler().fit(X)
print scaler
print scaler.mean_                                      
print scaler.scale_                                       
scaler.transform(X)                               

StandardScaler(copy=True, with_mean=True, with_std=True)
[ 1.          0.          0.33333333]
[ 0.81649658  0.81649658  1.24721913]


array([[ 0.        , -1.22474487,  1.33630621],
       [ 1.22474487,  0.        , -0.26726124],
       [-1.22474487,  1.22474487, -1.06904497]])

In [42]:
scaler.transform([-1, 1, 0])



array([-2.44948974,  1.22474487, -0.26726124])

In [46]:
X_train = np.array([[ 1., -1.,  2.],
                    [ 2.,  0.,  0.],
                    [ 0.,  1., -1.]])

min_max_scaler = preprocessing.MinMaxScaler()
X_train_minmax = min_max_scaler.fit_transform(X_train)
X_train_minmax

array([[ 0.5       ,  0.        ,  1.        ],
       [ 1.        ,  0.5       ,  0.33333333],
       [ 0.        ,  1.        ,  0.        ]])

In [47]:
min_max_scaler.scale_

array([ 0.5       ,  0.5       ,  0.33333333])

In [48]:
min_max_scaler.min_

array([ 0.        ,  0.5       ,  0.33333333])

In [49]:
min_max_scaler.max_

AttributeError: 'MinMaxScaler' object has no attribute 'max_'