In [1]:
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
data = load_boston()
X_train, X_test, y_train, y_test = train_test_split(data['data'], data['target'])

In [2]:
from sklearn.preprocessing import StandardScaler, RobustScaler, QuantileTransformer
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge

In [3]:
scaler = StandardScaler()
pca = PCA()
ridge = Ridge()

In [4]:
X_train = scaler.fit_transform(X_train)
X_train = pca.fit_transform(X_train)
ridge.fit(X_train, y_train) # Too repetitive! ...

Ridge()

In [5]:
from sklearn.pipeline import Pipeline
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('reduce_dim', PCA()),
    ('regressor', Ridge())
])

In [6]:
pipe = pipe.fit(X_train, y_train)
print('Testing score: ', pipe.score(X_test, y_test))

Testing score:  -2993.39988784317


In [7]:
print(pipe.steps[1][1].explained_variance_)

[1.0026455 1.0026455 1.0026455 1.0026455 1.0026455 1.0026455 1.0026455
 1.0026455 1.0026455 1.0026455 1.0026455 1.0026455 1.0026455]


In [8]:
# concerning PCA, let's evaluate accuracy variance with # components from 1,10
import numpy as np
n_features_to_test = np.arange(1, 11)

In [9]:
alpha_to_test = 2.0**np.arange(-6, +6)

In [10]:
params = {'reduce_dim__n_components': n_features_to_test,\
              'regressor__alpha': alpha_to_test}

In [11]:
from sklearn.model_selection import GridSearchCV
gridsearch = GridSearchCV(pipe, params, verbose=1).fit(X_train, y_train)
print('Final score is: ', gridsearch.score(X_test, y_test))

Fitting 5 folds for each of 120 candidates, totalling 600 fits
Final score is:  -4912.210820032559


In [12]:
gridsearch.best_params_

{'reduce_dim__n_components': 10, 'regressor__alpha': 32.0}

In [13]:
# pipeline tuning - advanced
scalers_to_test = [StandardScaler(), RobustScaler(), QuantileTransformer()]

In [14]:
params = {'scaler': scalers_to_test,
        'reduce_dim__n_components': n_features_to_test,\
        'regressor__alpha': alpha_to_test}

In [15]:
params = [
        {'scaler': scalers_to_test,
         'reduce_dim': [PCA()],
         'reduce_dim__n_components': n_features_to_test,\
         'regressor__alpha': alpha_to_test},

        {'scaler': scalers_to_test,
         'reduce_dim': [SelectKBest(f_regression)],
         'reduce_dim__k': n_features_to_test,\
         'regressor__alpha': alpha_to_test}
        ]


In [17]:
# launch gridsearch again
gridsearch = GridSearchCV(pipe, params, verbose=1).fit(X_train, y_train)
print('Final score is: ', gridsearch.score(X_test, y_test));

Fitting 5 folds for each of 720 candidates, totalling 3600 fits


























































Final score is:  -3223.242748031395




In [18]:
gridsearch.best_params_

{'reduce_dim': SelectKBest(score_func=<function f_regression at 0x7fb7a271c0d0>),
 'reduce_dim__k': 10,
 'regressor__alpha': 8.0,
 'scaler': StandardScaler()}