In [1]:
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

# Using Pipelines with Grid-Search

## Feature selection and regression without pipelines

In [2]:
from sklearn.datasets import make_regression

X, y = make_regression(random_state=42, effective_rank=90)
print(X.shape)

(100, 100)


In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, train_size=.5)

In [5]:
from sklearn.feature_selection import SelectFpr, f_regression
from sklearn.linear_model import Ridge

# Univariate feature selection, selects some useful features
fpr = SelectFpr(score_func=f_regression)
fpr.fit(X_train, y_train)
X_train_fpr = fpr.transform(X_train)
X_test_fpr = fpr.transform(X_test)

print(X_train_fpr.shape)

(50, 5)


In [6]:
ridge = Ridge()
ridge.fit(X_train_fpr, y_train)
ridge.score(X_test_fpr, y_test)

0.12185693594910485

## With pipelines

In [8]:
from sklearn.pipeline import make_pipeline

# Same thing as above
pipe = make_pipeline(SelectFpr(score_func=f_regression), Ridge())

pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

0.12185693594910485

## Grid-Searching alpha in Ridge

In [11]:
from sklearn.model_selection import GridSearchCV
# without pipeline:

# key = alpha, values = value of alpha
param_grid_no_pipeline = {'alpha': 10. ** np.arange(-3, 5)}

In [12]:
pipe.named_steps.keys()

dict_keys(['selectfpr', 'ridge'])

In [14]:
# with pipeline
param_grid = {'ridge__alpha': 10. ** np.arange(-3, 5)}
grid = GridSearchCV(pipe, param_grid, cv=10)

# Cross-validated grid search
grid.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=Pipeline(steps=[('selectfpr', SelectFpr(alpha=0.05, score_func=<function f_regression at 0x10f004d90>)), ('ridge', Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'ridge__alpha': array([  1.00000e-03,   1.00000e-02,   1.00000e-01,   1.00000e+00,
         1.00000e+01,   1.00000e+02,   1.00000e+03,   1.00000e+04])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [15]:
grid.score(X_test, y_test)

0.53885349326377441

In [16]:
grid.best_params_

{'ridge__alpha': 0.10000000000000001}

## Selecting parameters of the preprocessing steps

In [18]:
param_grid = {'ridge__alpha': 10. ** np.arange(-3, 5),
              'selectfpr__alpha': [0.01, 0.02, 0.05, 0.1, 0.3]}

grid = GridSearchCV(pipe, param_grid, cv=10)
grid.fit(X_train, y_train)
grid.score(X_test, y_test)

0.99438534427129266

In [19]:
grid.best_params_

{'ridge__alpha': 0.001, 'selectfpr__alpha': 0.3}

In [20]:
# Access the final feature selection, indicates which features were selected
final_selectfpr = grid.best_estimator_.named_steps['selectfpr']
final_selectfpr.get_support()

array([False, False, False, False, False, False, False,  True,  True,
       False,  True, False, False, False, False,  True,  True, False,
       False,  True, False,  True, False, False, False, False,  True,
       False,  True, False, False, False,  True, False,  True, False,
       False, False, False, False, False, False,  True,  True, False,
       False,  True, False, False,  True, False,  True, False, False,
        True,  True, False, False, False, False,  True,  True, False,
        True, False,  True, False, False, False, False, False, False,
       False, False,  True, False, False,  True, False, False, False,
        True, False,  True, False, False, False, False, False, False,
       False, False, False,  True,  True, False, False, False,  True, False], dtype=bool)