In [1]:
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

In [18]:
import warnings
warnings.filterwarnings('ignore')

# Using Pipelines with Grid-Search

### Feature selection and regression without pipelines

In [3]:
from sklearn.datasets import make_regression
X, y = make_regression(random_state=42, effective_rank=90)
X.shape

(100, 100)

In [4]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42,\
                                                   train_size=0.5)

In [6]:
from sklearn.feature_selection import SelectFpr, f_regression
from sklearn.linear_model import Ridge

fpr = SelectFpr(score_func=f_regression)
fpr.fit(X_train, y_train)
X_train_fpr = fpr.transform(X_train)
X_test_fpr = fpr.transform(X_test)

X_train_fpr.shape

(50, 5)

In [7]:
ridge = Ridge()
ridge.fit(X_train_fpr, y_train)
ridge.score(X_test_fpr, y_test)

0.12185693594910552

## With pipelines

In [8]:
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(SelectFpr(score_func=f_regression), Ridge())
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

0.12185693594910552

## Grid search alpha in Ridge

In [10]:
from sklearn.grid_search import GridSearchCV

#without pipeline
param_grid_no_pipeline = {'alpha':10.**np.arange(-3,5)}

In [12]:
pipe.named_steps.keys()

dict_keys(['ridge', 'selectfpr'])

In [13]:
# with pipeline
param_grid = {'ridge__alpha': 10.**np.arange(-3,5)}
grid = GridSearchCV(pipe, param_grid, cv=10)
grid.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=Pipeline(steps=[('selectfpr', SelectFpr(alpha=0.05,
     score_func=<function f_regression at 0x0000023D0CE480D0>)), ('ridge', Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'ridge__alpha': array([  1.00000e-03,   1.00000e-02,   1.00000e-01,   1.00000e+00,
         1.00000e+01,   1.00000e+02,   1.00000e+03,   1.00000e+04])},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [15]:
grid.score(X_test, y_test)

0.53885349326377441

In [16]:
grid.best_params_

{'ridge__alpha': 0.10000000000000001}

## Selecting parameters of the preprocessing steps

In [19]:
param_grid = {'ridge__alpha': 10.**np.arange(-3,5),
             'selectfpr__alpha': [0.01,0.02,0.05,0.1,0.3]}
grid = GridSearchCV(pipe, param_grid, cv=10)
grid.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=Pipeline(steps=[('selectfpr', SelectFpr(alpha=0.05,
     score_func=<function f_regression at 0x0000023D0CE480D0>)), ('ridge', Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'selectfpr__alpha': [0.01, 0.02, 0.05, 0.1, 0.3], 'ridge__alpha': array([  1.00000e-03,   1.00000e-02,   1.00000e-01,   1.00000e+00,
         1.00000e+01,   1.00000e+02,   1.00000e+03,   1.00000e+04])},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [20]:
grid.score(X_test, y_test)

0.99438534427129177

In [21]:
grid.best_params_

{'ridge__alpha': 0.001, 'selectfpr__alpha': 0.3}

In [23]:
final_selectfpr = grid.best_estimator_.named_steps['selectfpr']
final_selectfpr.get_support()

array([False, False, False, False, False, False, False,  True,  True,
       False,  True, False, False, False, False,  True,  True, False,
       False,  True, False,  True, False, False, False, False,  True,
       False,  True, False, False, False,  True, False,  True, False,
       False, False, False, False, False, False,  True,  True, False,
       False,  True, False, False,  True, False,  True, False, False,
        True,  True, False, False, False, False,  True,  True, False,
        True, False,  True, False, False, False, False, False, False,
       False, False,  True, False, False,  True, False, False, False,
        True, False,  True, False, False, False, False, False, False,
       False, False, False,  True,  True, False, False, False,  True, False], dtype=bool)