In [1]:
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

# Why pipelines for preprocessing

- It's important to keep in mind to not apply cross-validation only to the last step, but to the whole process
- To do this, you use Pipelines

In [2]:
# Create a senthetic regression dataset
from sklearn.datasets import make_regression

X, y = make_regression(random_state=42, noise=100)
print(X.shape)

(100, 100)


In [3]:
# Split the dataset
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, train_size=.5)



In [4]:
# Feature selection
from sklearn.feature_selection import SelectFpr, f_regression
from sklearn.linear_model import Ridge

# Look at each feature individually, and run a statistical test to see how related each feature is with the response
# SelectFpr controls the false positive rate
fpr = SelectFpr(score_func=f_regression, alpha=.05)
fpr.fit(X_train, y_train)

# Fit the data
X_train_fpr = fpr.transform(X_train)
X_test_fpr = fpr.transform(X_test)

# Select only the features that are correlated
print(X_train_fpr.shape)

(50, 6)


In [5]:
# Train a Ridge model with the selected features
ridge = Ridge()
ridge.fit(X_train_fpr, y_train)
ridge.score(X_test_fpr, y_test)

0.40868089541721303

## How not to do grid-searches


In [8]:
# DON'T DO THIS:
# This is called contamination
# We already used true labels in the validation fold, leaked data 

from sklearn.model_selection import GridSearchCV
param_grid = {'alpha': 10. ** np.arange(-3, 5)}
grid = GridSearchCV(ridge, param_grid, cv=5)
grid.fit(X_train_fpr, y_train)
print("test set accuracy: %.2f" % grid.score(X_test_fpr, y_test))

test set accuracy: 0.41


## A more extreme example

In [9]:
rng = np.random.RandomState(0)
y = rng.rand(X.shape[0])
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, train_size=.6)

In [10]:
from sklearn.feature_selection import SelectKBest

fpr = SelectFpr(score_func=f_regression)
fpr.fit(X_train, y_train)

X_train_fpr = fpr.transform(X_train)
X_test_fpr = fpr.transform(X_test)

X_train_fpr.shape

(60, 7)

In [11]:
# DON'T DO THIS:
from sklearn.grid_search import GridSearchCV

param_grid = {'alpha': 10. ** np.arange(-3, 3)}
grid = GridSearchCV(ridge, param_grid, cv=5)
grid.fit(X_train_fpr, y_train)

print("best cross-validation score: %.2f" % grid.best_score_)
print("test set accuracy: %.2f" % grid.score(X_test_fpr, y_test))

best cross-validation score: 0.24
test set accuracy: -0.50


<img src="figures/pipeline_cross_validation.svg" width=40%>