In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

In [3]:
from sklearn.datasets import load_digits
from sklearn.cross_validation import train_test_split

digits = load_digits()

X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, random_state=0)

## Without Pipelines

In [6]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

svm = SVC().fit(X_train_scaled, y_train)

In [7]:
X_test_scaled = scaler.transform(X_test)
svm.score(X_test_scaled, y_test)

0.98444444444444446

## With Pipelines

In [8]:
from sklearn.pipeline import Pipeline

#verbose constructor
pipe = Pipeline([("my_scaler", StandardScaler()), ("my_svm", SVC())])
pipe.fit(X_train, y_train)

Pipeline(steps=[('my_scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('my_svm', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [9]:
pipe.score(X_test, y_test)

0.98444444444444446

#### Easy way to make pipeline

In [10]:
from sklearn.pipeline import make_pipeline

pipe = make_pipeline(StandardScaler(), SVC())
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

0.98444444444444446

## Longer Pipelines

In [11]:
from sklearn.feature_selection import SelectFdr, VarianceThreshold
pipe = make_pipeline(VarianceThreshold(), StandardScaler(), SelectFdr(), SVC())

In [12]:
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

0.98666666666666669

## Unsupervised Pipelines

In [13]:
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

cluster_pipe = make_pipeline(PCA(n_components=10), KMeans(n_clusters=10))
cluster_pipe.fit(X_train)
cluster_pipe.predict(X_train)

array([4, 2, 2, ..., 9, 9, 3])

## Accessing attributes

In [17]:
cluster_pipe.named_steps['pca']

PCA(copy=True, n_components=10, whiten=False)

In [18]:
cluster_pipe.named_steps['kmeans']

KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=10, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)