In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

# Pipelines

In [4]:
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split

digits = load_digits()
X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, random_state=0)

## Without pipelines

In [5]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC # Support Vector Classifier

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

svm = SVC().fit(X_train_scaled, y_train)


In [7]:
X_test_scaled  = scaler.transform(X_test)
svm.score(X_test_scaled, y_test)

0.98444444444444446

## With pipelines

In [10]:
from sklearn.pipeline import Pipeline

# verbose constructor

# First step my_scaler, provide it an instance of StandardScaler
# Second step my_svm, provide it an instance of SVC

pipe = Pipeline([("my_scaler", StandardScaler()), ("my_svm", SVC())])
pipe.fit(X_train, y_train);

In [11]:
pipe.score(X_test, y_test)

0.98444444444444446

<img src="figures/pipeline.svg" width="60%">

In [12]:
pipe.decision_function(X_train).shape



(1347, 45)

In [14]:
from sklearn.pipeline import make_pipeline

# shortcut
pipe = make_pipeline(StandardScaler(), SVC())
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

0.98444444444444446

## Longer pipelines

In [15]:
from sklearn.feature_selection import SelectFdr, VarianceThreshold

pipe = make_pipeline(VarianceThreshold(), StandardScaler(), SelectFdr(), SVC())
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

0.98666666666666669

## Unsupervised pipelines

In [17]:
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

# Uses PCA for dimensionality reduction
# Clustering with KMeans
cluster_pipe = make_pipeline(PCA(n_components=10), KMeans(n_clusters=10))
cluster_pipe.fit(X_train)
cluster_pipe.predict(X_train)

array([2, 3, 3, ..., 8, 8, 7], dtype=int32)

## Accessing attributes

In [18]:
cluster_pipe.named_steps['pca']

PCA(copy=True, iterated_power='auto', n_components=10, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [19]:
cluster_pipe.named_steps['pca'].components_.shape

(10, 64)

In [20]:
cluster_pipe.named_steps['kmeans']

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=10, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [21]:
cluster_pipe.named_steps['kmeans'].cluster_centers_.shape

(10, 10)

In [22]:
cluster_pipe = Pipeline([("my_pca",PCA(n_components=10)), ("my_clustering", KMeans(n_clusters=10))])
cluster_pipe.fit(X_train)

Pipeline(steps=[('my_pca', PCA(copy=True, iterated_power='auto', n_components=10, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('my_clustering', KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=10, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0))])

In [23]:
print(cluster_pipe.named_steps['my_pca'])
print(cluster_pipe.named_steps['my_clustering'])

PCA(copy=True, iterated_power='auto', n_components=10, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)
KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=10, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)
