# ch6 Algorithm Chains and Pipelines

In [1]:
# Common imports# Commo 
import pandas as pd
import numpy as np

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

cm2 = ListedColormap(['#0000aa', '#ff2020'])
cm3 = ListedColormap(['#0000aa', '#ff2020', '#50ff50'])

plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

import mglearn

`Pipeline`类可以将多个处理步骤合并为单个skl estimator。`Pipeline`类最常见的用例是将预处理步骤（比如缩放数据）与一个监督模型（如分类器）链接起来。

In [2]:
from sklearn.svm import SVC
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [3]:
# Load and split the data
cancer = load_breast_cancer()
# test size 0.25
X_train, X_test, y_train, y_test = train_test_split(
    cancer.data, cancer.target, random_state=0)
# Rescale the training set by using MinMaxScaler
min_max_scaler = MinMaxScaler()
X_train_scaled = min_max_scaler.fit_transform(X_train)
# Fit the svc model
svc = SVC(random_state=42).fit(X_train_scaled, y_train)
# Scale test set
X_test_scaled = min_max_scaler.transform(X_test)
# Score the scaled test set
svc.score(X_test_scaled, y_test)

0.95104895104895104

## 6.1 构建pipeline

In [4]:
from sklearn.pipeline import Pipeline

pipe = Pipeline([("min_max_scaler", MinMaxScaler()), ("svm", SVC())])
pipe

Pipeline(memory=None,
     steps=[('min_max_scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('svm', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

`pipe.fit`首先对第一个步骤调用`fit`，然后使用该缩放器对训练数据进行变换，最后用缩放后的数据拟合SVM。

In [5]:
pipe.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('min_max_scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('svm', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [6]:
# 在测试集上评估
pipe.score(X_test, y_test)

0.95104895104895104

## 6.2 在网格搜索中使用pipeline

为pipeline定义网格参数时，为每个参数指定步骤名称，后面加上__（双下划线），然后是参数名称。

In [7]:
param_grid = {"svm__C": [0.001, 0.01, 0.1, 1, 10, 100],
              "svm__gamma": [0.001, 0.01, 0.1, 1, 10, 100]}

In [8]:
from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(pipe, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('min_max_scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('svm', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'svm__C': [0.001, 0.01, 0.1, 1, 10, 100], 'svm__gamma': [0.001, 0.01, 0.1, 1, 10, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [9]:
grid_search.best_params_

{'svm__C': 1, 'svm__gamma': 1}

In [10]:
grid_search.best_score_

0.98122065727699526

In [11]:
grid_search.score(X_test, y_test)

0.97202797202797198

**在交叉验证过程中，应该在进行任何预处理之前完成数据集的划分。任何从数据集中提取信息的过程都应该仅应用于数据集的训练部分，因此，任何交叉验证都应该位于处理过程的“最外层循环”。**

本例中，对于交叉验证每次划分来说，仅使用训练部分对`MinMaxScaler`进行拟合，测试部分的信息没有泄露到参数搜索中。

在交叉验证中，信息泄露的影响大小取决于处理步骤的性质。使用测试部分来估计数据的范围，通常不会产生可怕的影响，但在特征提取和特征选择中使用测试部分，则会导致结果的显著差异。

**信息泄露的例子**：

回归任务，包含从高斯分布中独立采样的100个样本和10000个特征，还从高斯分布中对响应进行采样。

In [12]:
rnd = np.random.RandomState(seed=0)
X = rnd.normal(size=(100, 10000))
y = rnd.normal(size=(100,))

> 考虑到创建数据集的方式，数据X和目标y之间没有任何关系（他们是独立的），所以不可能从数据集中学到任何内容。

In [13]:
# 利用SelectPercentile从10000个特征中选择信息量最大的特征
from sklearn.feature_selection import SelectPercentile, f_regression

# 保留5%的特征
select = SelectPercentile(score_func=f_regression, percentile=5)
X_selected = select.fit(X, y).transform(X)
X_selected.shape

(100, 500)

In [14]:
# 使用交叉验证对Ridge回归进行评估
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Ridge

np.mean(cross_val_score(Ridge(), X_selected, y, cv=5, n_jobs=-1))

0.90579530652398221

$R^2$为0.906，这表示是一个非常好的模型。这显然不对，因为数据是完全随机的。由于**在交叉验证之外对特征选择进行拟合，所以能找到在训练部分和测试部分都相关的特征**。

In [15]:
pipe = Pipeline([("select", SelectPercentile(score_func=f_regression,
                                             percentile=5)),
                 ("ridge", Ridge())])

In [16]:
np.mean(cross_val_score(pipe, X, y, cv=5, n_jobs=-1))

-0.24655422384952813

得到了负的$R^2$，表示模型很差。**利用pipeline，特征选择现在位于交叉验证循环内部**。也就是说，**仅使用数据的训练部分来选择特征，而不使用测试部分**。特征选择找到的特征在训练集中与目标有关，但数据是完全随机的，这些特征在测试集中并不与目标有关。

## 6.3 通用的pipeline接口

### 6.3.1 访问步骤属性

In [17]:
pipe = Pipeline([("scaler", MinMaxScaler()),
                 ("svm", SVC())])

In [18]:
pipe

Pipeline(memory=None,
     steps=[('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('svm', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

使用`set_params`设置参数：

In [19]:
pipe.set_params(svm__C=10)  # inplace

Pipeline(memory=None,
     steps=[('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('svm', SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [20]:
pipe.set_params(svm__C=100, scaler__feature_range=(10, 100))  # inplace

Pipeline(memory=None,
     steps=[('scaler', MinMaxScaler(copy=True, feature_range=(10, 100))), ('svm', SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [21]:
pipe

Pipeline(memory=None,
     steps=[('scaler', MinMaxScaler(copy=True, feature_range=(10, 100))), ('svm', SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

通过`step`属性查看步骤：

In [22]:
pipe.steps  # 列表

[('scaler', MinMaxScaler(copy=True, feature_range=(10, 100))),
 ('svm', SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False))]

通过`named_steps`查看步骤：

In [23]:
pipe.named_steps  # 字典

{'scaler': MinMaxScaler(copy=True, feature_range=(10, 100)),
 'svm': SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
   decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
   max_iter=-1, probability=False, random_state=None, shrinking=True,
   tol=0.001, verbose=False)}

### 6.3.2 用`make_pipe`创建pipeline

In [24]:
from sklearn.pipeline import make_pipeline

pipe = make_pipeline(MinMaxScaler(), SVC())
pipe

Pipeline(memory=None,
     steps=[('minmaxscaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('svc', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

不需要为每一个步骤提供用户指定的名称。创建管道并根据每个步骤所属的类自动为其命名。

In [25]:
pipe.named_steps  # 字典

{'minmaxscaler': MinMaxScaler(copy=True, feature_range=(0, 1)),
 'svc': SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
   decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
   max_iter=-1, probability=False, random_state=None, shrinking=True,
   tol=0.001, verbose=False)}

In [26]:
pipe.steps  # 列表

[('minmaxscaler', MinMaxScaler(copy=True, feature_range=(0, 1))),
 ('svc', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False))]

一般来说，步骤名称只是类名称的小写。如果多个步骤同属于一个类，则会附加一个数字。

In [27]:
from sklearn.decomposition import PCA

pipe = make_pipeline(MinMaxScaler(), PCA(), MinMaxScaler())
pipe

Pipeline(memory=None,
     steps=[('minmaxscaler-1', MinMaxScaler(copy=True, feature_range=(0, 1))), ('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('minmaxscaler-2', MinMaxScaler(copy=True, feature_range=(0, 1)))])

In [28]:
pipe.steps

[('minmaxscaler-1', MinMaxScaler(copy=True, feature_range=(0, 1))),
 ('pca',
  PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)),
 ('minmaxscaler-2', MinMaxScaler(copy=True, feature_range=(0, 1)))]

In [29]:
pipe.named_steps

{'minmaxscaler-1': MinMaxScaler(copy=True, feature_range=(0, 1)),
 'minmaxscaler-2': MinMaxScaler(copy=True, feature_range=(0, 1)),
 'pca': PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
   svd_solver='auto', tol=0.0, whiten=False)}

这种情况下，使用具有明确名称的Pipeline构建可能会更好，以便每一个步骤提供更具语义的名称。

### 6.3.3 访问网格搜索pipeline中的属性

使用pipeline的主要原因之一就是进行网格搜索。一个常见的任务是在网格搜索内访问pipeline的某些步骤。

In [30]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [31]:
pipe = make_pipeline(StandardScaler(), LogisticRegression())

In [32]:
pipe

Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('logisticregression', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [33]:
# 设置网格搜索参数
param_grid = {"logisticregression__C": [0.01, 0.1, 1, 10, 100]}

In [34]:
# 划分cancer数据集
X_train, X_test, y_train, y_test = train_test_split(
    cancer.data, cancer.target, random_state=4)

In [35]:
grid_search = GridSearchCV(pipe, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('logisticregression', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'logisticregression__C': [0.01, 0.1, 1, 10, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [36]:
grid_search.best_estimator_

Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('logisticregression', LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [37]:
# 最佳模型
grid_search.best_estimator_.named_steps

{'logisticregression': LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
           penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=False),
 'standardscaler': StandardScaler(copy=True, with_mean=True, with_std=True)}

In [38]:
# 查看logistic回归
grid_search.best_estimator_.named_steps.logisticregression

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [39]:
# 查看logistic回归的系数
grid_search.best_estimator_.named_steps.logisticregression.coef_

array([[-0.38856355, -0.37529972, -0.37624793, -0.39649439, -0.11519359,
         0.01709608, -0.3550729 , -0.38995414, -0.05780518,  0.20879795,
        -0.49487753, -0.0036321 , -0.37122718, -0.38337777, -0.04488715,
         0.19752816,  0.00424822, -0.04857196,  0.21023226,  0.22444999,
        -0.54669761, -0.52542026, -0.49881157, -0.51451071, -0.39256847,
        -0.12293451, -0.38827425, -0.4169485 , -0.32533663, -0.13926972]])

## 6.4 网格搜索预处理步骤与模型参数

可以利用pipeline将机器学习工作流程中的所有处理步骤封装成一个scikit-learn estimator。好处在于，可以使用监督任务（回归或分类）的输出来调节预处理参数。

In [40]:
from sklearn.datasets import load_boston
from sklearn.preprocessing import PolynomialFeatures

# Load and split data
boston = load_boston()
X_train, X_test, y_train, y_test = train_test_split(
    boston.data, boston.target, random_state=4)

pipe = make_pipeline(StandardScaler(), PolynomialFeatures(), Ridge())

理想情况下，可以根据分类结果来选择`degree`参数和`alpha`参数。

In [41]:
param_grid = {"polynomialfeatures__degree": [1, 2, 3],
              "ridge__alpha": [0.001, 0.01, 0.1, 1, 10, 100]}

grid_search = GridSearchCV(pipe, param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('polynomialfeatures', PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)), ('ridge', Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'polynomialfeatures__degree': [1, 2, 3], 'ridge__alpha': [0.001, 0.01, 0.1, 1, 10, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [42]:
grid_search.cv_results_["mean_test_score"]

array([  0.71968569,   0.71968753,   0.71970532,   0.71983084,
         0.7185514 ,   0.69012615,   0.59358219,   0.61528112,
         0.63883664,   0.67724913,   0.77126913,   0.80052727,
       -94.40956971, -37.04651126, -18.79946344, -12.42511096,
        -1.37891559,   0.74011704])

In [43]:
# 最佳参数
grid_search.best_params_

{'polynomialfeatures__degree': 2, 'ridge__alpha': 100}

In [44]:
grid_search.score(X_test, y_test)

0.8351426717313708

为了对比，运行一个没有多项式特征的网格搜索：

In [45]:
pipe = make_pipeline(StandardScaler(), Ridge())
param_grid = {"ridge__alpha": [0.001, 0.01, 0.1, 1, 10, 100]}
grid_search = GridSearchCV(pipe, param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)
grid_search.score(X_test, y_test)

0.73142012469101525

> 不适用多项式特征处理得到了明显更差的结果。

**同时搜索预处理参数和模型参数**是一个非常强大的策略。`GridSearchCV`会尝试指定参数所有可能组合。因此，向网格中添加更多参数，需要构建的模型数量将呈指数增长。

## 6.5 网格搜索使用哪个模型

在cancer数据集上比较RF和SVC。

+ 分类器是RF还是SVC
+ SVC需要数据缩放，所以要搜索的有是否使用数据缩放
+ 各个分类的参数

In [46]:
from sklearn.ensemble import RandomForestClassifier

In [47]:
pipe = Pipeline([("preprocessing", StandardScaler()), ("clf", SVC())])

In [48]:
from sklearn.datasets import load_breast_cancer

cancer = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target,
                                                    random_state=0)

In [49]:
param_grid = [{"clf": [SVC()],
               "preprocessing": [StandardScaler(), None],
               "clf__gamma": [0.001, 0.01, 0.1, 1, 10, 100],
               "clf__C": [0.001, 0.01, 0.1, 1, 10, 100]},
              
              {"clf": [RandomForestClassifier(n_estimators=100)],
               "preprocessing": [None],
               "clf__max_features": [1, 2, 3]}]

In [50]:
grid_search = GridSearchCV(pipe, param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('preprocessing', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid=[{'clf': [SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.01, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)], 'preprocessing': [StandardScaler(copy=True, with_mean=T...verbose=0,
            warm_start=False)], 'preprocessing': [None], 'clf__max_features': [1, 2, 3]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [51]:
# best params
grid_search.best_params_

{'clf': SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
   decision_function_shape='ovr', degree=3, gamma=0.01, kernel='rbf',
   max_iter=-1, probability=False, random_state=None, shrinking=True,
   tol=0.001, verbose=False),
 'clf__C': 10,
 'clf__gamma': 0.01,
 'preprocessing': StandardScaler(copy=True, with_mean=True, with_std=True)}

In [52]:
# best cv score
grid_search.best_score_

0.9859154929577465

In [53]:
# test set score
grid_search.score(X_test, y_test)

0.97902097902097907