In [1]:
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
X = data.data
y = data.target

In [3]:
from sklearn.model_selection import ShuffleSplit
ss = ShuffleSplit(n_splits=1, 
                  train_size=0.8, 
                  test_size=0.2, 
                  random_state=0)

train_index, test_index = next(ss.split(X, y))

X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]

In [4]:
from sklearn.decomposition import PCA

pca = PCA(whiten=True)
pca.fit(X_train)
X_train_pca = pca.transform(X_train)
X_test_pca  = pca.transform(X_test)

In [5]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()

In [6]:
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.95614035087719296

In [7]:
clf.fit(X_train_pca, y_train)
clf.score(X_test_pca, y_test)

0.96491228070175439

In [8]:
from sklearn.pipeline import Pipeline

estimators = [('pca', PCA(whiten=True)), 
              ('clf', LogisticRegression())] # それぞれの処理の名前は任意．
pipe = Pipeline(estimators) # 前処理をして，学習または識別が一度にできる．

In [9]:
pipe.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=True)), ('clf', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [10]:
pipe.score(X_test, y_test)

0.96491228070175439

In [11]:
pipe.steps # steps 属性で処理の流れが見れる

[('pca',
  PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
    svd_solver='auto', tol=0.0, whiten=True)),
 ('clf',
  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
            intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
            penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
            verbose=0, warm_start=False))]

In [12]:
pipe.named_steps['pca'] # named_steps['処理の名前']でそれぞれの処理にアクセスできる．

PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=True)

In [13]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC

estimators = [('mms', MinMaxScaler()), 
              ('clf', SVC(kernel='rbf', C=1e10))]
pipe = Pipeline(estimators)

In [14]:
pipe.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('mms', MinMaxScaler(copy=True, feature_range=(0, 1))), ('clf', SVC(C=10000000000.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [15]:
pipe.score(X_test, y_test)

0.98245614035087714

In [16]:
estimators = [('pca', PCA(whiten=True)), 
              ('clf', LogisticRegression())]
pipe = Pipeline(estimators)

In [17]:
from sklearn.model_selection import GridSearchCV

param = {'clf__C':[1e-5, 1e-3, 1e-2, 1, 1e2, 1e5, 1e10]} # clf.C

gs = GridSearchCV(estimator=pipe, param_grid=param)
gs.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=True)), ('clf', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'clf__C': [1e-05, 0.001, 0.01, 1, 100.0, 100000.0, 10000000000.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [18]:
gs.best_params_, gs.best_score_, gs.best_estimator_

({'clf__C': 1}, 0.95604395604395609, Pipeline(memory=None,
      steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
   svd_solver='auto', tol=0.0, whiten=True)), ('clf', LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
           penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=False))]))

In [19]:
gs.score(X_test, y_test)

0.96491228070175439

In [22]:
from sklearn.svm import SVC

estimators = [('pca', PCA()),
              ('clf', SVC())]

pipe = Pipeline(estimators)


from sklearn.model_selection import RandomizedSearchCV

C_range = [1e-3, 1e-2, 1, 1e2, 1e3]

param = {'clf__C': C_range, 
         'clf__kernel': ['linear', 'rbf'], 
         'pca__whiten': [True, False], 
         'pca__n_components': [30, 20, 10]}

gs = RandomizedSearchCV(pipe, param, n_jobs=-1, verbose=2) # defaultでn_iter=10
gs.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] pca__whiten=True, pca__n_components=20, clf__kernel=rbf, clf__C=100.0 
[CV] pca__whiten=True, pca__n_components=20, clf__kernel=rbf, clf__C=100.0 
[CV] pca__whiten=True, pca__n_components=20, clf__kernel=rbf, clf__C=100.0 
[CV] pca__whiten=False, pca__n_components=10, clf__kernel=linear, clf__C=1 
[CV] pca__whiten=False, pca__n_components=10, clf__kernel=linear, clf__C=1 
[CV] pca__whiten=False, pca__n_components=10, clf__kernel=linear, clf__C=1 
[CV] pca__whiten=False, pca__n_components=30, clf__kernel=linear, clf__C=1000.0 
[CV] pca__whiten=False, pca__n_components=30, clf__kernel=linear, clf__C=1000.0 


Process ForkPoolWorker-36:
Process ForkPoolWorker-33:
Process ForkPoolWorker-34:
Process ForkPoolWorker-35:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/Users/yohei/.pyenv/versions/anaconda3-5.0.0/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/Users/yohei/.pyenv/versions/anaconda3-5.0.0/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/Users/yohei/.pyenv/versions/anaconda3-5.0.0/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/Users/yohei/.pyenv/versions/anaconda3-5.0.0/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/yohei/.pyenv/versions/anaconda3-5.0.0/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/yohei/.pyenv/ver

KeyboardInterrupt: 

In [28]:
gs.best_params_, gs.best_score_, gs.best_estimator_

({'clf__C': 100.0,
  'clf__kernel': 'linear',
  'pca__n_components': 20,
  'pca__whiten': True},
 0.94505494505494503,
 Pipeline(memory=None,
      steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=20, random_state=None,
   svd_solver='auto', tol=0.0, whiten=True)), ('clf', SVC(C=100.0, cache_size=200, class_weight=None, coef0=0.0,
   decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
   max_iter=-1, probability=False, random_state=None, shrinking=True,
   tol=0.001, verbose=False))]))

```python
gs.fit(X_train, y_train)
```
は前処理のPCAを含めて一番良いパラメータの組み合わせを見つけるだけでなく，もっとも良いパラメータの組み合わせで訓練データ`X_train, y_train` を全て用いて
学習までしてくれている．

In [29]:
gs.score(X_test, y_test)

0.97368421052631582

In [30]:
import pandas as pd

pd.DataFrame(gs.cv_results_)

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_clf__C,param_clf__kernel,param_pca__n_components,param_pca__whiten,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,split1_train_score,split2_test_score,split2_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,0.082976,0.006917,0.945055,0.992317,100.0,linear,20,True,"{'clf__kernel': 'linear', 'pca__whiten': True,...",1,0.914474,0.993399,0.947368,1.0,0.97351,0.983553,0.046458,0.00214,0.024144,0.006758
1,0.02941,0.005966,0.637363,1.0,100.0,rbf,20,False,"{'clf__kernel': 'rbf', 'pca__whiten': False, '...",8,0.638158,1.0,0.638158,1.0,0.635762,1.0,0.003877,0.001928,0.001128,0.0
2,0.027077,0.007591,0.936264,0.957154,0.001,linear,30,False,"{'clf__kernel': 'linear', 'pca__whiten': False...",4,0.947368,0.960396,0.907895,0.963696,0.953642,0.947368,0.008483,0.00456,0.020255,0.007049
3,0.012931,0.001948,0.942857,0.939563,0.01,linear,30,True,"{'clf__kernel': 'linear', 'pca__whiten': True,...",2,0.947368,0.940594,0.940789,0.940594,0.940397,0.9375,0.004131,0.000123,0.003199,0.001459
4,0.015093,0.007347,0.931868,1.0,100.0,rbf,20,True,"{'clf__kernel': 'rbf', 'pca__whiten': True, 'c...",5,0.940789,1.0,0.921053,1.0,0.933775,1.0,0.006033,0.001451,0.008178,0.0
5,0.03352,0.003385,0.925275,1.0,1000.0,linear,30,True,"{'clf__kernel': 'linear', 'pca__whiten': True,...",6,0.901316,1.0,0.927632,1.0,0.94702,1.0,0.004047,0.000332,0.018723,0.0
6,0.015682,0.00131,0.896703,0.895609,0.01,linear,10,True,"{'clf__kernel': 'linear', 'pca__whiten': True,...",7,0.875,0.884488,0.901316,0.910891,0.913907,0.891447,0.003224,0.000109,0.016208,0.011173
7,0.019914,0.003851,0.637363,0.637362,0.001,rbf,30,True,"{'clf__kernel': 'rbf', 'pca__whiten': True, 'c...",8,0.638158,0.636964,0.638158,0.636964,0.635762,0.638158,0.010646,0.000466,0.001128,0.000563
8,0.005695,0.001636,0.942857,0.974726,1.0,rbf,10,True,"{'clf__kernel': 'rbf', 'pca__whiten': True, 'c...",2,0.973684,0.970297,0.927632,0.980198,0.927152,0.973684,0.001343,0.000272,0.021835,0.004109
9,0.011509,0.003726,0.637363,0.637362,0.01,rbf,30,True,"{'clf__kernel': 'rbf', 'pca__whiten': True, 'c...",8,0.638158,0.636964,0.638158,0.636964,0.635762,0.638158,0.002028,0.000732,0.001128,0.000563


グリッドサーチになぜパイプラインを用いると良いのかは，以下参照

[Pythonではじめる機械学習 ―scikit-learnで学ぶ特徴量エンジニアリングと機械学習の基礎](https://www.amazon.co.jp/Python%E3%81%A7%E3%81%AF%E3%81%98%E3%82%81%E3%82%8B%E6%A9%9F%E6%A2%B0%E5%AD%A6%E7%BF%92-scikit-learn%E3%81%A7%E5%AD%A6%E3%81%B6%E7%89%B9%E5%BE%B4%E9%87%8F%E3%82%A8%E3%83%B3%E3%82%B8%E3%83%8B%E3%82%A2%E3%83%AA%E3%83%B3%E3%82%B0%E3%81%A8%E6%A9%9F%E6%A2%B0%E5%AD%A6%E7%BF%92%E3%81%AE%E5%9F%BA%E7%A4%8E-Andreas-C-Muller/dp/4873117984/ref=sr_1_1?ie=UTF8&qid=1504934791&sr=8-1&keywords=python%E3%81%A7%E3%81%AF%E3%81%98%E3%82%81%E3%82%8B%E6%A9%9F%E6%A2%B0%E5%AD%A6%E7%BF%92)のP300~306