## データの準備

In [1]:
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
housing = load_boston()
X_train, X_test, y_train, y_test = train_test_split(housing.data, housing.target,
                                                    train_size=0.75, test_size=0.25)

## 例1)PCA -> SVRのPipelineで解析する

### estimatorの設定

In [2]:
from sklearn.svm import SVR
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
pl1 = Pipeline([['pca', PCA(random_state=2018)],
                ['svr', SVR(kernel='linear')]])

### parameter candidateの設定

In [3]:
prms1 = {'pca__n_components': [0.1, 0.5, 0.9],
         'svr__C': [0.1, 0.5, 1.],
         'svr__epsilon': [0.05, 0.10, 0.20]}

### Grid Search 実行

In [4]:
from sklearn.model_selection import GridSearchCV
gs1 = GridSearchCV(pl1, prms1, n_jobs=-1, return_train_score=True, cv=5, verbose=10)
gs1.fit(X_train, y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 36 concurrent workers.
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done  41 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done  78 out of 135 | elapsed:    3.4s remaining:    2.4s
[Parallel(n_jobs=-1)]: Done  92 out of 135 | elapsed:    3.7s remaining:    1.7s
[Parallel(n_jobs=-1)]: Done 106 out of 135 | elapsed:    3.9s remaining:    1.0s
[Parallel(n_jobs=-1)]: Done 120 out of 135 | elapsed:    4.2s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done 135 out of 135 | elapsed:    5.2s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[['pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=2018,
  svd_solver='auto', tol=0.0, whiten=False)], ['svr', SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
  gamma='auto_deprecated', kernel='linear', max_iter=-1, shrinking=True,
  tol=0.001, verbose=False)]]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'pca__n_components': [0.1, 0.5, 0.9], 'svr__C': [0.1, 0.5, 1.0], 'svr__epsilon': [0.05, 0.1, 0.2]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=10)

### 精度確認

In [5]:
from sklearn.metrics import r2_score
from scipy.stats import pearsonr
print('R2 = {:.3f}'.format(r2_score(gs1.predict(X_test), y_test)))
print('r = {:.3f} (p = {:.3f})'.format(*pearsonr(gs1.predict(X_test), y_test)))

R2 = -3.075
r = 0.527 (p = 0.000)


## 例2) PCAなしでSVRをする

In [6]:
prms2 = {'C': [0.1, 0.5, 1.],
        'epsilon': [0.05, 0.10, 0.20]}
gs2 = GridSearchCV(SVR(kernel='linear'), prms2, n_jobs=-1, return_train_score=True, cv=5, verbose=10)
gs2.fit(X_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 36 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  45 | elapsed:    0.3s remaining:    4.2s
[Parallel(n_jobs=-1)]: Done   9 out of  45 | elapsed:    0.4s remaining:    2.0s
[Parallel(n_jobs=-1)]: Done  14 out of  45 | elapsed:    0.8s remaining:    2.0s
[Parallel(n_jobs=-1)]: Done  19 out of  45 | elapsed:    1.2s remaining:    1.7s
[Parallel(n_jobs=-1)]: Done  24 out of  45 | elapsed:    2.1s remaining:    1.9s
[Parallel(n_jobs=-1)]: Done  29 out of  45 | elapsed:    2.2s remaining:    1.2s
[Parallel(n_jobs=-1)]: Done  34 out of  45 | elapsed:    2.6s remaining:    0.8s
[Parallel(n_jobs=-1)]: Done  39 out of  45 | elapsed:    3.3s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:    4.8s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
  gamma='auto_deprecated', kernel='linear', max_iter=-1, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'C': [0.1, 0.5, 1.0], 'epsilon': [0.05, 0.1, 0.2]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=10)

In [7]:
print('R2 = {:.3f}'.format(r2_score(gs2.predict(X_test), y_test)))
print('r = {:.3f} (p = {:.3f})'.format(*pearsonr(gs2.predict(X_test), y_test)))

R2 = 0.607
r = 0.832 (p = 0.000)


## 都合のいいPCAクラス

### クラスを作る

### クラスをインポート

In [8]:
from mypca import myPCA

### クラスを試す

#### PCAのちSVRのPipelineを試した時と同じ条件

In [9]:
prms3 = prms1
pl3 = Pipeline([['pca', myPCA(random_state=2018)],
                ['svr', SVR(kernel='linear')]])
gs3 = GridSearchCV(pl3, prms3, n_jobs=-1, return_train_score=True, cv=5, verbose=10)
gs3.fit(X_train, y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 36 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1875s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done  36 out of 135 | elapsed:    0.6s remaining:    1.7s
[Parallel(n_jobs=-1)]: Done  50 out of 135 | elapsed:    0.9s remaining:    1.6s
[Parallel(n_jobs=-1)]: Done  64 out of 135 | elapsed:    1.0s remaining:    1.2s
[Parallel(n_jobs=-1)]: Done  78 out of 135 | elapsed:    1.2s remaining:    0.8s
[Parallel(n_jobs=-1)]: Done 135 out of 135 | elapsed:    2.8s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[['pca', myPCA(n_components=0, random_state=2018)], ['svr', SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
  gamma='auto_deprecated', kernel='linear', max_iter=-1, shrinking=True,
  tol=0.001, verbose=False)]]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'pca__n_components': [0.1, 0.5, 0.9], 'svr__C': [0.1, 0.5, 1.0], 'svr__epsilon': [0.05, 0.1, 0.2]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=10)

In [10]:
print('R2 = {:.3f}'.format(r2_score(gs3.predict(X_test), y_test)))
print('r = {:.3f} (p = {:.3f})'.format(*pearsonr(gs3.predict(X_test), y_test)))

R2 = -3.075
r = 0.527 (p = 0.000)


#### PCAなしでのSVRも含めてGrid Search

In [11]:
prms4 = {'pca__n_components': [0, 0.1, 0.5, 0.9],
         'svr__C': [0.1, 0.5, 1.],
         'svr__epsilon': [0.05, 0.10, 0.20]}
pl4 = pl3
gs4 = GridSearchCV(pl4, prms4, n_jobs=-1, return_train_score=True, cv=5, verbose=10)
gs4.fit(X_train, y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 36 concurrent workers.
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done  41 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done  73 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done 109 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done 128 out of 180 | elapsed:    4.0s remaining:    1.6s
[Parallel(n_jobs=-1)]: Done 147 out of 180 | elapsed:    4.4s remaining:    0.9s
[Parallel(n_jobs=-1)]: Done 166 out of 180 | elapsed:    4.9s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    6.7s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[['pca', myPCA(n_components=0, random_state=2018)], ['svr', SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
  gamma='auto_deprecated', kernel='linear', max_iter=-1, shrinking=True,
  tol=0.001, verbose=False)]]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'pca__n_components': [0, 0.1, 0.5, 0.9], 'svr__C': [0.1, 0.5, 1.0], 'svr__epsilon': [0.05, 0.1, 0.2]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=10)

In [12]:
print('R2 = {:.3f}'.format(r2_score(gs4.predict(X_test), y_test)))
print('r = {:.3f} (p = {:.3f})'.format(*pearsonr(gs4.predict(X_test), y_test)))

R2 = 0.607
r = 0.832 (p = 0.000)
