# モデルの評価とハイパーパラメータのチューニング

データセットの読み込み

In [2]:
import pandas as pd

df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data", header=None)

  return f(*args, **kwds)
  return f(*args, **kwds)


In [3]:
df.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758


元々のクラスラベル表現（MとB）を整数値へとエンコーディング

In [3]:
from sklearn.preprocessing import LabelEncoder

X = df.loc[:, 2:].values
X

array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
        1.189e-01],
       [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
        8.902e-02],
       [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
        8.758e-02],
       ...,
       [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
        7.820e-02],
       [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
        1.240e-01],
       [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
        7.039e-02]])

In [4]:
y = df.loc[:, 1].values
le = LabelEncoder()
y = le.fit_transform(y)
le.classes_

array(['B', 'M'], dtype=object)

In [5]:
le.transform(['B', 'M'])

array([0, 1])

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y, random_state=1)

・データの標準化  
・データの次元削減  
・モデルの訓練  
をパイプラインを使って同時に行える！

In [16]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

pipe_lr = make_pipeline(StandardScaler(), PCA(n_components=2), LogisticRegression(random_state=1))
pipe_lr.fit(X_train, y_train)
y_pred = pipe_lr.predict(X_test)

  return f(*args, **kwds)


In [14]:
y_pred

array([1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,
       0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 1])

In [15]:
pipe_lr.score(X_test, y_test)

0.956140350877193

### 層化ｋ分割交差検証を試してみる

In [1]:
import numpy as np
from sklearn.model_selection import StratifiedKFold

  return f(*args, **kwds)
  return f(*args, **kwds)


In [14]:
kfold = StratifiedKFold(n_splits=10, random_state=1).split(X_train, y_train)
scores = []

In [15]:
kfold

<generator object _BaseKFold.split at 0x7fd61b6eeed0>

In [17]:
for k, (train, test) in enumerate(kfold):
    pipe_lr.fit(X_train[train], y_train[train])
    score = pipe_lr.score(X_train[test], y_train[test])
    scores.append(score)
    print("Fold: {}, Class dist: {}, Acc: {}".format(k+1, np.bincount(y_train[train]), score))

Fold: 1, Class dist: [256 153], Acc: 0.9347826086956522
Fold: 2, Class dist: [256 153], Acc: 0.9347826086956522
Fold: 3, Class dist: [256 153], Acc: 0.9565217391304348
Fold: 4, Class dist: [256 153], Acc: 0.9565217391304348
Fold: 5, Class dist: [256 153], Acc: 0.9347826086956522
Fold: 6, Class dist: [257 153], Acc: 0.9555555555555556
Fold: 7, Class dist: [257 153], Acc: 0.9777777777777777
Fold: 8, Class dist: [257 153], Acc: 0.9333333333333333
Fold: 9, Class dist: [257 153], Acc: 0.9555555555555556
Fold: 10, Class dist: [257 153], Acc: 0.9555555555555556




### 6.3 学習曲線と検証曲線