In [1]:
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

warnings.filterwarnings("ignore")


%matplotlib inline

In [2]:
from sklearn.datasets import load_breast_cancer

cancer = load_breast_cancer()

## 1. 問題の定義

### 解くべき問題は何か？
- ある腫瘍のデータを見て、悪性(Malignant)か良性(Benign)を区別すると

### 現状の予測手法はあるか？
- 専門家(医者)が診断をしている
    
### どういう問題として処理するか？
- 教師あり学習
- 2クラス分類問題


## 2. データの取得・観察


In [3]:
from sklearn.datasets import load_breast_cancer

cancer = load_breast_cancer()

In [4]:
# データセットの説明を確認
print(cancer.DESCR)

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry 
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 3 is Mean Radius, f

In [5]:
# 扱いやすくするためデータフレームに変換する
X = pd.DataFrame(cancer.data, columns=cancer.feature_names)

y = pd.Series(cancer.target)

In [6]:
# 1:悪性(Malignant)  / 0: 良性(Benign)
y.value_counts()

1    357
0    212
dtype: int64

In [7]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 30 columns):
mean radius                569 non-null float64
mean texture               569 non-null float64
mean perimeter             569 non-null float64
mean area                  569 non-null float64
mean smoothness            569 non-null float64
mean compactness           569 non-null float64
mean concavity             569 non-null float64
mean concave points        569 non-null float64
mean symmetry              569 non-null float64
mean fractal dimension     569 non-null float64
radius error               569 non-null float64
texture error              569 non-null float64
perimeter error            569 non-null float64
area error                 569 non-null float64
smoothness error           569 non-null float64
compactness error          569 non-null float64
concavity error            569 non-null float64
concave points error       569 non-null float64
symmetry error             569 

### データをざっと見て
- 569行のレコード
- 全ての値がfloat
- 欠損値はない

In [8]:
#  それぞれのカラムにおける統計量をある程度の確認することもできる
X.describe()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,0.062798,...,16.26919,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946
std,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,0.00706,...,4.833242,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061
min,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,0.04996,...,7.93,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504
25%,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,0.0577,...,13.01,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146
50%,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,0.06154,...,14.97,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004
75%,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,0.06612,...,18.79,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208
max,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,0.09744,...,36.04,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075


## 3. 評価指標と評価プロトコル

### 評価指標
- クラスのバランスが不均衡(212 - Malignant, 357 - Benign)
- 間違いの重要度が異なる(Malignantを陽性とすると)
    - 良性を悪性と診断してしまう(偽陰性)
    - 悪性を良性と診断してしまう(偽陽性)
- F1-scoreを使ってみよう    


### 評価プロトコル
- データの量が多いわけではないので交差検証で行うべきだろう

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [10]:
# 分割後の形状確認(学習データ)
X_train.shape, y_train.shape

((426, 30), (426,))

In [11]:
# 分割後の形状確認(テストデータ)
X_test.shape, y_test.shape

((143, 30), (143,))

## 4. データの前処理・特徴量エンジニアリング

### データの観察と研究
- 今回は割愛
- ひとまずmodelを作って評価してみよう

## 5. モデルの訓練・評価

### モデルの訓練 と モデルの評価
- 交差検証でモデルと評価を行う
- 機械学習アルゴリズムはいくつか試す    

In [12]:
# 交差検証のための関数をimport
from sklearn.model_selection import cross_val_score

### その1: KNN

In [13]:
from sklearn.neighbors import KNeighborsClassifier

cv_score_knn = cross_val_score(KNeighborsClassifier(), X_train, y_train, cv=5)

print(f"Avg score: {np.mean(cv_score_knn):.3f}")
print(f"Std: {cv_score_knn.std():.3f}")

Avg score: 0.920
Std: 0.043


###  その2: LogisticRegression

In [14]:
from sklearn.linear_model import LogisticRegression

cv_score_logreg = cross_val_score(LogisticRegression(), X_train, y_train, cv=5)


print(f"Avg score: {np.mean(cv_score_logreg):.3f}")
print(f"Std: {cv_score_logreg.std():.3f}")

Avg score: 0.948
Std: 0.027


### その3: SupportVectorMachine(スケーリングなし)

In [15]:
from sklearn.svm import SVC

cv_score_svm = cross_val_score(SVC(), X_train, y_train, cv=5)


print(f"Avg score: {np.mean(cv_score_svm):.3f}")
print(f"Std: {cv_score_svm.std():.3f}")

Avg score: 0.629
Std: 0.001


### その4: SVM(スケーリングあり)
- SVMはスケーリングされていないデータに弱いことを思い出す！

In [16]:
# スケーリングをしてみる
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)

cv_score_svm_scaled = cross_val_score(SVC(), X_train_scaled, y_train, cv=5)

print(f"Avg score: {np.mean(cv_score_svm_scaled):.3f}")
print(f"Std: {cv_score_svm_scaled.std():.3f}")

Avg score: 0.939
Std: 0.012


### その5:  KNN(スケーリングあり)

In [17]:
cv_score_knn_scaled = cross_val_score(
    KNeighborsClassifier(), X_train_scaled, y_train, cv=5)

print(f"Avg score: {np.mean(cv_score_knn_scaled):.3f}")
print(f"Std: {cv_score_knn_scaled.std():.3f}")

Avg score: 0.953
Std: 0.017


### その6:  LogisticRegression(スケーリングあり)

In [18]:
cv_score_logreg_scaled = cross_val_score(
    LogisticRegression(), X_train_scaled, y_train, cv=5)


print(f"Avg score: {np.mean(cv_score_logreg_scaled):.3f}")
print(f"Std: {cv_score_logreg_scaled.std():.3f}")

Avg score: 0.953
Std: 0.015


### それぞれの結果を並べてみる


In [19]:
print(f"Avg score(KNN): {np.mean(cv_score_knn):.3f}")
print(f"Avg score(KNN_scaled): {np.mean(cv_score_knn_scaled):.3f}")

print(f"Avg score(Logreg): {np.mean(cv_score_logreg):.3f}")
print(f"Avg score(Logreg_scaled): {np.mean(cv_score_logreg_scaled):.3f}")

print(f"Avg score(SVM): {np.mean(cv_score_svm):.3f}")
print(f"Avg score(SVM_scaled): {np.mean(cv_score_svm_scaled):.3f}")

Avg score(KNN): 0.920
Avg score(KNN_scaled): 0.953
Avg score(Logreg): 0.948
Avg score(Logreg_scaled): 0.953
Avg score(SVM): 0.629
Avg score(SVM_scaled): 0.939


## チューニングその1: Logreg(スケーリングなし)

In [20]:
from sklearn.model_selection import GridSearchCV

param_grid = {"C": [0.001, 0.01, 0.1, 1, 10, 100]}

grid_logreg = GridSearchCV(LogisticRegression(), param_grid, cv=5)

grid_logreg.fit(X_train, y_train)

print(f"{grid_logreg.best_score_:.3f}")
print(grid_logreg.best_params_)

0.962
{'C': 100}




## チューニングその2:  SVM(スケーリングあり)

In [21]:
from sklearn.model_selection import GridSearchCV

param_grid = {"C": [0.001, 0.01, 0.1, 1, 10, 100],
              "gamma": [0.001, 0.01, 0.1, 1, 10, 100]}

grid_svm = GridSearchCV(SVC(), param_grid, cv=5)

grid_svm.fit(X_train_scaled, y_train)

print(f"{grid_svm.best_score_:.3f}")
print(grid_svm.best_params_)

0.979
{'C': 1, 'gamma': 1}


###  混同行列をみてみる

In [22]:
from sklearn.metrics import confusion_matrix

pred_grid_logreg = grid_logreg.predict(X_test)
confusion_matrix(y_test, pred_grid_logreg)

array([[52,  2],
       [ 3, 86]])

In [23]:
X_test_scaled = scaler.transform(X_test)

pred_grid_svm = grid_svm.predict(X_test_scaled)
confusion_matrix(y_test, pred_grid_svm)

array([[52,  2],
       [ 1, 88]])

### F値で評価してみよう

In [24]:
from sklearn.metrics import f1_score

print(f"LogisticRegression: {f1_score(y_test, pred_grid_logreg):.3f}")
print(f"SurpportVectorMachine: {f1_score(y_test, pred_grid_svm):.3f}")

LogisticRegression: 0.972
SurpportVectorMachine: 0.983


### テストデータの最終的な評価

In [25]:
# SVMはスケーリングしていたのでテストデータもスケーリングするのを忘れずに！
X_test_scaled = scaler.transform(X_test)

print(f"{grid_svm.score(X_test_scaled, y_test):.3f}")

0.979
