## 4.5 調整

### 共通事前處理

In [1]:
# 取得中文字型
!wget 'https://github.com/flyingpath/electron-hand-dicom/raw/master/TaipeiSansTCBeta-Regular.ttf'

--2021-12-24 02:05:07--  https://github.com/flyingpath/electron-hand-dicom/raw/master/TaipeiSansTCBeta-Regular.ttf
Resolving github.com (github.com)... 140.82.121.4
Connecting to github.com (github.com)|140.82.121.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/flyingpath/electron-hand-dicom/master/TaipeiSansTCBeta-Regular.ttf [following]
--2021-12-24 02:05:08--  https://raw.githubusercontent.com/flyingpath/electron-hand-dicom/master/TaipeiSansTCBeta-Regular.ttf
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 20659344 (20M) [application/octet-stream]
Saving to: ‘TaipeiSansTCBeta-Regular.ttf’


2021-12-24 02:05:09 (282 MB/s) - ‘TaipeiSansTCBeta-Regular.ttf’ saved [20659344/20659344]



In [2]:
# 共通事前處理

# 隱藏不必要的警告
import warnings
warnings.filterwarnings('ignore')

# 匯入必要的套件
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import font_manager as fm

# 將字型新增到 matplotlib
fm.fontManager.addfont('./TaipeiSansTCBeta-Regular.ttf')

# 用來顯示資料框的函式
from IPython.display import display

# 調整顯示選項
# NumPy 的浮點數表示精度
np.set_printoptions(suppress=True, precision=4)
# pandas 中的浮點數表示精度
pd.options.display.float_format = '{:.4f}'.format
# 顯示資料框中的所有項目
pd.set_option("display.max_columns",None)
# 指定圖形的預設字體大小
plt.rcParams["font.size"] = 14
# 指定圖形的預設字型
plt.rcParams['font.family'] = 'Taipei Sans TC Beta'
# 隨機種子
random_seed = 123

#### 載入範例資料

In [3]:
# 載入範例資料
# （乳癌資料）

# 載入資料
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()

# 輸入資料：x（30 維）
# 標準答案：y
x = cancer.data
y = cancer.target

In [4]:
# 分割範例資料

# 分割資料的參數
test_size = 0.1

# 分割資料
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, 
    test_size=test_size, random_state=random_seed,
    stratify=y)

# 確認分割後維數
print(x.shape)
print(x_train.shape)
print(x_test.shape)

(569, 30)
(512, 30)
(57, 30)


### 4.5.1 選擇演算法

In [5]:
# 利用多種演算法比較正確率
# 將 random_state 設定為相同的值，以使結果相同

# 線性迴歸
from sklearn.linear_model import LogisticRegression
algorithm1 = LogisticRegression(random_state=random_seed)

# 支援向量機（kernel method）
from sklearn.svm import SVC
algorithm2 = SVC(kernel='rbf', random_state=random_seed)

# 決策樹
from sklearn.tree import DecisionTreeClassifier
algorithm3 = DecisionTreeClassifier(random_state=random_seed)

# 隨機森林
from sklearn.ensemble import RandomForestClassifier
algorithm4 = RandomForestClassifier(random_state=random_seed)

# XGBoost
from xgboost import XGBClassifier
algorithm5 = XGBClassifier(random_state=random_seed)

# 建立演算法列表
algorithms = [algorithm1, algorithm2, algorithm3, algorithm4, 
    algorithm5]

In [6]:
# 比較多種演算法的正確率
for algorithm in algorithms:
    
    # 以範例資料進行訓練
    algorithm.fit(x_train, y_train)
    
    # 以驗證資料測量正確率
    score = algorithm.score(x_test, y_test)
    
    # 取得演算法名稱
    #name = algorithm.__class__.__name__
    name = type(algorithm).__name__
    # 顯示正確率與演算法名稱
    print(f'score: {score:.4f}  {name}')

score: 0.9649  LogisticRegression
score: 0.8947  SVC
score: 0.9474  DecisionTreeClassifier
score: 0.9298  RandomForestClassifier
score: 0.9825  XGBClassifier


### 4.5.2 最佳化超參數

In [15]:
# 確認預設參數
algorithm = SVC(kernel='rbf', random_state=random_seed)
algorithm.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': 123,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [8]:
# 最佳化 gamma
algorithm = SVC(kernel='rbf', random_state=random_seed)
gammas = [100, 10, 1, 0.1, 0.01, 0.001, 0.0001, 0.00001]

for gamma in gammas:
    algorithm.gamma = gamma
    algorithm.fit(x_train, y_train)
    score = algorithm.score(x_test, y_test)
    print(f'score: {score:.4f}  gamma: {gamma}')

score: 0.6316  gamma: 100
score: 0.6316  gamma: 10
score: 0.6316  gamma: 1
score: 0.6316  gamma: 0.1
score: 0.6316  gamma: 0.01
score: 0.9474  gamma: 0.001
score: 0.9474  gamma: 0.0001
score: 0.9474  gamma: 1e-05


In [9]:
# 最佳化 C
# gamma 採用之前找出的最佳值 0.001

Cs = [1,  10,  100, 1000, 10000]
for C in Cs:
    algorithm = SVC(kernel='rbf', 
        gamma=0.001, C=C,
        random_state=random_seed)
    algorithm.fit(x_train, y_train)
    score = algorithm.score(x_test, y_test)
    print(f'score: {score:.4f}  C: {C}')

score: 0.9474  C: 1
score: 0.9298  C: 10
score: 0.9298  C: 100
score: 0.9298  C: 1000
score: 0.9298  C: 10000


### 4.5.3 交叉驗證

In [10]:
# 針對特定演算法進行交叉驗證

# 定義演算法
algorithm = SVC(kernel='rbf',random_state=random_seed,
    gamma=0.001, C=1)

# 分割時利用 StratifiedKFold，以避免標準答案分佈不均
from sklearn.model_selection import StratifiedKFold
stratifiedkfold = StratifiedKFold(n_splits=3)

# 進行交叉驗證（分割數 = 3）
from sklearn.model_selection import cross_val_score
scores = cross_val_score(algorithm , x_train, y_train, 
    cv=stratifiedkfold)

# 計算平均值
mean = scores.mean()

# 顯示結果
print(f'平均分數 : {mean:.4f}  個別分數 : {scores}')

平均分數 : 0.9141  個別分數 : [0.8889 0.9181 0.9353]


In [11]:
# 建立候選演算法的列表

from sklearn.linear_model import LogisticRegression
algorithm1 = LogisticRegression(random_state=random_seed)

from sklearn.svm import SVC
algorithm2 = SVC(kernel='rbf',random_state=random_seed,
    gamma=0.001, C=1)

from sklearn.tree import DecisionTreeClassifier
algorithm3 = DecisionTreeClassifier(random_state=random_seed)

from sklearn.ensemble import RandomForestClassifier
algorithm4 = RandomForestClassifier(random_state=random_seed)

from xgboost import XGBClassifier
algorithm5 = XGBClassifier(random_state=random_seed)

algorithms = [algorithm1, algorithm2, algorithm3, algorithm4, 
    algorithm5]

In [12]:
# 比較多種演算法的準確率

# 分割時利用 StratifiedKFold，以避免標準答案分佈不均
from sklearn.model_selection import StratifiedKFold
stratifiedkfold = StratifiedKFold(n_splits=3)

from sklearn.model_selection import cross_val_score
for algorithm in algorithms:
    # 進行交叉驗證
    scores = cross_val_score(algorithm , x_train, y_train, 
        cv=stratifiedkfold)
    score = scores.mean()
    name = algorithm.__class__.__name__
    print(f'平均分數 : {score:.4f}  個別分數 : {scores}  {name}')

平均分數 : 0.9473  個別分數 : [0.9415 0.9474 0.9529]  LogisticRegression
平均分數 : 0.9141  個別分數 : [0.8889 0.9181 0.9353]  SVC
平均分數 : 0.9062  個別分數 : [0.8713 0.9415 0.9059]  DecisionTreeClassifier
平均分數 : 0.9629  個別分數 : [0.9649 0.9591 0.9647]  RandomForestClassifier
平均分數 : 0.9590  個別分數 : [0.9591 0.9649 0.9529]  XGBClassifier


### 4.5.4 網格搜尋

In [13]:
# 結合網格搜尋與交叉驗證來搜尋最佳參數
params = {
      'C':[1, 10, 100, 1000, 10000],
      'gamma':[1, 0.1, 0.01, 0.001, 0.0001, 0.00001]
}
algorithm = SVC(random_state=random_seed)

from sklearn.model_selection import StratifiedKFold
stratifiedkfold = StratifiedKFold(n_splits=3)

from sklearn.model_selection import GridSearchCV
gs = GridSearchCV(algorithm, params, cv=stratifiedkfold)
gs.fit(x_train, y_train)

# 取得最佳模型並對驗證資料進行分類
best = gs.best_estimator_
best_pred = best.predict(x_test)
print(best)

SVC(C=1000, gamma=1e-05, random_state=123)


In [14]:
# 取得準確率
score = best.score(x_test, y_test)
print(f'分數 : {score:.4f}')

# 輸出混淆矩陣
from sklearn.metrics import confusion_matrix
print()
print('混淆矩陣')
print(confusion_matrix(y_test, best_pred))

分數 : 0.9825

混淆矩陣
[[20  1]
 [ 0 36]]
