In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.datasets import load_breast_cancer
dataset = load_breast_cancer()

In [3]:
t = dataset.target
x = dataset.data

In [4]:
x.shape, t.shape

((569, 30), (569,))

In [5]:
from sklearn.model_selection import train_test_split

x_train_val, x_test, t_train_val, t_test = train_test_split(x, t, test_size=0.2, random_state=1)

In [6]:
# 検証用データセット：学習用データセット＝ 30 ： 70
x_train, x_val, t_train, t_val = train_test_split(x_train_val, t_train_val, test_size=0.3, random_state=1)

In [7]:
x_train.shape, x_val.shape, x_test.shape

((318, 30), (137, 30), (114, 30))

In [8]:
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier(random_state=0)

In [9]:
dtree.fit(x_train, t_train)

In [10]:
print('train score : ', dtree.score(x_train, t_train))
print('validation score : ', dtree.score(x_val, t_val))

train score :  1.0
validation score :  0.927007299270073


In [11]:
# ハイパーパラメータを設定して、モデルの定義
dtree = DecisionTreeClassifier(max_depth=10, min_samples_split=30, random_state=0)

dtree.fit(x_train, t_train)

In [12]:
print('train score : ', dtree.score(x_train, t_train))
print('validation score : ', dtree.score(x_val, t_val))

train score :  0.9308176100628931
validation score :  0.9562043795620438


In [13]:
print('test score :', dtree.score(x_test, t_test))

test score : 0.9298245614035088


In [14]:
# GridSearchCV クラスのインポート
from sklearn.model_selection import GridSearchCV

In [15]:
# 学習に使用するアルゴリズムの定義
estimator = DecisionTreeClassifier(random_state=0)

In [16]:
# 探索するハイパーパラメータと範囲の定義
param_grid = [{
    'max_depth': [3, 20, 50],
    'min_samples_split': [3, 20, 30]
}]

In [17]:
# データセット分割数を定義
cv = 5

In [18]:
# GridSearchCV クラスを用いたモデルの定義
tuned_model = GridSearchCV(estimator=estimator,
                           param_grid=param_grid,
                           cv=cv, return_train_score=False)

In [19]:
# モデルの学習＆検証
tuned_model.fit(x_train_val, t_train_val)

In [20]:
# 検証結果の確認
pd.DataFrame(tuned_model.cv_results_).T

Unnamed: 0,0,1,2,3,4,5,6,7,8
mean_fit_time,0.005405,0.004438,0.004515,0.005282,0.005267,0.005161,0.005318,0.006254,0.00883
std_fit_time,0.00174,0.000122,0.000169,0.00026,0.000261,0.000253,0.000261,0.000971,0.002193
mean_score_time,0.000822,0.00072,0.000727,0.000745,0.000788,0.000745,0.000728,0.001129,0.001374
std_score_time,0.00016,0.000031,0.000057,0.000027,0.000125,0.000036,0.000019,0.000312,0.000144
param_max_depth,3,3,3,20,20,20,50,50,50
param_min_samples_split,3,20,30,3,20,30,3,20,30
params,"{'max_depth': 3, 'min_samples_split': 3}","{'max_depth': 3, 'min_samples_split': 20}","{'max_depth': 3, 'min_samples_split': 30}","{'max_depth': 20, 'min_samples_split': 3}","{'max_depth': 20, 'min_samples_split': 20}","{'max_depth': 20, 'min_samples_split': 30}","{'max_depth': 50, 'min_samples_split': 3}","{'max_depth': 50, 'min_samples_split': 20}","{'max_depth': 50, 'min_samples_split': 30}"
split0_test_score,0.923077,0.912088,0.912088,0.956044,0.912088,0.912088,0.956044,0.912088,0.912088
split1_test_score,0.901099,0.901099,0.901099,0.912088,0.901099,0.901099,0.912088,0.901099,0.901099
split2_test_score,0.934066,0.934066,0.934066,0.923077,0.934066,0.934066,0.923077,0.934066,0.934066


In [21]:
estimator = DecisionTreeClassifier(random_state=0)
cv = 5
param_grid = [{
    'max_depth': [5, 10, 15] ,
    'min_samples_split': [10, 12, 15]
}]

In [22]:
# モデルの定義
tuned_model = GridSearchCV(estimator=estimator,
                           param_grid=param_grid,
                           cv=cv, return_train_score=False)

# モデルの学習
tuned_model.fit(x_train_val, t_train_val)

In [23]:
# 学習結果の確認
pd.DataFrame(tuned_model.cv_results_).T

Unnamed: 0,0,1,2,3,4,5,6,7,8
mean_fit_time,0.01062,0.00761,0.009406,0.012019,0.008548,0.00897,0.007527,0.008162,0.007285
std_fit_time,0.003442,0.000909,0.004472,0.003985,0.002883,0.002352,0.000434,0.000798,0.000228
mean_score_time,0.001502,0.001287,0.002715,0.001339,0.001316,0.001498,0.001341,0.001894,0.001383
std_score_time,0.00016,0.000024,0.00228,0.000068,0.000165,0.000313,0.000048,0.001095,0.000084
param_max_depth,5,5,5,10,10,10,15,15,15
param_min_samples_split,10,12,15,10,12,15,10,12,15
params,"{'max_depth': 5, 'min_samples_split': 10}","{'max_depth': 5, 'min_samples_split': 12}","{'max_depth': 5, 'min_samples_split': 15}","{'max_depth': 10, 'min_samples_split': 10}","{'max_depth': 10, 'min_samples_split': 12}","{'max_depth': 10, 'min_samples_split': 15}","{'max_depth': 15, 'min_samples_split': 10}","{'max_depth': 15, 'min_samples_split': 12}","{'max_depth': 15, 'min_samples_split': 15}"
split0_test_score,0.967033,0.923077,0.912088,0.967033,0.923077,0.912088,0.967033,0.923077,0.912088
split1_test_score,0.912088,0.901099,0.901099,0.912088,0.901099,0.901099,0.912088,0.901099,0.901099
split2_test_score,0.923077,0.934066,0.934066,0.923077,0.934066,0.934066,0.923077,0.934066,0.934066


In [24]:
# 最も予測精度の高かったハイパーパラメータの確認
tuned_model.best_params_

{'max_depth': 5, 'min_samples_split': 10}

In [25]:
# 最も予測精度の高かったモデルの引き継ぎ
best_model = tuned_model.best_estimator_

# モデルの検証
print(best_model.score(x_train_val, t_train_val))
print(best_model.score(x_test, t_test))

0.9934065934065934
0.956140350877193


In [26]:
# RandomizedSearchCV クラスのインポート
from sklearn.model_selection import RandomizedSearchCV

In [27]:
# 学習に使用するアルゴリズム
estimator = DecisionTreeClassifier(random_state=0)

In [28]:
list(range(1, 10, 2))

[1, 3, 5, 7, 9]

In [29]:
# ハイパーパラメータを探索する範囲の指定
param_distributions = {
    'max_depth': list(range(5, 100, 2)),
    'min_samples_split': list(range(2, 50, 1))
}

In [30]:
# 試行回数の指定
n_iter = 100

In [31]:
cv = 5

In [32]:
# モデルの定義
tuned_model = RandomizedSearchCV(
    estimator=estimator,
    param_distributions=param_distributions,
    n_iter=n_iter, cv=cv,
    random_state=0, return_train_score=False
)

In [33]:
# モデルの学習＆検証
tuned_model.fit(x_train_val, t_train_val)

In [34]:
# 学習結果の確認（スコアの高い順に表示）
pd.DataFrame(tuned_model.cv_results_).sort_values('rank_test_score').T

Unnamed: 0,47,77,82,90,42,19,28,12,11,62,...,40,41,98,50,55,58,60,67,31,99
mean_fit_time,0.005219,0.005322,0.005177,0.005502,0.006311,0.009335,0.007456,0.009238,0.007605,0.0057,...,0.00534,0.005181,0.005022,0.005171,0.005129,0.00521,0.005242,0.005151,0.012287,0.005249
std_fit_time,0.000219,0.000238,0.000234,0.000539,0.000993,0.002759,0.000423,0.003781,0.000955,0.000412,...,0.000571,0.000317,0.000249,0.000251,0.000242,0.000408,0.000412,0.000303,0.006822,0.000475
mean_score_time,0.000779,0.000905,0.000739,0.00086,0.000932,0.001387,0.001262,0.002538,0.001312,0.000884,...,0.000883,0.00072,0.000681,0.000839,0.000811,0.000807,0.000893,0.000871,0.001842,0.000796
std_score_time,0.000035,0.00022,0.000048,0.00011,0.00019,0.000064,0.000058,0.002088,0.000082,0.000113,...,0.000294,0.00005,0.000016,0.000127,0.000027,0.000056,0.000159,0.000105,0.001182,0.000141
param_min_samples_split,10,10,4,4,7,9,11,2,8,7,...,49,31,45,27,43,36,36,47,44,39
param_max_depth,23,65,95,39,15,37,7,87,29,7,...,87,23,19,99,27,27,47,75,95,87
params,"{'min_samples_split': 10, 'max_depth': 23}","{'min_samples_split': 10, 'max_depth': 65}","{'min_samples_split': 4, 'max_depth': 95}","{'min_samples_split': 4, 'max_depth': 39}","{'min_samples_split': 7, 'max_depth': 15}","{'min_samples_split': 9, 'max_depth': 37}","{'min_samples_split': 11, 'max_depth': 7}","{'min_samples_split': 2, 'max_depth': 87}","{'min_samples_split': 8, 'max_depth': 29}","{'min_samples_split': 7, 'max_depth': 7}",...,"{'min_samples_split': 49, 'max_depth': 87}","{'min_samples_split': 31, 'max_depth': 23}","{'min_samples_split': 45, 'max_depth': 19}","{'min_samples_split': 27, 'max_depth': 99}","{'min_samples_split': 43, 'max_depth': 27}","{'min_samples_split': 36, 'max_depth': 27}","{'min_samples_split': 36, 'max_depth': 47}","{'min_samples_split': 47, 'max_depth': 75}","{'min_samples_split': 44, 'max_depth': 95}","{'min_samples_split': 39, 'max_depth': 87}"
split0_test_score,0.967033,0.967033,0.967033,0.967033,0.967033,0.967033,0.967033,0.956044,0.967033,0.967033,...,0.912088,0.912088,0.912088,0.912088,0.912088,0.912088,0.912088,0.912088,0.912088,0.912088
split1_test_score,0.912088,0.912088,0.912088,0.912088,0.912088,0.912088,0.901099,0.912088,0.912088,0.912088,...,0.901099,0.901099,0.901099,0.901099,0.901099,0.901099,0.901099,0.901099,0.901099,0.901099
split2_test_score,0.923077,0.923077,0.912088,0.912088,0.912088,0.912088,0.923077,0.923077,0.912088,0.912088,...,0.945055,0.934066,0.945055,0.934066,0.945055,0.945055,0.945055,0.945055,0.945055,0.945055


In [35]:
# 最も予測精度の高かったハイパーパラメータの確認
tuned_model.best_params_

{'min_samples_split': 10, 'max_depth': 23}

In [36]:
# 最も予測精度の高かったモデルの引き継ぎ
best_model = tuned_model.best_estimator_

In [37]:
# モデルの検証
print(best_model.score(x_train_val, t_train_val))
print(best_model.score(x_test, t_test))

0.9934065934065934
0.956140350877193


In [38]:
# optuna のインストール
!pip install optuna

Collecting optuna
  Downloading optuna-3.2.0-py3-none-any.whl (390 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m390.6/390.6 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.11.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m23.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cmaes>=0.9.1 (from optuna)
  Downloading cmaes-0.9.1-py3-none-any.whl (21 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.2.4-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, cmaes, alembic, optuna
Successfully installed Mako-1.2.4 alembic-1.11.1 cmaes-0.9.1 colorlog-6.7.0 optuna-3.2.0


In [39]:
import optuna

In [40]:
from sklearn.model_selection import cross_val_score

In [41]:
def objective(trial, x, t, cv):
    # 1. ハイパーパラメータごとに探索範囲を指定
    max_depth = trial.suggest_int('max_depth', 2, 100)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 100)

    # 2. 学習に使用するアルゴリズムを指定
    estimator = DecisionTreeClassifier(
      max_depth = max_depth,
      min_samples_split = min_samples_split
    )

    # 3. 学習の実行、検証結果の表示
    print('Current_params : ', trial.params)
    accuracy = cross_val_score(estimator, x, t, cv=cv).mean()
    return accuracy

In [42]:
# study オブジェクトの作成（最大化）
study = optuna.create_study(direction='maximize')

[I 2023-07-10 16:43:58,641] A new study created in memory with name: no-name-013aced5-03e2-439d-b1d3-064295a8fbb6


In [43]:
# K 分割交差検証の K
cv = 5
# 目的関数の最適化
study.optimize(lambda trial: objective(trial, x_train_val, t_train_val, cv), n_trials=10)

print(study.best_trial)

[I 2023-07-10 16:44:09,269] Trial 0 finished with value: 0.9252747252747253 and parameters: {'max_depth': 74, 'min_samples_split': 25}. Best is trial 0 with value: 0.9252747252747253.
[I 2023-07-10 16:44:09,308] Trial 1 finished with value: 0.9274725274725275 and parameters: {'max_depth': 62, 'min_samples_split': 19}. Best is trial 1 with value: 0.9274725274725275.
[I 2023-07-10 16:44:09,346] Trial 2 finished with value: 0.9472527472527472 and parameters: {'max_depth': 43, 'min_samples_split': 3}. Best is trial 2 with value: 0.9472527472527472.
[I 2023-07-10 16:44:09,385] Trial 3 finished with value: 0.9186813186813187 and parameters: {'max_depth': 57, 'min_samples_split': 84}. Best is trial 2 with value: 0.9472527472527472.
[I 2023-07-10 16:44:09,422] Trial 4 finished with value: 0.9186813186813187 and parameters: {'max_depth': 44, 'min_samples_split': 57}. Best is trial 2 with value: 0.9472527472527472.


Current_params :  {'max_depth': 74, 'min_samples_split': 25}
Current_params :  {'max_depth': 62, 'min_samples_split': 19}
Current_params :  {'max_depth': 43, 'min_samples_split': 3}
Current_params :  {'max_depth': 57, 'min_samples_split': 84}
Current_params :  {'max_depth': 44, 'min_samples_split': 57}
Current_params :  {'max_depth': 62, 'min_samples_split': 33}


[I 2023-07-10 16:44:09,491] Trial 5 finished with value: 0.9208791208791209 and parameters: {'max_depth': 62, 'min_samples_split': 33}. Best is trial 2 with value: 0.9472527472527472.
[I 2023-07-10 16:44:09,573] Trial 6 finished with value: 0.9208791208791209 and parameters: {'max_depth': 19, 'min_samples_split': 44}. Best is trial 2 with value: 0.9472527472527472.
[I 2023-07-10 16:44:09,668] Trial 7 finished with value: 0.9428571428571428 and parameters: {'max_depth': 99, 'min_samples_split': 11}. Best is trial 2 with value: 0.9472527472527472.


Current_params :  {'max_depth': 19, 'min_samples_split': 44}
Current_params :  {'max_depth': 99, 'min_samples_split': 11}
Current_params :  {'max_depth': 19, 'min_samples_split': 18}


[I 2023-07-10 16:44:09,722] Trial 8 finished with value: 0.9274725274725275 and parameters: {'max_depth': 19, 'min_samples_split': 18}. Best is trial 2 with value: 0.9472527472527472.
[I 2023-07-10 16:44:09,772] Trial 9 finished with value: 0.9186813186813187 and parameters: {'max_depth': 49, 'min_samples_split': 75}. Best is trial 2 with value: 0.9472527472527472.


Current_params :  {'max_depth': 49, 'min_samples_split': 75}
FrozenTrial(number=2, state=TrialState.COMPLETE, values=[0.9472527472527472], datetime_start=datetime.datetime(2023, 7, 10, 16, 44, 9, 309528), datetime_complete=datetime.datetime(2023, 7, 10, 16, 44, 9, 346734), params={'max_depth': 43, 'min_samples_split': 3}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'max_depth': IntDistribution(high=100, log=False, low=2, step=1), 'min_samples_split': IntDistribution(high=100, log=False, low=2, step=1)}, trial_id=2, value=None)


In [44]:
# 最も予測精度の高かったハイパーパラメータの確認
study.best_params

{'max_depth': 43, 'min_samples_split': 3}

In [45]:
# 最適なハイパーパラメータを設定したモデルの定義
best_model = DecisionTreeClassifier(**study.best_params)

# モデルの学習
best_model.fit(x_train_val, t_train_val)

# モデルの検証
print(best_model.score(x_train_val, t_train_val))
print(best_model.score(x_test, t_test))

1.0
0.9385964912280702
