<a href="https://colab.research.google.com/github/jason96819/Studying/blob/main/sklearn/7_Hyper_parameter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 하이퍼 파라미터 튜닝

In [None]:
import numpy as np
import pandas as pd

## GridSearchCV

In [None]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score

iris = load_iris()

X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.3, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((105, 4), (45, 4), (105,), (45,))

In [None]:
dt_clf = DecisionTreeClassifier()
parameters = {'max_depth': [1, 2, 3],
              'min_samples_split': [2, 3]
              }

In [None]:
grid_tree = GridSearchCV(dt_clf, param_grid=parameters, cv=3, refit=True)

In [None]:
grid_tree.fit(X_train, y_train)

In [None]:
scores_df = pd.DataFrame(grid_tree.cv_results_)
scores_df[['params', 'mean_test_score', 'rank_test_score', 'split0_test_score', 'split1_test_score', 'split2_test_score']]

Unnamed: 0,params,mean_test_score,rank_test_score,split0_test_score,split1_test_score,split2_test_score
0,"{'max_depth': 1, 'min_samples_split': 2}",0.619048,5,0.628571,0.571429,0.657143
1,"{'max_depth': 1, 'min_samples_split': 3}",0.619048,5,0.628571,0.571429,0.657143
2,"{'max_depth': 2, 'min_samples_split': 2}",0.92381,1,0.971429,0.857143,0.942857
3,"{'max_depth': 2, 'min_samples_split': 3}",0.904762,4,0.914286,0.857143,0.942857
4,"{'max_depth': 3, 'min_samples_split': 2}",0.92381,1,0.942857,0.885714,0.942857
5,"{'max_depth': 3, 'min_samples_split': 3}",0.92381,1,0.942857,0.885714,0.942857


In [None]:
print('GridSearchCV의 최적 파라미터 :', grid_tree.best_params_)
print('GridSearchCV의 최고 정확도 :', np.round(grid_tree.best_score_, 4))

GridSearchCV의 최적 파라미터 : {'max_depth': 2, 'min_samples_split': 2}
GridSearchCV의 최고 정확도 : 0.9238


In [None]:
estimator = grid_tree.best_estimator_
pred = estimator.predict(X_test)
print('테스트 데이터 세트 정확도 :', np.round(accuracy_score(y_test, pred), 4))

테스트 데이터 세트 정확도 : 0.9778


## 베이지안 최적화

In [None]:
pip install scikit-optimize

Collecting scikit-optimize
  Downloading scikit_optimize-0.9.0-py2.py3-none-any.whl (100 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.3/100.3 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Collecting pyaml>=16.9 (from scikit-optimize)
  Downloading pyaml-23.9.7-py3-none-any.whl (23 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-23.9.7 scikit-optimize-0.9.0


In [None]:
from skopt import BayesSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# 예시 데이터 생성
X, y = iris.data, iris.target

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 탐색할 하이퍼파라미터 공간 정의
param_dist = {
    'n_estimators': (10, 200),
    'max_depth': (1, 20),
    'min_samples_split': (2, 20),
    'min_samples_leaf': (1, 20),
    'bootstrap': [True, False]
}

# 모델 정의
model = RandomForestClassifier()

# 베이지안 최적화 수행
bayes_search = BayesSearchCV(model, param_dist, n_iter=10, cv=5, n_jobs=-1)
bayes_search.fit(X_train, y_train)

# 최적의 하이퍼파라미터 및 모델 성능 출력
print("Best Parameters:", bayes_search.best_params_)
print("Best Score:", bayes_search.best_score_)

Best Parameters: OrderedDict([('bootstrap', False), ('max_depth', 10), ('min_samples_leaf', 5), ('min_samples_split', 16), ('n_estimators', 153)])
Best Score: 0.95


## optuna

In [None]:
pip install optuna

Collecting optuna
  Downloading optuna-3.4.0-py3-none-any.whl (409 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m409.6/409.6 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.12.1-py3-none-any.whl (226 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m226.8/226.8 kB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorlog (from optuna)
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.0-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, alembic, optuna
Successfully installed Mako-1.3.0 alembic-1.12.1 colorlog-6.7.0 optuna-3.4.0


In [None]:
import optuna
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# 목적 함수 정의
def objective(trial):
    # 예시 데이터 생성
    X, y = iris.data, iris.target

    # 데이터 분할
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # 모델 및 하이퍼파라미터 정의
    model = RandomForestClassifier(
        n_estimators=trial.suggest_int('n_estimators', 10, 200),
        max_depth=trial.suggest_int('max_depth', 1, 20),
        min_samples_split=trial.suggest_int('min_samples_split', 2, 20),
        min_samples_leaf=trial.suggest_int('min_samples_leaf', 1, 20),
        bootstrap=trial.suggest_categorical('bootstrap', [True, False])
    )

    # 모델 학습
    model.fit(X_train, y_train)

    # 정확도 반환 (목적 함수는 최소화하려는 값이어야 함)
    accuracy = model.score(X_test, y_test)
    return 1 - accuracy

# Optuna 최적화 수행
study = optuna.create_study(direction='minimize')  # 최소화하려는 목적 함수
study.optimize(objective, n_trials=10)  # 10회의 실험 수행

# 최적의 하이퍼파라미터 및 최소 목적 함수 값 출력
print("Best Parameters:", study.best_params)
print("Best Objective Value:", study.best_value)

[I 2023-11-23 08:55:42,195] A new study created in memory with name: no-name-2e1c5e21-757f-42ad-8350-5c019c571e9f
[I 2023-11-23 08:55:42,340] Trial 0 finished with value: 0.0 and parameters: {'n_estimators': 91, 'max_depth': 5, 'min_samples_split': 17, 'min_samples_leaf': 6, 'bootstrap': True}. Best is trial 0 with value: 0.0.
[I 2023-11-23 08:55:42,478] Trial 1 finished with value: 0.0 and parameters: {'n_estimators': 90, 'max_depth': 9, 'min_samples_split': 13, 'min_samples_leaf': 10, 'bootstrap': True}. Best is trial 0 with value: 0.0.
[I 2023-11-23 08:55:42,528] Trial 2 finished with value: 0.0 and parameters: {'n_estimators': 33, 'max_depth': 15, 'min_samples_split': 19, 'min_samples_leaf': 7, 'bootstrap': False}. Best is trial 0 with value: 0.0.
[I 2023-11-23 08:55:42,627] Trial 3 finished with value: 0.033333333333333326 and parameters: {'n_estimators': 63, 'max_depth': 1, 'min_samples_split': 13, 'min_samples_leaf': 2, 'bootstrap': True}. Best is trial 0 with value: 0.0.
[I 202

Best Parameters: {'n_estimators': 91, 'max_depth': 5, 'min_samples_split': 17, 'min_samples_leaf': 6, 'bootstrap': True}
Best Objective Value: 0.0
