In [1]:
# best hyper parameter search
import hyperopt

print(hyperopt.__version__)

0.2.7


In [3]:
# x**2 - 20y 최소되는 x, y 구해보자
# 1. search space
from hyperopt import hp

search_space = {
      'x': hp.quniform('x', -10, 10, 1) # -10 < x < 10
    , 'y': hp.quniform('y', -15, 15, 1) # -15 < y < 15
}

In [4]:
from hyperopt import STATUS_OK

# 목적 함수를 생성. 변숫값과 변수 검색 공간을 가지는 딕셔너리를 인자로 받고, 특정 값을 반환
def objective_func(search_space): # 입력값, 파라미터
    x = search_space['x']
    y = search_space['y']
    retval = x**2 - 20*y
    
    return retval # 결과값이 리턴되게 지정해야 한다

In [5]:
from hyperopt import fmin, tpe, Trials
import numpy as np

# 입력 결괏값을 저장한 Trials 객체값 생성.
trial_val = Trials()

# 목적 함수의 최솟값을 반환하는 최적 입력 변숫값을 5번의 입력값 시도(max_evals=5)로 찾아냄.
best_01 = fmin(fn=objective_func, space=search_space, algo=tpe.suggest, max_evals=5
               , trials=trial_val, rstate=np.random.default_rng(seed=0))
print('best:', best_01)

100%|██████████| 5/5 [00:00<00:00, 500.35trial/s, best loss: -224.0]
best: {'x': np.float64(-4.0), 'y': np.float64(12.0)}


In [6]:
(-4) ** 2 - 20 * 12

-224

In [7]:
trial_val = Trials()

# max_evals를 20회로 늘려서 재테스트
best_02 = fmin(fn=objective_func, space=search_space, algo=tpe.suggest, max_evals=20
               , trials=trial_val, rstate=np.random.default_rng(seed=0))
print('best:', best_02)

100%|██████████| 20/20 [00:00<00:00, 526.16trial/s, best loss: -296.0]
best: {'x': np.float64(2.0), 'y': np.float64(15.0)}


In [10]:
trial_val = Trials()

# max_evals를 n회로 늘려서 재테스트
best_02 = fmin(fn=objective_func, space=search_space, algo=tpe.suggest, max_evals=200
               , trials=trial_val, rstate=np.random.default_rng(seed=0))
print('best:', best_02)

100%|██████████| 200/200 [00:01<00:00, 101.94trial/s, best loss: -300.0]
best: {'x': np.float64(0.0), 'y': np.float64(15.0)}


In [11]:
# Hyperopt, xgboost 사용한 하이퍼 파라미터 최적화
# 아래 코드는 이전에 수록된 코드라 책에는 싣지 않았습니다. 
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

dataset = load_breast_cancer() # 유방암 데이터 셋

cancer_df = pd.DataFrame(data=dataset.data, columns=dataset.feature_names)
cancer_df['target']= dataset.target
X_features = cancer_df.iloc[:, :-1] # 데이터 추출
y_label = cancer_df.iloc[:, -1] # 레이블 추출

In [12]:
# 데이터를 학습(X_tr), 검증(X_val), 테스트(X_test) 분리
# 전체 데이터 중 80%는 학습용 데이터, 20%는 테스트용 데이터 추출
X_train, X_test, y_train, y_test=train_test_split(X_features, y_label, test_size=0.2, random_state=156 )

# 앞에서 추출한 학습 데이터를 다시 학습과 검증 데이터로 분리
X_tr, X_val, y_tr, y_val= train_test_split(X_train, y_train, test_size=0.1, random_state=156 )

In [13]:
# search space 설정
from hyperopt import hp

# max_depth는 5에서 20까지 1간격으로, min_child_weight는 1에서 2까지 1간격으로
# colsample_bytree는 0.5에서 1사이, learning_rate는 0.01에서 0.2 사이 정규 분포된 값으로 검색.
xgb_search_space = {'max_depth': hp.quniform('max_depth', 5, 20, 1), # 5부터 20까지 1씩 증가
                    'min_child_weight': hp.quniform('min_child_weight', 1, 2, 1), # 1, 2
                    'learning_rate': hp.uniform('learning_rate', 0.01, 0.2), 
                    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1) # 컬럼 개수 : 0.5 ~ 1
                   }

In [14]:
from sklearn.model_selection import cross_val_score # 교차 검증 정확도 점수만 출력
from xgboost import XGBClassifier # 분류 모델
from hyperopt import STATUS_OK 

# fmin()에서 입력된 search_space 값으로 입력된 모든 값은 실수형임.
# XGBClassifier의 정수형 하이퍼 파라미터는 정수형 변환을 해줘야 함.
# 정확도는 높을수록 더 좋은 수치임. -1 * 정확도를 곱해서 큰 정확도 값일수록 최소가 되도록 변환
def objective_func(search_space):
    # 수행 시간 절약을 위해 nestimators(트리개수)는 100으로 축소
    xgb_clf = XGBClassifier(n_estimators=100, max_depth=int(search_space['max_depth']),
                            min_child_weight=int(search_space['min_child_weight']),
                            learning_rate=search_space['learning_rate'],
                            colsample_bytree=search_space['colsample_bytree'],
                            eval_metric='logloss') # 평가지표
    accuracy = cross_val_score(xgb_clf, X_train, y_train, scoring='accuracy', cv=3)
    
    # accuracy는 cv=3 개수만큼 roc-auc 결과를 리스트로 가짐. 이를 평균해서 반환하되 -1을 곱함.
    return {'loss':-1 * np.mean(accuracy), 'status': STATUS_OK}

In [15]:
from hyperopt import fmin, tpe, Trials

trial_val = Trials() 
# fmin => best hyper parameter value return {파라미터명:값, ...} => best
best = fmin(fn=objective_func,
            space=xgb_search_space,
            algo=tpe.suggest,
            max_evals=50, # 최대 반복 횟수를 지정합니다.
            trials=trial_val, rstate=np.random.default_rng(seed=9))
print('best:', best) # 베스트 파라미터 값 추출

100%|██████████| 50/50 [00:21<00:00,  2.33trial/s, best loss: -0.9714621819449286]
best: {'colsample_bytree': np.float64(0.5036717216371022), 'learning_rate': np.float64(0.19808959234346474), 'max_depth': np.float64(12.0), 'min_child_weight': np.float64(1.0)}


In [16]:
# 모델 생성 성능 평가 : get_clf_eval(원래답, 예측값, 예측확률)
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score, roc_auc_score

def get_clf_eval(y_test, pred=None, pred_proba=None):
    confusion = confusion_matrix( y_test, pred)
    accuracy = accuracy_score(y_test , pred) # 정확도 점수
    precision = precision_score(y_test , pred) # 정밀도 점수
    recall = recall_score(y_test , pred) # 재현율 점수
    f1 = f1_score(y_test,pred) # 정밀도, 재현율 조화평균 값
    # ROC-AUC 추가 
    roc_auc = roc_auc_score(y_test, pred_proba) # AUC 점수 : 
    print('오차 행렬')
    print(confusion)
    # ROC-AUC print 추가
    print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f},\
    F1: {3:.4f}, AUC:{4:.4f}'.format(accuracy, precision, recall, f1, roc_auc))

In [18]:
xgb_wrapper = XGBClassifier(n_estimators=400,
                            learning_rate=round(best['learning_rate'], 5),
                            max_depth=int(best['max_depth']),
                            min_child_weight=int(best['min_child_weight']),
                            colsample_bytree=round(best['colsample_bytree'], 5),
                            early_stopping_rounds=50, eval_metric='logloss'
                           )

evals = [(X_tr, y_tr), (X_val, y_val)]
xgb_wrapper.fit(X_tr, y_tr,
                eval_set=evals, verbose=True)

preds = xgb_wrapper.predict(X_test) # 예측값
pred_proba = xgb_wrapper.predict_proba(X_test)[:, 1] # 예측 확률

get_clf_eval(y_test, preds, pred_proba)

[0]	validation_0-logloss:0.51050	validation_1-logloss:0.54091
[1]	validation_0-logloss:0.40643	validation_1-logloss:0.46300
[2]	validation_0-logloss:0.32978	validation_1-logloss:0.40989
[3]	validation_0-logloss:0.27058	validation_1-logloss:0.37503
[4]	validation_0-logloss:0.22553	validation_1-logloss:0.35076
[5]	validation_0-logloss:0.19073	validation_1-logloss:0.32314
[6]	validation_0-logloss:0.16231	validation_1-logloss:0.30427
[7]	validation_0-logloss:0.13942	validation_1-logloss:0.29606
[8]	validation_0-logloss:0.12122	validation_1-logloss:0.28180
[9]	validation_0-logloss:0.10559	validation_1-logloss:0.27067
[10]	validation_0-logloss:0.09315	validation_1-logloss:0.26813
[11]	validation_0-logloss:0.08209	validation_1-logloss:0.26516
[12]	validation_0-logloss:0.07277	validation_1-logloss:0.26080
[13]	validation_0-logloss:0.06506	validation_1-logloss:0.25770
[14]	validation_0-logloss:0.05796	validation_1-logloss:0.25109
[15]	validation_0-logloss:0.05245	validation_1-logloss:0.24523
[1