# 베이지안 최적화 개요와 HyperOpt 사용법

In [1]:
import hyperopt

print(hyperopt.__version__)

0.2.7


In [2]:
from hyperopt import hp

# -10 ~ 10까지 1간격을 가지는 입력 변수 x 집합값과 -15 ~ 15까지 1간격을 가지는 입력 변수  y 집합값 설정.
search_space = {'x' : hp.quniform('x', -10, 10, 1), 'y':hp.quniform('y', -15, 15, 1)}

In [3]:
search_space
# x, y는 객체로 되어 있음

{'x': <hyperopt.pyll.base.Apply at 0x1ce007a6b20>,
 'y': <hyperopt.pyll.base.Apply at 0x1ce7deaa700>}

In [4]:
search_space['x']

<hyperopt.pyll.base.Apply at 0x1ce007a6b20>

In [5]:
from hyperopt import STATUS_OK

# 목적 함수를 생성. 입력 변수값과 입력 변수 검색 범위를 가지는 딕셔너리를 인자로 받고, 특정 값을 반환
def objective_func(search_space):
    x = search_space['x']
    y = search_space['y']
    retval = x**2 - 20*y
    
    return retval # 권장되는 return 방식 : return {'loss': retval, 'status':STATUS_OK}

In [6]:
# tpe는 "tree structured parzen estimator"을 뜻함
# 일반적으로 베이지안 최적화는 가우시안 최적화를 사용하지만 hyperopt는 tpe 알고리즘을 사용 
from hyperopt import fmin, tpe, Trials
import numpy as np

# 입력 결괏값을 저장한 Trials 객체값 생성.
trial_val = Trials()

# 목적 함수의 최솟값을 반환하는 최적 입력 변숫값을 5번의 입력값 시도(max_evals=5)로 찾아냄. 순차적으로 안들어가고 랜덤하게 들어감
# algo = tpe.suggest는 이 형태 그대로 들어감
# iteration이 돌아갈 때 마다 들어간 입력값과 결과값이 모두 trials에 저장됨
# "rstate=np.random.default_rng(seed=0)"을 안해주면 값이 고정이 안됨
    # 하지만 실제로는 rstate를 안해주는게 결과가 더 좋음. 실제에서는 rstate 사용을 권장하지 않음
best_01 = fmin(fn=objective_func, space = search_space, algo = tpe.suggest, 
              max_evals = 5, trials = trial_val, rstate=np.random.default_rng(seed=0))
print('best:', best_01)

# "x = -4, y = 12"일 때 objeci function은 최소가 됨 

100%|████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 1000.12trial/s, best loss: -224.0]
best: {'x': -4.0, 'y': 12.0}


In [7]:
trial_val = Trials()

# max_evals를 20회로 늘려서 재테스트
best_02 = fmin(fn = objective_func, space = search_space, algo = tpe.suggest,
              max_evals = 20, trials = trial_val, rstate=np.random.default_rng(seed=0))

print('best:', best_02)

100%|██████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 1193.75trial/s, best loss: -296.0]
best: {'x': 2.0, 'y': 15.0}


In [8]:
trial_val

<hyperopt.base.Trials at 0x1ce007c8040>

In [9]:
# fmin( )에 인자로 들어가는 Trials 객체의 result 속성에 파이썬 리스트로 목적 함수 반환값들이 저장됨
# 리스트 내부의 개별 원소는 {'loss':함수 반환값, 'status':반환 상태값} 와 같은 딕셔너리임. 
print(type(trial_val.results))
print("")
print(trial_val.results)

<class 'list'>

[{'loss': -64.0, 'status': 'ok'}, {'loss': -184.0, 'status': 'ok'}, {'loss': 56.0, 'status': 'ok'}, {'loss': -224.0, 'status': 'ok'}, {'loss': 61.0, 'status': 'ok'}, {'loss': -296.0, 'status': 'ok'}, {'loss': -40.0, 'status': 'ok'}, {'loss': 281.0, 'status': 'ok'}, {'loss': 64.0, 'status': 'ok'}, {'loss': 100.0, 'status': 'ok'}, {'loss': 60.0, 'status': 'ok'}, {'loss': -39.0, 'status': 'ok'}, {'loss': 1.0, 'status': 'ok'}, {'loss': -164.0, 'status': 'ok'}, {'loss': 21.0, 'status': 'ok'}, {'loss': -56.0, 'status': 'ok'}, {'loss': 284.0, 'status': 'ok'}, {'loss': 176.0, 'status': 'ok'}, {'loss': -171.0, 'status': 'ok'}, {'loss': 0.0, 'status': 'ok'}]


In [10]:
# Trials 객체의 vals 속성에 {'입력변수명':개별 수행 시마다 입력된 값 리스트} 형태로 저장됨
print(type(trial_val.vals))
print("")
print(trial_val.vals)

<class 'dict'>

{'x': [-6.0, -4.0, 4.0, -4.0, 9.0, 2.0, 10.0, -9.0, -8.0, -0.0, -0.0, 1.0, 9.0, 6.0, 9.0, 2.0, -2.0, -4.0, 7.0, -0.0], 'y': [5.0, 10.0, -2.0, 12.0, 1.0, 15.0, 7.0, -10.0, 0.0, -5.0, -3.0, 2.0, 4.0, 10.0, 3.0, 3.0, -14.0, -8.0, 11.0, -0.0]}


In [11]:
import pandas as pd 

# results에서 loss 키값에 해당하는 밸류들을 추출하여 list로 생성. 
losses = [loss_dict['loss'] for loss_dict in trial_val.results]
print(losses)

[-64.0, -184.0, 56.0, -224.0, 61.0, -296.0, -40.0, 281.0, 64.0, 100.0, 60.0, -39.0, 1.0, -164.0, 21.0, -56.0, 284.0, 176.0, -171.0, 0.0]


In [12]:
# DataFrame으로 생성. 
result_df = pd.DataFrame({'x' : trial_val.vals['x'],
                         'y' : trial_val.vals['y'],
                         'losses' : losses})
result_df

Unnamed: 0,x,y,losses
0,-6.0,5.0,-64.0
1,-4.0,10.0,-184.0
2,4.0,-2.0,56.0
3,-4.0,12.0,-224.0
4,9.0,1.0,61.0
5,2.0,15.0,-296.0
6,10.0,7.0,-40.0
7,-9.0,-10.0,281.0
8,-8.0,0.0,64.0
9,-0.0,-5.0,100.0


 # HyperOpt를 XGBoost 하이퍼 파라미터 튜닝에 적용

In [13]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

cancer_df = pd.DataFrame(data = load_breast_cancer().data, columns = load_breast_cancer().feature_names)
cancer_df['target'] = load_breast_cancer().target
X_features = cancer_df.iloc[:, :-1]
y_label = cancer_df.iloc[:, -1]

# 전체 데이터 중 80%는 학습용 데이터, 20%는 테스트용 데이터 추출
X_train, X_test, y_train, y_test=train_test_split(X_features, y_label,
                                         test_size=0.2, random_state=156 )

# 학습 데이터를 다시 학습과 검증 데이터로 분리 
X_tr, X_val, y_tr, y_val= train_test_split(X_train, y_train,
                                         test_size=0.1, random_state=156 )

In [14]:
from hyperopt import hp

# max_depth는 5에서 20까지 1간격으로, min_child_weight는 1에서 2까지 1간격으로
# colsample_bytree는 0.5에서 1사이, learning_rate는 0.01에서 0.2사이 정규 분포된 값으로 검색. 

xgb_search_space = {'max_depth' : hp.quniform('max_depth', 5, 20, 1),
                   'min_child_weight' : hp.quniform('min_child_weight', 1, 2, 1),
                   'learning_rate' : hp.uniform('learnig_rate', 0.01, 0.2),
                    'colsample_bytree' : hp.uniform('colsample_bytree', 0.5 ,1)
                   }

In [15]:
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
from hyperopt import STATUS_OK

# fmin()에서 입력된 search_space값으로 입력된 모든 값은 실수형임. 
# XGBClassifier의 정수형 하이퍼 파라미터는 정수형 변환을 해줘야 함. 
# 정확도는 높은 수록 더 좋은 수치임. -1* 정확도를 곱해서 큰 정확도 값일 수록 최소가 되도록 변환

def objective_func(search_space):
    xgb_clf = XGBClassifier(n_estimators = 100, max_depth = int(search_space['max_depth']),
                           min_child_weight = int(search_space['min_child_weight']),
                           learning_rate = search_space['learning_rate'],
                           colsample_bytree = search_space['colsample_bytree'],
                           eval_metric='logloss')
    
    accuracy = cross_val_score(xgb_clf, X_train, y_train, scoring='accuracy', cv=3)
    
    # accuracy는 cv=3 개수만큼의 정확도 결과를 가지므로 이를 평균해서 반환하되 -1을 곱해줌.
    return {'loss' : -1*np.mean(accuracy), 'status' : STATUS_OK}

In [16]:
from hyperopt import fmin, tpe, Trials

trial_val = Trials()
best = fmin(fn=objective_func,
           space = xgb_search_space,
           algo = tpe.suggest, # "algo = tpe.suggest"는 고정입니다.
           max_evals = 50, # max_evals : 최대 반복 횟수를 지정합니다.
           trials = trial_val, rstate = np.random.default_rng(seed = 9))
print('best :', best)

# best loss: -0.9670616939700244 : 정확도가 0.96
# 최소 loss일 때의 하이퍼 파라미터들을 보여줍니다.

100%|███████████████████████████████████████████████| 50/50 [00:07<00:00,  6.30trial/s, best loss: -0.9670616939700244]
best : {'colsample_bytree': 0.5424149213362504, 'learnig_rate': 0.12601372924444681, 'max_depth': 17.0, 'min_child_weight': 2.0}


In [17]:
# 반환된 best 객체는 dictionary입니다.
best

{'colsample_bytree': 0.5424149213362504,
 'learnig_rate': 0.12601372924444681,
 'max_depth': 17.0,
 'min_child_weight': 2.0}

In [18]:
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score, roc_auc_score

def get_clf_eval(y_test, pred=None, pred_proba=None):
    confusion = confusion_matrix( y_test, pred)
    accuracy = accuracy_score(y_test , pred)
    precision = precision_score(y_test , pred)
    recall = recall_score(y_test , pred)
    f1 = f1_score(y_test,pred)
    # ROC-AUC 추가 
    roc_auc = roc_auc_score(y_test, pred_proba)
    print('오차 행렬')
    print(confusion)
    # ROC-AUC print 추가
    print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f},\
    F1: {3:.4f}, AUC:{4:.4f}'.format(accuracy, precision, recall, f1, roc_auc))

In [19]:
xgb_wrapper = XGBClassifier(n_estimators = 400, learning_rate = round(best['learnig_rate'], 5),
                           max_depth = int(best['max_depth']),
                           min_child_weight = int(best['min_child_weight']),
                           colsamplt_bytree = round(best['colsample_bytree'], 5)
                           )

In [20]:
# evals = [(X_val, y_val)]  : 검증데이터 세트만 넣어줘도 됨

evals = [(X_tr, y_tr), (X_val, y_val)]
xgb_wrapper.fit(X_tr, y_tr, early_stopping_rounds = 50, eval_metric='logloss',
               eval_set = evals, verbose = True)

Parameters: { "colsamplt_bytree" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	validation_0-logloss:0.59003	validation_1-logloss:0.62146
[1]	validation_0-logloss:0.50919	validation_1-logloss:0.56750
[2]	validation_0-logloss:0.44256	validation_1-logloss:0.51487
[3]	validation_0-logloss:0.38849	validation_1-logloss:0.48449
[4]	validation_0-logloss:0.34293	validation_1-logloss:0.44620
[5]	validation_0-logloss:0.30368	validation_1-logloss:0.41606
[6]	validation_0-logloss:0.27108	validation_1-logloss:0.38899
[7]	validation_0-logloss:0.24259	validation_1-logloss:0.36614
[8]	validation_0-logloss:0.21929	validation_1-logloss:0.35055
[9]	validation_0-logloss:0.19740	validation_1-logloss:0.33662
[10]	validation_0-logloss:0.17946	validation_1-logloss:0.

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, colsamplt_bytree=0.54241,
              enable_categorical=False, gamma=0, gpu_id=-1,
              importance_type=None, interaction_constraints='',
              learning_rate=0.12601, max_delta_step=0, max_depth=17,
              min_child_weight=2, missing=nan, monotone_constraints='()',
              n_estimators=400, n_jobs=8, num_parallel_tree=1, predictor='auto',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [21]:
preds = xgb_wrapper.predict(X_test)
pred_proba = xgb_wrapper.predict_proba(X_test)[:, 1]

get_clf_eval(y_test, preds, pred_proba)

오차 행렬
[[34  3]
 [ 3 74]]
정확도: 0.9474, 정밀도: 0.9610, 재현율: 0.9610,    F1: 0.9610, AUC:0.9898


In [37]:
# Trials 객체의 result 속성에 파이썬 리스트로 목적 함수 반환값들이 저장됨
# 리스트 내부의 개별 원소는 {'loss':함수 반환값, 'status':반환 상태값} 와 같은 딕셔너리임. 
trial_val.results

[{'loss': -0.9472958057395143, 'status': 'ok'},
 {'loss': -0.9604827466016034, 'status': 'ok'},
 {'loss': -0.9582897641454631, 'status': 'ok'},
 {'loss': -0.9604682235389799, 'status': 'ok'},
 {'loss': -0.9626612059951203, 'status': 'ok'},
 {'loss': -0.9582752410828395, 'status': 'ok'},
 {'loss': -0.9560967816893227, 'status': 'ok'},
 {'loss': -0.9604682235389799, 'status': 'ok'},
 {'loss': -0.9626612059951203, 'status': 'ok'},
 {'loss': -0.9560967816893227, 'status': 'ok'},
 {'loss': -0.9560822586266992, 'status': 'ok'},
 {'loss': -0.9604682235389799, 'status': 'ok'},
 {'loss': -0.9582752410828395, 'status': 'ok'},
 {'loss': -0.9494742651330311, 'status': 'ok'},
 {'loss': -0.9494887881956547, 'status': 'ok'},
 {'loss': -0.958289764145463, 'status': 'ok'},
 {'loss': -0.9494887881956547, 'status': 'ok'},
 {'loss': -0.9626466829324968, 'status': 'ok'},
 {'loss': -0.9604682235389799, 'status': 'ok'},
 {'loss': -0.964868711513884, 'status': 'ok'},
 {'loss': -0.9582752410828395, 'status': '

In [33]:
# Trials 객체의 vals 속성에 {'입력변수명':개별 수행 시마다 입력된 값 리스트} 형태로 저장됨
print(trial_val.vals)

{'colsample_bytree': [0.5852347138193622, 0.7271863641855161, 0.9599446282177103, 0.9500116932342133, 0.6743364060621724, 0.8637740285716389, 0.9575208672481308, 0.6950178418953206, 0.684441779397407, 0.5921156978948783, 0.6147984356971752, 0.7767383218403126, 0.5147724605153474, 0.949782533600956, 0.9261209894715933, 0.5709901136927109, 0.8845494605701275, 0.548301545497125, 0.9102783821000844, 0.5325012994457242, 0.6448903552593086, 0.7809149353740567, 0.5101224389618698, 0.8221652003014476, 0.6474442723326842, 0.5424149213362504, 0.5381596018104068, 0.50646273858041, 0.6161621007772744, 0.5645004363767054, 0.7338255028647506, 0.5011019025007462, 0.597852955568476, 0.5019506063145684, 0.7091699780599466, 0.9994329972450053, 0.6515381726798483, 0.8399880223170878, 0.7651789401279038, 0.6134026515660492, 0.6665125226118613, 0.5595464026956128, 0.5274152029858842, 0.5882900366390131, 0.8049782668083675, 0.6968777525552029, 0.5249013078645705, 0.7258961569139915, 0.6309004947313197, 0.67

In [38]:
losses = [loss_dict['loss'] for loss_dict in trial_val.results]
losses

[-0.9472958057395143,
 -0.9604827466016034,
 -0.9582897641454631,
 -0.9604682235389799,
 -0.9626612059951203,
 -0.9582752410828395,
 -0.9560967816893227,
 -0.9604682235389799,
 -0.9626612059951203,
 -0.9560967816893227,
 -0.9560822586266992,
 -0.9604682235389799,
 -0.9582752410828395,
 -0.9494742651330311,
 -0.9494887881956547,
 -0.958289764145463,
 -0.9494887881956547,
 -0.9626466829324968,
 -0.9604682235389799,
 -0.964868711513884,
 -0.9582752410828395,
 -0.9604682235389799,
 -0.9604827466016034,
 -0.9472958057395143,
 -0.9363163703961893,
 -0.9670616939700244,
 -0.9626757290577438,
 -0.9407168583710934,
 -0.9582752410828395,
 -0.9429243638898571,
 -0.9516962937144186,
 -0.9670616939700244,
 -0.9604537004763566,
 -0.9626757290577438,
 -0.9604537004763566,
 -0.9604537004763566,
 -0.9604537004763566,
 -0.9582752410828395,
 -0.9560532125014524,
 -0.958289764145463,
 -0.9626612059951203,
 -0.9604827466016034,
 -0.9670616939700244,
 -0.9626612059951203,
 -0.9516962937144186,
 -0.964854188

In [40]:
result_df = pd.DataFrame({'max_depth' : trial_val.vals['max_depth'],
                         'min_child_weight' : trial_val.vals['min_child_weight'],
                         'colsample_bytree' : trial_val.vals['colsample_bytree'],
                         'learning_rate' : trial_val.vals['learnig_rate'],
                         'losses' : losses})
result_df

Unnamed: 0,max_depth,min_child_weight,colsample_bytree,learning_rate,losses
0,19.0,2.0,0.585235,0.033688,-0.947296
1,5.0,2.0,0.727186,0.105956,-0.960483
2,6.0,2.0,0.959945,0.154804,-0.95829
3,6.0,2.0,0.950012,0.120686,-0.960468
4,16.0,2.0,0.674336,0.142392,-0.962661
5,8.0,2.0,0.863774,0.106579,-0.958275
6,14.0,2.0,0.957521,0.079111,-0.956097
7,19.0,2.0,0.695018,0.095213,-0.960468
8,9.0,2.0,0.684442,0.14752,-0.962661
9,8.0,1.0,0.592116,0.081179,-0.956097
