In [1]:
pip install bayesian-optimization

Collecting bayesian-optimization
  Downloading bayesian_optimization-1.4.3-py3-none-any.whl (18 kB)
Installing collected packages: bayesian-optimization
Successfully installed bayesian-optimization-1.4.3
Note: you may need to restart the kernel to use updated packages.


In [None]:
from bayes_opt import BayesianOptimization
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier                      #이정도면 50분 걸림 (6core cpu), 교차검증하면 6시간

### train 준비.  평가지표는 eval_metric 이랑 roc_auc_score부분 수정.

## 옵션 1 (cv 없이) 사이킷런 래퍼 사용

In [None]:
X = train.drop(['TARGET'], axis=1)        #대회진행중이면 val 따로 뺴놓고 갖고오자
y = train['TARGET']

train_x, valid_x, train_y, valid_y = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
# parameter 별로 search할 범위를 설정. 
bayesian_params = {
    'max_depth': (6, 16),                 #이상
    'num_leaves': (24, 64), 
    'min_child_samples': (10, 200), 
    'min_child_weight':(1, 50),    
    'subsample':(0.5, 1.0),               #이상 
    'colsample_bytree': (0.5, 1.0),       #고정
    'max_bin':(10, 500),                  #이상 
    'reg_lambda':(0.001, 10),             
    'reg_alpha': (0.01, 50) 
}

In [None]:
def lgb_roc_eval(max_depth, num_leaves, min_child_samples, min_child_weight, subsample, 
                colsample_bytree,max_bin, reg_lambda, reg_alpha):
    params = {
        "n_estimators":500, "learning_rate":0.02,
        'max_depth': int(round(max_depth)),  #  호출 시 실수형 값이 들어오므로 정수형 하이퍼 파라미터는 정수형으로 변경 
        'num_leaves': int(round(num_leaves)), 
        'min_child_samples': int(round(min_child_samples)),
        'min_child_weight': int(round(min_child_weight)),
        'subsample': max(min(subsample, 1), 0), 
        'colsample_bytree': max(min(colsample_bytree, 1), 0),
        'max_bin':  max(int(round(max_bin)),10),            #10이상이 일반적
        'reg_lambda': max(reg_lambda,0),                    #0이상이 일반적
        'reg_alpha': max(reg_alpha, 0)
    }
    print(params)
    
    lgb_model = LGBMClassifier(**params)
    lgb_model.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_metric= 'auc', verbose= 100, 
                early_stopping_rounds= 100)
    valid_proba = lgb_model.predict_proba(valid_x)[:, 1]
    roc_auc = roc_auc_score(valid_y, valid_proba)
    
    return roc_auc   

In [None]:
lgbBO = BayesianOptimization(lgb_roc_eval,bayesian_params , random_state=0)
lgbBO.maximize(init_points=5, n_iter=25)      


In [None]:
#lgbBO.res                #수행기록, target은 내가 지정한 평가지표 값

In [None]:
target_list = []                   #최적의 파라미터를 추출
for result in lgbBO.res:
    target = result['target']
    target_list.append(target)

max_dict = lgbBO.res[np.argmax(np.array(target_list))]  
print(max_dict)

## 옵션2 (CV 수행), lgbm 래퍼 사용 (하이퍼파라미터 단어가 좀 다름)

In [None]:
X = train.drop(['TARGET'], axis=1)        #대회진행중이면 val 따로 뺴놓고 갖고오자
y = train['TARGET']

In [None]:
bayesian_params = {                    
    'max_depth': (6, 16), 
    'num_leaves': (24, 64), 
    'min_data_in_leaf': (10, 200), # min_child_samples
    'min_child_weight':(1, 50),
    'bagging_fraction':(0.5, 1.0), # subsample
    'feature_fraction': (0.5, 1.0), # colsample_bytree
    'max_bin':(10, 500),
    'lambda_l2':(0.001, 10), # reg_lambda
    'lambda_l1': (0.01, 50) # reg_alpha
}

In [None]:
import lightgbm as lgb

train_data = lgb.Dataset(data=train, label=y, free_raw_data=False)   #free_raw_data 이게 뭐야

def lgb_roc_eval_cv(max_depth, num_leaves, min_data_in_leaf, min_child_weight, bagging_fraction, 
                 feature_fraction,  max_bin, lambda_l2, lambda_l1):   
    params = {
        "num_iterations":500, "learning_rate":0.02,
        'early_stopping_rounds':100, 'metric':'auc',
        'max_depth': int(round(max_depth)), #  호출 시 실수형 값이 들어오므로 실수형 하이퍼 파라미터는 정수형으로 변경 
        'num_leaves': int(round(num_leaves)), 
        'min_data_in_leaf': int(round(min_data_in_leaf)),
        'min_child_weight': int(round(min_child_weight)),
        'bagging_fraction': max(min(bagging_fraction, 1), 0), 
        'feature_fraction': max(min(feature_fraction, 1), 0),
        'max_bin':  max(int(round(max_bin)),10),
        'lambda_l2': max(lambda_l2,0),
        'lambda_l1': max(lambda_l1, 0)
    }
    # 파이썬 lightgbm의 cv 메소드를 사용. cross_cal_score는 early stopping안된다고 함
    cv_result = lgb.cv(params, train_data, nfold=3, seed=0,  verbose_eval =100,  early_stopping_rounds=50, metrics=['auc'])
    return max(cv_result['auc-mean'])   

In [None]:
max_dict = lgbBO.res[np.argmax(np.array(target_list))]
print(max_dict)

#### 최종평가. 최적의 파라미터로 다시 예측 (trian준비) -> 이거 fmin 자동화코드 갖고 오자

## 옵션1 용 

In [None]:
def train_apps_all(train):
    
    X = train.drop(['TARGET'], axis=1)
    y =  train['TARGET']

    train_x, valid_x, train_y, valid_y = train_test_split(X, y, test_size=0.3, random_state=0)
    
    clf = LGBMClassifier(             #수작업으로 채워넣기 
                nthread=4,
                n_estimators=1000,
                learning_rate=0.02,
                max_depth = 13,
                num_leaves=57,
                colsample_bytree=0.638,
                subsample=0.682,
                max_bin=435,
                reg_alpha=0.936,
                reg_lambda=4.533,
                min_child_weight=25,
                min_child_samples=166,
                silent=-1,
                verbose=-1,
                )

    clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_metric= 'auc', verbose= 100, 
                early_stopping_rounds= 100)
    
    return clf

## 옵션2 용

In [None]:
# def train_apps_all(apps_all_train):
    
#     X = train.drop(['TARGET'], axis=1)
#     y =  train['TARGET']
#     train_x, valid_x, train_y, valid_y = train_test_split(X, y, test_size=0.3, random_state=0)
    
#     clf = LGBMClassifier(
#                 nthread=4,       #tn
#                 n_estimators=1000,
#                 learning_rate=0.02,
#                 max_depth = 10,
#                 num_leaves=60,
#                 colsample_bytree=0.511,
#                 subsample=0.785,
#                 max_bin=208,
#                 reg_alpha=7.009,
#                 reg_lambda=6.579,
#                 min_child_weight=40,
#                 min_child_samples=91,
#                 silent=-1,
#                 verbose=-1,
#                 )

#     clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_metric= 'auc', verbose= 100, 
#                 early_stopping_rounds= 100)
    
#     return clf

In [None]:
clf = train_apps_all(train)

In [None]:
preds = clf.predict(x_test)          #예측 진행 , 제출하고 싶으면 제출
roc_auc = roc_auc_score(y_test, preds)
print(roc_auc)