In [1]:
import numpy as np
import random
import os
def seed_everything(seed: int = 24):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
seed_everything(24)

import pandas as pd
import sklearn
from lightgbm import LGBMClassifier
import lightgbm as lgb
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.model_selection import train_test_split as tts
from bayes_opt import BayesianOptimization
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss

### train 준비.  이 코드는 LGBM-이진분류-f1 용이다.

# 기본 성능 파악

In [39]:
train=pd.read_csv("train_binary_연습.csv")

In [40]:
X = train.drop(['TARGET'], axis=1)        #대회진행중이면 val 따로 뺴놓고 갖고오자
y = train['TARGET']

train_x, valid_x, train_y, valid_y = tts(X, y, test_size=0.3, random_state=0, shuffle=True, stratify=y)

In [6]:
# lgbm_clf = LGBMClassifier(n_estimators=500, objective='multiclass')               # macro f1 

# # 모델 학습
# eval_set = [(train_x, train_y), (valid_x, valid_y)]
# lgbm_clf.fit(train_x, train_y, early_stopping_rounds=50, eval_metric="multi_logloss", eval_set=eval_set)

# # F1 Score 계산
# f1 = f1_score(valid_y, lgbm_clf.predict(valid_x), average='macro')
# print('Macro F1 Score: {0:.4f}'.format(f1))

In [41]:
lgbm_clf = LGBMClassifier(n_estimators=500, objective='binary')                   # f1

# 모델 학습
eval_set = [(train_x, train_y), (valid_x, valid_y)]
early_stopping_callback = lgb.early_stopping(stopping_rounds=50)
lgbm_clf.fit(train_x, train_y,  callbacks=[early_stopping_callback], eval_metric="binary_logloss", eval_set=eval_set)

# F1 Score 계산
pred = lgbm_clf.predict(valid_x)
f1 = f1_score(valid_y, pred, average='binary')  
print('F1 Score: {0:.4f}'.format(f1))

Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[3]	training's binary_logloss: 0.642252	valid_1's binary_logloss: 0.679709
F1 Score: 0.7234


In [11]:
# lgbm_clf = LGBMClassifier(n_estimators=500, objective='binary')                   #roc auc

# # 모델 학습
# eval_set = [(train_x, train_y), (valid_x, valid_y)]
# lgbm_clf.fit(train_x, train_y, early_stopping_rounds=50, eval_metric="auc", eval_set=eval_set)

# # AUC 계산
# pred_prob = lgbm_clf.predict_proba(valid_x)[:, 1]  
# auc = roc_auc_score(valid_y, pred_prob)
# print('AUC Score: {0:.4f}'.format(auc))

[1]	training's auc: 0.683333	training's binary_logloss: 0.670304	valid_1's auc: 0.563348	valid_1's binary_logloss: 0.68034
[2]	training's auc: 0.825417	training's binary_logloss: 0.652452	valid_1's auc: 0.515837	valid_1's binary_logloss: 0.682139
[3]	training's auc: 0.800417	training's binary_logloss: 0.642252	valid_1's auc: 0.556561	valid_1's binary_logloss: 0.679709
[4]	training's auc: 0.825417	training's binary_logloss: 0.627829	valid_1's auc: 0.515837	valid_1's binary_logloss: 0.682631
[5]	training's auc: 0.800417	training's binary_logloss: 0.619506	valid_1's auc: 0.556561	valid_1's binary_logloss: 0.681332
[6]	training's auc: 0.825417	training's binary_logloss: 0.607728	valid_1's auc: 0.515837	valid_1's binary_logloss: 0.685053
[7]	training's auc: 0.825417	training's binary_logloss: 0.600884	valid_1's auc: 0.515837	valid_1's binary_logloss: 0.684602
[8]	training's auc: 0.825417	training's binary_logloss: 0.591186	valid_1's auc: 0.515837	valid_1's binary_logloss: 0.688882
[9]	train

# 옵션 1 (cv 없이) 사이킷런 래퍼 사용

In [42]:
# parameter 별로 search할 범위를 설정. 
bayesian_params = {
    'learning_rate': (0.01, 0.2),
    'max_depth': (6, 16),                 #이상
    'num_leaves': (24, 64), 
    'min_child_samples': (10, 200), 
    'min_child_weight':(1, 50),    
    'subsample':(0.5, 1.0),               #이상 
    'colsample_bytree': (0.5, 1.0),       #고정
    'max_bin':(10, 500),                  #이상 
    'reg_lambda':(0.001, 10),             
    'reg_alpha': (0.01, 50) 
}

In [45]:
def lgb_roc_eval(learning_rate, max_depth, num_leaves, min_child_samples, min_child_weight, subsample, 
                colsample_bytree,max_bin, reg_lambda, reg_alpha):
    params = {
        "n_estimators":500, 
        "learning_rate": max(round(learning_rate),0.01),
        'max_depth': int(round(max_depth)),  #  호출 시 실수형 값이 들어오므로 정수형 하이퍼 파라미터는 정수형으로 변경 
        'num_leaves': int(round(num_leaves)), 
        'min_child_samples': int(round(min_child_samples)),
        'min_child_weight': int(round(min_child_weight)),
        'subsample': max(min(subsample, 1), 0), 
        'colsample_bytree': max(min(colsample_bytree, 1), 0),
        'max_bin':  max(int(round(max_bin)),10),            #10이상이 일반적
        'reg_lambda': max(reg_lambda,0),                    #0이상이 일반적
        'reg_alpha': max(reg_alpha, 0)
    }
    print(params)
    early_stopping_callback = lgb.early_stopping(stopping_rounds=100)
    log_callback = lgb.log_evaluation(period=100)   #중간과정 출력
    lgb_model = LGBMClassifier(**params)
    lgb_model.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_metric= "binary_logloss",  
                callbacks=[early_stopping_callback,log_callback])
    pred = lgb_model.predict(valid_x)
    f1 = f1_score(valid_y, pred, average='binary')
    
    return f1   

In [46]:
lgbBO = BayesianOptimization(lgb_roc_eval, bayesian_params , random_state=0)
lgbBO.maximize(init_points=5, n_iter=25)                         #init_points(초기 지정 파라미터개수) (5~20), n_iter(한 셋 반복수) (10~100)


|   iter    |  target   | colsam... | learni... |  max_bin  | max_depth | min_ch... | min_ch... | num_le... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------------------
{'n_estimators': 500, 'learning_rate': 0.01, 'max_depth': 11, 'num_leaves': 42, 'min_child_samples': 90, 'min_child_weight': 33, 'subsample': 0.6917207594128889, 'colsample_bytree': 0.7744067519636624, 'max_bin': 305, 'reg_lambda': 9.636663942249792, 'reg_alpha': 44.58973230909617}
Training until validation scores don't improve for 100 rounds
[100]	training's binary_logloss: 0.692872	valid_1's binary_logloss: 0.695612
Early stopping, best iteration is:
[1]	training's binary_logloss: 0.692872	valid_1's binary_logloss: 0.695612
| [0m1        [0m | [0m0.7234   [0m | [0m0.7744   [0m | [0m0.1459   [0m | [0m305.4    [0m | [0m11.45    [0m | [0m90.49    [0m | [0m32.65    [0m | [0m41.5     

In [None]:
#lgbBO.res                #수행기록, target은 내가 지정한 평가지표 값

In [21]:
target_list = []                   #최적의 파라미터를 추출
for result in lgbBO.res:
    target = result['target']
    target_list.append(target)

best = lgbBO.res[np.argmax(np.array(target_list))]  
print(best)
best=best['params']

{'target': 0.7234042553191489, 'params': {'colsample_bytree': 0.7744067519636624, 'learning_rate': 0.14588597961075972, 'max_bin': 305.3540542751055, 'max_depth': 11.448831829968968, 'min_child_samples': 90.4944118743919, 'min_child_weight': 32.64881154026615, 'num_leaves': 41.5034884505077, 'reg_alpha': 44.58973230909617, 'reg_lambda': 9.636663942249792, 'subsample': 0.6917207594128889}}


## 최종평가. 최적의 파라미터로 다시 학습

In [23]:
lgbm_clf = LGBMClassifier(#n_jobs= 6,                  #cpu 코어수 
                          n_estimators=700, 
                          learning_rate=round(best['learning_rate'], 5),
                          max_depth=int(best['max_depth']),
                          num_leaves=int(best['num_leaves']),
                          colsample_bytree = round(best['colsample_bytree'], 5),
                          subsample=round(best['subsample'], 5),
                          max_bin=int(best['max_bin']),
                          reg_alpha=round(best['reg_alpha'], 5),
                          reg_lambda=round(best['reg_lambda'], 5),
                          min_child_weight= int(best['min_child_weight']), 
                          min_child_samples=int(best['min_child_samples']), 
                          verbose=-1,                #학습정보 미출력                          
                          objective='binary')                            

# 모델 학습
early_stopping_callback = lgb.early_stopping(stopping_rounds=50)
eval_set = [(train_x, train_y), (valid_x, valid_y)]                                                                    
lgbm_clf.fit(train_x, train_y, callbacks=[early_stopping_callback], eval_metric="binary_logloss", eval_set=eval_set)

# f1 계산
pred = lgbm_clf.predict(valid_x)
f1 = f1_score(valid_y, pred, average='binary')  
print('F1 Score: {0:.4f}'.format(f1))

Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's binary_logloss: 0.692872	valid_1's binary_logloss: 0.695612
F1 Score: 0.7234


# 옵션2 (CV 수행),오래걸림, 좀더 일반화된 성능

In [None]:
X = train.drop(columns=['TARGET'])
y = train['TARGET']

In [63]:
bayesian_params = {
    'learning_rate': (0.01, 0.2),
    'max_depth': (6, 16),
    'num_leaves': (24, 64),
    'min_data_in_leaf': (10, 200),  # Equivalent to min_child_samples
    'min_child_weight': (1, 50),
    'bagging_fraction': (0.5, 1.0),  # Equivalent to subsample
    'feature_fraction': (0.5, 1.0),  # Equivalent to colsample_bytree
    'max_bin': (10, 500),
    'lambda_l2': (0.001, 10),  # Equivalent to reg_lambda
    'lambda_l1': (0.01, 50)  # Equivalent to reg_alpha
}

In [64]:
# Wrapper function for cross-validation using KFold
def lgb_kfold_eval(learning_rate, max_depth, num_leaves, min_data_in_leaf, min_child_weight, 
                   bagging_fraction, feature_fraction, max_bin, lambda_l2, lambda_l1):
    params = {
        'learning_rate': round(learning_rate, 2),
        'max_depth': int(max_depth),
        'num_leaves': int(num_leaves),
        'min_child_samples': int(min_data_in_leaf),
        'min_child_weight': int(min_child_weight),
        'subsample': round(bagging_fraction, 2),
        'colsample_bytree': round(feature_fraction, 2),
        'max_bin': int(max_bin),
        'reg_lambda': round(lambda_l2, 2),
        'reg_alpha': round(lambda_l1, 2),
        'objective': 'binary',
        'metric': 'binary_logloss',
        'verbosity': -1,
        'n_estimators': 500,  # Considering fixed number of estimators
        'n_jobs': -1  # Using all CPU cores
    }
    kf = KFold(n_splits=3, shuffle=True, random_state=42)
    log_loss_scores = []

    for train_index, test_index in kf.split(X):
        X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[test_index]

        lgb_model = LGBMClassifier(**params)
        
        early_stopping_callback = lgb.early_stopping(stopping_rounds=100)            #조기중단
        log_callback = lgb.log_evaluation(period=100)                                #결과 중간 출력       
        lgb_model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)],callbacks=[early_stopping_callback,log_callback])
        
        preds = lgb_model.predict_proba(X_valid)[:, 1]
        log_loss_score = log_loss(y_valid, preds)
        log_loss_scores.append(log_loss_score)
    
    return -np.mean(log_loss_scores)  # Negative because BayesianOptimization maximizes the function

In [65]:
lgbBO = BayesianOptimization(f=lgb_kfold_eval, pbounds=bayesian_params, random_state=0)
lgbBO.maximize(init_points=5, n_iter=25)

|   iter    |  target   | baggin... | featur... | lambda_l1 | lambda_l2 | learni... |  max_bin  | max_depth | min_ch... | min_da... | num_le... |
-------------------------------------------------------------------------------------------------------------------------------------------------
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 66, number of used features: 0
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 67, number of used features: 0
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 67, number of used features: 0
[LightGBM] [Info] Start training from score 0.575758
[LightGBM] [Info] Start training from score 0.567164
[LightGBM] [Info] Start training from score 0.567164
Training until validation scores don't improve for 100 rounds
[100]	cv_agg's binary_logloss: 14.7731 + 0.27276
Did not meet early stopping. Best iteration is:
[1]	cv_agg's binary_logloss: 1

In [66]:
best = lgbBO.res[np.argmax(np.array(target_list))]
print(best)
best=best['params']

{'target': 14.773116029403882, 'params': {'bagging_fraction': 0.7744067519636624, 'feature_fraction': 0.8575946831862098, 'lambda_l1': 30.142141169821482, 'lambda_l2': 5.449286946785972, 'learning_rate': 0.09049441187439189, 'max_bin': 326.4881154026615, 'max_depth': 10.375872112626926, 'min_child_weight': 44.69687703832191, 'min_data_in_leaf': 193.09592449519556, 'num_leaves': 39.33766075303111}}


## 최종평가. 최적의 파라미터로 다시 예측

#### 옵션2 전용 (파라미터 단어가 조금 다르기때문에 구분해서 실행해야함)

In [67]:
lgbm_clf = LGBMClassifier(#n_jobs= 6,                  #cpu 코어수 
                          n_estimators=700, 
                          learning_rate=round(best['learning_rate'], 5),
                          max_depth=int(best['max_depth']),
                          num_leaves=int(best['num_leaves']),
                          colsample_bytree = round(best['feature_fraction'], 5),
                          subsample=round(best['bagging_fraction'], 5),
                          max_bin=int(best['max_bin']),
                          reg_alpha=round(best['lambda_l1'], 5),
                          reg_lambda=round(best['lambda_l2'], 5),
                          min_child_weight= int(best['min_child_weight']), 
                          min_child_samples=int(best['min_data_in_leaf']), 
                          verbose=-1,                #학습정보 미출력                          
                          objective='binary')                            

# 모델 학습
early_stopping_callback = lgb.early_stopping(stopping_rounds=50)
eval_set = [(train_x, train_y), (valid_x, valid_y)]                                                                    
lgbm_clf.fit(train_x, train_y, callbacks=[early_stopping_callback], eval_metric="binary_logloss", eval_set=eval_set)

# f1 계산
pred = lgbm_clf.predict(valid_x)
f1 = f1_score(valid_y, pred, average='binary')  
print('F1 Score: {0:.4f}'.format(f1))

Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's binary_logloss: 0.692872	valid_1's binary_logloss: 0.695612
F1 Score: 0.7234
