In [10]:
# LightGBM의 파이썬 패키지인 lightgbm에서 LGBMClassifier 임포트
from lightgbm import LGBMClassifier

import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

dataset = load_breast_cancer()
ftr = dataset.data
target = dataset.target

# 전체 데이터 중 80%는 학습용 데이터, 20%는 테스트용 데이터 추출
X_train, X_test, y_train, y_test=train_test_split(ftr, target, test_size=0.2, random_state=156 )

# 앞서 XGBoost와 동일하게 n_estimators는 400 설정. 
lgbm_wrapper = LGBMClassifier(n_estimators=400)

# LightGBM도 XGBoost와 동일하게 조기 중단 수행 가능. 
evals = [(X_test, y_test)]
lgbm_wrapper.fit(X_train, y_train, early_stopping_rounds=100, eval_metric="logloss", 
                 eval_set=evals, verbose=True)
preds = lgbm_wrapper.predict(X_test)
pred_proba = lgbm_wrapper.predict_proba(X_test)

[1]	valid_0's binary_logloss: 0.565079	valid_0's binary_logloss: 0.565079
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's binary_logloss: 0.507451	valid_0's binary_logloss: 0.507451
[3]	valid_0's binary_logloss: 0.458489	valid_0's binary_logloss: 0.458489
[4]	valid_0's binary_logloss: 0.417481	valid_0's binary_logloss: 0.417481
[5]	valid_0's binary_logloss: 0.385507	valid_0's binary_logloss: 0.385507
[6]	valid_0's binary_logloss: 0.355846	valid_0's binary_logloss: 0.355846
[7]	valid_0's binary_logloss: 0.330897	valid_0's binary_logloss: 0.330897
[8]	valid_0's binary_logloss: 0.306923	valid_0's binary_logloss: 0.306923
[9]	valid_0's binary_logloss: 0.28776	valid_0's binary_logloss: 0.28776
[10]	valid_0's binary_logloss: 0.26917	valid_0's binary_logloss: 0.26917
[11]	valid_0's binary_logloss: 0.250954	valid_0's binary_logloss: 0.250954
[12]	valid_0's binary_logloss: 0.23847	valid_0's binary_logloss: 0.23847
[13]	valid_0's binary_logloss: 0.225865	valid_0's bi

>튜닝

In [7]:
from sklearn.model_selection import GridSearchCV

param = {
    'min_child_samples':range(10, 400, 50),
    'num_leaves': range(10, 200, 30),
    'reg_alpha': [0.1,0.3,  0.5,0.7, 1],
    'reg_lambda': [0.1,0.3,  0.5,0.7, 1]
    }

grid_cv = GridSearchCV(lgbm_wrapper, param_grid=param, cv=2, verbose=1, n_jobs=-1)
grid_cv.fit(X_train, y_train)
print('최적 하이퍼 파라미터: \n', grid_cv.best_params_)
print('최고 예측 정확도: {0:.4f}'.format(grid_cv.best_score_))

Fitting 2 folds for each of 1400 candidates, totalling 2800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-1)]: Done 388 tasks      | elapsed:   18.7s
[Parallel(n_jobs=-1)]: Done 888 tasks      | elapsed:   37.8s


최적 하이퍼 파라미터: 
 {'min_child_samples': 10, 'num_leaves': 10, 'reg_alpha': 0.1, 'reg_lambda': 0.3}
최고 예측 정확도: 0.9648


[Parallel(n_jobs=-1)]: Done 2800 out of 2800 | elapsed:   56.0s finished


> min_child_samples는 최종 결정 클래스인 Leaf Node가 되기 위해서 최소한으로 필요한 데이터 개체의 수를 의미하며, 과적합을 제어하는 파라미터이다. 이 파라미터의 최적값은 훈련 데이터의 개수와 num_leaves에 의해 결정된다. 너무 큰 숫자로 설정하면 예측률이 떨어지는 과소적합(under-fitting)이 일어날 수 있으며, 아주 큰 데이터셋이라면 적어도 수백~수천 정도로 가정하는 것이 편리하다

> num_leaves는 개별 트리가 가질 수 있는 최대 리프의 개수이고 LightGBM 모델의 복잡도를 제어하는 주요 파라미터이다. 일반적으로 계수를 높이면 정확도가 올라가지만 트리의 깊이가 깊어지고 보델이 복잡도가 커져 과적합이 될 가능성이 높다.

> reg_alpha, reg_lambda는 피처 개수가 많을 경우 적용을 검토하며 값이 클수록 과적합 감소 효과가 있다.

In [8]:
from sklearn.model_selection import GridSearchCV

param = {
    'min_child_samples':range(5, 30, 5),
    'num_leaves': range(5, 30, 5),
    'reg_alpha': [0.05, 0.07, 0.09, 0.1, 0.13, 0.15],
    'reg_lambda': [0.25, 0.27, 0.29, 0.3, 0.33, 0.35]
    }

grid_cv = GridSearchCV(lgbm_wrapper, param_grid=param, cv=2, verbose=1, n_jobs=-1)
grid_cv.fit(X_train, y_train)
print('최적 하이퍼 파라미터: \n', grid_cv.best_params_)
print('최고 예측 정확도: {0:.4f}'.format(grid_cv.best_score_))

Fitting 2 folds for each of 900 candidates, totalling 1800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    6.0s
[Parallel(n_jobs=-1)]: Done 388 tasks      | elapsed:   26.1s
[Parallel(n_jobs=-1)]: Done 888 tasks      | elapsed:   56.8s
[Parallel(n_jobs=-1)]: Done 1588 tasks      | elapsed:  1.7min


최적 하이퍼 파라미터: 
 {'min_child_samples': 10, 'num_leaves': 5, 'reg_alpha': 0.05, 'reg_lambda': 0.25}
최고 예측 정확도: 0.9648


[Parallel(n_jobs=-1)]: Done 1800 out of 1800 | elapsed:  1.9min finished


In [9]:
from sklearn.model_selection import GridSearchCV

param = {
    'min_child_samples':[10],
    'num_leaves': range(1, 5, 1),
    'reg_alpha': [0.01, 0.02, 0.03, 0.04, 0.05, 0.06],
    'reg_lambda': [0.21, 0.22, 0.23, 0.24, 0.25, 0.26]
    }

grid_cv = GridSearchCV(lgbm_wrapper, param_grid=param, cv=2, verbose=1, n_jobs=-1)
grid_cv.fit(X_train, y_train)
print('최적 하이퍼 파라미터: \n', grid_cv.best_params_)
print('최고 예측 정확도: {0:.4f}'.format(grid_cv.best_score_))

Fitting 2 folds for each of 144 candidates, totalling 288 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 244 tasks      | elapsed:   10.1s


최적 하이퍼 파라미터: 
 {'min_child_samples': 10, 'num_leaves': 4, 'reg_alpha': 0.01, 'reg_lambda': 0.21}
최고 예측 정확도: 0.9627


[Parallel(n_jobs=-1)]: Done 288 out of 288 | elapsed:   13.1s finished


In [11]:
# 앞서 XGBoost와 동일하게 n_estimators는 400 설정. 
lgbm_wrapper = LGBMClassifier(n_estimators=400, min_child_samples= 10, num_leaves= 10, reg_alpha= 0.1, reg_lambda= 0.3)

# LightGBM도 XGBoost와 동일하게 조기 중단 수행 가능. 
evals = [(X_test, y_test)]
lgbm_wrapper.fit(X_train, y_train, early_stopping_rounds=100, eval_metric="logloss", 
                 eval_set=evals, verbose=True)
preds = lgbm_wrapper.predict(X_test)
pred_proba = lgbm_wrapper.predict_proba(X_test)

[1]	valid_0's binary_logloss: 0.565791	valid_0's binary_logloss: 0.565791
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's binary_logloss: 0.506052	valid_0's binary_logloss: 0.506052
[3]	valid_0's binary_logloss: 0.460279	valid_0's binary_logloss: 0.460279
[4]	valid_0's binary_logloss: 0.41831	valid_0's binary_logloss: 0.41831
[5]	valid_0's binary_logloss: 0.383392	valid_0's binary_logloss: 0.383392
[6]	valid_0's binary_logloss: 0.352581	valid_0's binary_logloss: 0.352581
[7]	valid_0's binary_logloss: 0.327076	valid_0's binary_logloss: 0.327076
[8]	valid_0's binary_logloss: 0.302658	valid_0's binary_logloss: 0.302658
[9]	valid_0's binary_logloss: 0.283297	valid_0's binary_logloss: 0.283297
[10]	valid_0's binary_logloss: 0.264996	valid_0's binary_logloss: 0.264996
[11]	valid_0's binary_logloss: 0.249311	valid_0's binary_logloss: 0.249311
[12]	valid_0's binary_logloss: 0.234398	valid_0's binary_logloss: 0.234398
[13]	valid_0's binary_logloss: 0.222063	valid_0'

In [14]:
# 앞서 XGBoost와 동일하게 n_estimators는 400 설정. 
lgbm_wrapper = LGBMClassifier(n_estimators=400, min_child_samples= 10, num_leaves= 5, reg_alpha= 0.05, reg_lambda= 0.25)

# LightGBM도 XGBoost와 동일하게 조기 중단 수행 가능. 
evals = [(X_test, y_test)]
lgbm_wrapper.fit(X_train, y_train, early_stopping_rounds=100, eval_metric="logloss", 
                 eval_set=evals, verbose=True)
preds = lgbm_wrapper.predict(X_test)
pred_proba = lgbm_wrapper.predict_proba(X_test)

[1]	valid_0's binary_logloss: 0.566416	valid_0's binary_logloss: 0.566416
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's binary_logloss: 0.506513	valid_0's binary_logloss: 0.506513
[3]	valid_0's binary_logloss: 0.45719	valid_0's binary_logloss: 0.45719
[4]	valid_0's binary_logloss: 0.417845	valid_0's binary_logloss: 0.417845
[5]	valid_0's binary_logloss: 0.384396	valid_0's binary_logloss: 0.384396
[6]	valid_0's binary_logloss: 0.356069	valid_0's binary_logloss: 0.356069
[7]	valid_0's binary_logloss: 0.329164	valid_0's binary_logloss: 0.329164
[8]	valid_0's binary_logloss: 0.304588	valid_0's binary_logloss: 0.304588
[9]	valid_0's binary_logloss: 0.285247	valid_0's binary_logloss: 0.285247
[10]	valid_0's binary_logloss: 0.266957	valid_0's binary_logloss: 0.266957
[11]	valid_0's binary_logloss: 0.2503	valid_0's binary_logloss: 0.2503
[12]	valid_0's binary_logloss: 0.236263	valid_0's binary_logloss: 0.236263
[13]	valid_0's binary_logloss: 0.223186	valid_0's bi

> 튜닝전 결과
- [47]	valid_0's binary_logloss: 0.126108	valid_0's binary_logloss: 0.126108
> 튜닝 후 결과 두 가지
- [58]	valid_0's binary_logloss: 0.114421	valid_0's binary_logloss: 0.114421
- [80]	valid_0's binary_logloss: 0.10751	valid_0's binary_logloss: 0.10751

> 결국 다음과 같이 파라미터를 설정하는 것이 예측률을 높일 수 있다.
- min_child_samples= 10, num_leaves= 5, reg_alpha= 0.05, reg_lambda= 0.25