In [241]:
import numpy as np 
import pandas as pd 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from bayes_opt import BayesianOptimization
from datetime import datetime
from sklearn.metrics import precision_score, recall_score, confusion_matrix, accuracy_score, roc_auc_score, f1_score, roc_curve, auc,precision_recall_curve
from sklearn import metrics
from sklearn import preprocessing
import lightgbm as lgb
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import datasets
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

In [2]:
#加载数据集
dataset = datasets.load_breast_cancer()

In [3]:
X = dataset['data']
y = dataset['target']

In [4]:
X.shape, Counter(y)

((569, 30), Counter({0: 212, 1: 357}))

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=2)

In [6]:
X_train.shape, X_test.shape, Counter(y_train), Counter(y_test)

((455, 30), (114, 30), Counter({1: 288, 0: 167}), Counter({1: 69, 0: 45}))

In [216]:
# 使用默认参数
params = {'num_leaves':31, 'max_depth':-1, 'learning_rate':0.1, 'n_estimators':100, 'subsample_for_bin':200000, 'objective':'binary', 
         'min_split_gain':0.0, 'min_child_weight':0.001, 'min_child_samples':20, 'subsample':1.0, 'subsample_freq':0, 'colsample_bytree':1.0, 
         'reg_alpha':0.0, 'reg_lambda':0.0, 'random_state':6, 'metric':'auc'}
lgbc = lgb.LGBMClassifier(**params)

# 网格搜索（Grid Search）

## 全局搜索

- 对全部需要调节的参数设置范围和步长，对所有的参数组合进行搜索，找出最优

In [217]:
def GridSearch(clf, params, X, y):
    cscv = GridSearchCV(clf, params, scoring='roc_auc', n_jobs=-1, cv=5)
    cscv.fit(X, y)
    return cscv

In [None]:
%%time
adj_params = {'num_leaves': range(6, 100, 5),
              'max_depth': range(3, 15, 3),
              'min_child_weight': np.arange(0.001, 0.010, 0.001),
              'min_child_samples': np.arange(4, 30, 2),
              'subsample': [round(i,1) for i in np.arange(0.4,1.1,0.2)],
              'subsample_freq': range(0,6,1),
              'colsample_bytree': [round(i,1) for i in np.arange(0.4,1.1,0.2)],
              'reg_alpha': [round(i,2) for i in np.arange(0.0,0.1,0.01)],
              'reg_lambda': [round(i,2) for i in np.arange(0.0,0.1,0.01)]
             }
cscv = GridSearch(lgbc , adj_params , X_train, y_train)

## 手动搜索

In [218]:
train_set = lgb.Dataset( X_train, y_train)

cv_result =lgb.cv(params=params,train_set=train_set,num_boost_round=1000,nfold=5,metrics='auc',early_stopping_rounds=50)

len(cv_result['auc-mean']), cv_result['auc-mean'][-1]

(28, 0.9929415323298103)

In [219]:
params.update({'n_estimators':28})
lgbc = lgb.LGBMClassifier(**params)

In [220]:
%%time
adj_params = {'num_leaves': range(6, 100, 5),
             'max_depth': range(3, 15, 3)
             }
cscv = GridSearch(lgbc , adj_params , X_train, y_train)
print(cscv.best_score_, cscv.best_params_)

0.9922602147861035 {'max_depth': 6, 'num_leaves': 11}
Wall time: 2.75 s


In [221]:
%%time
adj_params = {'num_leaves': [8,11,14],
             'max_depth': [5, 6, 7]
             }
cscv = GridSearch(lgbc , adj_params , X_train, y_train)
print(cscv.best_score_, cscv.best_params_)

0.9922602147861035 {'max_depth': 6, 'num_leaves': 11}
Wall time: 388 ms


In [222]:
params.update({'max_depth': 6, 'num_leaves': 11})
lgbc = lgb.LGBMClassifier(**params)

In [223]:
%%time
adj_params = {'min_child_weight': np.arange(0.001, 0.010, 0.001),
             'min_child_samples': np.arange(4, 30, 2)
             }
cscv = GridSearch(lgbc , adj_params , X_train, y_train)
print(cscv.best_score_, cscv.best_params_)

0.9932855941610078 {'min_child_samples': 16, 'min_child_weight': 0.001}
Wall time: 3.77 s


In [224]:
params.update({'min_child_weight': 0.001, 'min_child_samples': 16})
lgbc = lgb.LGBMClassifier(**params)

In [225]:
%%time
adj_params = {'subsample': [round(i,1) for i in np.arange(0.4,1.1,0.2)],
             'subsample_freq': range(0,6,1),
             'colsample_bytree': [round(i,1) for i in np.arange(0.4,1.1,0.2)]
             }
cscv = GridSearch(lgbc , adj_params , X_train, y_train)
print(cscv.best_score_, cscv.best_params_)

0.9934524960467098 {'colsample_bytree': 0.4, 'subsample': 0.4, 'subsample_freq': 0}
Wall time: 2.31 s


In [226]:
params.update({'colsample_bytree': 0.4, 'subsample': 0.4, 'subsample_freq': 0})
lgbc = lgb.LGBMClassifier(**params)

In [227]:
%%time
adj_params = {'reg_alpha': [round(i,2) for i in np.arange(0.0,0.1,0.01)],
              'reg_lambda': [round(i,2) for i in np.arange(0.0,0.1,0.01)]
             }
cscv = GridSearch(lgbc , adj_params , X_train, y_train)
print(cscv.best_score_, cscv.best_params_)

0.9938586803751743 {'reg_alpha': 0.04, 'reg_lambda': 0.07}
Wall time: 1.69 s


In [249]:
params.update({'reg_alpha': 0.04, 'reg_lambda': 0.07})

In [230]:
print(params)

{'num_leaves': 11, 'max_depth': 6, 'learning_rate': 0.1, 'n_estimators': 28, 'subsample_for_bin': 200000, 'objective': 'binary', 'min_split_gain': 0.0, 'min_child_weight': 0.001, 'min_child_samples': 16, 'subsample': 0.4, 'subsample_freq': 0, 'colsample_bytree': 0.4, 'reg_alpha': 0.04, 'reg_lambda': 0.07, 'random_state': 6, 'metric': 'auc'}


In [250]:
lgbc = lgb.LGBMClassifier(**params)

In [251]:
cv_score = cross_val_score(estimator=lgbc,X=X,y=y,scoring='roc_auc',cv=5,n_jobs=-1).mean()
print(cv_score)

0.9915777611404863


In [252]:
lgbc.fit(X_train,y_train)

probs = lgbc.predict_proba(X_test)[:,1]

test_score = roc_auc_score(y_true=y_test,y_score=probs)
print(test_score)

0.9826086956521739


# 随机搜索（Randomized Search）

In [233]:
params = {'num_leaves':31, 'max_depth':-1, 'learning_rate':0.1, 'n_estimators':100, 'subsample_for_bin':200000, 'objective':'binary', 
         'min_split_gain':0.0, 'min_child_weight':0.001, 'min_child_samples':20, 'subsample':1.0, 'subsample_freq':0, 'colsample_bytree':1.0, 
         'reg_alpha':0.0, 'reg_lambda':0.0, 'random_state':6, 'metric':'auc'}

In [206]:
def RandomSearch(clf, params, X, y, n_iter):
    cscv = RandomizedSearchCV(clf, params, n_iter=n_iter, scoring='roc_auc', n_jobs=-1, cv=5)
    cscv.fit(X, y)
    return cscv

In [234]:
params.update({'n_estimators':28})
lgbc = lgb.LGBMClassifier(**params)

In [235]:
%%time
adj_params = {'num_leaves': range(6, 100, 5),
              'max_depth': range(3, 15, 3),
              'min_child_weight': np.arange(0.001, 0.010, 0.001),
              'min_child_samples': np.arange(4, 30, 2),
              'subsample': [round(i,1) for i in np.arange(0.4,1.1,0.2)],
              'subsample_freq': range(0,6,1),
              'colsample_bytree': [round(i,1) for i in np.arange(0.4,1.1,0.2)],
              'reg_alpha': [round(i,2) for i in np.arange(0.0,0.1,0.01)],
              'reg_lambda': [round(i,2) for i in np.arange(0.0,0.1,0.01)]
             }
cscv = RandomSearch(lgbc , adj_params , X_train, y_train, 1000)

Wall time: 22.5 s


In [236]:
print(cscv.best_params_, cscv.best_score_)

{'subsample_freq': 3, 'subsample': 0.8, 'reg_lambda': 0.03, 'reg_alpha': 0.07, 'num_leaves': 36, 'min_child_weight': 0.002, 'min_child_samples': 8, 'max_depth': 9, 'colsample_bytree': 0.6} 0.9943397800022118


In [237]:
params.update(cscv.best_params_)

In [238]:
print(params)

{'num_leaves': 36, 'max_depth': 9, 'learning_rate': 0.1, 'n_estimators': 28, 'subsample_for_bin': 200000, 'objective': 'binary', 'min_split_gain': 0.0, 'min_child_weight': 0.002, 'min_child_samples': 8, 'subsample': 0.8, 'subsample_freq': 3, 'colsample_bytree': 0.6, 'reg_alpha': 0.07, 'reg_lambda': 0.03, 'random_state': 6, 'metric': 'auc'}


In [255]:
lgbc = lgb.LGBMClassifier(**params)

In [256]:
cv_score = cross_val_score(estimator=lgbc,X=X,y=y,scoring='roc_auc',cv=5,n_jobs=-1).mean()
print(cv_score)

0.9916970817150969


In [257]:
lgbc.fit(X_train,y_train)
probs = lgbc.predict_proba(X_test)[:,1]
roc_auc_score(y_true=y_test,y_score=probs)

0.9864734299516907

# 贝叶斯优化（Bayesian Optimization）

In [292]:
def BayesianSearch(clf, params):
    """贝叶斯优化器"""
    # 迭代次数
    num_iter = 25
    init_points = 5
    # 创建一个贝叶斯优化对象，输入为自定义的模型评估函数与超参数的范围
    bayes = BayesianOptimization(clf, params)
    # 开始优化
    bayes.maximize(init_points=init_points, n_iter=num_iter)
    return bayes

In [274]:
def GBM_evaluate(num_leaves, max_depth, min_child_weight, min_child_samples, subsample, subsample_freq, colsample_bytree, reg_alpha, reg_lambda):
    """自定义的模型评估函数"""

    # 模型固定的超参数
    param = {
        'objective': 'binary',
        'n_estimators': 28,
        'metric': 'auc',
        'learning_rate':0.1,
        'random_state': 6}

    # 贝叶斯优化器生成的超参数
    param['min_child_weight'] = int(num_leaves)
    param['max_depth'] = int(max_depth)
    param['min_child_weight'] = float(min_child_weight)
    param['min_child_samples'] = int(min_child_samples)
    param['subsample'] = float(subsample)
    param['subsample_freq'] = int(subsample_freq)
    param['colsample_bytree'] = float(colsample_bytree)
    param['reg_lambda'] = float(reg_lambda)
    param['reg_alpha'] = float(reg_alpha)
    
    # 5-flod 交叉检验，注意BayesianOptimization会向最大评估值的方向优化，因此对于回归任务需要取负数。
    val = cross_val_score(lgb.LGBMClassifier(**param),X_train, y_train ,scoring='roc_auc', cv=5).mean()
    return val

In [275]:
%%time
# 调参范围
adj_params = {'num_leaves': (6, 100),
              'max_depth': (3, 15),
              'min_child_weight': (0.001, 0.01),
              'min_child_samples': (4, 30),
              'subsample': (0.4, 1.0),
              'subsample_freq': (0, 6),
              'colsample_bytree': (0.4, 1.0),
              'reg_alpha': (0.0, 0.1),
              'reg_lambda': (0.0, 0.1)
             }
# 调用贝叶斯优化
bayesian_result = BayesianSearch(GBM_evaluate, adj_params)

|   iter    |  target   | colsam... | max_depth | min_ch... | min_ch... | num_le... | reg_alpha | reg_la... | subsample | subsam... |
-------------------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.992   [0m | [0m 0.6131  [0m | [0m 6.529   [0m | [0m 21.09   [0m | [0m 0.008365[0m | [0m 12.43   [0m | [0m 0.07952 [0m | [0m 0.03076 [0m | [0m 0.8642  [0m | [0m 2.161   [0m |
| [95m 2       [0m | [95m 0.992   [0m | [95m 0.6005  [0m | [95m 14.37   [0m | [95m 21.94   [0m | [95m 0.001349[0m | [95m 59.36   [0m | [95m 0.01041 [0m | [95m 0.08223 [0m | [95m 0.8218  [0m | [95m 3.593   [0m |
| [95m 3       [0m | [95m 0.9922  [0m | [95m 0.7575  [0m | [95m 7.302   [0m | [95m 23.36   [0m | [95m 0.009218[0m | [95m 52.97   [0m | [95m 0.08787 [0m | [95m 0.0914  [0m | [95m 0.8791  [0m | [95m 3.026   [0m |
| [0m 4       [0m | [0m 0.989   [0m | [

In [291]:
print('Best bayesian score: %s\n'%bayesian_result.max['target'])
print('Best bayesian params: %s'%str(bayesian_result.max['params']))

Best bayesian score: 0.9924000871315913

Best bayesian params: {'colsample_bytree': 0.4089206496876992, 'max_depth': 14.773819480466189, 'min_child_samples': 4.480196571648507, 'min_child_weight': 0.004031333374775868, 'num_leaves': 99.97968887651052, 'reg_alpha': 0.03214605103841293, 'reg_lambda': 0.08320635117193971, 'subsample': 0.7843722149752079, 'subsample_freq': 2.770536702654322}


In [281]:
params.update(bayesian_result.max['params'])

In [283]:
for p in ['num_leaves','max_depth','min_child_samples','subsample_freq']:
    params[p] = int(params.get(p))

In [284]:
print(params)

{'num_leaves': 99, 'max_depth': 14, 'learning_rate': 0.1, 'n_estimators': 28, 'subsample_for_bin': 200000, 'objective': 'binary', 'min_split_gain': 0.0, 'min_child_weight': 0.004031333374775868, 'min_child_samples': 4, 'subsample': 0.7843722149752079, 'subsample_freq': 2, 'colsample_bytree': 0.4089206496876992, 'reg_alpha': 0.03214605103841293, 'reg_lambda': 0.08320635117193971, 'random_state': 6, 'metric': 'auc'}


In [285]:
lgbc = lgb.LGBMClassifier(**params)

In [286]:
cv_score = cross_val_score(estimator=lgbc,X=X,y=y,scoring='roc_auc',cv=5,n_jobs=-1).mean()
print(cv_score)

0.9910362692953587


In [287]:
lgbc.fit(X_train,y_train)
probs = lgbc.predict_proba(X_test)[:,1]
roc_auc_score(y_true=y_test,y_score=probs)

0.9832528180354266