In [None]:
import lightgbm as lgb
from lightgbm import LGBMClassifier,LGBMRegressor
from sklearn.model_selection import GridSearchCV
params = {'boosting_type': 'gbdt', 'objective': 'binary', 
          'learning_rate': 0.1, 
    'num_leaves': 50, 'max_depth': 6,'subsample': 0.8, 
          'colsample_bytree': 0.8}

'''为了确定估计器的数目，也就是boosting迭代的次数，也可以说是残差树的数目，
参数名为n_estimators/num_iterations/num_round/num_boost_round。
我们可以先将该参数设成一个较大的数，然后在cv结果中查看最优的迭代次数，'''

data_train = lgb.Dataset(df_train, y_train, silent=True)
cv_results = lgb.cv(params, data_train, num_boost_round=1000, nfold=5, 
                    stratified=False, shuffle=True, metrics='rmse',
    early_stopping_rounds=50, verbose_eval=50, show_stdv=True, seed=0)

In [None]:
### 我们可以创建lgb的sklearn模型，使用上面选择的(学习率，评估器数目)
model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=50,
                              learning_rate=0.1, n_estimators=43, max_depth=6,
                              metric='rmse', bagging_fraction = 0.8,feature_fraction = 0.8)

params_test1={'max_depth': range(3,8,2),
    'num_leaves':range(50, 170, 30)}
'''sklearn模型评估里的scoring参数都是采用的higher return values are 
better than lower return values（较高的返回值优于较低的返回值）。
但是，我采用的metric策略采用的是均方误差(rmse)，越低越好，所以sklearn就提供了neg_mean_squared_erro参数，
也就是返回metric的负数，所以就均方差来说，也就变成负数越大越好了。'''
gsearch1 = GridSearchCV(estimator=model_lgb, param_grid=params_test1, 
                        scoring='neg_mean_squared_error', cv=5, verbose=1, n_jobs=4)
gsearch1.fit(df_train, y_train)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_
#先粗调再细调
params_test2={'max_depth': [6,7,8],
    'num_leaves':[68,74,80,86,92]}
gsearch2 = GridSearchCV(estimator=model_lgb, param_grid=params_test2, 
                        scoring='neg_mean_squared_error', cv=5, verbose=1, n_jobs=4)
gsearch2.fit(df_train, y_train)
gsearch2.grid_scores_, gsearch2.best_params_, gsearch2.best_score_

In [None]:
#细调参数
params_test3={'min_child_samples': [18, 19, 20, 21, 22], 'min_child_weight':[0.001, 0.002]}
model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=80,
                              learning_rate=0.1, n_estimators=43, max_depth=7, 
                              metric='rmse', bagging_fraction = 0.8, feature_fraction = 0.8)
gsearch3 = GridSearchCV(estimator=model_lgb, param_grid=params_test3, 
                        scoring='neg_mean_squared_error', cv=5, verbose=1, n_jobs=4)
gsearch3.fit(df_train, y_train)

In [None]:
#调节样本选取参数和特征选取参数
params_test4={'feature_fraction': [0.5, 0.6, 0.7, 0.8, 0.9],
              'bagging_fraction': [0.6, 0.7, 0.8, 0.9, 1.0]}
model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=80,
                              learning_rate=0.1, n_estimators=43, max_depth=7, 
                              metric='rmse', bagging_freq = 5,  min_child_samples=20)
gsearch4 = GridSearchCV(estimator=model_lgb, param_grid=params_test4, 
                        scoring='neg_mean_squared_error', cv=5, verbose=1, n_jobs=4)
gsearch4.fit(df_train, y_train)

In [None]:
#细调特征选取参数
params_test5={'feature_fraction': [0.62, 0.65, 0.68, 0.7, 0.72, 0.75, 0.78 ]}
model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=80,
                              learning_rate=0.1, n_estimators=43, max_depth=7, 
                              metric='rmse',  min_child_samples=20)
gsearch5 = GridSearchCV(estimator=model_lgb, param_grid=params_test5, 
                        scoring='neg_mean_squared_error', cv=5, verbose=1, n_jobs=4)
gsearch5.fit(df_train, y_train)

In [None]:
#调节正则化系数
params_test6={'reg_alpha': [0, 0.001, 0.01, 0.03, 0.08, 0.3, 0.5],
    'reg_lambda': [0, 0.001, 0.01, 0.03, 0.08, 0.3, 0.5]}
model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=80,
                              learning_rate=0.b1, n_estimators=43, max_depth=7, 
                              metric='rmse',  min_child_samples=20, feature_fraction=0.7)
gsearch6 = GridSearchCV(estimator=model_lgb, param_grid=params_test6, 
                        scoring='neg_mean_squared_error', cv=5, verbose=1, n_jobs=4)
gsearch6.fit(df_train, y_train)

In [None]:
params = {'boosting_type': 'gbdt', 'objective': 'regression', 'learning_rate': 0.005, 
    'num_leaves': 80, 'max_depth': 7,'min_data_in_leaf': 20,
          'subsample': 1, 'colsample_bytree': 0.7}

data_train = lgb.Dataset(df_train, y_train, silent=True)
cv_results = lgb.cv(
    params, data_train, num_boost_round=10000, nfold=5, stratified=False, 
    shuffle=True, metrics='rmse',
    early_stopping_rounds=50, verbose_eval=100, show_stdv=True)

print('best n_estimators:', len(cv_results['rmse-mean']))
print('best cv score:', cv_results['rmse-mean'][-1])