## 调参
## 调节时候过拟合
### 一、手动调参
1. adjustment_xgb_parameters(x_train=None, y_train=None, fig_ylim=3)
2. adjust_lgb_parameters(x_train=None, y_train=None, x_test=None, y_test=None, xlim=None)

### 二、网格搜索

### 三、调参顺序
1. n_estimators
2. eta
3. gamma
4. max_depth
5. 采样
6. 抽样参数（纵向抽样影响更大）
7. 正则化

In [1]:
# 学习曲线调参 一般的函数
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import KFold, cross_val_score
from lightgbm import LGBMClassifier

def adjust_lgbc_param(train_x=None, train_y=None, test_x=None, test_y=None, metric='accuracy'):
    Xtrain, Ytrain, Xtest, Ytest = train_x, train_y, test_x, test_y

    acc_train, acc_test = [], []
    pre_train, pre_test = [], []
    recall_train, recall_test = [], []
    f1_train, f1_test = [], []
    auc_train, auc_test = [], []
    map_train, map_test = [], [] 
    # 调优精度
    start, end, step = 1, 10, 1  
    n_cv = 5
    
    for i in range(start,end,step):
        lgbc = LGBMClassifier(
                             boosting_type='gbdt',
                             class_weight='balanced',
                             objective='binary',
                             colsample_bytree=1.0,
                             importance_type='split', # 和feature_importance 配合使用
                             n_estimators=81,
                             learning_rate=0.1,
                             max_depth=i,
                             min_child_samples=20,
                             min_child_weight=0.001,
                             min_split_gain=0.0,
                             num_leaves=31,
                             reg_alpha=0.0,
                             reg_lambda=0.0,
                             silent=True,
                             subsample=1.0,
                             subsample_for_bin=200000,
                             subsample_freq=0)
        score = cross_val_score(lgbc,Xtrain,Ytrain,cv=n_cv, scoring='accuracy').mean()
        acc_train.append(score)
        score = cross_val_score(lgbc,Xtrain,Ytrain,cv=n_cv, scoring='precision').mean()
        pre_train.append(score)
        score = cross_val_score(lgbc,Xtrain,Ytrain,cv=n_cv, scoring='recall').mean()
        recall_train.append(score)
        score = cross_val_score(lgbc,Xtrain,Ytrain,cv=n_cv, scoring='f1').mean()
        f1_train.append(score)
        score = cross_val_score(lgbc,Xtrain,Ytrain,cv=n_cv, scoring='roc_auc').mean()
        auc_train.append(score)

    print('max(accuracy):  %f, index: %d' % (max(acc_train),(acc_train.index(max(acc_train))*step)+1+start))
    print('max(precision): %f, index: %d' % (max(pre_train),(pre_train.index(max(pre_train))*step)+1+start))
    print('max(recall):    %f, index: %d' % (max(recall_train),(recall_train.index(max(recall_train))*step)+1+start))
    print('max(f1):        %f, index: %d' % (max(f1_train),(f1_train.index(max(f1_train))*step)+1+start))
    print('max(roc_auc):   %f, index: %d' % (max(auc_train),(auc_train.index(max(auc_train))*step)+1+start))
    
    fig = plt.figure(figsize=[20,20])
    ax = fig.add_subplot(1, 1, 1)
    ax.set_title('accuracy')
    ax.plot(range(start,end,step),acc_train, c="red",label="accuracy")
    ax.plot(range(start,end,step),pre_train, c="green",label="precision")
    ax.plot(range(start,end,step),recall_train, c="blue",label="recall")
    ax.plot(range(start,end,step),f1_train, c="orange",label="f1")
    ax.plot(range(start,end,step),auc_train, c="pink",label="auc")
    ax.legend(fontsize="xx-large")
    plt.show()
adjust_lgbc_param(x_train, y_train, x_test, y_test)

NameError: name 'x_train' is not defined

In [None]:
def adjustment_xgb_parameters(x_train=None, y_train=None, fig_ylim=3):
    """x_train, y_train"""
    fig,ax = plt.subplots(1,figsize=(15,8))
    ax.set_ylim(top=fig_ylim)
    ax.grid()

    dfull = xgb.DMatrix(x_train,y_train) 
    # Init parameter
    param1 = {'verbosity':1, # -- global parameter 
              'objective':'binary:logistic',  # -- task parameter
              'eval_metric':'auc',
              "subsample":1,  # -- tree booster parameter
              "max_depth":6,
              "eta":0.3,
              "gamma":0,
              "lambda":1,
              "alpha":0,
              "colsample_bytree":1,
              "colsample_bylevel":1,
              "colsample_bynode":1,
            }
    num_round = 200
    cvresult1 = xgb.cv(params=param1, dtrain=dfull, num_boost_round=num_round,nfold=5)
    ax.plot(range(1,num_round+1),cvresult1.iloc[:,0],c="red",label="train,original")
    ax.plot(range(1,num_round+1),cvresult1.iloc[:,2],c="orange",label="test,original")

    # Usable parameter
    param2 = {'verbosity':1, 
              'objective':'binary:logistic', 
              'eval_metric':'auc'
             }
    num_round = 200
    cvresult2 = xgb.cv(params=param1, dtrain=dfull, num_boost_round=num_round,nfold=5)
    ax.plot(range(1,num_round+1),cvresult2.iloc[:,0],c="green",label="train,last")
    ax.plot(range(1,num_round+1),cvresult2.iloc[:,2],c="blue",label="test,last")

    # Adjusting parameter
    param3 = {'verbosity':1,  
              'objective':'binary:logistic',
              'eval_metric':'auc'
             }
    num_round = 200
    cvresult3 = xgb.cv(params=param1, dtrain=dfull, num_boost_round=num_round,nfold=5)
    ax.plot(range(1,num_round+1),cvresult3.iloc[:,0],c="gray",label="train,this")
    ax.plot(range(1,num_round+1),cvresult3.iloc[:,2],c="pink",label="test,this")
    ax.legend(fontsize="xx-large")
    plt.show()
adjustment_xgb_parameters(x_train, y_train)

In [None]:
# Adjust the param of lightgbm   use eval_result = {} 
def adjust_lgb_parameters(x_train=None, y_train=None, x_test=None, y_test=None, xlim=None):
    # auc / binary_logloss(binary) / binary_error 
    train_data_l = lgb.Dataset(x_train, label=y_train) 
    valid_data_l = lgb.Dataset(x_test, label=y_test)

    fig = plt.figure(figsize=(20,16))
    ax = fig.add_subplot(211)
    xlim = xlim
    params = {
        'objective': 'binary',
        'boosting_type': 'gbdt',
        'verbose': -1,
        'metric': 'binary_logloss',
        'n_estimators': 100,
    }
    evals_result_ori = {}
    model = lgb.train(params, train_set=train_data_l, verbose_eval=100, valid_sets=[train_data_l, valid_data_l], evals_result=evals_result_ori) 
    lgb.plot_metric(evals_result_ori, metric=params['metric'], ax=ax, xlim=xlim)

    params = {
        'objective': 'binary',
        'boosting_type': 'gbdt',
        'verbose': -1,
        'metric': 'binary_logloss',
        'n_estimators': 100,
        'learning_rate': 0.03,
        'num_leaves': 21,
        'reg_alpha': 0.2,
        'reg_lambda': 0.2,
    }
    evals_result_last = {}
    model = lgb.train(params, train_set=train_data_l, verbose_eval=100, valid_sets=[train_data_l, valid_data_l], evals_result=evals_result_last)
    lgb.plot_metric(evals_result_last, metric=params['metric'], ax=ax, xlim=xlim)

    params = {
        'objective': 'binary',
        'boosting_type': 'gbdt',
        'verbose': -1,
        'metric': 'binary_logloss',
        'n_estimators': 100,
        'learning_rate': 0.03,
        'num_leaves': 21,
        'reg_alpha': 0.2,
        'reg_lambda': 0.2,
        'subsample': 0.5
    }
    evals_result_this = {}
    model = lgb.train(params, train_set=train_data_l, verbose_eval=100, valid_sets=[train_data_l, valid_data_l], evals_result=evals_result_this)
    lgb.plot_metric(evals_result_this, metric=params['metric'], ax=ax, xlim=xlim)

In [None]:
# Adjust catboost 

In [None]:
# 网格搜索
import xgboost as xgb
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score


parameters = {
    'max_depth': [5, 10, 15, 20, 25],
    'learning_rate': [0.01, 0.02, 0.05, 0.1, 0.15],
    'n_estimators': [500, 1000, 2000, 3000, 5000],
    'min_child_weight': [0, 2, 5, 10, 20],
    'max_delta_step': [0, 0.2, 0.6, 1, 2],
    'subsample': [0.6, 0.7, 0.8, 0.85, 0.95],
    'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9],
    'reg_alpha': [0, 0.25, 0.5, 0.75, 1],
    'reg_lambda': [0.2, 0.4, 0.6, 0.8, 1],
    'scale_pos_weight': [0.2, 0.4, 0.6, 0.8, 1]
}

xlf = xgb.XGBClassifier(max_depth=10,
                        learning_rate=0.01,
                        n_estimators=2000,
                        silent=True,
                        objective='binary:logistic',
                        nthread=-1,
                        gamma=0,
                        min_child_weight=1,
                        max_delta_step=0,
                        subsample=0.85,
                        colsample_bytree=0.7,
                        colsample_bylevel=1,
                        reg_alpha=0,
                        reg_lambda=1,
                        scale_pos_weight=1,
                        seed=1440,
                        missing=None
                       )

gsearch = GridSearchCV(xlf, param_grid=parameters, scoring='f1', cv=3, verbose=1)
gsearch.fit(x_train, y_train)

print("Best score: %0.3f" % gsearch.best_score_)
print("Best parameters set:")
best_parameters = gsearch.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))