说明：  
本文档使用的数据来自于 Data clean 1 里 的填充过缺失值的数据。

<font face="微软雅黑" size=5> Contents： </font>  
1. 准备数据集  
2. 过拟合XGB模型  
3. GridSearchCV进行超参数交叉验证  
&ensp;&ensp;&ensp;&ensp;3.1 自定义网络搜索函数  
4. 交叉验证  
5. 训练最终XGB模型并预测未知样本集  
6. 提交结果


In [1]:
import numpy as np
import pandas as pd
import time
# from scipy import special
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import roc_auc_score
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials,rand
import xgboost
from xgboost import XGBClassifier
from xgboost import plot_importance
import time
from sklearn.externals import joblib
import pickle
import copy
import matplotlib.pyplot as plt
%matplotlib inline

In [14]:
auc_result

Unnamed: 0,best_iteration,model_num,train_auc,validation_auc
1,6,1,0.970433,0.727055
2,2,2,0.924242,0.736062
3,1,3,0.890393,0.756073
4,5,4,0.962529,0.742586
5,1,5,0.884069,0.7343


# 3. GridSearchCV 进行超参数交叉验证

## 3.1 自定义网格搜索函数

In [15]:
def Combination_enum(d):   # d 是参数字典
    
    K=[]                   # K 为参数目录
    V=[]                   # V 为每个参数可能取的值。二维list，与 K 对应。
    for k,v in d.items():
        K.append(k)
        V.append(v)
    
    p = len(K)-1           # p 是最后一个参数的索引
    L=[]                   # L 用来综合得到的组合
    e=[]                   # e 提供初始的组合，是空集合
    
    def fun3(p,e):
        nonlocal L
        if p==-1:
            L.append(copy.deepcopy(e))   # 若 p==-1，说明所有参数都取遍了，
            return                       # 得到了一个组合，可以添加到 L 里了
        for i in range(len(V[p])):
            f = []
            for j in range(len(V[p])):
                f.append(copy.deepcopy(e))
            for j in range(len(V[p])):
                f[j].append(V[p][i])
            fun3(p-1,f[i])
    fun3(p,e)      # 采用了递归调用
    
    comb = L
    param_names = [i for i in reversed(K)] # 需要把 K 反序排列
    df = pd.DataFrame(comb,columns = param_names)   
    return comb,param_names, df

def myGridSearchCV2(XGB_param,Combination_enum,param_names,X,
                    y,K,random_state):
    
    skf = StratifiedKFold(n_splits=K,shuffle=True,random_state=random_state) 
    Xt = []
    Yt = []
    Xv = []
    Yv = []
    for train_index, validation_index in skf.split(X,y):  
        Xt.append(X_tr_train.values[train_index])
        Yt.append(y_tr_train.values[train_index])
        Xv.append(X_tr_train.values[validation_index])
        Yv.append(y_tr_train.values[validation_index])
        
    records = []  
    Info = []    
    for l in  range(len(Combination_enum)):  
        paramcombdict = {}
        for i in range(len(param_names)):
            paramcombdict[param_names[i]]=Combination_enum[l][i] 
    
        
        XGB_param_fix = XGB_param          
        XGB_param_fix.update(paramcombdict) 
                
        Scores=[]
        Models=[XGBClassifier(**XGB_param_fix) for i in range(len(Xt))] 
        for i in range(len(Xt)):      
            Models[i].fit(Xt[i],Yt[i], eval_set = [(Xt[i],Yt[i]),
                                                   ( Xv[i],Yv[i])],
                       eval_metric =["logloss","auc"] ,
                          early_stopping_rounds = 10,verbose = True)
            
            # 为 K 个模型中的每个收集信息
            n = Models[i].best_iteration  
            r = Models[i].evals_result()
            Infoi = []
            Infoi.append(paramcombdict)                  # 此超参数值 c 
            Infoi.append(i+1)                            
            Infoi.append(n)                              # 此模型的最佳轮数
            Infoi.append(r["validation_0"]["auc"][n])    
            Infoi.append(r["validation_1"]["auc"][n])   
            Info.append(Infoi)
            
            Scores.append(Models[i].best_score)        
            print("--------------------------------")
        
        print("When param is %s, five scores are :" % paramcombdict)
        print(Scores)
        print("--------------------------------")
        print("--------------------------------")
        print("--------------------------------")
        
        records.append([paramcombdict,np.mean(Scores)])
    
    Info = pd.DataFrame(Info,columns=["paramcombdict","model_round",
                                      "best_iteration",
                                      "train_auc","validation_auc"])
    records = pd.DataFrame(records,columns=["paramcombdict","mean_auc"]
                          ).sort_values(by="mean_auc",ascending=False)
    
    return Info, records

In [16]:
# 固定参数

XGB_param2 = dict(                  
    n_estimators=1000,       # 树的个数--1000棵树建立xgboost
    silent = False,          # 此为默认设置，打印running boosting时的message。这个打印是打印到 后台 cmd或 powershell里。
    objective='binary:logistic',        # 指定损失函数
    # nthread = -1,             # 此为默认设置，并行线程数。
    scale_pos_weight = scale_pos_weight_value,   # 设置正类别样本的权重。解决正反样本个数不平衡的问题。
    base_score = 0.5,         # 此为默认设置，
    seed = 6,                           # 随机数，因为涉及到随机采样等随机过程。
    missing = np.nan,                    # XGB是能处理带缺失值的数据的模型，所以需要指出数据集中用什么代表缺失值。不过我的数据无缺失。
    max_delta_step = 0,        # 此为默认设置，每棵树权重改变的最大步长。
    colsample_bylevel = 1,    # 此为默认设置，树的每一级的每一次分裂时，对特征的采样比例。因为要调节的超参数已有colsample_bytree，所以就不调节colsample_bylevel了
    reg_alpha = 0,                      # （默认此为默认设置，L1正则项系数。
    reg_lambda = 1,                     # （默认）此为默认设置，L2正则项系数。
    gamma=0,            # 设置节点划分要达到的loss减小量
    min_child_weight = 1,               # （默认）叶子节点最小实例数据权重和
    colsample_btree=1,                  # （默认）训练每棵树时特征采样的比例
    )

In [17]:
# 要调节的超参数

XGB_param_grid2 = dict(
    learning_rate=[0.1,0.2,0.3,0.5],         # 提升树的学习率 
    max_depth=[1,3,5,7,9],                        # 树的最大深度
    subsample=[0.8,1.0],            # 训练每棵树时实例数据随机采样的比例
     )


In [18]:
comb,param_names,param_grid_df = Combination_enum(XGB_param_grid2)

Unnamed: 0,subsample,max_depth,learning_rate
0,0.8,1,0.1
1,0.8,1,0.2
2,0.8,1,0.3
3,0.8,1,0.5
4,0.8,3,0.1
5,0.8,3,0.2
6,0.8,3,0.3
7,0.8,3,0.5
8,0.8,5,0.1
9,0.8,5,0.2


In [None]:
# 交叉验证。对每一对超参数组合进行检查

timestart=time.time()
Info, records = myGridSearchCV2(XGB_param2,comb,param_names,
                X_tr_train.values,y_tr_train.values,K=3,random_state=4)
print("Time: {:.3f} hours".format((time.time() - timestart)/3600))            

In [24]:
XGB_param4 = dict(                 
                      
    n_estimators=1000,                  # 树的个数--1000棵树建立xgboost
    # silent = True,                    # 此为默认设置，不打印running boosting时的message。
    objective='binary:logistic',        # 指定损失函数
    # nthread = -1,（默认）              # 此为默认设置，并行线程数。
    gamma=0,                            # （默认）设置节点划分要达到的loss减小量
    min_child_weight = 1,               # （默认）叶子节点最小实例数据权重和
    max_delta_step = 0,                 # （默认）此为默认设置，每棵树权重改变的最大步长。
    colsample_btree=1,                  # （默认）训练每棵树时特征采样的比例
    colsample_bylevel = 1,              # （默认）此为默认设置，树的每一级的每一次分裂时，对特征的采样比例。
    reg_alpha = 0,                      # （默认此为默认设置，L1正则项系数。
    reg_lambda = 1,                     # （默认）此为默认设置，L2正则项系数。
    scale_pos_weight = scale_pos_weight_value,   # 设置正类别样本的权重。解决正反样本个数不平衡的问题。
    base_score = 0.5,                   # （默认）此为默认设置，
    seed = 1,                           # 随机数，因为涉及到随机采样等随机过程。
    missing = np.nan,                   # XGB是能处理带缺失值的数据的模型，所以需要指出数据集中用什么代表缺失值。   
    
    learning_rate = 0.1,         # 这是调参得到的最佳参数，正好，也同时是默认值。
    max_depth = 3,
    subsample = 1.0
    )

In [None]:
# 对训练集 X_tr_train, y_tr_train 进行训练，对 验证集 X_tr_test, y_tr_test 进行预测并计算 AUC 值。

modelXGB2 = XGBClassifier(**XGB_param4)
timestart = time.time()
modelXGB2.fit(X_tr_train,y_tr_train,eval_set = [(X_tr_train,y_tr_train),(X_tr_test, y_tr_test)],
              eval_metric =["logloss","auc"] ,early_stopping_rounds = 10,verbose = True)
print("Time: {:.2f} seconds".format(time.time() - timestart))

In [26]:
modelXGB2.best_iteration

166

In [27]:
modelXGB2.best_ntree_limit

167

In [28]:
modelXGB2.best_score

0.798225

In [29]:
def ks_statistic(y_true,y_predicted_proba):
    
    fpr,tpr,thresholds = metrics.roc_curve(y_true,y_predicted_proba,pos_label=1)
    return abs(tpr-fpr).max()


def ks_curve(y_true,y_predicted_proba):
    fpr,tpr,thresholds = metrics.roc_curve(y_true,y_predicted_proba,pos_label=1)
    
    font1 = {'family': 'Calibri','weight': 'normal','size': 18} # 轴标签字体
    font2 = {'family': 'Calibri','weight': 'normal','size': 23} # 图标题字体
    
    fig = plt.figure(figsize = (6,8))
    ax = fig.add_subplot(111)
    ax.plot(thresholds,1.0-tpr,label="overdue",color="navy")
    ax_t = ax.twinx()
    ax_t.plot(thresholds,1.0-fpr,label="normal",color="g")
    ax.plot(thresholds,tpr-fpr,label="K-S",color="darkorange")
    ax.legend(loc=2)
    ax_t.legend(loc=9)
    ax_t.set_xlim(0.0, 1.0)
    ax_t.set_ylim(0.0, 1.0)
    ax.set_title('K-S curve',fontdict=font2 ) 
    ax.set_xlabel('threshold',fontdict=font1,labelpad= 2)  
    ax.set_ylabel('True Positive Rate',fontdict=font1,labelpad= 6)
    ax.set_ylabel('False Positive Rate',fontdict=font1,labelpad= 6)
    ax.set_xlim(0.0, 1.0)
    ax.set_ylim(0.0, 1.0)
       
    plt.show()

In [30]:
y_predict_proba2 = modelXGB2.predict_proba(X_tr_test, ntree_limit=167)  # 注意设置 ntree_limit=167，使用最佳轮的集成树
ks_value2 = ks_statistic(np.array(y_tr_test),y_predict_proba2[:,1])
print("ks_value2 is: ",ks_value2)

ks_value2 is:  0.45967897552846243



# 4. 交叉验证  
&ensp;&ensp;以上过程是建立在一次性二八分，用八份去通过GridSearchCV+hyperopt得到最佳超参数，然后在此超参数下对这八份进行学习得到最佳模型，然后用最佳模型对那二份进行预测并计算AUC和KS。  
&ensp;&ensp;一次性二八分会否太单一？我认为可以在此超参数下使用交叉验证，多算几个验证分数。也就是把整个训练集X_tr, y_tr进行五折划分，在此超参数下学得5个模型，这5个模型对各自的验证集进行预测，得到5个auc和ks评分。可以看看这5个评分，算一下均分。

In [35]:
skf = StratifiedKFold (n_splits=5,shuffle=True,random_state=9)  
Xta = []
Yta = []
Xva = []
Yva = []
for train_index, validation_index in skf.split(X_tr.values,y_tr.values):  
    Xta.append(X_tr.values[train_index])
    Yta.append(y_tr.values[train_index])
    Xva.append(X_tr.values[validation_index])
    Yva.append(y_tr.values[validation_index])

In [36]:
timestart = time.time()
Scores2=[]
Models2=[XGBClassifier(**XGB_param4) for i in range(len(Xta))]  # 创建 5 个模型
for i in range(len(Xta)):
    Models2[i].fit(Xta[i],Yta[i], eval_set = [(Xta[i],Yta[i]),( Xva[i],
                                                               Yva[i])],
                   eval_metric =["logloss","auc"] ,
                   early_stopping_rounds = 10,verbose = True)
    Scores2.append(Models2[i].best_score)
    print("-------------------------------")
print("Time: {:.2f} seconds".format(time.time() - timestart))
print("Scores1 : ", Scores2)
print("The mean auc is :", np.mean(Scores2))


In [39]:
roundXGB2=[]  
train_auc2=[]  
validation_auc2=[]  
best_iteration2=[]

for i in range(len(Scores2)):
    
    roundXGB2.append(i+1)
    
    train_auc2.append(Models2[i].evals_result(
    )["validation_0"]["auc"][ Models2[i].best_iteration])
    
    validation_auc2.append(Models2[i].evals_result(
    )["validation_1"]["auc"][ Models2[i].best_iteration])
    
    best_iteration2.append(Models2[i].best_iteration)

d = {"model_round":roundXGB2, "train_auc": train_auc2 ,
     "validation_auc": validation_auc2 , "best_iteration": best_iteration2 }
auc_result2 = pd.DataFrame(d,index=[1,2,3,4,5])


# 5. 训练最终XGB模型并预测未知样本集

In [42]:
# 训练最终XGB模型并预测未知样本集

modelXGB4 = XGBClassifier(**XGB_param4)  # XGB_param4是最佳超参数+固定参数
modelXGB4.fit(X_tr, y_tr)
proba_prediction4 = modelXGB4.predict_proba(X_te)

In [43]:
# 将预测结果整合成表

XGBpre = pd.DataFrame(proba_prediction4[:,1],index=X_te.index,
                      columns=["positive_proba"]).reset_index()
XGBpre.columns = ["userid","probability"]  # 按官网提交示例修改列名

In [45]:
XGBpre.to_csv(r'F:\RiskPre2\result\XGBpre.csv',index=False)

In [46]:
joblib.dump(modelXGB4, r'F:\RiskPre2\result\modelXGB5.pkl')

['F:\\RiskPre2\\result\\modelXGB5.pkl']