In [1]:
import numpy as np
import pandas as pd
import os 
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from statsmodels.stats.outliers_influence import variance_inflation_factor 
import warnings
warnings.filterwarnings('ignore')

#### 时间序列处理

In [None]:
data1_train['d1'] = pd.to_datetime(data1_train['d1'], format='%Y-%m-%d')
data1_train['d2'] = pd.to_datetime(data1_train['d2'],format='%Y-%m-%d')
data1_train['gap'] = (data1_train['d2']-data1_train['d1']).dt.days

In [None]:
for p in g.patches:  # 柱状图加注释
    height = p.get_height()
    sizes.append(height)
    g.text(p.get_x()+p.get_width()/2.,height + 3,'{:1.2f}%'.format(
        height/total*100),ha="center", fontsize=14)

### 缺失值处理
#### 填补

In [None]:
# 用随机森林对缺失值预测填充函数
def set_missing(df,fill_column=None,rfr=None):
    if not rfr:
        rfr = RandomForestRegressor(random_state=0,
                                n_estimators=200,max_depth=3,n_jobs=-1)
    known = df[df[fill_column].notnull()]
    unknown = df[df[fill_column].isnull()]
    X = known.pop(fill_column).values
    y = known.values
    y_test = unknown.pop(fill_column)
    X_test = unknown.values
    rfr.fit(X,y)
    predicted = rfr.predict(X_test).round(0)
    df.loc[(df[fill_column].isnull()), fill_column] = predicted
    return df

### 类别特征编码

In [None]:
def factor_encode(data):
    map_dict = {}
    for each in data.columns[:-1]:
        piv = pd.pivot_table(data, values='SalePrice',
                             index=each, aggfunc='mean')
        piv = piv.sort_values(by='SalePrice')
        piv['rank'] = np.arange(1, piv.shape[0] + 1)
        map_dict[each] = piv['rank'].to_dict()
    return map_dict
'''对于编码之后的类别特征还可以采用数值特征的处理方式在进行一次处理。如对其进行
相关性分析和共线性检验，以及共线性处理。'''

In [None]:
# Encoding Date features
def date_cyc_enc(df, col, max_vals):
    df[col + '_sin'] = np.sin(2 * np.pi * df[col]/max_vals)
    df[col + '_cos'] = np.cos(2 * np.pi * df[col]/max_vals)
    return df

df_train = date_cyc_enc(df_train, 'day', 7)
df_train = date_cyc_enc(df_train, 'month', 12)

### 特征选择
#### 树模型进行特征选择

In [None]:
def feature(X, y, N=9, clf=None):
    if not clf:
        clf = RandomForestClassifier(max_depth=5, random_state=0)
    clf.fit(X,y)
    importances = list(clf.feature_importances_)
    results = sorted(zip(importances,X.columns), key=lambda x: x[0])
    return results[:N]

### 模型调参

In [None]:
class SelectHyperParameter():
    '''estimator:进行超参数调节的模型，params:需要调参的参数，score:得分函数,
    n:最大迭代次数'''
    def __init__(self, estimator, params, n=20, score=None):
        self.estimator = estimator #模型
        self.params = params  # 调节参数
        self.n = n # 最大迭代次数
        self.score = score # 评分函数
        self.res = {} 
        self.temp = self.estimator.get_params()
    def isequal(self):
        if not(isinstance(self.res,dict) and isinstance(self.temp,dict)):
            return False
        keys = set(self.res.keys()).union(self.temp.keys())
        for key in keys:
            if self.res.get(key,-1) != self.temp.get(key, -1):
                return False
        return True
    def search(self, X, y):
        i = 0
        while i<=self.n and not(self.isequal()):
            self.res = self.temp
            for each in self.params:
                grid = GridSearchCV(self.estimator,
                                    param_grid={each:self.params[each]},
                                   scoring=self.score, cv=5,iid=True)
                grid.fit(X, y)
                self.temp = grid.best_estimator_.get_params()
                self.estimator = grid.best_estimator_
            i += 1
        return self.estimator, self.temp, grid.best_score_
    def __call__(self, X, y):
        return self.search(X,y)

### 模型评估

In [106]:
# 绘制AUC和KS图
from sklearn.metrics import roc_curve, auc
def model_validation(y_test, y_pred):
    fpr,tpr,threshold = roc_curve(y_test,y_pred, drop_intermediate=False) ###计算真正率和假正率  
    roc_auc = auc(fpr,tpr) # 计算AUC值
    fig, (ax1, ax2) = plt.subplots(2,1)
    fig.set_figheight(10)
    fig.set_figwidth(10)
    ax1.plot(fpr,tpr,color='darkorange', lw=2, 
             label='ROC Curve(area=%0.2f)'%roc_auc)
    ax1.plot([0,1], [0,1], color='navy',lw=2,linestyle='--')
    ax1.set_xlim(0,1)
    ax1.set_ylim(0,1.05)
    ax1.set_xlabel('False Positive Rate')
    ax1.set(xlabel='False Positve Rate',ylabel='True Positive Rate',
           title='ROC')
    ax1.legend(loc='lower right')
    index = (tpr-fpr).argmax()
    ax2.plot(threshold, 1-tpr, label='tpr')
    ax2.plot(threshold, 1-fpr, label='fpr')
    ax2.plot(threshold, tpr-fpr, label='KS %0.2f'%(tpr-fpr)[index],
             linestyle='--')
    ax2.vlines(threshold[index],1-fpr[index],1-tpr[index])
    
    ax2.set(xlabel='score')
    ax2.set_xlim(0,1)
    ax2.set_title('KS Curve')
    ax2.legend(loc='upper left', shadow=True, fontsize='x-large')
    return roc_auc,(tpr-fpr).max()

In [None]:
model = lgb.train(param, trn_data, 750, valid_sets = [trn_data,val_data],
    verbose_eval=False, evals_result=evals_result)      
x = evals_result['valid_1']['auc']
best = x.index(max(x))
model = xgb.train(params, tdata,num_boost_round=50, 
            evals = [(tdata,'train'), (vdata,'valid')], 
        verbose_eval=False, evals_result=evals_result)
x=evals_result['valid']['auc']
model.predict(valid[features], num_iteration=best)

def model_fit(estimator, X, y):
    roundXGB,train_auc,validation_auc,best_iteration = [], [], [], []
    skf = StratifiedKFold (n_splits=5, shuffle=True, random_state=8)
    for i,(train_index, validation_index) in enumerate(skf.split(X.values,
                                               y.values)): 
        Xt,yt = X.values[train_index],y.values[train_index]
        Xv,yv = X.values[validation_index],y.values[validation_index]
        model = clone(estimator)
        model.fit(Xt, yt, eval_set=[(Xt,yt),( Xv,yv)],
                   eval_metric=["auc"],
                   early_stopping_rounds=10, verbose=False)
        score = model.best_score
        roundXGB.append(i+1)
        train_auc.append(model.evals_result()['validation_0']['auc'][
            model.best_iteration])
        validation_auc.append(model.evals_result(
                )["validation_1"]["auc"][model.best_iteration])
        best_iteration.append(model.best_iteration)
    d = {"model_num":roundXGB, "train_auc": train_auc , 
              "validation_auc": validation_auc , 
             "best_iteration": best_iteration}
    auc_result = pd.DataFrame(d,index=[1,2,3,4,5])
    return auc_result