# xgb模型

In [18]:
import pandas as pd
import numpy as np
import math
import logging
from datetime import datetime

from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, Normalizer, StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import roc_curve, auc

from mlxtend.preprocessing import DenseTransformer
from mlxtend.feature_selection import ColumnSelector
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import GradientBoostingClassifier

import xgboost as xgb

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [19]:
logger = logging.getLogger('ai')
logging.basicConfig(level=logging.DEBUG,
                    format='%(asctime)s  %(filename)s : %(levelname)s  %(message)s')

In [20]:
IS_PRED = False

## 模型

### 数据预处理

In [21]:
dataset_alpha = pd.read_csv('../features/dataset_alpha.csv')

In [22]:
dataset_beta = pd.read_csv('../features/dataset_beta.csv')

In [23]:
dataset_pred = pd.read_csv('../features/dataset_pred.csv')

In [24]:
if IS_PRED:
    dataset_beta = pd.concat([dataset_alpha, dataset_beta])
    dataset_alpha = dataset_pred

In [25]:
continous = [
    'Coupon_id', 'Distance',
    'Month_of_received', 'Day_of_received',
    'Weekday_of_received', 'Base_consume', 'Discount',
    'Discount_money', 'Coupon_type', 'Coupon_category',
    'Previous_duration', 'Next_duration', 'o1',
    'o2', 'o3', 'o4', 'o5', 'o6', 'o8', 'o7', 'o9', 'o10', 'o12',
    'o14', 'o11', 'o13', 'o16', 'o15', 'o18', 'o19', 'o20', 'o21',
    'o22', 'o23', 'o17', 'o24', 'o25', 'o26', 'o27', 'o28', 'o29',
    'o30', 'o38', 'o31', 'o39', 'o40', 'o41', 'o42', 'o43', 'o32',
    'o33', 'o34', 'o35', 'o36', 'o37', 'o44', 'u0', 'u1', 'u2', 'u3',
    'u4', 'u5', 'u6', 'u7', 'u8', 'u9', 'u10', 'u11', 'u12', 'u13',
    'u14', 'u15', 'u16', 'u17', 'u18', 'u19', 'u20', 'u21', 'u22',
    'u23', 'u24', 'u25', 'ucc0', 'ucc1', 'ucc2', 'ucc3', 'ucc4',
    'ucc5', 'ucc6', 'ucc7', 'ucc8', 'ucc9', 'ucc10', 'ucc11', 'ucc12',
    'uc1', 'uc2', 'uc3', 'uc4', 'uc5', 'uc6', 'uc7', 'uc8', 'uc9',
    'uc10', 'uc11', 'uc12', 'ud0', 'ud1', 'ud2', 'ud3', 'ud4', 'ud5',
    'ud6', 'ud7', 'ud8', 'ud9', 'ud10', 'ud11', 'ud12', 'um0', 'um1',
    'um2', 'um3', 'um4', 'um5', 'um6', 'um7', 'um8', 'um9', 'um10',
    'um16', 'um15', 'um17', 'um11', 'um12', 'um13', 'um14', 'm0', 'm1',
    'm2', 'm3', 'm4', 'm5', 'm6', 'm7', 'm8', 'm9', 'm10', 'm11',
    'm12', 'm13', 'm14', 'c1', 'c2', 'c3', 'c4', 'c5', 'c6', 'c7',
    'c8', 'c9', 'c10', 'c11', 'c12', 'c13', 'c14', 'cd1', 'cd2', 'cd3',
    'cd4', 'cd5', 'cd6', 'cd7', 'dr1', 'dr2', 'dr3', 'dr4', 'dr5',
    'dr6', 'dr7', 'ou1', 'ou2', 'ou3', 'ou4']

label = ['Label']

In [26]:
features_pipeline = Pipeline([
    ('features', FeatureUnion([
        ('continuous', Pipeline([
            ('extract', ColumnSelector(continous)),
            ('imputer', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
            ('normalize', Normalizer())
        ])),
#         ('fields', Pipeline([
#             ('extract', ColumnSelector(fields)),
#             ('imputer', SimpleImputer(missing_values=np.nan,  strategy='most_frequent')),
#             ('one_hot', OneHotEncoder(categories='auto')),
#             ('to_dense', DenseTransformer())
#         ])),
    ])),
])

features_pipeline.fit(dataset_beta, dataset_beta.Label.values.ravel())

train_dataset_x = features_pipeline.transform(dataset_beta)
train_dataset_y = dataset_beta.Label.values.ravel()

valid_dataset_x = features_pipeline.transform(dataset_alpha)

if not IS_PRED:
    valid_dataset_y = dataset_alpha.Label.values.ravel()

In [27]:
selector_model = xgb.sklearn.XGBClassifier(max_depth=3, n_estimators=100, random_state=0)
selector_model.fit(train_dataset_x, train_dataset_y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [28]:
thresh = 0.001
selection = SelectFromModel(selector_model, threshold=thresh, prefit=True)

train_dataset_x = selection.transform(train_dataset_x)
valid_dataset_x = selection.transform(valid_dataset_x)

In [29]:
train_dataset_x.shape[1] / len(continous)

0.4972972972972973

### 模型训练

#### 调参数

In [None]:
parameters = {
    'xgb__learn_rate': [0.01, ],
    'xgb__max_depth': [6],
    'xgb__min_child_weight': [1],
    'xgb__subsample': [0.7,],
    'xgb__colsample_bytree': [0.7,],
    'xgb__colsample_bylevel': [0.7,],
    'xgb__objective': ['rank:pairwise'],
    'xgb__n_estimators': range(100, 401, 100), # 使用1-3都可以被接受
    'xgb__gamma': [0.1,],
    'xgb__reg_alpha': [1,],
    'xgb__reg_lambda': [1,],
    'xgb__max_delta_step': [0,],
    'xgb__scale_pos_weight': [1,],
    'xgb__silent': [True],
    'xgb__eval_metric': ['auc']
}

cv = GridSearchCV(model_pipeline, parameters, scoring = 'roc_auc', n_jobs= 4)
cv.fit(train_dataset_x, train_dataset_y)

In [None]:
# 查看每组评估的具体数据
cv.cv_results_['param_xgb__n_estimators'].data

# 结果训练
cv.cv_results_['mean_train_score']

#### 最优参数训练

In [35]:
model_pipeline = Pipeline([
    ('xgb', xgb.sklearn.XGBClassifier())
])

model_pipeline.set_params(
    xgb__learn_rate=0.01,
    xgb__max_depth=12,
    xgb__min_child_weight=1.1,
    xgb__subsample=0.7,
    xgb__colsample_bytree=0.7,
    xgb__colsample_bylevel=0.7,
#     xgb__objective='rank:pairwise',
    xgb__objective='binary:logistic',
    xgb__n_estimators=200,
#     xgb__gamma=0.1,
#     xgb__reg_alpha=1,
    xgb__reg_lambda=10,
#     xgb__max_delta_step=0,
#     xgb__scale_pos_weight=1,
    xgb__silent=True,
    xgb__eval_metric='auc'
).fit(train_dataset_x, train_dataset_y)

Pipeline(memory=None,
     steps=[('xgb', XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=0.7,
       colsample_bytree=0.7, eval_metric='auc', gamma=0, learn_rate=0.01,
       learning_rate=0.1, max_delta_step=0, max_depth=12,
       min_child_weight=1.1, missing=None, n_estimators=100, n_jobs=1,
       nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=10, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.7))])

In [36]:
class Evaluator():
    def __init__(self, df, pipe):
        self.df = df
        self.pipe = pipe
        
    def transfer_result(self, result):
        return MinMaxScaler(copy=True, feature_range=(0, 1)).fit_transform(result.reshape(-1, 1))
    
    def cal(self, dataset):
        pred = self.pipe.predict_proba(dataset)[:,1]
        logging.info(pred)
        
        self.df['Probability'] = self.transfer_result(pred)
        return self.evaluate(self.df[['Probability', 'Coupon_id', 'Label']])
    
    def describe(self):
        return self.df[['User_id', 'Coupon_id', 'Probability', 'Label']].describe()
    
    def predict(self, dataset):
        pred = self.pipe.predict_proba(dataset)[:,1]
        logging.info(pred)
        
        self.df['Probability'] = self.transfer_result(pred)
        return self.df[['User_id', 'Coupon_id', 'Date_received', 'Probability']]
    
    def evaluate(self, result_df):
        group = result_df.groupby(['Coupon_id'])
        aucs = []
        logging.info('coupon size is %d' % (len(group)))

        counter = 0
        for i in group:
            tmpdf = i[1]        
            if len(tmpdf['Label'].unique()) != 2:
                continue

            fpr, tpr, thresholds = roc_curve(tmpdf['Label'], tmpdf['Probability'], pos_label=1)
            auc_score = auc(fpr,tpr)
            aucs.append(auc_score)
            counter = counter + 1

        logging.info('coupon in cal is %d' % (counter))

        return np.average(aucs)

In [32]:
evaluator = Evaluator(dataset_alpha, model_pipeline)

if IS_PRED:
    final_result_df = evaluator.predict(valid_dataset_x)
    final_result_df.to_csv('/Users/leewind/Desktop/submission_20190208.csv', index=False, header=False)
    final_result_df.describe()
else:
    logger.info(evaluator.cal(valid_dataset_x))

2019-02-08 17:11:55,393  <ipython-input-31-88d5387b257b> : INFO  [0.02572334 0.01788806 0.0523413  ... 0.14622833 0.00774132 0.04822461]
2019-02-08 17:11:55,631  <ipython-input-31-88d5387b257b> : INFO  coupon size is 4800
2019-02-08 17:11:58,391  <ipython-input-31-88d5387b257b> : INFO  coupon in cal is 2048


In [37]:
evaluator = Evaluator(dataset_beta, model_pipeline)
evaluator.cal(train_dataset_x)

2019-02-08 17:28:07,298  <ipython-input-36-88d5387b257b> : INFO  [0.02719907 0.02647771 0.06181328 ... 0.13776243 0.01090022 0.0495923 ]
2019-02-08 17:28:07,514  <ipython-input-36-88d5387b257b> : INFO  coupon size is 4800
2019-02-08 17:28:10,827  <ipython-input-36-88d5387b257b> : INFO  coupon in cal is 2048


0.7099751430695382