# stacking模型

In [1]:
import pandas as pd
import numpy as np
import math
import logging
from datetime import datetime

from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, Normalizer, StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import roc_curve, auc

from mlxtend.preprocessing import DenseTransformer
from mlxtend.feature_selection import ColumnSelector
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import GradientBoostingClassifier

import xgboost as xgb

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.ensemble import RandomForestClassifier
from mlxtend.classifier import StackingClassifier
from sklearn.model_selection import cross_val_score

In [2]:
logger = logging.getLogger('ai')
logging.basicConfig(level=logging.DEBUG,
                    format='%(asctime)s  %(filename)s : %(levelname)s  %(message)s')

In [3]:
IS_PRED = False

## 数据预处理

In [4]:
dataset_alpha = pd.read_csv('../features/dataset_alpha.csv')

In [5]:
dataset_beta = pd.read_csv('../features/dataset_beta.csv')

In [6]:
dataset_pred = pd.read_csv('../features/dataset_pred.csv')

In [7]:
if IS_PRED:
    dataset_beta = pd.concat([dataset_alpha, dataset_beta])
    dataset_alpha = dataset_pred

In [8]:
continous = [
    'Coupon_id', 'Distance',
    'Month_of_received', 'Day_of_received',
    'Weekday_of_received', 'Base_consume', 'Discount',
    'Discount_money', 'Coupon_type', 'Coupon_category',
    'Previous_duration', 'Next_duration', 'o1',
    'o2', 'o3', 'o4', 'o5', 'o6', 'o8', 'o7', 'o9', 'o10', 'o12',
    'o14', 'o11', 'o13', 'o16', 'o15', 'o18', 'o19', 'o20', 'o21',
    'o22', 'o23', 'o17', 'o24', 'o25', 'o26', 'o27', 'o28', 'o29',
    'o30', 'o38', 'o31', 'o39', 'o40', 'o41', 'o42', 'o43', 'o32',
    'o33', 'o34', 'o35', 'o36', 'o37', 'o44', 'u0', 'u1', 'u2', 'u3',
    'u4', 'u5', 'u6', 'u7', 'u8', 'u9', 'u10', 'u11', 'u12', 'u13',
    'u14', 'u15', 'u16', 'u17', 'u18', 'u19', 'u20', 'u21', 'u22',
    'u23', 'u24', 'u25', 'ucc0', 'ucc1', 'ucc2', 'ucc3', 'ucc4',
    'ucc5', 'ucc6', 'ucc7', 'ucc8', 'ucc9', 'ucc10', 'ucc11', 'ucc12',
    'uc1', 'uc2', 'uc3', 'uc4', 'uc5', 'uc6', 'uc7', 'uc8', 'uc9',
    'uc10', 'uc11', 'uc12', 'ud0', 'ud1', 'ud2', 'ud3', 'ud4', 'ud5',
    'ud6', 'ud7', 'ud8', 'ud9', 'ud10', 'ud11', 'ud12', 'um0', 'um1',
    'um2', 'um3', 'um4', 'um5', 'um6', 'um7', 'um8', 'um9', 'um10',
    'um16', 'um15', 'um17', 'um11', 'um12', 'um13', 'um14', 'm0', 'm1',
    'm2', 'm3', 'm4', 'm5', 'm6', 'm7', 'm8', 'm9', 'm10', 'm11',
    'm12', 'm13', 'm14', 'c1', 'c2', 'c3', 'c4', 'c5', 'c6', 'c7',
    'c8', 'c9', 'c10', 'c11', 'c12', 'c13', 'c14', 'cd1', 'cd2', 'cd3',
    'cd4', 'cd5', 'cd6', 'cd7', 'dr1', 'dr2', 'dr3', 'dr4', 'dr5',
    'dr6', 'dr7', 'ou1', 'ou2', 'ou3', 'ou4']

label = ['Label']

In [9]:
features_pipeline = Pipeline([
    ('features', FeatureUnion([
        ('continuous', Pipeline([
            ('extract', ColumnSelector(continous)),
            ('imputer', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
            ('normalize', Normalizer())
        ])),
#         ('fields', Pipeline([
#             ('extract', ColumnSelector(fields)),
#             ('imputer', SimpleImputer(missing_values=np.nan,  strategy='most_frequent')),
#             ('one_hot', OneHotEncoder(categories='auto')),
#             ('to_dense', DenseTransformer())
#         ])),
    ])),
])

features_pipeline.fit(dataset_beta, dataset_beta.Label.values.ravel())

train_dataset_x = features_pipeline.transform(dataset_beta)
train_dataset_y = dataset_beta.Label.values.ravel()

valid_dataset_x = features_pipeline.transform(dataset_alpha)

if not IS_PRED:
    valid_dataset_y = dataset_alpha.Label.values.ravel()

In [10]:
selector_model = xgb.sklearn.XGBClassifier(max_depth=3, n_estimators=100, random_state=0)
selector_model.fit(train_dataset_x, train_dataset_y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [11]:
thresh = 0.001
selection = SelectFromModel(selector_model, threshold=thresh, prefit=True)

train_dataset_x = selection.transform(train_dataset_x)
valid_dataset_x = selection.transform(valid_dataset_x)

In [12]:
feature_selector = []
for index, value in enumerate(selector_model.feature_importances_):
    if value > 0:
        feature_selector.append((continous[index], value))

feature_selector

[('Coupon_id', 0.0071530757),
 ('Distance', 0.037195995),
 ('Day_of_received', 0.010014306),
 ('Weekday_of_received', 0.0028612304),
 ('Base_consume', 0.03862661),
 ('Discount', 0.0014306152),
 ('Discount_money', 0.012875536),
 ('Coupon_type', 0.0042918455),
 ('Coupon_category', 0.010014306),
 ('Previous_duration', 0.0042918455),
 ('Next_duration', 0.055793993),
 ('o1', 0.0014306152),
 ('o3', 0.011444922),
 ('o4', 0.0028612304),
 ('o5', 0.0014306152),
 ('o6', 0.040057223),
 ('o8', 0.014306151),
 ('o9', 0.011444922),
 ('o10', 0.0028612304),
 ('o14', 0.020028612),
 ('o11', 0.04434907),
 ('o13', 0.008583691),
 ('o16', 0.005722461),
 ('o15', 0.07439199),
 ('o18', 0.030042918),
 ('o21', 0.0028612304),
 ('o22', 0.005722461),
 ('o23', 0.0042918455),
 ('o17', 0.0014306152),
 ('o24', 0.0014306152),
 ('o25', 0.0028612304),
 ('o26', 0.055793993),
 ('o27', 0.0028612304),
 ('o28', 0.0042918455),
 ('o29', 0.0071530757),
 ('o30', 0.012875536),
 ('o38', 0.005722461),
 ('o31', 0.0014306152),
 ('o39', 0

In [13]:
train_dataset_x.shape[1] / len(continous)

0.4972972972972973

In [17]:
train_dataset_x.shape

(252586, 92)

## stacking模型训练

In [27]:
clf1 = KNeighborsClassifier(n_neighbors=1)
clf2 = RandomForestClassifier(random_state=1)
clf3 = GaussianNB()
lr = LogisticRegression(solver='lbfgs')
sclf = StackingClassifier(classifiers=[clf1, clf2, clf3],
                          use_probas=True,
                          average_probas=False,
                          meta_classifier=lr)

logger.info('3-fold cross validation:\n')

params = {'kneighborsclassifier__n_neighbors': [1, 5],
          'randomforestclassifier__n_estimators': [10, 200],
          'meta-logisticregression__C': [0.1, 10.0]}

grid = GridSearchCV(estimator=sclf, 
                    param_grid=params, 
                    cv=5,
                    refit=True,
                    scoring='roc_auc')
grid.fit(train_dataset_x, train_dataset_y)

cv_keys = ('mean_test_score', 'std_test_score', 'params')

for r, _ in enumerate(grid.cv_results_['mean_test_score']):
    print("%0.3f +/- %0.2f %r"
          % (grid.cv_results_[cv_keys[0]][r],
             grid.cv_results_[cv_keys[1]][r] / 2.0,
             grid.cv_results_[cv_keys[2]][r]))

print('Best parameters: %s' % grid.best_params_)
print('Accuracy: %.2f' % grid.best_score_)

2019-02-09 16:42:24,467  <ipython-input-27-2557adcfd4fe> : INFO  3-fold cross validation:



0.820 +/- 0.00 {'kneighborsclassifier__n_neighbors': 1, 'meta-logisticregression__C': 0.1, 'randomforestclassifier__n_estimators': 10}
0.868 +/- 0.00 {'kneighborsclassifier__n_neighbors': 1, 'meta-logisticregression__C': 0.1, 'randomforestclassifier__n_estimators': 200}
0.820 +/- 0.00 {'kneighborsclassifier__n_neighbors': 1, 'meta-logisticregression__C': 10.0, 'randomforestclassifier__n_estimators': 10}
0.868 +/- 0.00 {'kneighborsclassifier__n_neighbors': 1, 'meta-logisticregression__C': 10.0, 'randomforestclassifier__n_estimators': 200}
0.837 +/- 0.00 {'kneighborsclassifier__n_neighbors': 5, 'meta-logisticregression__C': 0.1, 'randomforestclassifier__n_estimators': 10}
0.871 +/- 0.00 {'kneighborsclassifier__n_neighbors': 5, 'meta-logisticregression__C': 0.1, 'randomforestclassifier__n_estimators': 200}
0.835 +/- 0.00 {'kneighborsclassifier__n_neighbors': 5, 'meta-logisticregression__C': 10.0, 'randomforestclassifier__n_estimators': 10}
0.868 +/- 0.00 {'kneighborsclassifier__n_neighbor

In [28]:
clf1 = KNeighborsClassifier(n_neighbors=5)
clf2 = RandomForestClassifier(random_state=0, n_estimators=200)
clf3 = GaussianNB()
lr = LogisticRegression(solver='lbfgs', class_weight='balanced', multi_class='multinomial', C=0.1)
sclf = StackingClassifier(classifiers=[clf1, clf2, clf3],
                          use_probas=True,
                          average_probas=False,
                          meta_classifier=lr)

sclf.fit(train_dataset_x, train_dataset_y)

StackingClassifier(average_probas=False,
          classifiers=[KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'), RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='a...e=False, random_state=0, verbose=0, warm_start=False), GaussianNB(priors=None, var_smoothing=1e-09)],
          meta_classifier=LogisticRegression(C=0.1, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='multinomial', n_jobs=None, penalty='l2',
          random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
          warm_start=False),
          store_train_meta_features=False, use_clones=True,
          use_features_in_secondary=False, use_probas=True, verbose=0)

In [29]:
from sklearn.metrics import roc_curve, auc
import numpy as np
import math

def evaluate(result_df):
    group = result_df.groupby(['Coupon_id'])
    aucs = []
    for i in group:
        tmpdf = i[1]        
        if len(tmpdf['Label'].unique()) != 2:
            continue
            
        fpr, tpr, thresholds = roc_curve(tmpdf['Label'], tmpdf['Prob'], pos_label=1)
        auc_score = auc(fpr,tpr)
        aucs.append(auc_score)
            
    return np.average(aucs)

In [30]:
predict_test_prob_y = sclf.predict_proba(valid_dataset_x)
dataset_alpha['Prob'] = predict_test_prob_y[:, 1]
evaluate(dataset_alpha)

0.6987197678112163