In [None]:
import pandas as pd
import numpy as np
import math
import logging
from datetime import datetime

from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, Normalizer, StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, chi2, SelectFromModel
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import roc_curve, auc
from sklearn.svm import LinearSVC

from mlxtend.preprocessing import DenseTransformer
from mlxtend.feature_selection import ColumnSelector

import xgboost as xgb

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [None]:
logger = logging.getLogger('ai')
logging.basicConfig(level=logging.DEBUG,
                    format='%(asctime)s  %(filename)s : %(levelname)s  %(message)s')

## 模型

+ [sklearn.preprocessing.StandardScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html)
+ [Feature transformations with ensembles of trees](https://scikit-learn.org/stable/auto_examples/ensemble/plot_feature_transformation.html)
+ [sklearn.preprocessing.OneHotEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html)
+ [机器学习之 sklearn中的pipeline](http://frankchen.xyz/2018/04/08/pipeline-in-machine-learning/)
    - 使用pipeline做cross validation
    - 自定义transformer
    - FeatureUnion
+ [Concatenating multiple feature extraction methods](https://scikit-learn.org/stable/auto_examples/compose/plot_feature_union.html#sphx-glr-auto-examples-compose-plot-feature-union-py)
+ [sklearn.pipeline.Pipeline](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html)
+ [gbdt+lr demo](https://github.com/princewen/tensorflow_practice/blob/master/recommendation/GBDT%2BLR-Demo/GBDT_LR.py)
+ [推荐系统遇上深度学习(十)--GBDT+LR融合方案实战](https://zhuanlan.zhihu.com/p/37522339)
+ [python︱sklearn一些小技巧的记录（训练集划分/pipelline/交叉验证等）](https://blog.csdn.net/sinat_26917383/article/details/77917881)
+ [16.【进阶】特征提升之特征筛选----feature_selection](https://blog.csdn.net/jh1137921986/article/details/79822512)
+ [使用sklearn优雅地进行数据挖掘](https://www.cnblogs.com/jasonfreak/p/5448462.html)
+ [Kaggle机器学习之模型融合（stacking）心得](https://zhuanlan.zhihu.com/p/26890738)
+ [model_library_config](https://github.com/ChenglongChen/Kaggle_CrowdFlower/blob/master/Code/Model/model_library_config.py)

In [77]:
df = pd.read_csv('../features/lcm_base_features.csv')
user_features_df = pd.read_csv('../features/lcm_user_features.csv')
merchant_features_df = pd.read_csv('../features/lcm_merchant_features.csv')
coupon_features_df = pd.read_csv('../features/lcm_coupon_features.csv')

In [78]:
ipipe = Pipeline([
    ('pca', PCA(n_components=2)),
    ('scale', MinMaxScaler()),
])

def get_factor(df, key, prefix):
    id_df = df[[key]]
    output_df = df.drop([key], axis=1)

    ipipe.fit(output_df)
    factors = ipipe.transform(output_df)
    factors_df = pd.DataFrame(data=factors, columns=[prefix + '_factor_alpha', prefix + '_factor_beta'])
    factors_df[key] = id_df[key]
    return factors_df

df = pd.merge(df, get_factor(user_features_df, 'User_id', 'User'), on=['User_id'], how='left')
df = pd.merge(df, get_factor(merchant_features_df, 'Merchant_id', 'Merchant'), on=['Merchant_id'], how='left')
df = pd.merge(df, get_factor(coupon_features_df, 'Coupon_id', 'Coupon'), on=['Coupon_id'], how='left')

In [79]:
df.columns.values

array(['User_id', 'Merchant_id', 'Coupon_id', 'Distance', 'Date_received',
       'Is_in_day_consume', 'Discount', 'Base_consume', 'Discount_money',
       'Day_in_month', 'Day_in_week', 'Coupon_type', 'Offline_consume',
       'Duration', 'User_factor_alpha', 'User_factor_beta',
       'Merchant_factor_alpha', 'Merchant_factor_beta',
       'Coupon_factor_alpha', 'Coupon_factor_beta'], dtype=object)

In [80]:
continous = [
    'Discount', 
    'Base_consume', 
    'Discount_money',
    'User_factor_alpha',
    'User_factor_beta',
    'Merchant_factor_alpha',
    'Merchant_factor_beta',
    'Coupon_factor_alpha', 
    'Coupon_factor_beta'
]

fields = [
    'Distance',
    'Day_in_month',
    'Day_in_week',
    'Coupon_type'
]

label = ['Is_in_day_consume']

In [82]:
# model_train_df = pd.read_csv('../features/lcm_base_features.csv')
# model_train_df = model_train_df[model_train_df['Coupon_id']>0]
model_train_df = df[df['Date_received']<20160501]

In [83]:
# model_test_df = pd.read_csv('../features/lcm_train_test_features.csv')
# model_test_df = model_test_df[model_test_df['Coupon_id']>0]
model_test_df = model_train_df = df[df['Date_received']>=20160501]

In [84]:
class ExtractFeature(TransformerMixin):
    def fit(self, *args, **kwargs):
        return self
    
    def transform(self, X, **transform_params):
        return pd.DataFrame(X[:,0] * X[:,1])

In [85]:
fp = Pipeline([
    ('features', FeatureUnion([
        ('continuous', Pipeline([
            ('extract', ColumnSelector(continous)),
            ('imputer', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
            ('scale', Normalizer())
        ])),
        ('fields', Pipeline([
            ('extract', ColumnSelector(fields)),
            ('imputer', SimpleImputer(missing_values=np.nan,  strategy='most_frequent')),
            ('one_hot', OneHotEncoder(categories='auto')),
            ('to_dense', DenseTransformer())
        ])),
    ])),
#     ('feature_selection', SelectFromModel(LinearSVC(penalty="l1", dual=False))),
#     ('skb', SelectKBest(chi2, k=64)),
#     ('sc4gbdt', StandardScaler())
])

fp.fit(model_train_df[fields+continous], model_train_df[label].values.ravel())

Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=None,
       transformer_list=[('continuous', Pipeline(memory=None,
     steps=[('extract', ColumnSelector(cols=['Discount', 'Base_consume', 'Discount_money', 'User_factor_alpha', 'User_factor_beta', 'Merchant_factor_alpha', 'Merchant_factor_beta', 'Coupon_factor_alpha', 'Coupon_factor_beta'],
        drop_axis=...n='error',
       n_values=None, sparse=True)), ('to_dense', DenseTransformer(return_copy=True))]))],
       transformer_weights=None))])

In [86]:
train_dataset_x = fp.transform(model_train_df[fields+continous])
train_dataset_y = model_train_df[label].values.ravel()

valid_dataset_x = fp.transform(model_test_df[fields+continous])
valid_dataset_y = model_test_df[label].values.ravel()

xgbtrain = xgb.DMatrix(train_dataset_x, label=train_dataset_y)
xgbvalid = xgb.DMatrix(valid_dataset_x, label=valid_dataset_y)

In [87]:
def evaluate(result_df):
    group = result_df.groupby(['Coupon_id'])
    aucs = []
    for i in group:
        tmpdf = i[1]        
        if len(tmpdf['Is_in_day_consume'].unique()) != 2:
            continue
            
        fpr, tpr, thresholds = roc_curve(tmpdf['Is_in_day_consume'], tmpdf['Probability'], pos_label=1)
        auc_score = auc(fpr,tpr)
        aucs.append(auc_score)
            
    return np.average(aucs)

In [92]:
# model = xgb.sklearn.XGBClassifier(
#     nthread=4,
#     learn_rate=0.17,
#     max_depth=18,
#     min_child_weight=1.1,
#     subsample=0.7,
#     colsample_bytree=0.7,
#     colsample_bylevel=0.7,
#     objective='rank:pairwise',
#     n_estimators=500,
#     gamma=0.1,
#     reg_alpha=0,
#     reg_lambda=1,
#     max_delta_step=0,
#     scale_pos_weight=1,
#     silent=True
# )

params = {
    'booster': 'gbtree',
    'objective': 'rank:pairwise',
    'eval_metric': 'auc',
    'gamma': 0.1,
    'min_child_weight': 1.1,
    'max_depth': 18,
    'lambda': 1,
    'alpha': 0,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'colsample_bylevel': 0.7,
    'eta': 0.17,
    'tree_method': 'exact',
    'seed': 2018,
    'nthread': 20
}

watchlist = [(xgbtrain, 'train'), (xgbvalid, 'valid')]

logging.info('train begin')
model = xgb.train(params, xgbtrain, num_boost_round=500, evals=watchlist, early_stopping_rounds=10)
logging.info('train finish')

model.save_model('../model/xgb.model')

[18:00:48] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 1986 extra nodes, 378 pruned nodes, max_depth=18
[454]	train-auc:0.980382	valid-auc:0.980382
[18:00:52] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 1808 extra nodes, 330 pruned nodes, max_depth=18
[455]	train-auc:0.980384	valid-auc:0.980384
[18:00:55] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2830 extra nodes, 530 pruned nodes, max_depth=18
[456]	train-auc:0.980396	valid-auc:0.980396
[18:00:59] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2068 extra nodes, 304 pruned nodes, max_depth=18
[457]	train-auc:0.980408	valid-auc:0.980408
[18:01:02] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2852 extra nodes, 498 pruned nodes, max_depth=18
[458]	train-auc:0.980426	valid-auc:0.980426
[18:01:06] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2786 extra nodes, 526 pruned nodes, max_depth=18
[459]	train-auc:0.980429	valid-auc:0.980429
[18:01:09] src/tree/updater_prune.

2019-01-26 18:03:43,849  <ipython-input-92-a6d83375ceda> : INFO  train finish


In [89]:
def transfer_result(result):
    return MinMaxScaler(copy=True, feature_range=(0, 1)).fit_transform(result.reshape(-1, 1))

In [93]:
predict_test_prob_raw = model.predict(xgbvalid)
predict_test_prob = transfer_result(predict_test_prob_raw)
model_test_df['Probability'] = predict_test_prob
evaluate(model_test_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


0.9405666423255918

In [94]:
model_test_df['Probability'].describe()

count    306313.000000
mean          0.418945
std           0.142300
min           0.000000
25%           0.323448
50%           0.419343
75%           0.514778
max           1.000000
Name: Probability, dtype: float64

## 预测

In [95]:
model_pred_df = pd.read_csv('../features/lcm_submit_features.csv')
model_pred_df = pd.merge(model_pred_df, get_factor(user_features_df, 'User_id', 'User'), on=['User_id'], how='left')
model_pred_df = pd.merge(model_pred_df, get_factor(merchant_features_df, 'Merchant_id', 'Merchant'), on=['Merchant_id'], how='left')
model_pred_df = pd.merge(model_pred_df, get_factor(coupon_features_df, 'Coupon_id', 'Coupon'), on=['Coupon_id'], how='left')

In [96]:
predict_dataset_x = fp.transform(model_pred_df[fields+continous])
xgbPredict = xgb.DMatrix(predict_dataset_x)
predict_prob_raw = model.predict(xgbPredict)
model_pred_df['Probability'] = transfer_result(predict_prob_raw)
model_pred_df.sort_values(['Probability'], ascending=False).head()

Unnamed: 0,User_id,Merchant_id,Coupon_id,Distance,Date_received,Discount,Base_consume,Discount_money,Day_in_month,Day_in_week,Coupon_type,User_factor_alpha,User_factor_beta,Merchant_factor_alpha,Merchant_factor_beta,Coupon_factor_alpha,Coupon_factor_beta,Probability
43671,2751537,7910.0,2637.0,0.0,20160702.0,0.833333,30.0,5.0,2.0,6.0,1.0,0.395742,0.057898,0.00635,0.329149,,,1.0
43669,2751537,7910.0,2637.0,0.0,20160702.0,0.833333,30.0,5.0,2.0,6.0,1.0,0.395742,0.057898,0.00635,0.329149,,,1.0
42251,6139850,6135.0,8182.0,0.0,20160715.0,0.9,10.0,1.0,15.0,5.0,1.0,0.141453,0.080237,0.006191,0.330748,,,0.900718
45934,3977895,4808.0,1226.0,0.0,20160709.0,0.95,20.0,1.0,9.0,6.0,1.0,0.050597,0.090049,7.1e-05,0.314645,,,0.895087
78008,3229547,6189.0,12807.0,0.0,20160712.0,0.966667,30.0,1.0,12.0,2.0,1.0,0.050586,0.088519,0.001129,0.317547,,,0.882158


In [97]:
final_result_df = model_pred_df[['User_id', 'Coupon_id', 'Date_received', 'Probability']]
final_result_df.to_csv('/Users/leewind/Desktop/submission_20190127_2.csv', index=False, header=False)
final_result_df.shape

(113640, 4)

In [98]:
final_result_df.describe()

Unnamed: 0,User_id,Coupon_id,Date_received,Probability
count,113640.0,113640.0,113640.0,113640.0
mean,3684858.0,9053.810929,20160720.0,0.421957
std,2126259.0,4145.873088,9.019508,0.115079
min,209.0,3.0,20160700.0,0.0
25%,1844191.0,5023.0,20160710.0,0.345981
50%,3683266.0,9983.0,20160720.0,0.423869
75%,5525845.0,13602.0,20160720.0,0.504204
max,7361024.0,14045.0,20160730.0,1.0
