In [None]:
import pandas as pd
import numpy as np
import math
import logging
from datetime import datetime

from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, Normalizer, StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, chi2, SelectFromModel
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import roc_curve, auc
from sklearn.svm import LinearSVC

from mlxtend.preprocessing import DenseTransformer
from mlxtend.feature_selection import ColumnSelector

import xgboost as xgb

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [None]:
logger = logging.getLogger('ai')
logging.basicConfig(level=logging.DEBUG,
                    format='%(asctime)s  %(filename)s : %(levelname)s  %(message)s')

## 模型

+ [sklearn.preprocessing.StandardScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html)
+ [Feature transformations with ensembles of trees](https://scikit-learn.org/stable/auto_examples/ensemble/plot_feature_transformation.html)
+ [sklearn.preprocessing.OneHotEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html)
+ [机器学习之 sklearn中的pipeline](http://frankchen.xyz/2018/04/08/pipeline-in-machine-learning/)
    - 使用pipeline做cross validation
    - 自定义transformer
    - FeatureUnion
+ [Concatenating multiple feature extraction methods](https://scikit-learn.org/stable/auto_examples/compose/plot_feature_union.html#sphx-glr-auto-examples-compose-plot-feature-union-py)
+ [sklearn.pipeline.Pipeline](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html)
+ [gbdt+lr demo](https://github.com/princewen/tensorflow_practice/blob/master/recommendation/GBDT%2BLR-Demo/GBDT_LR.py)
+ [推荐系统遇上深度学习(十)--GBDT+LR融合方案实战](https://zhuanlan.zhihu.com/p/37522339)
+ [python︱sklearn一些小技巧的记录（训练集划分/pipelline/交叉验证等）](https://blog.csdn.net/sinat_26917383/article/details/77917881)
+ [16.【进阶】特征提升之特征筛选----feature_selection](https://blog.csdn.net/jh1137921986/article/details/79822512)
+ [使用sklearn优雅地进行数据挖掘](https://www.cnblogs.com/jasonfreak/p/5448462.html)
+ [Kaggle机器学习之模型融合（stacking）心得](https://zhuanlan.zhihu.com/p/26890738)
+ [model_library_config](https://github.com/ChenglongChen/Kaggle_CrowdFlower/blob/master/Code/Model/model_library_config.py)

In [37]:
df = pd.read_csv('../features/lcm_base_features.csv')
user_features_df = pd.read_csv('../features/lcm_user_features.csv')
merchant_features_df = pd.read_csv('../features/lcm_merchant_features.csv')

In [38]:
ipipe = Pipeline([
    ('pca', PCA(n_components=2)),
    ('scale', MinMaxScaler()),
])

def get_factor(df, key, prefix):
    id_df = df[[key]]
    output_df = df.drop([key], axis=1)

    ipipe.fit(output_df)
    factors = ipipe.transform(output_df)
    factors_df = pd.DataFrame(data=factors, columns=[prefix + '_factor_alpha', prefix + '_factor_beta'])
    factors_df[key] = id_df[key]
    return factors_df

df = pd.merge(df, get_factor(user_features_df, 'User_id', 'User'), on=['User_id'], how='left')
df = pd.merge(df, get_factor(merchant_features_df, 'Merchant_id', 'Merchant'), on=['Merchant_id'], how='left')

In [39]:
df.columns.values

array(['User_id', 'Merchant_id', 'Coupon_id', 'Distance', 'Date_received',
       'Is_in_day_consume', 'Discount', 'Base_consume', 'Discount_money',
       'Day_in_month', 'Day_in_week', 'Coupon_type', 'Offline_consume',
       'Duration', 'User_factor_alpha', 'User_factor_beta',
       'Merchant_factor_alpha', 'Merchant_factor_beta'], dtype=object)

In [45]:
continous = [
    'Discount', 
    'Base_consume', 
    'Discount_money',
    'User_factor_alpha',
    'User_factor_beta',
    'Merchant_factor_alpha',
    'Merchant_factor_beta'
]

fields = [
    'Distance',
    'Day_in_month',
    'Day_in_week',
    'Coupon_type'
]

label = ['Is_in_day_consume']

In [41]:
# model_train_df = pd.read_csv('../features/lcm_base_features.csv')
# model_train_df = model_train_df[model_train_df['Coupon_id']>0]
model_train_df = df[df['Date_received']<20160501]

In [42]:
# model_test_df = pd.read_csv('../features/lcm_train_test_features.csv')
# model_test_df = model_test_df[model_test_df['Coupon_id']>0]
model_test_df = model_train_df = df[df['Date_received']>=20160501]

In [43]:
class ExtractFeature(TransformerMixin):
    def fit(self, *args, **kwargs):
        return self
    
    def transform(self, X, **transform_params):
        return pd.DataFrame(X[:,0] * X[:,1])

In [46]:
fp = Pipeline([
    ('features', FeatureUnion([
        ('continuous', Pipeline([
            ('extract', ColumnSelector(continous)),
            ('imputer', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
            ('scale', Normalizer())
        ])),
        ('fields', Pipeline([
            ('extract', ColumnSelector(fields)),
            ('imputer', SimpleImputer(missing_values=np.nan,  strategy='most_frequent')),
            ('one_hot', OneHotEncoder(categories='auto')),
            ('to_dense', DenseTransformer())
        ])),
    ])),
#     ('feature_selection', SelectFromModel(LinearSVC(penalty="l1", dual=False))),
#     ('skb', SelectKBest(chi2, k=64)),
#     ('sc4gbdt', StandardScaler())
])

fp.fit(model_train_df[fields+continous], model_train_df[label].values.ravel())

Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=None,
       transformer_list=[('continuous', Pipeline(memory=None,
     steps=[('extract', ColumnSelector(cols=['Discount', 'Base_consume', 'Discount_money', 'User_factor_alpha', 'User_factor_beta', 'Merchant_factor_alpha', 'Merchant_factor_beta'],
        drop_axis=False)), ('imputer', SimpleImputer(copy=True,...n='error',
       n_values=None, sparse=True)), ('to_dense', DenseTransformer(return_copy=True))]))],
       transformer_weights=None))])

In [47]:
train_dataset_x = fp.transform(model_train_df[fields+continous])
train_dataset_y = model_train_df[label].values.ravel()

valid_dataset_x = fp.transform(model_test_df[fields+continous])
valid_dataset_y = model_test_df[label].values.ravel()

xgbtrain = xgb.DMatrix(train_dataset_x, label=train_dataset_y)
xgbvalid = xgb.DMatrix(valid_dataset_x, label=valid_dataset_y)

In [48]:
def evaluate(result_df):
    group = result_df.groupby(['Coupon_id'])
    aucs = []
    for i in group:
        tmpdf = i[1]        
        if len(tmpdf['Is_in_day_consume'].unique()) != 2:
            continue
            
        fpr, tpr, thresholds = roc_curve(tmpdf['Is_in_day_consume'], tmpdf['Probability'], pos_label=1)
        auc_score = auc(fpr,tpr)
        aucs.append(auc_score)
            
    return np.average(aucs)

In [None]:
xgb_min_num_round = 10
xgb_max_num_round = 500
xgb_num_round_step = 10

xgb_random_seed = 2018
xgb_nthread = 4
xgb_dmatrix_silent = True

space = {
    'booster': 'gblinear',
    'objective': 'rank:pairwise',
    'nthread': xgb_nthread,
    'silent' : True,
    'seed': xgb_random_seed,
    "max_evals": 200,
    'eval_metric': 'auc',
    'max_depth': hp.quniform('max_depth', 6, 18, 1),
    'eta' : hp.quniform('eta', 0.01, 1, 0.01),
#     'lambda' : hp.quniform('lambda', 0, 5, 0.05),
#     'alpha' : hp.quniform('alpha', 0, 0.5, 0.005),
#     'lambda_bias' : hp.quniform('lambda_bias', 0, 3, 0.1),
#     'num_round' : hp.quniform('num_round', xgb_min_num_round, xgb_max_num_round, xgb_num_round_step),
    'n_estimators': hp.quniform('n_estimators', 100, 500, 50),
}

watchlist = [(train_dataset_x, train_dataset_y), (valid_dataset_x, valid_dataset_y)]

def objective(params):
    """Objective function for Gradient Boosting Machine Hyperparameter Tuning"""
    
    logger.info(params)    
    bst = xgb.sklearn.XGBClassifier(
        nthread=params['nthread'],
        learn_rate=params['eta'],
        max_depth=int(params['max_depth']),
        min_child_weight=1.1,
        subsample=0.7,
        colsample_bytree=0.7,
        colsample_bylevel=0.7,
        objective=params['objective'],
        n_estimators=int(params['n_estimators']),
        gamma=0.1,
        reg_alpha=0,
        reg_lambda=1,
        max_delta_step=0,
        scale_pos_weight=1,
        silent=params['silent']
    )
    bst.fit(train_dataset_x, train_dataset_y, eval_set=watchlist, eval_metric=params['eval_metric'], early_stopping_rounds=10)
    
    predict_test_prob_y = bst.predict_proba(valid_dataset_x)
    model_test_df['Probability'] = predict_test_prob_y[:, 1]
    score = evaluate(model_test_df)
    logging.info('Socre is %f' % score)
    
    # Loss must be minimized
    loss = 1 - score
    
    # Dictionary with information for evaluation
    return {'loss': loss, 'params': params, 'status': STATUS_OK}

MAX_EVALS = 200

# Optimize
best = fmin(fn = objective, space = space, algo = tpe.suggest, max_evals = MAX_EVALS, trials = Trials())
best

In [49]:
model = xgb.sklearn.XGBClassifier(
    nthread=4,
    learn_rate=0.17,
    max_depth=18,
    min_child_weight=1.1,
    subsample=0.7,
    colsample_bytree=0.7,
    colsample_bylevel=0.7,
    objective='rank:pairwise',
    n_estimators=500,
    gamma=0.1,
    reg_alpha=0,
    reg_lambda=1,
    max_delta_step=0,
    scale_pos_weight=1,
    silent=True
)
watchlist = [(train_dataset_x, train_dataset_y), (valid_dataset_x, valid_dataset_y)]

logging.info('train begin')
model.fit(train_dataset_x, train_dataset_y, eval_set=watchlist, eval_metric='auc', early_stopping_rounds=10)
logging.info('train finish')

model.save_model('../model/xgb.model')

[22:54:47] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2272 extra nodes, 290 pruned nodes, max_depth=18
[414]	validation_0-auc:0.974762	validation_1-auc:0.974762
[22:54:50] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 3488 extra nodes, 382 pruned nodes, max_depth=18
[415]	validation_0-auc:0.974804	validation_1-auc:0.974804
[22:54:53] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2840 extra nodes, 352 pruned nodes, max_depth=18
[416]	validation_0-auc:0.974825	validation_1-auc:0.974825
[22:54:57] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 3500 extra nodes, 470 pruned nodes, max_depth=18
[417]	validation_0-auc:0.974864	validation_1-auc:0.974864
[22:55:00] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2768 extra nodes, 260 pruned nodes, max_depth=18
[418]	validation_0-auc:0.974886	validation_1-auc:0.974886
[22:55:03] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 3000 extra nodes, 312 pruned nodes, max_depth=18
[419]	va

2019-01-25 22:59:36,209  <ipython-input-49-af38504e9331> : INFO  train finish


In [50]:
predict_test_prob_y = model.predict_proba(valid_dataset_x)
model_test_df['Probability'] = predict_test_prob_y[:, 1]
evaluate(model_test_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


0.9381759179762218

In [None]:
params = {
    # gbtree and dart use tree based models while gblinear uses linear functions.
    'booster': 'gbtree',
    #  Use LambdaMART to perform pairwise ranking where the pairwise loss is minimized
    'objective': 'rank:pairwise',
    # auc: Area under the curve
    'eval_metric': 'auc',
    # Minimum loss reduction required to make a further partition on a leaf node of the tree. The larger gamma is, the more conservative the algorithm will be.
    'gamma': 0.1,
    'min_child_weight': 1.1,
    
    'max_depth':4,
#     'max_depth': 12,
    # Maximum number of nodes to be added
    'max_leaves': 128,
    # L2 regularization term on weights. Increasing this value will make model more conservative.
    'lambda': 3,
    
    'alpha': 2,
    # Subsample ratio of the training instances. Setting it to 0.5 means that XGBoost would randomly sample half of the training data prior to growing trees. and this will prevent overfitting. 
    'subsample': 0.7,
    # This is a family of parameters for subsampling of columns.
    'colsample_bytree': 0.7,
    'colsample_bylevel': 0.7,
    # learning_rate
    'eta': 0.01,
    # Exact greedy algorithm
    'tree_method': 'exact',
    # Random number seed.
    'seed': 0,
    'nthread': 4,
    # Verbosity of printing messages. Valid values are 0 (silent),
    'verbosity': 0,
    'metric_freq': 100,
}

watchlist = [(xgbtrain, 'train'), (xgbvalid, 'validate')]

logging.info('train begin')
model = xgb.train(params, xgbtrain, num_boost_round=200, evals=watchlist)
logging.info('train end')
model.save_model('../model/xgb.model')

### XGB调参

### GBDT 调参

In [None]:
skl_min_n_estimators = 10
skl_max_n_estimators = 500
skl_n_estimators_step = 10
skl_n_jobs = 2
skl_random_seed = 2018

## random forest tree classifier
space = {
    'n_estimators': hp.quniform("n_estimators", skl_min_n_estimators, skl_max_n_estimators, skl_n_estimators_step),
    'learning_rate': hp.quniform("learning_rate", 0.01, 0.5, 0.01),
    'max_features': hp.quniform("max_features", 0.05, 1.0, 0.05),
    'max_depth': hp.quniform('max_depth', 1, 15, 1),
    'subsample': hp.quniform('subsample', 0.5, 1, 0.1),
    'random_state': skl_random_seed
}

def objective(params):
    """Objective function for Gradient Boosting Machine Hyperparameter Tuning"""
    
    logger.info(params)
    
    gbcf = GradientBoostingClassifier(
        n_estimators=int(params['n_estimators']), 
        max_features=params['max_features'], 
        learning_rate=params['learning_rate'],
        max_depth=params['max_depth'], 
        subsample=params['subsample'],
        random_state=params['random_state']
    )
    
    gbcf.fit(train_dataset_x, train_dataset_y)
    
    predict_test_prob_y = gbcf.predict_proba(valid_dataset_x)
    model_test_df['Probability'] = predict_test_prob_y[:, 1]
    
    score = evaluate(model_test_df)

#     gbdt = GradientBoostingRegressor(
#         n_estimators=int(params['n_estimators']), 
#         max_features=params['max_features'], 
#         learning_rate=params['learning_rate'],
#         max_depth=params['max_depth'], 
#         subsample=params['subsample'],
#         random_state=params['random_state'],
#         verbose=1
#     )
    
#     gbdt.fit(train_dataset_x, train_dataset_y)
#     score = gbdt.score(valid_dataset_x, valid_dataset_y)
    logging.info('Socre is %f' % score)
    
    # Loss must be minimized
    loss = 1 - score
    
    # Dictionary with information for evaluation
    return {'loss': loss, 'params': params, 'status': STATUS_OK}

def evaluate(result_df):
    group = result_df.groupby(['Coupon_id'])
    aucs = []
    for i in group:
        tmpdf = i[1]        
        if len(tmpdf['Is_in_day_consume'].unique()) != 2:
            continue
            
        fpr, tpr, thresholds = roc_curve(tmpdf['Is_in_day_consume'], tmpdf['Probability'], pos_label=1)
        auc_score = auc(fpr,tpr)
        aucs.append(auc_score)
            
    return np.average(aucs)

MAX_EVALS = 500

# Optimize
best = fmin(fn = objective, space = space, algo = tpe.suggest, max_evals = MAX_EVALS, trials = Trials())
best

## 预测

In [None]:
model_pred_df = pd.read_csv('../features/lcm_submit_features.csv')

model_pred_df = pd.merge(model_pred_df, get_factor(user_features_df, 'User_id', 'User'), on=['User_id'], how='left')
model_pred_df = pd.merge(model_pred_df, get_factor(merchant_features_df, 'Merchant_id', 'Merchant'), on=['Merchant_id'], how='left')

In [52]:
predict_dataset_x = fp.transform(model_pred_df[fields+continous])
predict_prob_y = model.predict_proba(predict_dataset_x)
model_pred_df['Probability'] = predict_prob_y[:, 1]
model_pred_df.sort_values(['Probability'], ascending=False).head()

Unnamed: 0,User_id,Merchant_id,Coupon_id,Distance,Date_received,Discount,Base_consume,Discount_money,Day_in_month,Day_in_week,Coupon_type,User_factor_alpha,User_factor_beta,Merchant_factor_alpha,Merchant_factor_beta,Probability
43671,2751537,7910.0,2637.0,0.0,20160702.0,0.833333,30.0,5.0,2.0,6.0,1.0,0.395742,0.057898,0.00635,0.329149,6.973665
43669,2751537,7910.0,2637.0,0.0,20160702.0,0.833333,30.0,5.0,2.0,6.0,1.0,0.395742,0.057898,0.00635,0.329149,6.973665
47968,7294555,6135.0,8182.0,0.0,20160712.0,0.9,10.0,1.0,12.0,2.0,1.0,0.117911,0.081931,0.006191,0.330748,6.083786
104469,1535039,6135.0,8182.0,0.0,20160712.0,0.9,10.0,1.0,12.0,2.0,1.0,0.117266,0.08196,0.006191,0.330748,6.053985
45934,3977895,4808.0,1226.0,0.0,20160709.0,0.95,20.0,1.0,9.0,6.0,1.0,0.050597,0.090049,7.1e-05,0.314645,5.846001


In [53]:
final_result_df = model_pred_df[['User_id', 'Coupon_id', 'Date_received', 'Probability']]
final_result_df.to_csv('/Users/leewind/Desktop/submission_20190126.csv', index=False, header=False)
final_result_df.shape

(113640, 4)

In [54]:
final_result_df.describe()

Unnamed: 0,User_id,Coupon_id,Date_received,Probability
count,113640.0,113640.0,113640.0,113640.0
mean,3684858.0,9053.810929,20160720.0,-2.091856
std,2126259.0,4145.873088,9.019508,1.865192
min,209.0,3.0,20160700.0,-9.081948
25%,1844191.0,5023.0,20160710.0,-3.309978
50%,3683266.0,9983.0,20160720.0,-2.050517
75%,5525845.0,13602.0,20160720.0,-0.762426
max,7361024.0,14045.0,20160730.0,6.973665
