In [1]:
import pandas as pd
import numpy as np
import math
import logging
from datetime import datetime

from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, Normalizer, StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import roc_curve, auc

from mlxtend.preprocessing import DenseTransformer
from mlxtend.feature_selection import ColumnSelector

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [14]:
logger = logging.getLogger('ai')
logging.basicConfig(level=logging.DEBUG,
                    format='%(asctime)s  %(filename)s : %(levelname)s  %(message)s')

## 模型

+ [sklearn.preprocessing.StandardScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html)
+ [Feature transformations with ensembles of trees](https://scikit-learn.org/stable/auto_examples/ensemble/plot_feature_transformation.html)
+ [sklearn.preprocessing.OneHotEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html)
+ [机器学习之 sklearn中的pipeline](http://frankchen.xyz/2018/04/08/pipeline-in-machine-learning/)
    - 使用pipeline做cross validation
    - 自定义transformer
    - FeatureUnion
+ [Concatenating multiple feature extraction methods](https://scikit-learn.org/stable/auto_examples/compose/plot_feature_union.html#sphx-glr-auto-examples-compose-plot-feature-union-py)
+ [sklearn.pipeline.Pipeline](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html)
+ [gbdt+lr demo](https://github.com/princewen/tensorflow_practice/blob/master/recommendation/GBDT%2BLR-Demo/GBDT_LR.py)
+ [推荐系统遇上深度学习(十)--GBDT+LR融合方案实战](https://zhuanlan.zhihu.com/p/37522339)
+ [python︱sklearn一些小技巧的记录（训练集划分/pipelline/交叉验证等）](https://blog.csdn.net/sinat_26917383/article/details/77917881)
+ [16.【进阶】特征提升之特征筛选----feature_selection](https://blog.csdn.net/jh1137921986/article/details/79822512)
+ [使用sklearn优雅地进行数据挖掘](https://www.cnblogs.com/jasonfreak/p/5448462.html)
+ [Kaggle机器学习之模型融合（stacking）心得](https://zhuanlan.zhihu.com/p/26890738)
+ [model_library_config](https://github.com/ChenglongChen/Kaggle_CrowdFlower/blob/master/Code/Model/model_library_config.py)

In [2]:
continous = [
    'Discount',
    'Previous_duration',
    'Next_duration',
    'Base_consume',
    'User_receive_count',
    'User_consume_count',
    'User_used_count',
    'User_not_used_count',
    'User_used_coupon_rate',
    'User_used_coupon_rate_max',
    'User_used_coupon_rate_min',
    'User_used_coupon_rate_mean',
    'User_receive_coupon_merchant_count',
    'User_consume_merchant_count',
    'User_used_coupon_merchant_count',
    'User_used_coupon_merchant_occ',
    'User_receive_different_coupon_count',
    'User_used_different_coupon_count',
    'User_receive_different_coupon_occ',
    'User_used_different_coupon_occ',
    'User_receive_coupon_mean',
    'User_used_coupon_mean',
    'User_distance_used_mean',
    'User_distance_used_max',
    'User_distance_used_min',
    'User_duration_used_mean',
    'User_duration_used_max',
    'User_duration_used_min',
    'User_previous_duration_used_mean',
    'User_previous_duration_used_max',
    'User_previous_duration_used_min',
    'User_next_duration_used_mean',
    'User_next_duration_used_max',
    'User_next_duration_used_min',
    'Merchant_receive_count',
    'Merchant_consume_count',
    'Merchant_used_count',
    'Merchant_not_used_count',
    'Merchant_used_coupon_rate',
    'Merchant_used_coupon_rate_max',
    'Merchant_used_coupon_rate_min',
    'Merchant_used_coupon_rate_mean',
    'Merchant_receive_coupon_user_count',
    'Merchant_consume_user_count',
    'Merchant_used_coupon_user_count',
    'Merchant_receive_coupon_user_occ',
    'Merchant_consume_user_occ',
    'Merchant_used_coupon_user_occ',
    'Merchant_receive_different_coupon_count',
    'Merchant_used_different_coupon_count',
    'Merchant_receive_different_coupon_occ',
    'Merchant_used_different_coupon_occ',
    'Merchant_receive_coupon_mean',
    'Merchant_used_coupon_mean',
    'Merchant_receive_different_coupon_avg',
    'Merchant_used_different_coupon_avg',
    'Merchant_distance_used_mean',
    'Merchant_distance_used_max',
    'Merchant_distance_used_min',
    'Merchant_duration_used_mean',
    'Merchant_duration_used_max',
    'Merchant_duration_used_min',
    'Merchant_previous_duration_used_mean',
    'Merchant_previous_duration_used_max',
    'Merchant_previous_duration_used_min',
    'Merchant_next_duration_used_mean',
    'Merchant_next_duration_used_max',
    'Merchant_next_duration_used_min',
    'Coupon_received_count',
    'Coupon_used_count',
    'Coupon_used_rate',
    'Coupon_duration_used_mean',
    'Coupon_duration_used_max',
    'Coupon_duration_used_min',
    'Coupon_distance_used_mean',
    'Coupon_distance_used_max',
    'Coupon_distance_used_min',
    'User_merchant_receive_count',
    'User_merchant_consume_count',
    'User_merchant_used_count',
    'User_merchant_not_used_count',
    'User_merchant_used_coupon_rate',
    'User_merchant_not_used_coupon_rate',
    'User_merchant_used_coupon_rate_4_merchant',
    'User_merchant_not_used_coupon_rate_4_merchant',
    'User_merchant_duration_used_mean',
    'User_merchant_duration_used_max',
    'User_merchant_duration_used_min',
    'Online_user_receive_count',
    'Online_user_consume_count',
    'Online_user_used_count',
    'Online_user_not_used_count',
    'Online_user_used_coupon_rate',
    'User_offline_consume_rate',
    'User_offline_used_rate',
    'User_offline_no_consume_coupon_rate',
    'User_distance_receive_count',
    'User_distance_consume_count',
    'User_distance_used_count',
    'User_distance_receive_rate',
    'User_distance_consume_rate',
    'User_distance_used_rate',
    'User_coupon_type_receive_count',
    'User_coupon_type_used_count',
    'User_coupon_type_receive_rate',
    'User_coupon_type_used_rate',
    'User_coupon_receive_count',
    'User_coupon_used_count',
    'User_coupon_receive_rate',
    'User_coupon_used_rate',
    'Merchant_distance_receive_count',
    'Merchant_distance_consume_count',
    'Merchant_distance_used_count',
    'Merchant_distance_receive_rate',
    'Merchant_distance_used_rate',
    'User_coupon_duration_used_mean',
    'User_coupon_duration_used_max',
    'User_coupon_duration_used_min',
    'User_received_date_count'
]


fields = [
    'Distance',
    'Day_in_month_received',
    'Day_in_week_received',
    'Coupon_type'
]

label = ['Is_in_day_consume']

In [3]:
model_train_df = pd.read_csv('lcm_train_features.csv')

In [4]:
model_test_df = pd.read_csv('lcm_train_test_features.csv')
model_test_df = model_test_df[model_test_df['Coupon_id']>0]

In [5]:
class ExtractFeature(TransformerMixin):
    def fit(self, *args, **kwargs):
        return self
    
    def transform(self, X, **transform_params):
        return pd.DataFrame(X[:,0] * X[:,1])

In [19]:
fp = Pipeline([
    ('features', FeatureUnion([
        ('continuous', Pipeline([
            ('extract', ColumnSelector(continous)),
            ('imputer', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
            ('scale', Normalizer())
        ])),
        ('fields', Pipeline([
            ('extract', ColumnSelector(fields)),
            ('imputer', SimpleImputer(missing_values=np.nan,  strategy='most_frequent')),
            ('one_hot', OneHotEncoder(categories='auto')),
            ('to_dense', DenseTransformer())
        ])),
        ('rate', Pipeline([
            ('extract', ColumnSelector(['User_coupon_used_rate', 'User_used_coupon_rate'])),
            ('imputer', SimpleImputer(missing_values=np.nan,  strategy='most_frequent')),
            ('new', ExtractFeature()),
            ('scale', Normalizer())
        ]))
    ])),
    ('skb', SelectKBest(chi2, k=80)),
    ('sc4gbdt', StandardScaler())
])

fp.fit(model_train_df[fields+continous], model_train_df[label].values.ravel())

train_dataset_x = fp.transform(model_train_df[fields+continous])
train_dataset_y = model_train_df[label].values.ravel()

valid_dataset_x = fp.transform(model_test_df[fields+continous])

In [None]:
skl_min_n_estimators = 10
skl_max_n_estimators = 500
skl_n_estimators_step = 10
skl_n_jobs = 2
skl_random_seed = 2018

## random forest tree classifier
space = {
    'n_estimators': hp.quniform("n_estimators", skl_min_n_estimators, skl_max_n_estimators, skl_n_estimators_step),
    'learning_rate': hp.quniform("learning_rate", 0.01, 0.5, 0.01),
    'max_features': hp.quniform("max_features", 0.05, 1.0, 0.05),
    'max_depth': hp.quniform('max_depth', 1, 15, 1),
    'subsample': hp.quniform('subsample', 0.5, 1, 0.1),
    'random_state': skl_random_seed
}

def objective(params):
    """Objective function for Gradient Boosting Machine Hyperparameter Tuning"""
    
    logger.info(params)
    
    gbcf = GradientBoostingClassifier(
        n_estimators=int(params['n_estimators']), 
        max_features=params['max_features'], 
        learning_rate=params['learning_rate'],
        max_depth=params['max_depth'], 
        subsample=params['subsample'],
        random_state=params['random_state']
    )
    
    gbcf.fit(train_dataset_x, train_dataset_y)
    
    predict_test_prob_y = gbcf.predict_proba(valid_dataset_x)
    model_test_df['Probability'] = predict_test_prob_y[:, 1]
    
    score = evaluate(model_test_df)
    logging.info('Socre is %f' % score)
    
    # Loss must be minimized
    loss = 1 - score
    
    # Dictionary with information for evaluation
    return {'loss': loss, 'params': params, 'status': STATUS_OK}

def evaluate(result_df):
    group = result_df.groupby(['Coupon_id'])
    aucs = []
    for i in group:
        tmpdf = i[1]        
        if len(tmpdf['Is_in_day_consume'].unique()) != 2:
            continue
            
        fpr, tpr, thresholds = roc_curve(tmpdf['Is_in_day_consume'], tmpdf['Probability'], pos_label=1)
        auc_score = auc(fpr,tpr)
        aucs.append(auc_score)
            
    return np.average(aucs)

MAX_EVALS = 500

# Optimize
best = fmin(fn = objective, space = space, algo = tpe.suggest, max_evals = MAX_EVALS, trials = Trials())

2019-01-19 19:58:17,046  tpe.py : INFO  tpe_transform took 0.006516 seconds
2019-01-19 19:58:17,048  tpe.py : INFO  TPE using 0 trials
2019-01-19 19:58:17,054  <ipython-input-20-ed20a3a385c0> : INFO  {'learning_rate': 0.21, 'max_depth': 12.0, 'max_features': 0.2, 'n_estimators': 210.0, 'random_state': 2018, 'subsample': 0.8}
2019-01-19 21:43:04,931  <ipython-input-20-ed20a3a385c0> : INFO  Socre is 0.422708
2019-01-19 21:43:04,956  tpe.py : INFO  tpe_transform took 0.006679 seconds
2019-01-19 21:43:04,957  tpe.py : INFO  TPE using 1/1 trials with best loss 0.577292
2019-01-19 21:43:04,961  <ipython-input-20-ed20a3a385c0> : INFO  {'learning_rate': 0.46, 'max_depth': 2.0, 'max_features': 0.6000000000000001, 'n_estimators': 150.0, 'random_state': 2018, 'subsample': 0.7000000000000001}
2019-01-19 21:47:15,392  <ipython-input-20-ed20a3a385c0> : INFO  Socre is 0.493653
2019-01-19 21:47:15,404  tpe.py : INFO  tpe_transform took 0.002821 seconds
2019-01-19 21:47:15,407  tpe.py : INFO  TPE using

## 预测

In [25]:
model_pred_df = pd.read_csv('lcm_test_features.csv')
predict_prob_y = pipe_lr.predict_proba(model_pred_df[fields+continous])
model_pred_df['Probability'] = predict_prob_y[:, 1]
model_pred_df.sort_values(['Probability'], ascending=False).head()

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Previous_date_received,Next_date_received,Previous_duration,Next_duration,...,Merchant_distance_receive_count,Merchant_distance_consume_count,Merchant_distance_used_count,Merchant_distance_receive_rate,Merchant_distance_used_rate,User_coupon_duration_used_mean,User_coupon_duration_used_max,User_coupon_duration_used_min,User_received_date_count,Probability
92750,6013165,7963,5548,20:1,3.0,20160723,20160722.0,20160728.0,2,6,...,6.0,10.0,3.0,0.315789,0.157895,4.333333,7.0,1.0,7,1.0
92751,6013165,7963,5548,20:1,3.0,20160728,20160723.0,20160729.0,6,2,...,6.0,10.0,3.0,0.315789,0.157895,4.333333,7.0,1.0,7,1.0
92748,6013165,7963,5548,20:1,3.0,20160721,20160718.0,20160722.0,4,2,...,6.0,10.0,3.0,0.315789,0.157895,4.333333,7.0,1.0,7,1.0
92752,6013165,7963,5548,20:1,3.0,20160729,20160728.0,20160731.0,2,3,...,6.0,10.0,3.0,0.315789,0.157895,4.333333,7.0,1.0,7,1.0
92749,6013165,7963,5548,20:1,3.0,20160722,20160721.0,20160723.0,2,2,...,6.0,10.0,3.0,0.315789,0.157895,4.333333,7.0,1.0,7,1.0


In [26]:
final_result_df = model_pred_df[['User_id', 'Coupon_id', 'Date_received', 'Probability']]
final_result_df.to_csv('/Users/leewind/Desktop/submission_20190118.csv', index=False, header=False)
final_result_df.shape

(113640, 4)