In [1]:
import pandas as pd
import numpy as np
import math
import logging
from datetime import datetime

from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, Normalizer, StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import roc_curve, auc

from mlxtend.preprocessing import DenseTransformer
from mlxtend.feature_selection import ColumnSelector

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [2]:
logger = logging.getLogger('ai')
logging.basicConfig(level=logging.DEBUG,
                    format='%(asctime)s  %(filename)s : %(levelname)s  %(message)s')

## 模型

+ [sklearn.preprocessing.StandardScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html)
+ [Feature transformations with ensembles of trees](https://scikit-learn.org/stable/auto_examples/ensemble/plot_feature_transformation.html)
+ [sklearn.preprocessing.OneHotEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html)
+ [机器学习之 sklearn中的pipeline](http://frankchen.xyz/2018/04/08/pipeline-in-machine-learning/)
    - 使用pipeline做cross validation
    - 自定义transformer
    - FeatureUnion
+ [Concatenating multiple feature extraction methods](https://scikit-learn.org/stable/auto_examples/compose/plot_feature_union.html#sphx-glr-auto-examples-compose-plot-feature-union-py)
+ [sklearn.pipeline.Pipeline](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html)
+ [gbdt+lr demo](https://github.com/princewen/tensorflow_practice/blob/master/recommendation/GBDT%2BLR-Demo/GBDT_LR.py)
+ [推荐系统遇上深度学习(十)--GBDT+LR融合方案实战](https://zhuanlan.zhihu.com/p/37522339)
+ [python︱sklearn一些小技巧的记录（训练集划分/pipelline/交叉验证等）](https://blog.csdn.net/sinat_26917383/article/details/77917881)
+ [16.【进阶】特征提升之特征筛选----feature_selection](https://blog.csdn.net/jh1137921986/article/details/79822512)
+ [使用sklearn优雅地进行数据挖掘](https://www.cnblogs.com/jasonfreak/p/5448462.html)
+ [Kaggle机器学习之模型融合（stacking）心得](https://zhuanlan.zhihu.com/p/26890738)
+ [model_library_config](https://github.com/ChenglongChen/Kaggle_CrowdFlower/blob/master/Code/Model/model_library_config.py)

In [3]:
continous = [
    'Discount',
    'Previous_duration',
    'Next_duration',
    'Base_consume',
    'User_receive_count',
    'User_consume_count',
    'User_used_count',
    'User_not_used_count',
    'User_used_coupon_rate',
    'User_used_coupon_rate_max',
    'User_used_coupon_rate_min',
    'User_used_coupon_rate_mean',
    'User_receive_coupon_merchant_count',
    'User_consume_merchant_count',
    'User_used_coupon_merchant_count',
    'User_used_coupon_merchant_occ',
    'User_receive_different_coupon_count',
    'User_used_different_coupon_count',
    'User_receive_different_coupon_occ',
    'User_used_different_coupon_occ',
    'User_receive_coupon_mean',
    'User_used_coupon_mean',
    'User_distance_used_mean',
    'User_distance_used_max',
    'User_distance_used_min',
    'User_duration_used_mean',
    'User_duration_used_max',
    'User_duration_used_min',
    'User_previous_duration_used_mean',
    'User_previous_duration_used_max',
    'User_previous_duration_used_min',
    'User_next_duration_used_mean',
    'User_next_duration_used_max',
    'User_next_duration_used_min',
    'Merchant_receive_count',
    'Merchant_consume_count',
    'Merchant_used_count',
    'Merchant_not_used_count',
    'Merchant_used_coupon_rate',
    'Merchant_used_coupon_rate_max',
    'Merchant_used_coupon_rate_min',
    'Merchant_used_coupon_rate_mean',
    'Merchant_receive_coupon_user_count',
    'Merchant_consume_user_count',
    'Merchant_used_coupon_user_count',
    'Merchant_receive_coupon_user_occ',
    'Merchant_consume_user_occ',
    'Merchant_used_coupon_user_occ',
    'Merchant_receive_different_coupon_count',
    'Merchant_used_different_coupon_count',
    'Merchant_receive_different_coupon_occ',
    'Merchant_used_different_coupon_occ',
    'Merchant_receive_coupon_mean',
    'Merchant_used_coupon_mean',
    'Merchant_receive_different_coupon_avg',
    'Merchant_used_different_coupon_avg',
    'Merchant_distance_used_mean',
    'Merchant_distance_used_max',
    'Merchant_distance_used_min',
    'Merchant_duration_used_mean',
    'Merchant_duration_used_max',
    'Merchant_duration_used_min',
    'Merchant_previous_duration_used_mean',
    'Merchant_previous_duration_used_max',
    'Merchant_previous_duration_used_min',
    'Merchant_next_duration_used_mean',
    'Merchant_next_duration_used_max',
    'Merchant_next_duration_used_min',
    'Coupon_received_count',
    'Coupon_used_count',
    'Coupon_used_rate',
    'Coupon_duration_used_mean',
    'Coupon_duration_used_max',
    'Coupon_duration_used_min',
    'Coupon_distance_used_mean',
    'Coupon_distance_used_max',
    'Coupon_distance_used_min',
    'User_merchant_receive_count',
    'User_merchant_consume_count',
    'User_merchant_used_count',
    'User_merchant_not_used_count',
    'User_merchant_used_coupon_rate',
    'User_merchant_not_used_coupon_rate',
    'User_merchant_used_coupon_rate_4_merchant',
    'User_merchant_not_used_coupon_rate_4_merchant',
    'User_merchant_duration_used_mean',
    'User_merchant_duration_used_max',
    'User_merchant_duration_used_min',
    'Online_user_receive_count',
    'Online_user_consume_count',
    'Online_user_used_count',
    'Online_user_not_used_count',
    'Online_user_used_coupon_rate',
    'User_offline_consume_rate',
    'User_offline_used_rate',
    'User_offline_no_consume_coupon_rate',
    'User_distance_receive_count',
    'User_distance_consume_count',
    'User_distance_used_count',
    'User_distance_receive_rate',
    'User_distance_consume_rate',
    'User_distance_used_rate',
    'User_coupon_type_receive_count',
    'User_coupon_type_used_count',
    'User_coupon_type_receive_rate',
    'User_coupon_type_used_rate',
    'User_coupon_receive_count',
    'User_coupon_used_count',
    'User_coupon_receive_rate',
    'User_coupon_used_rate',
    'Merchant_distance_receive_count',
    'Merchant_distance_consume_count',
    'Merchant_distance_used_count',
    'Merchant_distance_receive_rate',
    'Merchant_distance_used_rate',
    'User_coupon_duration_used_mean',
    'User_coupon_duration_used_max',
    'User_coupon_duration_used_min',
    'User_received_date_count'
]


fields = [
    'Distance',
    'Day_in_month_received',
    'Day_in_week_received',
    'Coupon_type'
]

label = ['Duration']

In [4]:
model_train_df = pd.read_csv('../features/lcm_train_features.csv')

In [5]:
model_test_df = pd.read_csv('../features/lcm_train_test_features.csv')
model_test_df = model_test_df[model_test_df['Coupon_id']>0]

In [6]:
class ExtractFeature(TransformerMixin):
    def fit(self, *args, **kwargs):
        return self
    
    def transform(self, X, **transform_params):
        return pd.DataFrame(X[:,0] * X[:,1])

In [7]:
fp = Pipeline([
    ('features', FeatureUnion([
        ('continuous', Pipeline([
            ('extract', ColumnSelector(continous)),
            ('imputer', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
            ('scale', Normalizer())
        ])),
        ('fields', Pipeline([
            ('extract', ColumnSelector(fields)),
            ('imputer', SimpleImputer(missing_values=np.nan,  strategy='most_frequent')),
            ('one_hot', OneHotEncoder(categories='auto')),
            ('to_dense', DenseTransformer())
        ])),
        ('rate', Pipeline([
            ('extract', ColumnSelector(['User_coupon_used_rate', 'User_used_coupon_rate'])),
            ('imputer', SimpleImputer(missing_values=np.nan,  strategy='most_frequent')),
            ('new', ExtractFeature()),
            ('scale', Normalizer())
        ]))
    ])),
    ('skb', SelectKBest(chi2, k=80)),
    ('sc4gbdt', StandardScaler())
])

fp.fit(model_train_df[fields+continous], model_train_df[label].values.ravel())

train_dataset_x = fp.transform(model_train_df[fields+continous])
train_dataset_y = model_train_df[label].values.ravel()

valid_dataset_x = fp.transform(model_test_df[fields+continous])
valid_dataset_y = model_test_df[label].values.ravel()

In [None]:
skl_min_n_estimators = 10
skl_max_n_estimators = 500
skl_n_estimators_step = 10
skl_n_jobs = 2
skl_random_seed = 2018

## random forest tree classifier
space = {
    'n_estimators': hp.quniform("n_estimators", skl_min_n_estimators, skl_max_n_estimators, skl_n_estimators_step),
    'learning_rate': hp.quniform("learning_rate", 0.01, 0.5, 0.01),
    'max_features': hp.quniform("max_features", 0.05, 1.0, 0.05),
    'max_depth': hp.quniform('max_depth', 1, 15, 1),
    'subsample': hp.quniform('subsample', 0.5, 1, 0.1),
    'random_state': skl_random_seed
}

def objective(params):
    """Objective function for Gradient Boosting Machine Hyperparameter Tuning"""
    
    logger.info(params)
    
#     gbcf = GradientBoostingClassifier(
#         n_estimators=int(params['n_estimators']), 
#         max_features=params['max_features'], 
#         learning_rate=params['learning_rate'],
#         max_depth=params['max_depth'], 
#         subsample=params['subsample'],
#         random_state=params['random_state']
#     )
    
#     gbcf.fit(train_dataset_x, train_dataset_y)
    
#     predict_test_prob_y = gbcf.predict_proba(valid_dataset_x)
#     model_test_df['Probability'] = predict_test_prob_y[:, 1]
    
#     score = evaluate(model_test_df)

    gbdt = GradientBoostingRegressor(
        n_estimators=int(params['n_estimators']), 
        max_features=params['max_features'], 
        learning_rate=params['learning_rate'],
        max_depth=params['max_depth'], 
        subsample=params['subsample'],
        random_state=params['random_state'],
        verbose=1
    )
    
    gbdt.fit(train_dataset_x, train_dataset_y)
    score = gbdt.score(valid_dataset_x, valid_dataset_y)
    logging.info('Socre is %f' % score)
    
    # Loss must be minimized
    loss = 1 - score
    
    # Dictionary with information for evaluation
    return {'loss': loss, 'params': params, 'status': STATUS_OK}

def evaluate(result_df):
    group = result_df.groupby(['Coupon_id'])
    aucs = []
    for i in group:
        tmpdf = i[1]        
        if len(tmpdf['Is_in_day_consume'].unique()) != 2:
            continue
            
        fpr, tpr, thresholds = roc_curve(tmpdf['Is_in_day_consume'], tmpdf['Probability'], pos_label=1)
        auc_score = auc(fpr,tpr)
        aucs.append(auc_score)
            
    return np.average(aucs)

MAX_EVALS = 500

# Optimize
best = fmin(fn = objective, space = space, algo = tpe.suggest, max_evals = MAX_EVALS, trials = Trials())
best

2019-01-22 00:23:22,690  tpe.py : INFO  tpe_transform took 0.010583 seconds
2019-01-22 00:23:22,694  tpe.py : INFO  TPE using 0 trials
2019-01-22 00:23:22,699  <ipython-input-8-e876bef2e5bb> : INFO  {'learning_rate': 0.42, 'max_depth': 5.0, 'max_features': 0.30000000000000004, 'n_estimators': 300.0, 'random_state': 2018, 'subsample': 0.7000000000000001}


      Iter       Train Loss      OOB Improve   Remaining Time 
         1           3.6740           0.6724           19.18m
         2           3.4188           0.2548           21.11m
         3           3.2408           0.1060           21.87m
         4           3.2237           0.0516           22.44m
         5           3.1645           0.0380           22.32m
         6           3.1819           0.0203           21.80m
         7           3.1373           0.0227           21.58m
         8           3.0643           0.0103           21.31m
         9           3.1024           0.0057           21.16m
        10           3.0486           0.0030           20.90m
        20           2.9597           0.0026           19.63m
        30           3.0018           0.0001           18.53m
        40           2.9063          -0.0006           17.70m
        50           2.9252          -0.0005           16.98m
        60           2.8647          -0.0009           15.98m
       

2019-01-22 00:43:10,824  <ipython-input-8-e876bef2e5bb> : INFO  Socre is -16.502297
2019-01-22 00:43:10,832  tpe.py : INFO  tpe_transform took 0.003673 seconds
2019-01-22 00:43:10,833  tpe.py : INFO  TPE using 1/1 trials with best loss 17.502297
2019-01-22 00:43:10,836  <ipython-input-8-e876bef2e5bb> : INFO  {'learning_rate': 0.18, 'max_depth': 5.0, 'max_features': 0.1, 'n_estimators': 330.0, 'random_state': 2018, 'subsample': 1.0}


      Iter       Train Loss   Remaining Time 
         1           3.9984            9.67m
         2           3.7716            9.59m
         3           3.6323            9.32m
         4           3.5219            9.05m
         5           3.4452            8.86m
         6           3.3906            8.66m
         7           3.3475            8.54m
         8           3.3136            8.47m
         9           3.2951            8.38m
        10           3.2779            8.37m
        20           3.1637            7.76m
        30           3.1027            7.28m
        40           3.0670            6.99m
        50           3.0317            6.66m
        60           3.0077            6.36m
        70           2.9895            6.12m
        80           2.9707            5.81m
        90           2.9538            5.53m
       100           2.9426            5.26m
       200           2.8384            2.89m
       300           2.7650           39.53s


2019-01-22 00:50:38,257  <ipython-input-8-e876bef2e5bb> : INFO  Socre is -5.679980
2019-01-22 00:50:38,264  tpe.py : INFO  tpe_transform took 0.003530 seconds
2019-01-22 00:50:38,265  tpe.py : INFO  TPE using 2/2 trials with best loss 6.679980
2019-01-22 00:50:38,268  <ipython-input-8-e876bef2e5bb> : INFO  {'learning_rate': 0.21, 'max_depth': 14.0, 'max_features': 0.4, 'n_estimators': 210.0, 'random_state': 2018, 'subsample': 0.5}


      Iter       Train Loss      OOB Improve   Remaining Time 
         1           3.5608           0.4783          562.39m
         2           3.2760           0.2905          602.21m
         3           3.0018           0.1779          613.22m
         4           2.8090           0.1025          681.43m
         5           2.6674           0.0682          682.30m
         6           2.5981           0.0275          664.37m
         7           2.5795           0.0170          655.05m
         8           2.4577           0.0100          640.52m
         9           2.4562           0.0014          623.92m
        10           2.3348          -0.0041          608.85m
        20           2.1716          -0.0061          480.18m
        30           2.1255          -0.0051          379.15m
        40           1.9988          -0.0058          309.36m
        50           1.9577          -0.0044          273.26m
        60           1.8330          -0.0097          249.22m
       

2019-01-22 07:40:26,191  <ipython-input-8-e876bef2e5bb> : INFO  Socre is -10.068071
2019-01-22 07:40:26,216  tpe.py : INFO  tpe_transform took 0.003987 seconds
2019-01-22 07:40:26,217  tpe.py : INFO  TPE using 3/3 trials with best loss 6.679980
2019-01-22 07:40:26,220  <ipython-input-8-e876bef2e5bb> : INFO  {'learning_rate': 0.33, 'max_depth': 10.0, 'max_features': 0.35000000000000003, 'n_estimators': 400.0, 'random_state': 2018, 'subsample': 0.6000000000000001}


      Iter       Train Loss      OOB Improve   Remaining Time 
         1           3.5790           0.6630          293.93m
         2           3.2355           0.3057          299.06m
         3           3.0491           0.1370          307.87m
         4           2.9722           0.0682          293.08m
         5           2.8269           0.0316          296.05m
         6           2.8336           0.0060          288.40m
         7           2.8113           0.0119          284.72m
         8           2.7177          -0.0014          282.87m
         9           2.7125          -0.0098          285.69m
        10           2.6634          -0.0048          274.82m
        20           2.4969          -0.0042          224.59m
        30           2.4852          -0.0075          202.12m
        40           2.3439          -0.0060          177.99m
        50           2.3478          -0.0020          166.07m
        60           2.2297          -0.0020          162.14m
       

## 预测

In [25]:
model_pred_df = pd.read_csv('lcm_test_features.csv')
predict_prob_y = pipe_lr.predict_proba(model_pred_df[fields+continous])
model_pred_df['Probability'] = predict_prob_y[:, 1]
model_pred_df.sort_values(['Probability'], ascending=False).head()

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Previous_date_received,Next_date_received,Previous_duration,Next_duration,...,Merchant_distance_receive_count,Merchant_distance_consume_count,Merchant_distance_used_count,Merchant_distance_receive_rate,Merchant_distance_used_rate,User_coupon_duration_used_mean,User_coupon_duration_used_max,User_coupon_duration_used_min,User_received_date_count,Probability
92750,6013165,7963,5548,20:1,3.0,20160723,20160722.0,20160728.0,2,6,...,6.0,10.0,3.0,0.315789,0.157895,4.333333,7.0,1.0,7,1.0
92751,6013165,7963,5548,20:1,3.0,20160728,20160723.0,20160729.0,6,2,...,6.0,10.0,3.0,0.315789,0.157895,4.333333,7.0,1.0,7,1.0
92748,6013165,7963,5548,20:1,3.0,20160721,20160718.0,20160722.0,4,2,...,6.0,10.0,3.0,0.315789,0.157895,4.333333,7.0,1.0,7,1.0
92752,6013165,7963,5548,20:1,3.0,20160729,20160728.0,20160731.0,2,3,...,6.0,10.0,3.0,0.315789,0.157895,4.333333,7.0,1.0,7,1.0
92749,6013165,7963,5548,20:1,3.0,20160722,20160721.0,20160723.0,2,2,...,6.0,10.0,3.0,0.315789,0.157895,4.333333,7.0,1.0,7,1.0


In [26]:
final_result_df = model_pred_df[['User_id', 'Coupon_id', 'Date_received', 'Probability']]
final_result_df.to_csv('/Users/leewind/Desktop/submission_20190118.csv', index=False, header=False)
final_result_df.shape

(113640, 4)