In [1]:
import pandas as pd
import numpy as np
import math
import logging
from datetime import datetime

from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, Normalizer, StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import roc_curve, auc

from mlxtend.preprocessing import DenseTransformer
from mlxtend.feature_selection import ColumnSelector

import xgboost as xgb

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [2]:
logger = logging.getLogger('ai')
logging.basicConfig(level=logging.DEBUG,
                    format='%(asctime)s  %(filename)s : %(levelname)s  %(message)s')

## 模型

In [3]:
df = pd.read_csv('../features/lcm_base_features.csv')
user_features_df = pd.read_csv('../features/lcm_user_features.csv')
merchant_features_df = pd.read_csv('../features/lcm_merchant_features.csv')
coupon_features_df = pd.read_csv('../features/lcm_coupon_features.csv')

model_train_df = df[df['Date_received'] < 20160501]
model_test_df = df[df['Date_received'] >= 20160501]

In [4]:
continous = [
    'Discount', 
    'Base_consume', 
    'Discount_money',
    'User_factor_alpha',
    'User_factor_beta',
    'Merchant_factor_alpha',
    'Merchant_factor_beta',
    'Coupon_factor_alpha', 
    'Coupon_factor_beta'
]

fields = [
    'Distance',
    'Day_in_month',
    'Day_in_week',
    'Coupon_type'
]

label = ['Is_in_day_consume']

In [5]:
class MergeFeature(TransformerMixin):
    def __init__(self, df, key, prefix, pipe):
        self.df = df
        self.key = key
        self.prefix = prefix
        self.pipe = pipe
    
    def get_factor(self, df, key, prefix):
        id_df = df[[key]]
        output_df = df.drop([key], axis=1)

        self.pipe.fit(output_df)
        factors = self.pipe.transform(output_df)
        factors_df = pd.DataFrame(data=factors, columns=[prefix + '_factor_alpha', prefix + '_factor_beta'])
        factors_df[key] = id_df[key]
        return factors_df
    
    def fit(self, *args, **kwargs):
        return self
    
    def transform(self, X, **transform_params):
        return pd.merge(X, self.get_factor(self.df, self.key, self.prefix), on=[self.key], how='left')

In [6]:
factor_pipeline = Pipeline([
    ('pca', PCA(n_components=2)),
    ('scale', MinMaxScaler()),
])

features_pipeline = Pipeline([
    ('user', MergeFeature(user_features_df, 'User_id', 'User', factor_pipeline)),
    ('merchant', MergeFeature(merchant_features_df, 'Merchant_id', 'Merchant', factor_pipeline)),
    ('coupon', MergeFeature(coupon_features_df, 'Coupon_id', 'Coupon', factor_pipeline)),
    ('features', FeatureUnion([
        ('continuous', Pipeline([
            ('extract', ColumnSelector(continous)),
            ('imputer', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
            ('scale', Normalizer())
        ])),
        ('fields', Pipeline([
            ('extract', ColumnSelector(fields)),
            ('imputer', SimpleImputer(missing_values=np.nan,  strategy='most_frequent')),
            ('one_hot', OneHotEncoder(categories='auto')),
            ('to_dense', DenseTransformer())
        ])),
    ])),
])

features_pipeline.fit(model_train_df, model_train_df[label].values.ravel())

train_dataset_x = features_pipeline.transform(model_train_df)
train_dataset_y = model_train_df[label].values.ravel()

valid_dataset_x = features_pipeline.transform(model_test_df)
valid_dataset_y = model_test_df[label].values.ravel()

In [7]:
model_pipeline = Pipeline([
    ('xgb', xgb.sklearn.XGBClassifier())
])

model_pipeline.set_params(
    xgb__learn_rate=0.01,
    xgb__max_depth=2,
    xgb__min_child_weight=1,
    xgb__subsample=0.7,
    xgb__colsample_bytree=0.7,
    xgb__colsample_bylevel=0.7,
    xgb__objective='rank:pairwise',
    xgb__n_estimators=50,
    xgb__gamma=0.1,
    xgb__reg_alpha=1,
    xgb__reg_lambda=1,
    xgb__max_delta_step=0,
    xgb__scale_pos_weight=1,
    xgb__silent=True,
    xgb__eval_metric='auc'
).fit(train_dataset_x, train_dataset_y)

Pipeline(memory=None,
     steps=[('xgb', XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=0.7,
       colsample_bytree=0.7, eval_metric='auc', gamma=0.1, learn_rate=0.01,
       learning_rate=0.1, max_delta_step=0, max_depth=2,
       min_child_weight=1, missing=None, n_estimators=50, n_jobs=1,
       nthread=None, objective='rank:pairwise', random_state=0,
       reg_alpha=1, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.7))])

In [8]:
class Evaluator():
    def __init__(self, df, pipe):
        self.df = df
        self.pipe = pipe
        
    def transfer_result(self, result):
        return MinMaxScaler(copy=True, feature_range=(0, 1)).fit_transform(result.reshape(-1, 1))
    
    def cal(self, dataset):
        prob_raw = self.pipe.predict_proba(dataset)[:,1]
        predict_prob = self.transfer_result(prob_raw)
        self.df['Probability'] = predict_prob
        return self.evaluate(self.df[['Probability', 'Coupon_id', 'Is_in_day_consume']])
    
    def describe(self):
        return self.df[['User_id', 'Coupon_id', 'Probability', 'Is_in_day_consume']].describe()
    
    def predict(self, dataset):
        prob_raw = self.pipe.predict_proba(dataset)[:,1]
        predict_prob = self.transfer_result(prob_raw)
        self.df['Probability'] = predict_prob
        return self.df[['User_id', 'Coupon_id', 'Date_received', 'Probability']]
    
    def evaluate(self, result_df):
        group = result_df.groupby(['Coupon_id'])
        aucs = []
        logging.info('coupon size is %d' % (len(group)))

        counter = 0
        for i in group:
            tmpdf = i[1]        
            if len(tmpdf['Is_in_day_consume'].unique()) != 2:
                continue

            fpr, tpr, thresholds = roc_curve(tmpdf['Is_in_day_consume'], tmpdf['Probability'], pos_label=1)
            auc_score = auc(fpr,tpr)
            aucs.append(auc_score)
            counter = counter + 1

        logging.info('coupon in cal is %d' % (counter))

        return np.average(aucs)

In [9]:
evaluator = Evaluator(model_test_df, model_pipeline)
evaluator.cal(valid_dataset_x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
2019-01-28 10:36:15,978  <ipython-input-8-68255e7438c7> : INFO  coupon size is 7160
2019-01-28 10:36:23,193  <ipython-input-8-68255e7438c7> : INFO  coupon in cal is 4004


0.568953425169264

In [10]:
evaluator.describe()

Unnamed: 0,User_id,Coupon_id,Probability,Is_in_day_consume
count,306313.0,306313.0,306313.0,306313.0
mean,3681999.0,6145.500612,0.232891,0.091707
std,2122008.0,4135.385856,0.21697,0.288612
min,4.0,1.0,0.0,0.0
25%,1842402.0,2418.0,0.125907,0.0
50%,3684421.0,4958.0,0.181925,0.0
75%,5518531.0,9746.0,0.22831,0.0
max,7360961.0,14045.0,1.0,1.0


## 预测

In [11]:
model_pred_df = pd.read_csv('../features/lcm_submit_features.csv')
predict_dataset_x = features_pipeline.transform(model_pred_df)
predictor = Evaluator(model_pred_df, model_pipeline)
final_result_df = predictor.predict(predict_dataset_x)
final_result_df.to_csv('/Users/leewind/Desktop/submission_20190127.csv', index=False, header=False)
final_result_df.shape

(113640, 4)