In [None]:
import pandas as pd
import numpy as np
import math
import logging
from datetime import datetime

from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, Normalizer, StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import roc_curve, auc

from mlxtend.preprocessing import DenseTransformer
from mlxtend.feature_selection import ColumnSelector

import xgboost as xgb

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [None]:
logger = logging.getLogger('ai')
logging.basicConfig(level=logging.DEBUG,
                    format='%(asctime)s  %(filename)s : %(levelname)s  %(message)s')

## 模型

In [None]:
df = pd.read_csv('../features/lcm_base_features.csv')
user_features_df = pd.read_csv('../features/lcm_user_features.csv')
merchant_features_df = pd.read_csv('../features/lcm_merchant_features.csv')
coupon_features_df = pd.read_csv('../features/lcm_coupon_features.csv')

model_train_df = df[df['Date_received'] < 20160501]
model_test_df = df[df['Date_received'] >= 20160501]

In [None]:
continous = [
    'Discount', 
    'Base_consume', 
    'Discount_money',
#     'User_factor_alpha',
#     'User_factor_beta',
#     'Merchant_factor_alpha',
#     'Merchant_factor_beta',
#     'Coupon_factor_alpha', 
#     'Coupon_factor_beta'
    
    'User_receive_count', 'User_receive_count_occ',
    'User_used_count', 'User_used_count_occ',
    'User_receive_diff_merchant_count',
    'User_receive_diff_merchant_count_occ',
    'User_used_diff_merchant_count',
    'User_used_diff_merchant_count_occ',
    'User_receive_diff_coupon_count',
    'User_receive_diff_coupon_count_occ',
    'User_used_diff_coupon_count', 'User_used_diff_coupon_count_occ',
    'User_receive_coupon_type_0_count',
    'User_receive_coupon_type_0_count_occ',
    'User_used_coupon_type_0_count',
    'User_used_coupon_type_0_count_occ',
    'User_used_coupon_type_0_rate', 'User_receive_coupon_type_1_count',
    'User_receive_coupon_type_1_count_occ',
    'User_used_coupon_type_1_count',
    'User_used_coupon_type_1_count_occ',
    'User_used_coupon_type_1_rate', 'User_receive_distance_0_count',
    'User_receive_distance_0_count_occ', 'User_used_distance_0_count',
    'User_used_distance_0_count_occ', 'User_used_distance_0_rate',
    'User_receive_distance_1_count',
    'User_receive_distance_1_count_occ', 'User_used_distance_1_count',
    'User_used_distance_1_count_occ', 'User_used_distance_1_rate',
    'User_receive_distance_2_count',
    'User_receive_distance_2_count_occ', 'User_used_distance_2_count',
    'User_used_distance_2_count_occ', 'User_used_distance_2_rate',
    'User_receive_distance_3_count',
    'User_receive_distance_3_count_occ', 'User_used_distance_3_count',
    'User_used_distance_3_count_occ', 'User_used_distance_3_rate',
    'User_receive_distance_4_count',
    'User_receive_distance_4_count_occ', 'User_used_distance_4_count',
    'User_used_distance_4_count_occ', 'User_used_distance_4_rate',
    'User_receive_distance_5_count',
    'User_receive_distance_5_count_occ', 'User_used_distance_5_count',
    'User_used_distance_5_count_occ', 'User_used_distance_5_rate',
    'User_receive_distance_6_count',
    'User_receive_distance_6_count_occ', 'User_used_distance_6_count',
    'User_used_distance_6_count_occ', 'User_used_distance_6_rate',
    'User_receive_distance_7_count',
    'User_receive_distance_7_count_occ', 'User_used_distance_7_count',
    'User_used_distance_7_count_occ', 'User_used_distance_7_rate',
    'User_receive_distance_8_count',
    'User_receive_distance_8_count_occ', 'User_used_distance_8_count',
    'User_used_distance_8_count_occ', 'User_used_distance_8_rate',
    'User_receive_distance_9_count',
    'User_receive_distance_9_count_occ', 'User_used_distance_9_count',
    'User_used_distance_9_count_occ', 'User_used_distance_9_rate',
    'User_receive_distance_10_count',
    'User_receive_distance_10_count_occ',
    'User_used_distance_10_count', 'User_used_distance_10_count_occ',
    'User_used_distance_10_rate', 'User_used_rate',
    'User_receive_diff_merchant_rate', 'User_used_diff_merchant_rate',
    'User_receive_diff_coupon_rate', 'User_used_diff_coupon_rate',
    'User_used_mean_4_merchant', 'User_used_rate_occ',
    'Discount_User_mean', 'Discount_User_max', 'Discount_User_min',
    'Distance_User_mean', 'Distance_User_max', 'Distance_User_min'
    
    'Coupon_receive_count', 'Coupon_receive_count_occ',
    'Coupon_used_count', 'Coupon_used_count_occ',
    'Coupon_diff_user_receive_count',
    'Coupon_diff_user_receive_count_occ',
    'Coupon_diff_user_used_count', 'Coupon_diff_user_used_count_occ',
    'Coupon_receive_distance_0_count',
    'Coupon_receive_distance_0_count_occ',
    'Coupon_used_distance_0_count', 'Coupon_used_distance_0_count_occ',
    'Coupon_used_distance_0_rate', 'Coupon_receive_distance_1_count',
    'Coupon_receive_distance_1_count_occ',
    'Coupon_used_distance_1_count', 'Coupon_used_distance_1_count_occ',
    'Coupon_used_distance_1_rate', 'Coupon_receive_distance_2_count',
    'Coupon_receive_distance_2_count_occ',
    'Coupon_used_distance_2_count', 'Coupon_used_distance_2_count_occ',
    'Coupon_used_distance_2_rate', 'Coupon_receive_distance_3_count',
    'Coupon_receive_distance_3_count_occ',
    'Coupon_used_distance_3_count', 'Coupon_used_distance_3_count_occ',
    'Coupon_used_distance_3_rate', 'Coupon_receive_distance_4_count',
    'Coupon_receive_distance_4_count_occ',
    'Coupon_used_distance_4_count', 'Coupon_used_distance_4_count_occ',
    'Coupon_used_distance_4_rate', 'Coupon_receive_distance_5_count',
    'Coupon_receive_distance_5_count_occ',
    'Coupon_used_distance_5_count', 'Coupon_used_distance_5_count_occ',
    'Coupon_used_distance_5_rate', 'Coupon_receive_distance_6_count',
    'Coupon_receive_distance_6_count_occ',
    'Coupon_used_distance_6_count', 'Coupon_used_distance_6_count_occ',
    'Coupon_used_distance_6_rate', 'Coupon_receive_distance_7_count',
    'Coupon_receive_distance_7_count_occ',
    'Coupon_used_distance_7_count', 'Coupon_used_distance_7_count_occ',
    'Coupon_used_distance_7_rate', 'Coupon_receive_distance_8_count',
    'Coupon_receive_distance_8_count_occ',
    'Coupon_used_distance_8_count', 'Coupon_used_distance_8_count_occ',
    'Coupon_used_distance_8_rate', 'Coupon_receive_distance_9_count',
    'Coupon_receive_distance_9_count_occ',
    'Coupon_used_distance_9_count', 'Coupon_used_distance_9_count_occ',
    'Coupon_used_distance_9_rate', 'Coupon_receive_distance_10_count',
    'Coupon_receive_distance_10_count_occ',
    'Coupon_used_distance_10_count',
    'Coupon_used_distance_10_count_occ',
    'Coupon_used_distance_10_rate', 'Coupon_used_rate',
    'Coupon_diff_user_receive_rate', 'Coupon_diff_user_used_rate',
    'Coupon_used_rate_occ', 'Distance_Coupon_mean',
    'Distance_Coupon_max', 'Distance_Coupon_min',
    
    'Merchant_coupon_receive_count',
    'Merchant_coupon_receive_count_occ', 'Merchant_coupon_used_count',
    'Merchant_coupon_used_count_occ', 'Merchant_user_receive_count',
    'Merchant_user_receive_count_occ', 'Merchant_user_used_count',
    'Merchant_user_used_count_occ',
    'Merchant_receive_different_coupon_count',
    'Merchant_receive_different_coupon_count_occ',
    'Merchant_used_different_coupon_count',
    'Merchant_used_different_coupon_count_occ',
    'Merchant_receive_coupon_type_0_count',
    'Merchant_receive_coupon_type_0_count_occ',
    'Merchant_used_coupon_type_0_count',
    'Merchant_used_coupon_type_0_count_occ',
    'Merchant_used_coupon_type_0_rate',
    'Merchant_receive_coupon_type_1_count',
    'Merchant_receive_coupon_type_1_count_occ',
    'Merchant_used_coupon_type_1_count',
    'Merchant_used_coupon_type_1_count_occ',
    'Merchant_used_coupon_type_1_rate',
    'Merchant_receive_distance_0_count',
    'Merchant_receive_distance_0_count_occ',
    'Merchant_used_distance_0_count',
    'Merchant_used_distance_0_count_occ',
    'Merchant_used_distance_0_rate',
    'Merchant_receive_distance_1_count',
    'Merchant_receive_distance_1_count_occ',
    'Merchant_used_distance_1_count',
    'Merchant_used_distance_1_count_occ',
    'Merchant_used_distance_1_rate',
    'Merchant_receive_distance_2_count',
    'Merchant_receive_distance_2_count_occ',
    'Merchant_used_distance_2_count',
    'Merchant_used_distance_2_count_occ',
    'Merchant_used_distance_2_rate',
    'Merchant_receive_distance_3_count',
    'Merchant_receive_distance_3_count_occ',
    'Merchant_used_distance_3_count',
    'Merchant_used_distance_3_count_occ',
    'Merchant_used_distance_3_rate',
    'Merchant_receive_distance_4_count',
    'Merchant_receive_distance_4_count_occ',
    'Merchant_used_distance_4_count',
    'Merchant_used_distance_4_count_occ',
    'Merchant_used_distance_4_rate',
    'Merchant_receive_distance_5_count',
    'Merchant_receive_distance_5_count_occ',
    'Merchant_used_distance_5_count',
    'Merchant_used_distance_5_count_occ',
    'Merchant_used_distance_5_rate',
    'Merchant_receive_distance_6_count',
    'Merchant_receive_distance_6_count_occ',
    'Merchant_used_distance_6_count',
    'Merchant_used_distance_6_count_occ',
    'Merchant_used_distance_6_rate',
    'Merchant_receive_distance_7_count',
    'Merchant_receive_distance_7_count_occ',
    'Merchant_used_distance_7_count',
    'Merchant_used_distance_7_count_occ',
    'Merchant_used_distance_7_rate',
    'Merchant_receive_distance_8_count',
    'Merchant_receive_distance_8_count_occ',
    'Merchant_used_distance_8_count',
    'Merchant_used_distance_8_count_occ',
    'Merchant_used_distance_8_rate',
    'Merchant_receive_distance_9_count',
    'Merchant_receive_distance_9_count_occ',
    'Merchant_used_distance_9_count',
    'Merchant_used_distance_9_count_occ',
    'Merchant_used_distance_9_rate',
    'Merchant_receive_distance_10_count',
    'Merchant_receive_distance_10_count_occ',
    'Merchant_used_distance_10_count',
    'Merchant_used_distance_10_count_occ',
    'Merchant_used_distance_10_rate', 'Merchant_coupon_used_rate',
    'Merchant_user_receive_rate', 'Merchant_user_used_rate',
    'Merchant_receive_different_coupon_rate',
    'Merchant_used_different_coupon_rate',
    'Merchant_coupon_used_rate_occ', 'Discount_Merchant_mean',
    'Discount_Merchant_max', 'Discount_Merchant_min',
    'Distance_Merchant_mean', 'Distance_Merchant_max',
    'Distance_Merchant_min'
]

fields = [
    'Distance',
    'Day_in_month',
    'Day_in_week',
    'Coupon_type'
]

label = ['Is_in_day_consume']

In [None]:
class MergeFeature(TransformerMixin):
    def __init__(self, df, key, prefix, pipe):
        self.df = df
        self.key = key
        self.prefix = prefix
        self.pipe = pipe
    
    def get_factor(self, df, key, prefix):
        id_df = df[[key]]
        output_df = df.drop([key], axis=1)

        self.pipe.fit(output_df)
        factors = self.pipe.transform(output_df)
        factors_df = pd.DataFrame(data=factors, columns=[prefix + '_factor_alpha', prefix + '_factor_beta'])
        factors_df[key] = id_df[key]
        return factors_df
    
    def fit(self, *args, **kwargs):
        return self
    
    def transform(self, X, **transform_params):
        return pd.merge(X, self.df, on=[self.key], how='left')
#         return pd.merge(X, self.get_factor(self.df, self.key, self.prefix), on=[self.key], how='left')

In [None]:
factor_pipeline = Pipeline([
    ('pca', PCA(n_components=2)),
    ('scale', MinMaxScaler()),
])

features_pipeline = Pipeline([
    ('user', MergeFeature(user_features_df, 'User_id', 'User', factor_pipeline)),
    ('merchant', MergeFeature(merchant_features_df, 'Merchant_id', 'Merchant', factor_pipeline)),
    ('coupon', MergeFeature(coupon_features_df, 'Coupon_id', 'Coupon', factor_pipeline)),
    ('features', FeatureUnion([
        ('continuous', Pipeline([
            ('extract', ColumnSelector(continous+fields)),
            ('imputer', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
            ('scale', StandardScaler())
        ])),
    ])),
#     ('pca', PCA(n_components=128)),
#     ('features', FeatureUnion([
#         ('continuous', Pipeline([
#             ('extract', ColumnSelector(continous)),
#             ('imputer', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
#             ('scale', Normalizer())
#         ])),
#         ('fields', Pipeline([
#             ('extract', ColumnSelector(fields)),
#             ('imputer', SimpleImputer(missing_values=np.nan,  strategy='most_frequent')),
#             ('one_hot', OneHotEncoder(categories='auto')),
#             ('to_dense', DenseTransformer())
#         ])),
#     ])),
])

features_pipeline.fit(model_train_df, model_train_df[label].values.ravel())

train_dataset_x = features_pipeline.transform(model_train_df)
train_dataset_y = model_train_df[label].values.ravel()

valid_dataset_x = features_pipeline.transform(model_test_df)
valid_dataset_y = model_test_df[label].values.ravel()

In [None]:
model_pipeline = Pipeline([
    ('xgb', xgb.sklearn.XGBClassifier())
])

model_pipeline.set_params(
    xgb__learn_rate=0.01,
    xgb__max_depth=6,
    xgb__min_child_weight=1,
    xgb__subsample=0.7,
    xgb__colsample_bytree=0.7,
    xgb__colsample_bylevel=0.7,
    xgb__objective='rank:pairwise',
    xgb__n_estimators=10000,
    xgb__gamma=0.1,
    xgb__reg_alpha=1,
    xgb__reg_lambda=1,
    xgb__max_delta_step=0,
    xgb__scale_pos_weight=1,
    xgb__silent=True,
    xgb__eval_metric='auc'
).fit(train_dataset_x, train_dataset_y)

In [None]:
parameters = {
    'xgb__learn_rate': [0.001, ],
    'xgb__max_depth': [6],
    'xgb__min_child_weight': [1],
    'xgb__subsample': [0.7,],
    'xgb__colsample_bytree': [0.7,],
    'xgb__colsample_bylevel': [0.7,],
    'xgb__objective': ['rank:pairwise'],
    'xgb__n_estimators': [2], # 使用1-3都可以被接受
    'xgb__gamma': [0.1,],
    'xgb__reg_alpha': [1,],
    'xgb__reg_lambda': [1,],
    'xgb__max_delta_step': [0,],
    'xgb__scale_pos_weight': [1,],
    'xgb__silent': [True],
    'xgb__eval_metric': ['auc']
}

cv = GridSearchCV(model_pipeline, parameters, scoring = 'roc_auc', n_jobs= 4)
cv.fit(train_dataset_x, train_dataset_y)

In [None]:
# cv.cv_results_['param_xgb__n_estimators'].data
cv.cv_results_['mean_train_score']

In [None]:
class Evaluator():
    def __init__(self, df, pipe):
        self.df = df
        self.pipe = pipe
        
    def transfer_result(self, result):
        return MinMaxScaler(copy=True, feature_range=(0, 1)).fit_transform(result.reshape(-1, 1))
    
    def cal(self, dataset):
        prob_raw = self.pipe.predict_proba(dataset)[:,1]
        predict_prob = self.transfer_result(prob_raw)
        self.df['Probability'] = predict_prob
        return self.evaluate(self.df[['Probability', 'Coupon_id', 'Is_in_day_consume']])
    
    def describe(self):
        return self.df[['User_id', 'Coupon_id', 'Probability', 'Is_in_day_consume']].describe()
    
    def predict(self, dataset):
        prob_raw = self.pipe.predict_proba(dataset)[:,1]
        predict_prob = self.transfer_result(prob_raw)
        self.df['Probability'] = predict_prob
        return self.df[['User_id', 'Coupon_id', 'Date_received', 'Probability']]
    
    def evaluate(self, result_df):
        group = result_df.groupby(['Coupon_id'])
        aucs = []
        logging.info('coupon size is %d' % (len(group)))

        counter = 0
        for i in group:
            tmpdf = i[1]        
            if len(tmpdf['Is_in_day_consume'].unique()) != 2:
                continue

            fpr, tpr, thresholds = roc_curve(tmpdf['Is_in_day_consume'], tmpdf['Probability'], pos_label=1)
            auc_score = auc(fpr,tpr)
            aucs.append(auc_score)
            counter = counter + 1

        logging.info('coupon in cal is %d' % (counter))

        return np.average(aucs)

In [None]:
evaluator = Evaluator(model_test_df, model_pipeline)
evaluator.cal(valid_dataset_x)

In [None]:
evaluator.describe()

## 预测

In [None]:
model_pred_df = pd.read_csv('../features/lcm_submit_features.csv')
predict_dataset_x = features_pipeline.transform(model_pred_df)
predictor = Evaluator(model_pred_df, model_pipeline)
final_result_df = predictor.predict(predict_dataset_x)
final_result_df.shape

In [None]:
len(final_result_df[final_result_df['Probability']>0.5])

In [None]:
final_result_df.describe()

In [None]:
final_result_df.to_csv('/Users/leewind/Desktop/submission_20190129.csv', index=False, header=False)