In [1]:
import pandas as pd
import numpy as np
import math
import logging
from datetime import datetime

from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, Normalizer, StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import roc_curve, auc

from mlxtend.preprocessing import DenseTransformer
from mlxtend.feature_selection import ColumnSelector

import xgboost as xgb

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [2]:
logger = logging.getLogger('ai')
logging.basicConfig(level=logging.DEBUG,
                    format='%(asctime)s  %(filename)s : %(levelname)s  %(message)s')

## 模型

### 数据预处理

In [27]:
dataset_1 = pd.read_csv('/Users/leewind/Projects/leewind/tianchi_O2O_predict/data_preprocessed_2/ProcessDataSet1.csv')
dataset_1.drop_duplicates(inplace=True)
dataset_1.fillna(0, inplace=True)
dataset_1_x = dataset_1.drop(
        columns=['User_id', 'Merchant_id', 'Discount_rate', 'Date_received', 'discount_rate_x', 'discount_rate_y',
                 'Date', 'Coupon_id', 'label'], axis=1)

In [28]:
dataset_2 = pd.read_csv('/Users/leewind/Projects/leewind/tianchi_O2O_predict/data_preprocessed_2/ProcessDataSet2.csv')
dataset_2.drop_duplicates(inplace=True)
dataset_2.fillna(0, inplace=True)
dataset_2_x = dataset_2.drop(
        columns=['User_id', 'Merchant_id', 'Discount_rate', 'Date_received', 'discount_rate_x', 'discount_rate_y',
                 'Date', 'Coupon_id', 'label'], axis=1)

In [29]:
dataset_2_x.columns.values

array(['Distance', 'discount_rate', 'weekday', 'day', 'u2', 'u3', 'u19',
       'u1', 'u4', 'u5', 'u25', 'u20', 'u6', 'u7', 'u8', 'u9', 'u10',
       'u11', 'u21', 'u22', 'u23', 'u24', 'u45', 'u27', 'u28', 'u32',
       'u47', 'u33', 'u34', 'u35', 'u36', 'u37', 'discount_type', 'u41',
       'u42', 'u43', 'u44', 'u48', 'u49', 'm0', 'm1', 'm2', 'm3', 'm4',
       'm7', 'm5', 'm6', 'm8', 'm9', 'm10', 'm11', 'm12', 'm13', 'm14',
       'm15', 'm18', 'm19', 'm20', 'm21', 'm22', 'm23', 'c1', 'c2', 'c3',
       'c4', 'c5', 'c6', 'c8', 'c9', 'c10', 'c11', 'c12', 'um1', 'um2',
       'um3', 'um4', 'um5', 'um6', 'um7', 'um8', 'um9', 'um10', 'um11',
       'um12', 'o1', 'o2', 'o17', 'o18', 'o3', 'o4', 'o5', 'o6', 'o7',
       'o8', 'o9', 'o10', 'o11', 'o12', 'o13', 'o14', 'o15', 'o16',
       'on_u1', 'on_u2', 'on_u3', 'on_u4', 'on_u5', 'on_u6', 'on_u7',
       'on_u8', 'on_u9', 'on_u10', 'on_u11', 'on_u12', 'on_u13'],
      dtype=object)

In [30]:
continous = [
    'discount_rate', 'weekday', 'day', 'u2', 'u3', 'u19',
       'u1', 'u4', 'u5', 'u25', 'u20', 'u6', 'u7', 'u8', 'u9', 'u10',
       'u11', 'u21', 'u22', 'u23', 'u24', 'u45', 'u27', 'u28', 'u32',
       'u47', 'u33', 'u34', 'u35', 'u36', 'u37', 'discount_type', 'u41',
       'u42', 'u43', 'u44', 'u48', 'u49', 'm0', 'm1', 'm2', 'm3', 'm4',
       'm7', 'm5', 'm6', 'm8', 'm9', 'm10', 'm11', 'm12', 'm13', 'm14',
       'm15', 'm18', 'm19', 'm20', 'm21', 'm22', 'm23', 'c1', 'c2', 'c3',
       'c4', 'c5', 'c6', 'c8', 'c9', 'c10', 'c11', 'c12', 'um1', 'um2',
       'um3', 'um4', 'um5', 'um6', 'um7', 'um8', 'um9', 'um10', 'um11',
       'um12', 'o1', 'o2', 'o17', 'o18', 'o3', 'o4', 'o5', 'o6', 'o7',
       'o8', 'o9', 'o10', 'o11', 'o12', 'o13', 'o14', 'o15', 'o16',
       'on_u1', 'on_u2', 'on_u3', 'on_u4', 'on_u5', 'on_u6', 'on_u7',
       'on_u8', 'on_u9', 'on_u10', 'on_u11', 'on_u12', 'on_u13'
]

fields = [
    'Distance', 
]

label = ['label']

In [31]:
class MergeFeature(TransformerMixin):
    def __init__(self, df, key, prefix, pipe):
        self.df = df
        self.key = key
        self.prefix = prefix
        self.pipe = pipe
    
    def get_factor(self, df, key, prefix):
        id_df = df[[key]]
        output_df = df.drop([key], axis=1)

        self.pipe.fit(output_df)
        factors = self.pipe.transform(output_df)
        factors_df = pd.DataFrame(data=factors, columns=[prefix + '_factor_alpha', prefix + '_factor_beta'])
        factors_df[key] = id_df[key]
        return factors_df
    
    def fit(self, *args, **kwargs):
        return self
    
    def transform(self, X, **transform_params):
        return pd.merge(X, self.df, on=self.key, how='left')
#         return pd.merge(X, self.get_factor(self.df, self.key, self.prefix), on=[self.key], how='left')

In [32]:
factor_pipeline = Pipeline([
    ('pca', PCA(n_components=2)),
    ('scale', MinMaxScaler()),
])

features_pipeline = Pipeline([
    ('features', FeatureUnion([
        ('continuous', Pipeline([
            ('extract', ColumnSelector(continous)),
            ('imputer', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
            ('scale', MinMaxScaler()),
            ('normalize', Normalizer())
        ])),
        ('fields', Pipeline([
            ('extract', ColumnSelector(fields)),
            ('imputer', SimpleImputer(missing_values=np.nan,  strategy='most_frequent')),
            ('one_hot', OneHotEncoder(categories='auto')),
            ('to_dense', DenseTransformer())
        ])),
    ])),
])

features_pipeline.fit(dataset_1_x, dataset_1.label.values.ravel())

train_dataset_x = features_pipeline.transform(dataset_1_x)
train_dataset_y = dataset_1.label.values.ravel()

valid_dataset_x = features_pipeline.transform(dataset_2_x)
valid_dataset_y = dataset_2.label.values.ravel()

ValueError: Found unknown categories [31] in column 1 during transform

### 模型训练

#### 调参数

In [None]:
parameters = {
    'xgb__learn_rate': [0.001, ],
    'xgb__max_depth': [6],
    'xgb__min_child_weight': [1],
    'xgb__subsample': [0.7,],
    'xgb__colsample_bytree': [0.7,],
    'xgb__colsample_bylevel': [0.7,],
    'xgb__objective': ['rank:pairwise'],
    'xgb__n_estimators': [2], # 使用1-3都可以被接受
    'xgb__gamma': [0.1,],
    'xgb__reg_alpha': [1,],
    'xgb__reg_lambda': [1,],
    'xgb__max_delta_step': [0,],
    'xgb__scale_pos_weight': [1,],
    'xgb__silent': [True],
    'xgb__eval_metric': ['auc']
}

cv = GridSearchCV(model_pipeline, parameters, scoring = 'roc_auc', n_jobs= 4)
cv.fit(train_dataset_x, train_dataset_y)

In [None]:
# 查看每组评估的具体数据
cv.cv_results_['param_xgb__n_estimators'].data

# 结果训练
cv.cv_results_['mean_train_score']

#### 最优参数训练

In [None]:
model_pipeline = Pipeline([
    ('xgb', xgb.sklearn.XGBClassifier())
])

model_pipeline.set_params(
    xgb__learn_rate=0.01,
    xgb__max_depth=12,
    xgb__min_child_weight=1.1,
    xgb__subsample=0.7,
    xgb__colsample_bytree=0.7,
    xgb__colsample_bylevel=0.7,
#     xgb__objective='rank:pairwise',
    xgb__objective='binary:logistic',
    xgb__n_estimators=100,
    xgb__gamma=0.1,
    xgb__reg_alpha=1,
    xgb__reg_lambda=1,
    xgb__max_delta_step=0,
    xgb__scale_pos_weight=1,
    xgb__silent=True,
    xgb__eval_metric='auc'
).fit(train_dataset_x, train_dataset_y)

In [10]:
class Evaluator():
    def __init__(self, df, pipe):
        self.df = df
        self.pipe = pipe
        
    def transfer_result(self, result):
        return MinMaxScaler(copy=True, feature_range=(0, 1)).fit_transform(result.reshape(-1, 1))
    
    def cal(self, dataset):
        pred = self.pipe.predict_proba(dataset)[:,1]
        logging.info(pred)
        
        self.df['Probability'] = self.transfer_result(pred)
        return self.evaluate(self.df[['Probability', 'Coupon_id', 'label']])
    
    def describe(self):
        return self.df[['User_id', 'Coupon_id', 'Probability', 'label']].describe()
    
    def predict(self, dataset):
        pred = self.pipe.predict_proba(dataset)[:,1]
        logging.info(pred)
        
        self.df['Probability'] = self.transfer_result(pred)
        return self.df[['User_id', 'Coupon_id', 'Date_received', 'Probability']]
    
    def evaluate(self, result_df):
        group = result_df.groupby(['Coupon_id'])
        aucs = []
        logging.info('coupon size is %d' % (len(group)))

        counter = 0
        for i in group:
            tmpdf = i[1]        
            if len(tmpdf['label'].unique()) != 2:
                continue

            fpr, tpr, thresholds = roc_curve(tmpdf['label'], tmpdf['Probability'], pos_label=1)
            auc_score = auc(fpr,tpr)
            aucs.append(auc_score)
            counter = counter + 1

        logging.info('coupon in cal is %d' % (counter))

        return np.average(aucs)

In [11]:
evaluator = Evaluator(dataset_2, model_pipeline)
evaluator.cal(valid_dataset_x)

2019-02-02 14:16:13,506  <ipython-input-10-dd5979b0d9de> : INFO  [0.01924821 0.01720051 0.14385155 ... 0.0140827  0.00565731 0.01052577]
2019-02-02 14:16:13,745  <ipython-input-10-dd5979b0d9de> : INFO  coupon size is 6192
2019-02-02 14:16:17,373  <ipython-input-10-dd5979b0d9de> : INFO  coupon in cal is 3229


0.7500371690557894

## 预测

In [14]:
model_pred_df = pd.read_csv('../features/lcm_submit_features.csv')

predict_dataset_x = features_pipeline.transform(model_pred_df)
predictor = Evaluator(model_pred_df, model_pipeline)
final_result_df = predictor.predict(predict_dataset_x)
final_result_df.shape

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)
2019-02-02 12:42:23,332  <ipython-input-12-3fa321e3310d> : INFO  [0.04794511 0.02325486 0.0034588  ... 0.00014111 0.03200784 0.04099259]


(113640, 4)

In [15]:
final_result_df.describe()

Unnamed: 0,User_id,Coupon_id,Date_received,Probability
count,113640.0,113640.0,113640.0,113640.0
mean,3684858.0,9053.810929,20160720.0,0.045039
std,2126259.0,4145.873088,9.019508,0.083413
min,209.0,3.0,20160700.0,0.0
25%,1844191.0,5023.0,20160710.0,0.017546
50%,3683266.0,9983.0,20160720.0,0.027178
75%,5525845.0,13602.0,20160720.0,0.043449
max,7361024.0,14045.0,20160730.0,1.0


In [16]:
final_result_df.head(20)

Unnamed: 0,User_id,Coupon_id,Date_received,Probability
0,4129537,9983.0,20160712.0,0.048136
1,6949378,3429.0,20160706.0,0.023293
2,2166529,6928.0,20160727.0,0.003373
3,2166529,1808.0,20160727.0,0.011802
4,6172162,6500.0,20160708.0,0.023979
5,4005121,9983.0,20160706.0,0.021361
6,4347394,9983.0,20160716.0,0.018635
7,3094273,13602.0,20160727.0,0.035659
8,5139970,9983.0,20160729.0,0.096365
9,3237121,13602.0,20160703.0,0.022057


In [27]:
final_result_df.to_csv('/Users/leewind/Desktop/submission_20190201_1.csv', index=False, header=False)