In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import math
import logging

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.preprocessing import OneHotEncoder,MinMaxScaler, Normalizer
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer

from mlxtend.preprocessing import DenseTransformer
from mlxtend.feature_selection import ColumnSelector

In [2]:
logger = logging.getLogger('ai')
logging.basicConfig(level=logging.DEBUG,
                    format='%(asctime)s  %(filename)s : %(levelname)s  %(message)s')

## 模型

In [3]:
dataset_alpha = pd.read_csv('../features/dataset_alpha.csv')

In [4]:
dataset_beta = pd.read_csv('../features/dataset_beta.csv')

In [5]:
continous = ['Distance',
 'Base_consume',
 'Discount_money',
 'Next_duration',
 'Receive_coupon_category',
 'Receive_coupon_category_rate',
 'Receive_coupon_id',
 'Avg_user_per_coupon_count',
 'Avg_user_per_merchant_count',
 'Receive_merchant_id',
 'Receive_base_consume',
 'Coupon_id_receive',
 'Merchant_id_receive',
 'User_discount_mean',
 'Receive_coupon_type_rate',
 'Receive_weekday_of_received_rate',
 'Collect_in_day_rate',
 'Avg_user_receive_coupon',
 'Avg_user_receive_merchant',
 'H_Use_duration_mean_U',
 'H_Use_UCC',
 'H_Use_UC_rate',
 'H_Receive_M',
 'H_Consume_M',
 'H_Use_M',
 'H_No_use_M',
 'H_Use_occ_M',
 'H_Consume_duration_mean_M',
 'H_Use_duration_mean_M',
 'H_Receive_C',
 'H_Consume_C',
 'H_Use_C',
 'H_O_Receive_U',
 'H_Consume_duration_mean_UCC',
 'User_discount_max',
 'User_discount_min']

label = ['Label']

In [26]:
dataset_1 = pd.read_csv('/Users/leewind/Projects/leewind/tianchi_O2O_predict/data_preprocessed_2/ProcessDataSet1.csv')
dataset_1.drop_duplicates(inplace=True)
dataset_1.fillna(0, inplace=True)
dataset_1_x = dataset_1.drop(
        columns=['User_id', 'Merchant_id', 'Discount_rate', 'Date_received', 'discount_rate_x', 'discount_rate_y',
                 'Date', 'Coupon_id', 'label'], axis=1)

In [23]:
dataset_2 = pd.read_csv('/Users/leewind/Projects/leewind/tianchi_O2O_predict/data_preprocessed_2/ProcessDataSet2.csv')
dataset_2.drop_duplicates(inplace=True)
dataset_2.fillna(0, inplace=True)
dataset_2_x = dataset_2.drop(
        columns=['User_id', 'Merchant_id', 'Discount_rate', 'Date_received', 'discount_rate_x', 'discount_rate_y',
                 'Date', 'Coupon_id', 'label'], axis=1)

In [6]:
continous = [
    'Distance', 'discount_rate', 'weekday', 'day', 'u2', 'u3', 'u19',
       'u1', 'u4', 'u5', 'u25', 'u20', 'u6', 'u7', 'u8', 'u9', 'u10',
       'u11', 'u21', 'u22', 'u23', 'u24', 'u45', 'u27', 'u28', 'u32',
       'u47', 'u33', 'u34', 'u35', 'u36', 'u37', 'discount_type', 'u41',
       'u42', 'u43', 'u44', 'u48', 'u49', 'm0', 'm1', 'm2', 'm3', 'm4',
       'm7', 'm5', 'm6', 'm8', 'm9', 'm10', 'm11', 'm12', 'm13', 'm14',
       'm15', 'm18', 'm19', 'm20', 'm21', 'm22', 'm23', 'c1', 'c2', 'c3',
       'c4', 'c5', 'c6', 'c8', 'c9', 'c10', 'c11', 'c12', 'um1', 'um2',
       'um3', 'um4', 'um5', 'um6', 'um7', 'um8', 'um9', 'um10', 'um11',
       'um12', 'o1', 'o2', 'o17', 'o18', 'o3', 'o4', 'o5', 'o6', 'o7',
       'o8', 'o9', 'o10', 'o11', 'o12', 'o13', 'o14', 'o15', 'o16',
       'on_u1', 'on_u2', 'on_u3', 'on_u4', 'on_u5', 'on_u6', 'on_u7',
       'on_u8', 'on_u9', 'on_u10', 'on_u11', 'on_u12', 'on_u13'
]

fields = [

]

label = ['label']

In [6]:
class MergeFeature(TransformerMixin):
    def __init__(self, df, key, prefix, pipe):
        self.df = df
        self.key = key
        self.prefix = prefix
        self.pipe = pipe
    
    def get_factor(self, df, key, prefix):
        id_df = df[[key]]
        output_df = df.drop([key], axis=1)

        self.pipe.fit(output_df)
        factors = self.pipe.transform(output_df)
        factors_df = pd.DataFrame(data=factors, columns=[prefix + '_factor_alpha', prefix + '_factor_beta'])
        factors_df[key] = id_df[key]
        return factors_df
    
    def fit(self, *args, **kwargs):
        return self
    
    def transform(self, X, **transform_params):
        return pd.merge(X, self.df, on=self.key, how='left')
#         return pd.merge(X, self.get_factor(self.df, self.key, self.prefix), on=[self.key], how='left')

In [7]:
class GBDTTransformer(TransformerMixin):
    def __init__(self):
        self.n_estimator = 256
        self.model = GradientBoostingClassifier(max_depth=3, n_estimators=self.n_estimator, random_state=0)
        
    def fit(self, *args, **kwargs):
        self.model.fit(*args, **kwargs)
        return self
    
    def transform(self, X, **transform_params):
        return self.model.apply(X)[:, :, 0]
    
class ExtractFeature(TransformerMixin):
    def fit(self, *args, **kwargs):
        return self
    
    def transform(self, X, **transform_params):
        return pd.DataFrame(X[:,0] * X[:,1])

In [8]:
factor_pipeline = Pipeline([
    ('pca', PCA(n_components=2)),
    ('scale', MinMaxScaler()),
])

pipe_lr = Pipeline([
    ('features', FeatureUnion([
        ('continuous', Pipeline([
            ('extract', ColumnSelector(continous)),
            ('imputer', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
#             ('scale', MinMaxScaler(copy=True, feature_range=(0, 1))),
            ('normalize', Normalizer())
        ])),
#         ('fields', Pipeline([
#             ('extract', ColumnSelector(fields)),
#             ('imputer', SimpleImputer(missing_values=np.nan,  strategy='most_frequent')),
#             ('one_hot', OneHotEncoder(categories='auto')),
#             ('to_dense', DenseTransformer())
#         ])),
    ])),
#     ('skb', SelectKBest(chi2)),
    ('gbdt', GBDTTransformer()),
    ('pca', PCA()),
    ('clf', LogisticRegression(C=0.1, random_state=2, solver='lbfgs', class_weight='balanced', multi_class='multinomial', max_iter=5000, n_jobs=4))
])

logger.info('Start training')
pipe_lr.set_params(
    pca__n_components=8, 
#     skb__k=32
).fit(dataset_beta, dataset_beta['Label'].values.ravel())

2019-02-05 11:05:21,499  <ipython-input-8-098a8269e384> : INFO  Start training


Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=None,
       transformer_list=[('continuous', Pipeline(memory=None,
     steps=[('extract', ColumnSelector(cols=['Distance', 'Base_consume', 'Discount_money', 'Next_duration', 'Receive_coupon_category', 'Receive_coupon_category_rate', 'Receive_coupon_id', 'Avg...'l2',
          random_state=2, solver='lbfgs', tol=0.0001, verbose=0,
          warm_start=False))])

In [9]:
from sklearn.metrics import roc_curve, auc
import numpy as np
import math

def evaluate(result_df):
    group = result_df.groupby(['Coupon_id'])
    aucs = []
    for i in group:
        tmpdf = i[1]        
        if len(tmpdf['Label'].unique()) != 2:
            continue
            
        fpr, tpr, thresholds = roc_curve(tmpdf['Label'], tmpdf['Prob'], pos_label=1)
        auc_score = auc(fpr,tpr)
        aucs.append(auc_score)
            
    return np.average(aucs)

In [10]:
predict_test_prob_y = pipe_lr.predict_proba(dataset_alpha)
dataset_alpha['Prob'] = predict_test_prob_y[:, 1]

In [11]:
evaluate(dataset_alpha)

0.6551828585411936

In [None]:
model_pred_df = pd.read_csv('lcm_test_features.csv')
predict_prob_y = pipe_lr.predict_proba(model_pred_df[fields+continous])
model_pred_df['Probability'] = predict_prob_y[:, 1]
model_pred_df.sort_values(['Probability'], ascending=False).head()

In [None]:
final_result_df = model_pred_df[['User_id', 'Coupon_id', 'Date_received', 'Probability']]
final_result_df.to_csv('/Users/leewind/Desktop/submission_20190118.csv', index=False, header=False)
final_result_df.shape