In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import math
import logging

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.preprocessing import OneHotEncoder,MinMaxScaler, Normalizer
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer

from mlxtend.preprocessing import DenseTransformer
from mlxtend.feature_selection import ColumnSelector

In [2]:
logger = logging.getLogger('ai')
logging.basicConfig(level=logging.DEBUG,
                    format='%(asctime)s  %(filename)s : %(levelname)s  %(message)s')

## 模型

In [3]:
dataset_alpha = pd.read_csv('../features/dataset_alpha.csv')

In [4]:
dataset_beta = pd.read_csv('../features/dataset_beta.csv')

In [5]:
continous = [
    'Coupon_id', 'Distance',
    'Month_of_received', 'Day_of_received',
    'Weekday_of_received', 'Base_consume', 'Discount',
    'Discount_money', 'Coupon_type', 'Coupon_category',
    'Previous_duration', 'Next_duration', 'o1',
    'o2', 'o3', 'o4', 'o5', 'o6', 'o8', 'o7', 'o9', 'o10', 'o12',
    'o14', 'o11', 'o13', 'o16', 'o15', 'o18', 'o19', 'o20', 'o21',
    'o22', 'o23', 'o17', 'o24', 'o25', 'o26', 'o27', 'o28', 'o29',
    'o30', 'o38', 'o31', 'o39', 'o40', 'o41', 'o42', 'o43', 'o32',
    'o33', 'o34', 'o35', 'o36', 'o37', 'o44', 'u0', 'u1', 'u2', 'u3',
    'u4', 'u5', 'u6', 'u7', 'u8', 'u9', 'u10', 'u11', 'u12', 'u13',
    'u14', 'u15', 'u16', 'u17', 'u18', 'u19', 'u20', 'u21', 'u22',
    'u23', 'u24', 'u25', 'ucc0', 'ucc1', 'ucc2', 'ucc3', 'ucc4',
    'ucc5', 'ucc6', 'ucc7', 'ucc8', 'ucc9', 'ucc10', 'ucc11', 'ucc12',
    'uc1', 'uc2', 'uc3', 'uc4', 'uc5', 'uc6', 'uc7', 'uc8', 'uc9',
    'uc10', 'uc11', 'uc12', 'ud0', 'ud1', 'ud2', 'ud3', 'ud4', 'ud5',
    'ud6', 'ud7', 'ud8', 'ud9', 'ud10', 'ud11', 'ud12', 'um0', 'um1',
    'um2', 'um3', 'um4', 'um5', 'um6', 'um7', 'um8', 'um9', 'um10',
    'um16', 'um15', 'um17', 'um11', 'um12', 'um13', 'um14', 'm0', 'm1',
    'm2', 'm3', 'm4', 'm5', 'm6', 'm7', 'm8', 'm9', 'm10', 'm11',
    'm12', 'm13', 'm14', 'c1', 'c2', 'c3', 'c4', 'c5', 'c6', 'c7',
    'c8', 'c9', 'c10', 'c11', 'c12', 'c13', 'c14', 'cd1', 'cd2', 'cd3',
    'cd4', 'cd5', 'cd6', 'cd7', 'dr1', 'dr2', 'dr3', 'dr4', 'dr5',
    'dr6', 'dr7', 'ou1', 'ou2', 'ou3', 'ou4']

label = ['Label']

In [6]:
class GBDTTransformer(TransformerMixin):
    def __init__(self):
        self.n_estimator = 256
        self.model = GradientBoostingClassifier(max_depth=3, n_estimators=self.n_estimator, random_state=0)
        
    def fit(self, *args, **kwargs):
        self.model.fit(*args, **kwargs)
        return self
    
    def transform(self, X, **transform_params):
        return self.model.apply(X)[:, :, 0]
    
class ExtractFeature(TransformerMixin):
    def fit(self, *args, **kwargs):
        return self
    
    def transform(self, X, **transform_params):
        return pd.DataFrame(X[:,0] * X[:,1])

In [11]:
pipe_lr = Pipeline([
    ('features', FeatureUnion([
        ('continuous', Pipeline([
            ('extract', ColumnSelector(continous)),
            ('imputer', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
#             ('scale', MinMaxScaler(copy=True, feature_range=(0, 1))),
            ('normalize', Normalizer())
        ])),
#         ('fields', Pipeline([
#             ('extract', ColumnSelector(fields)),
#             ('imputer', SimpleImputer(missing_values=np.nan,  strategy='most_frequent')),
#             ('one_hot', OneHotEncoder(categories='auto')),
#             ('to_dense', DenseTransformer())
#         ])),
    ])),
    ('skb', SelectKBest(chi2)),
    ('gbdt', GBDTTransformer()),
    ('pca', PCA()),
    ('clf', LogisticRegression(C=0.1, random_state=2, solver='lbfgs', class_weight='balanced', multi_class='multinomial', max_iter=5000, n_jobs=4))
])

logger.info('Start training')
pipe_lr.set_params(
    pca__n_components=8, 
    skb__k=80
).fit(dataset_beta, dataset_beta['Label'].values.ravel())

2019-02-09 16:20:50,724  <ipython-input-11-964b07fd7bec> : INFO  Start training


Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=None,
       transformer_list=[('continuous', Pipeline(memory=None,
     steps=[('extract', ColumnSelector(cols=['Coupon_id', 'Distance', 'Month_of_received', 'Day_of_received', 'Weekday_of_received', 'Base_consume', 'Discount', 'Discount_money', 'Coupon_type'...'l2',
          random_state=2, solver='lbfgs', tol=0.0001, verbose=0,
          warm_start=False))])

In [12]:
from sklearn.metrics import roc_curve, auc
import numpy as np
import math

def evaluate(result_df):
    group = result_df.groupby(['Coupon_id'])
    aucs = []
    for i in group:
        tmpdf = i[1]        
        if len(tmpdf['Label'].unique()) != 2:
            continue
            
        fpr, tpr, thresholds = roc_curve(tmpdf['Label'], tmpdf['Prob'], pos_label=1)
        auc_score = auc(fpr,tpr)
        aucs.append(auc_score)
            
    return np.average(aucs)

In [13]:
predict_test_prob_y = pipe_lr.predict_proba(dataset_alpha)
dataset_alpha['Prob'] = predict_test_prob_y[:, 1]

In [14]:
evaluate(dataset_alpha)

0.6447818061175449