In [47]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import math

from datetime import datetime
from scipy import stats

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from mlxtend.feature_selection import ColumnSelector
from sklearn.preprocessing import Imputer 


In [2]:
offline_df = pd.read_csv('../source/ccf_offline_stage1_train.csv')

In [35]:
offline_df.columns.values

array(['User_id', 'Merchant_id', 'Coupon_id', 'Discount_rate', 'Distance',
       'Date_received', 'Date'], dtype=object)

In [3]:
online_df = pd.read_csv('../source/ccf_online_stage1_train.csv')

In [62]:
class ContentExtractor(TransformerMixin):
    def __init__(self, col, fn):
        self.col = col
        self.fn = fn
    
    def fit(self, *args, **kwargs):
        return self

    def transform(self, X, **transform_params):
        return pd.DataFrame(X[self.col].apply(lambda row: self.fn(row)))

In [71]:
class ContentExtractorByRow(TransformerMixin):
    def __init__(self, fn):
        self.fn = fn
    
    def fit(self, *args, **kwargs):
        return self

    def transform(self, X, **transform_params):
        return pd.DataFrame(X.apply(lambda row: self.fn(row), axis=1))

In [239]:
class NewDataFrame(TransformerMixin):
    def __init__(self, cols, dtypes):
        self.cols = cols
        self.dtypes = dtypes
    def fit(self, *args, **kwargs):
        return self
    def transform(self, X, **transform_params):
        df = pd.DataFrame(data=X, columns=self.cols)
        for item in self.dtypes:
            df[item[0]] = df[item[0]].astype(item[1])
            
        return df

In [240]:
class GBDTTransformer(TransformerMixin):
    def __init__(self):
        self.n_estimator = 256
        self.model = GradientBoostingClassifier(max_depth=3, n_estimators=self.n_estimator, random_state=0)
        
    def fit(self, *args, **kwargs):
        self.model.fit(*args, **kwargs)
        return self
    
    def transform(self, X, **transform_params):
        return self.model.apply(X)[:, :, 0]

In [241]:
def join(df, col, series, key):
    return multi_join(df, col, series, [key])

def multi_join(df, col, series, keys):
    t = series.to_frame()
    t.columns = [col]
    
    return pd.merge(df, t, on=keys, how='left')

## 基础特征

In [273]:
def cal_duration(row):
    if row.Coupon_id > 0 and row.Date_received > 0 and row.Date > 0:
        date_received = datetime.strptime(str(int(row.Date_received)), '%Y%m%d')
        date_consumed = datetime.strptime(str(int(row.Date)), '%Y%m%d')
        delta = date_consumed - date_received
        return delta.days + 1
    return 0

# 优惠券信息 - 计算折扣率
def cal_discount(discount_rate):
    if isinstance(discount_rate, int) or isinstance(discount_rate, float):
        if math.isnan(discount_rate):
            return 0.0
        return float(discount_rate)

    if discount_rate == 'fixed':
        return 0.0

    arr = discount_rate.split(':')
    if len(arr) == 2:
        return (float(arr[0]) - float(arr[1])) / float(arr[0])
    else:
        return float(discount_rate)

def base_consume(discount_rate):
    if isinstance(discount_rate, int) or isinstance(discount_rate, float):
        return 0.0
    
    if discount_rate == 'fixed':
        return 0.0
    
    arr = discount_rate.split(':')
    if len(arr) == 2:
        return float(arr[0])
    else:
        return 0.0

def get_day_in_month_4_received_day(date_received):
    if math.isnan(date_received) or date_received <= 0:
        return 0.0
    
    date_received_dt = datetime.strptime(str(int(date_received)), '%Y%m%d')
    return date_received_dt.day

def get_day_in_week_4_received_day(date_received):
    if math.isnan(date_received) or date_received <= 0:
        return 0.0
    
    date_received_dt = datetime.strptime(str(int(date_received)), '%Y%m%d')
    return (date_received_dt.weekday() + 1)

def set_coupon_type(discount_rate):
    if isinstance(discount_rate, int) or isinstance(discount_rate, float):
        return 0
    
    if discount_rate == 'fixed':
        return 2
    
    arr = discount_rate.split(':')
    if len(arr) == 2:
        return 1
    else:
        return 0
    
def check_is_in_day_consume(row):
    
    if row.Coupon_id == 'fixed':
        return 0
    
    if float(row.Coupon_id) > 0 and float(row.Date_received) > 0 and float(row.Date) > 0:
        date_received = datetime.strptime(str(int(row.Date_received)), '%Y%m%d')
        date_consumed = datetime.strptime(str(int(row.Date)), '%Y%m%d')
        delta = date_consumed - date_received
        if delta.days < 16:
            return 1
        else:
            return 0
    
    return 0

def offline_consume(date):
    if math.isnan(date):
        return 0
    
    if date > 0:
        return 1
    else:
        return 0

In [274]:
features_extract_processing = Pipeline([
    ('features', FeatureUnion([
        ('extract', ColumnSelector(['User_id', 'Merchant_id', 'Coupon_id', 'Distance', 'Date_received'])),
        ('extract_label', ContentExtractorByRow(check_is_in_day_consume)),
        ('discount', ContentExtractor('Discount_rate', cal_discount)),
        ('base_consume', ContentExtractor('Discount_rate', base_consume)),
        ('day_in_month', ContentExtractor('Date_received', get_day_in_month_4_received_day)),
        ('day_in_week', ContentExtractor('Date_received', get_day_in_week_4_received_day)),
        ('coupon_type', ContentExtractor('Discount_rate', set_coupon_type)),
        ('offline_consume', ContentExtractor('Date', offline_consume)),
        ('duration', ContentExtractorByRow(cal_duration)),
    ])),
    ('combine', NewDataFrame([
        'User_id', 
        'Merchant_id', 
        'Coupon_id', 
        'Distance', 
        'Date_received', 
        'Is_in_day_consume', 
        'Discount', 
        'Base_consume', 
        'Day_in_month', 
        'Day_in_week', 
        'Coupon_type', 
        'Offline_consume',
        'Duration'
    ], [('User_id', 'int64')]))
])

In [275]:
offline_df = offline_df.fillna(0)
features_extract_processing.fit(offline_df)
base_features_df = features_extract_processing.transform(offline_df)
base_features_df.head(20)

Unnamed: 0,User_id,Merchant_id,Coupon_id,Distance,Date_received,Is_in_day_consume,Discount,Base_consume,Day_in_month,Day_in_week,Coupon_type,Offline_consume,Duration
0,1439408,2632.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1439408,4663.0,11002.0,1.0,20160528.0,0.0,0.866667,150.0,28.0,6.0,1.0,0.0,0.0
2,1439408,2632.0,8591.0,0.0,20160217.0,0.0,0.95,20.0,17.0,3.0,1.0,0.0,0.0
3,1439408,2632.0,1078.0,0.0,20160319.0,0.0,0.95,20.0,19.0,6.0,1.0,0.0,0.0
4,1439408,2632.0,8591.0,0.0,20160613.0,0.0,0.95,20.0,13.0,1.0,1.0,0.0,0.0
5,1439408,2632.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
6,1439408,2632.0,8591.0,0.0,20160516.0,0.0,0.95,20.0,16.0,1.0,1.0,1.0,29.0
7,1832624,3381.0,7610.0,0.0,20160429.0,0.0,0.9,200.0,29.0,5.0,1.0,0.0,0.0
8,2029232,3381.0,11951.0,1.0,20160129.0,0.0,0.9,200.0,29.0,5.0,1.0,0.0,0.0
9,2029232,450.0,1532.0,0.0,20160530.0,0.0,0.833333,30.0,30.0,1.0,1.0,0.0,0.0


In [276]:
offline_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1754884 entries, 0 to 1754883
Data columns (total 7 columns):
User_id          int64
Merchant_id      int64
Coupon_id        float64
Discount_rate    object
Distance         float64
Date_received    float64
Date             float64
dtypes: float64(4), int64(2), object(1)
memory usage: 93.7+ MB


In [277]:
base_features_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1754884 entries, 0 to 1754883
Data columns (total 13 columns):
User_id              int64
Merchant_id          float64
Coupon_id            float64
Distance             float64
Date_received        float64
Is_in_day_consume    float64
Discount             float64
Base_consume         float64
Day_in_month         float64
Day_in_week          float64
Coupon_type          float64
Offline_consume      float64
Duration             float64
dtypes: float64(12), int64(1)
memory usage: 174.1 MB


## 用户特征

In [267]:
def receive_count(df, target, groupby):
    return df[df['Date_received']>0][target].groupby(groupby).size()

def cousume_count(df, target, groupby):
    return df[df['Offline_consume']>0][target].groupby(groupby).size()

def used_count(df, target, groupby):
    return df[df['Is_in_day_consume']>0][target].groupby(groupby).size()

def extract_info(df, target, type, groupby, column_name):
    factory = {
        'receive_type': receive_count,
        'cousume_type': cousume_count,
        'used_type': used_count
    }
    
    target_df = df[groupby].drop_duplicates()
    series = factory[type](df, target, groupby)
    
    tdf = multi_join(target_df, column_name, series, groupby)
    tdf = tdf.fillna(0)
    
    mean = tdf[column_name].mean()
    tdf[column_name+'_occ'] = tdf[column_name] / mean
    return tdf[[column_name, column_name+'_occ']]

In [268]:
class GroupbyExtractor(TransformerMixin):
    def __init__(self, fn, target, type, groupby, column_name):
        self.fn = fn
        self.target = target
        self.type = type
        self.groupby = groupby
        self.column_name = column_name
    
    def fit(self, *args, **kwargs):
        return self

    def transform(self, X, **transform_params):
        return self.fn(X, self.target, self.type, self.groupby, self.column_name)

In [271]:
user_features_extract_processing = Pipeline([
    ('features', FeatureUnion([
        ('user_receive_count', GroupbyExtractor(extract_info, ['User_id'], 'receive_type', ['User_id'], 'User_receive_count')),
        ('User_consume_count', GroupbyExtractor(extract_info, ['User_id'], 'cousume_type', ['User_id'], 'User_consume_count')),
        ('User_used_count', GroupbyExtractor(extract_info, ['User_id'], 'used_type', ['User_id'], 'User_used_count')),
        
        ('User_receive_merchant_count', GroupbyExtractor(extract_info, ['User_id', 'Merchant_id'], 'receive_type', ['User_id'], 'User_receive_merchant_count')),
        ('User_used_merchant_count', GroupbyExtractor(extract_info, ['User_id', 'Merchant_id'], 'used_type', ['User_id'], 'User_used_merchant_count')),
        
        ('User_receive_different_coupon_count', GroupbyExtractor(extract_info, ['User_id', 'Coupon_id'], 'receive_type', ['User_id'], 'User_receive_different_coupon_count')),
        ('User_used_different_coupon_count', GroupbyExtractor(extract_info, ['User_id', 'Coupon_id'], 'used_type', ['User_id'], 'User_used_different_coupon_count')),
        
    ]))
])

In [272]:
user_features_extract_processing.fit(base_features_df)
user_features_extract_processing.transform(base_features_df)

array([[ 5.        ,  2.56074821,  3.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 1.        ,  0.51214964,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 3.        ,  1.53644893,  2.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 1.        ,  0.51214964,  4.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 5.        ,  2.56074821,  9.        , ..., 25.13105055,
         3.        , 25.13105055],
       [ 2.        ,  1.02429929,  0.        , ...,  0.        ,
         0.        ,  0.        ]])