In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import math

from datetime import datetime
from scipy import stats

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from mlxtend.feature_selection import ColumnSelector
from sklearn.preprocessing import OneHotEncoder,MinMaxScaler, Normalizer
from sklearn.impute import SimpleImputer

In [20]:
offline_df = pd.read_csv('../source/ccf_offline_stage1_train.csv')

In [4]:
online_df = pd.read_csv('../source/ccf_online_stage1_train.csv')

In [5]:
class ContentExtractor(TransformerMixin):
    def __init__(self, col, fn):
        self.col = col
        self.fn = fn
    
    def fit(self, *args, **kwargs):
        return self

    def transform(self, X, **transform_params):
        return pd.DataFrame(X[self.col].apply(lambda row: self.fn(row)))

In [6]:
class ContentExtractorByRow(TransformerMixin):
    def __init__(self, fn):
        self.fn = fn
    
    def fit(self, *args, **kwargs):
        return self

    def transform(self, X, **transform_params):
        return pd.DataFrame(X.apply(lambda row: self.fn(row), axis=1))

In [7]:
class NewDataFrame(TransformerMixin):
    def __init__(self, cols, dtypes):
        self.cols = cols
        self.dtypes = dtypes
    def fit(self, *args, **kwargs):
        return self
    def transform(self, X, **transform_params):
        df = pd.DataFrame(data=X, columns=self.cols)
        for item in self.dtypes:
            df[item[0]] = df[item[0]].astype(item[1])
            
        return df

In [8]:
class GBDTTransformer(TransformerMixin):
    def __init__(self):
        self.n_estimator = 256
        self.model = GradientBoostingClassifier(max_depth=3, n_estimators=self.n_estimator, random_state=0)
        
    def fit(self, *args, **kwargs):
        self.model.fit(*args, **kwargs)
        return self
    
    def transform(self, X, **transform_params):
        return self.model.apply(X)[:, :, 0]

In [9]:
def join(df, col, series, key):
    return multi_join(df, col, series, [key])

def multi_join(df, col, series, keys):
    t = series.to_frame()
    t.columns = [col]
    
    return pd.merge(df, t, on=keys, how='left')

## 基础特征

In [17]:
def cal_duration(row):
    if row.Coupon_id > 0 and row.Date_received > 0 and row.Date > 0:
        date_received = datetime.strptime(str(int(row.Date_received)), '%Y%m%d')
        date_consumed = datetime.strptime(str(int(row.Date)), '%Y%m%d')
        delta = date_consumed - date_received
        return delta.days + 1
    return 0

# 优惠券信息 - 计算折扣率
def cal_discount(discount_rate):
    if isinstance(discount_rate, int) or isinstance(discount_rate, float):
        if math.isnan(discount_rate):
            return 0.0
        return float(discount_rate)

    if discount_rate == 'fixed':
        return 0.0

    arr = discount_rate.split(':')
    if len(arr) == 2:
        return (float(arr[0]) - float(arr[1])) / float(arr[0])
    else:
        return float(discount_rate)

def base_consume(discount_rate):
    if isinstance(discount_rate, int) or isinstance(discount_rate, float):
        return 0.0
    
    if discount_rate == 'fixed':
        return 0.0
    
    arr = discount_rate.split(':')
    if len(arr) == 2:
        return float(arr[0])
    else:
        return 0.0

def discount_money(discount_rate):
    if isinstance(discount_rate, int) or isinstance(discount_rate, float):
        return -1
    
    if discount_rate == 'fixed':
        return -1
    
    arr = discount_rate.split(':')
    if len(arr) == 2:
        return int(arr[1])
    else:
        return -1

def get_day_in_month_4_received_day(date_received):
    if math.isnan(date_received) or date_received <= 0:
        return 0.0
    
    date_received_dt = datetime.strptime(str(int(date_received)), '%Y%m%d')
    return date_received_dt.day

def get_day_in_week_4_received_day(date_received):
    if math.isnan(date_received) or date_received <= 0:
        return 0.0
    
    date_received_dt = datetime.strptime(str(int(date_received)), '%Y%m%d')
    return (date_received_dt.weekday() + 1)

def set_coupon_type(discount_rate):
    if isinstance(discount_rate, int) or isinstance(discount_rate, float):
        return 0
    
    if discount_rate == 'fixed':
        return 2
    
    arr = discount_rate.split(':')
    if len(arr) == 2:
        return 1
    else:
        return 0
    
def check_is_in_day_consume(row):
    
    if row.Coupon_id == 'fixed':
        return 0
    
    if float(row.Coupon_id) > 0 and float(row.Date_received) > 0 and float(row.Date) > 0:
        date_received = datetime.strptime(str(int(row.Date_received)), '%Y%m%d')
        date_consumed = datetime.strptime(str(int(row.Date)), '%Y%m%d')
        delta = date_consumed - date_received
        if delta.days < 16:
            return 1
        else:
            return 0
    
    return 0

def offline_consume(date):
    if math.isnan(date):
        return 0
    
    if date > 0:
        return 1
    else:
        return 0

In [22]:
features_extract_processing = Pipeline([
    ('features', FeatureUnion([
        ('extract', ColumnSelector(['User_id', 'Merchant_id', 'Coupon_id', 'Distance', 'Date_received'])),
        ('extract_label', ContentExtractorByRow(check_is_in_day_consume)),
        ('discount', ContentExtractor('Discount_rate', cal_discount)),
        ('base_consume', ContentExtractor('Discount_rate', base_consume)),
        ('discount_money', ContentExtractor('Discount_rate', discount_money)),
        ('day_in_month', ContentExtractor('Date_received', get_day_in_month_4_received_day)),
        ('day_in_week', ContentExtractor('Date_received', get_day_in_week_4_received_day)),
        ('coupon_type', ContentExtractor('Discount_rate', set_coupon_type)),
        ('offline_consume', ContentExtractor('Date', offline_consume)),
        ('duration', ContentExtractorByRow(cal_duration)),
    ])),
    ('imputer', SimpleImputer(missing_values=np.nan,  strategy='constant', fill_value=0)),
    ('combine', NewDataFrame([
        'User_id', 
        'Merchant_id', 
        'Coupon_id', 
        'Distance', 
        'Date_received', 
        'Is_in_day_consume', 
        'Discount', 
        'Base_consume', 
        'Discount_money',
        'Day_in_month', 
        'Day_in_week', 
        'Coupon_type', 
        'Offline_consume',
        'Duration'
    ], [('User_id', 'int64')]))
])

In [23]:
# Distance - null表示无此信息
offline_df['Distance'] = offline_df['Distance'].fillna(-1)

# 消费日期: 如果Date=null & Coupon_id != null，该记录表示领取优惠券但没有使用
offline_df['Date'] = offline_df['Date'].fillna(-1)

# 消费日期: 如果Date!=null & Coupon_id = null，则表示普通消费日期；
offline_df = offline_df.dropna()

# 抽取基础的特征
features_extract_processing.fit(offline_df)
base_features_df = features_extract_processing.transform(offline_df)
base_features_df.head(20)

Unnamed: 0,User_id,Merchant_id,Coupon_id,Distance,Date_received,Is_in_day_consume,Discount,Base_consume,Discount_money,Day_in_month,Day_in_week,Coupon_type,Offline_consume,Duration
0,1439408,4663.0,11002.0,1.0,20160528.0,0.0,0.866667,150.0,20.0,28.0,6.0,1.0,0.0,0.0
1,1439408,2632.0,8591.0,0.0,20160217.0,0.0,0.95,20.0,1.0,17.0,3.0,1.0,0.0,0.0
2,1439408,2632.0,1078.0,0.0,20160319.0,0.0,0.95,20.0,1.0,19.0,6.0,1.0,0.0,0.0
3,1439408,2632.0,8591.0,0.0,20160613.0,0.0,0.95,20.0,1.0,13.0,1.0,1.0,0.0,0.0
4,1439408,2632.0,8591.0,0.0,20160516.0,0.0,0.95,20.0,1.0,16.0,1.0,1.0,1.0,29.0
5,1832624,3381.0,7610.0,0.0,20160429.0,0.0,0.9,200.0,20.0,29.0,5.0,1.0,0.0,0.0
6,2029232,3381.0,11951.0,1.0,20160129.0,0.0,0.9,200.0,20.0,29.0,5.0,1.0,0.0,0.0
7,2029232,450.0,1532.0,0.0,20160530.0,0.0,0.833333,30.0,5.0,30.0,1.0,1.0,0.0,0.0
8,2029232,6459.0,12737.0,0.0,20160519.0,0.0,0.95,20.0,1.0,19.0,4.0,1.0,0.0,0.0
9,2747744,6901.0,1097.0,-1.0,20160606.0,0.0,0.8,50.0,10.0,6.0,1.0,1.0,0.0,0.0


## 用户特征

In [13]:
def receive_count(df, target, groupby):
    return df[df['Date_received']>0][target].groupby(groupby).size()

def cousume_count(df, target, groupby):
    return df[df['Offline_consume']>0][target].groupby(groupby).size()

def used_count(df, target, groupby):
    return df[df['Is_in_day_consume']>0][target].groupby(groupby).size()

def cal_user_used_coupon_rate(row):
    return row.User_used_count/row.User_receive_count

def cal_user_not_used_conpon_count(row):
    return row.User_receive_count - row.User_used_count

def extract_info(df, target, type, groupby, column_name):
    factory = {
        'receive_type': receive_count,
        'cousume_type': cousume_count,
        'used_type': used_count
    }
    
    target_df = df[groupby].drop_duplicates()
    series = factory[type](df, target, groupby)
    
    tdf = multi_join(target_df, column_name, series, groupby)
    tdf = tdf.fillna(0)
    
    mean = tdf[column_name].mean()
    tdf[column_name+'_occ'] = tdf[column_name] / mean
    return tdf[[column_name, column_name+'_occ']]

In [14]:
class GroupbyExtractor(TransformerMixin):
    def __init__(self, fn, target, type, groupby, column_name):
        self.fn = fn
        self.target = target
        self.type = type
        self.groupby = groupby
        self.column_name = column_name
    
    def fit(self, *args, **kwargs):
        return self

    def transform(self, X, **transform_params):
        return self.fn(X, self.target, self.type, self.groupby, self.column_name)

In [15]:
user_features_columns = [
    'User_receive_count', 
    'User_receive_count_occ',
    'User_used_count',
    'User_used_count_occ',
    'User_receive_diff_merchant_count',
    'User_receive_diff_merchant_count_occ',
    'User_used_diff_merchant_count',
    'User_used_diff_merchant_count_occ',
    'User_receive_diff_coupon_count',
    'User_receive_diff_coupon_count_occ',
    'User_used_diff_coupon_count',
    'User_used_diff_coupon_count_occ',
    'User_receive_diff_coupon_type_count',
    'User_receive_diff_coupon_type_count_occ',
    'User_used_diff_coupon_type_count',
    'User_used_diff_coupon_type_count_occ',
    'User_receive_diff_distance_count',
    'User_receive_diff_distance_count_occ',
    'User_used_diff_distance_count',
    'User_used_diff_distance_count_occ'
]

user_features_extract_processing = Pipeline([
    ('features', FeatureUnion([
        ('User_receive_count', GroupbyExtractor(extract_info, ['User_id'], 'receive_type', ['User_id'], 'User_receive_count')),
        ('User_used_count', GroupbyExtractor(extract_info, ['User_id'], 'used_type', ['User_id'], 'User_used_count')),
        
        ('User_receive_diff_merchant_count', GroupbyExtractor(extract_info, ['User_id', 'Merchant_id'], 'receive_type', ['User_id'], 'User_receive_diff_merchant_count')),
        ('User_used_diff_merchant_count', GroupbyExtractor(extract_info, ['User_id', 'Merchant_id'], 'used_type', ['User_id'], 'User_used_diff_merchant_count')),
        
        ('User_receive_diff_coupon_count', GroupbyExtractor(extract_info, ['User_id', 'Coupon_id'], 'receive_type', ['User_id'], 'User_receive_diff_coupon_count')),
        ('User_used_diff_coupon_count', GroupbyExtractor(extract_info, ['User_id', 'Coupon_id'], 'used_type', ['User_id'], 'User_used_diff_coupon_count')),
        
        ('User_receive_diff_coupon_type_count', GroupbyExtractor(extract_info, ['User_id', 'Coupon_type'], 'receive_type', ['User_id'], 'User_receive_diff_coupon_type_count')),
        ('User_used_diff_coupon_type_count', GroupbyExtractor(extract_info, ['User_id', 'Coupon_type'], 'used_type', ['User_id'], 'User_used_diff_coupon_type_count')),
        
        ('User_receive_diff_distance_count', GroupbyExtractor(extract_info, ['User_id', 'Distance'], 'receive_type', ['User_id'], 'User_receive_diff_distance_count')),
        ('User_used_diff_distance_count', GroupbyExtractor(extract_info, ['User_id', 'Distance'], 'used_type', ['User_id'], 'User_used_diff_distance_count')),
    ])),
    ('agg', NewDataFrame(user_features_columns, [])),
    ('extract', FeatureUnion([
        ('extract', ColumnSelector(user_features_columns])),
        ('User_used_coupon_rate', ContentExtractorByRow(cal_user_used_coupon_rate)),
        ('User_not_used_coupon_count', ContentExtractorByRow(cal_user_not_used_conpon_count)),
        
    ])),
    ('scale', MinMaxScaler()),
    ('combine', NewDataFrame(user_features_columns + [
        'User_used_coupon_rate'
    ],[]))
])

In [16]:
user_features_extract_processing.fit(base_features_df)
user_features_df = user_features_extract_processing.transform(base_features_df)

  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()
