In [47]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import math

from datetime import datetime
from scipy import stats

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from mlxtend.feature_selection import ColumnSelector
from sklearn.preprocessing import Imputer 


In [2]:
offline_df = pd.read_csv('../source/ccf_offline_stage1_train.csv')

In [35]:
offline_df.columns.values

array(['User_id', 'Merchant_id', 'Coupon_id', 'Discount_rate', 'Distance',
       'Date_received', 'Date'], dtype=object)

In [3]:
online_df = pd.read_csv('../source/ccf_online_stage1_train.csv')

In [37]:
# 优惠券信息 - 计算折扣率
def cal_discount(discount_rate):
    if isinstance(discount_rate, int) or isinstance(discount_rate, float):
        if math.isnan(discount_rate):
            return 0.0
        return float(discount_rate)

    if discount_rate == 'fixed':
        return 0.0

    arr = discount_rate.split(':')
    if len(arr) == 2:
        return (float(arr[0]) - float(arr[1])) / float(arr[0])
    else:
        return float(discount_rate)

def base_consume(discount_rate):
    if isinstance(discount_rate, int) or isinstance(discount_rate, float):
        return 0.0
    
    if discount_rate == 'fixed':
        return 0.0
    
    arr = discount_rate.split(':')
    if len(arr) == 2:
        return float(arr[0])
    else:
        return 0.0

def get_day_in_month_4_received_day(date_received):
    if math.isnan(date_received) or date_received <= 0:
        return 0.0
    
    date_received_dt = datetime.strptime(str(int(date_received)), '%Y%m%d')
    return date_received_dt.day

def get_day_in_week_4_received_day(date_received):
    if math.isnan(date_received) or date_received <= 0:
        return 0.0
    
    date_received_dt = datetime.strptime(str(int(date_received)), '%Y%m%d')
    return (date_received_dt.weekday() + 1)

def set_coupon_type(discount_rate):
    if isinstance(discount_rate, int) or isinstance(discount_rate, float):
        return 0
    
    if discount_rate == 'fixed':
        return 2
    
    arr = discount_rate.split(':')
    if len(arr) == 2:
        return 1
    else:
        return 0

In [38]:
class ContentExtractor(TransformerMixin):
    def __init__(self, col, fn):
        self.col = col
        self.fn = fn
    
    def fit(self, *args, **kwargs):
        return self

    def transform(self, X, **transform_params):
        return pd.DataFrame(X[self.col].apply(lambda row: self.fn(row)))

In [52]:
class NewDataFrame(TransformerMixin):
    def __init__(self, cols):
        self.cols = cols
    def fit(self, *args, **kwargs):
        return self
    def transform(self, X, **transform_params):
        return pd.DataFrame(data=X, columns=self.cols)

In [53]:
class GBDTTransformer(TransformerMixin):
    def __init__(self):
        self.n_estimator = 256
        self.model = GradientBoostingClassifier(max_depth=3, n_estimators=self.n_estimator, random_state=0)
        
    def fit(self, *args, **kwargs):
        self.model.fit(*args, **kwargs)
        return self
    
    def transform(self, X, **transform_params):
        return self.model.apply(X)[:, :, 0]

In [56]:
features_extract_processing = Pipeline([
    ('features', FeatureUnion([
        ('extract', ColumnSelector(['User_id', 'Merchant_id', 'Coupon_id', 'Discount_rate', 'Distance', 'Date_received', 'Date'])),
        ('discount', ContentExtractor('Discount_rate', cal_discount)),
        ('base_consume', ContentExtractor('Discount_rate', base_consume)),
        ('day_in_month', ContentExtractor('Date_received', get_day_in_month_4_received_day)),
        ('day_in_week', ContentExtractor('Date_received', get_day_in_week_4_received_day)),
        ('coupon_type', ContentExtractor('Discount_rate', set_coupon_type)),
    ])),
    ('combine', NewDataFrame(['User_id', 'Merchant_id', 'Coupon_id', 'Discount_rate', 'Distance', 'Date_received', 'Discount', 'Base_consume', 'Day_in_month', 'Day_in_week', 'Coupon_type']))
])

offline_df = offline_df.fillna(0)
features_extract_processing.fit(offline_df)
features_extract_processing.transform(offline_df)

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Discount,Base_consume,Day_in_month,Day_in_week,Coupon_type
0,1439408,2632,0,0,0,0,0,0,0,0,0
1,1439408,4663,11002,150:20,1,2.01605e+07,0.866667,150,28,6,1
2,1439408,2632,8591,20:1,0,2.01602e+07,0.95,20,17,3,1
3,1439408,2632,1078,20:1,0,2.01603e+07,0.95,20,19,6,1
4,1439408,2632,8591,20:1,0,2.01606e+07,0.95,20,13,1,1
5,1439408,2632,0,0,0,0,0,0,0,0,0
6,1439408,2632,8591,20:1,0,2.01605e+07,0.95,20,16,1,1
7,1832624,3381,7610,200:20,0,2.01604e+07,0.9,200,29,5,1
8,2029232,3381,11951,200:20,1,2.01601e+07,0.9,200,29,5,1
9,2029232,450,1532,30:5,0,2.01605e+07,0.833333,30,30,1,1
