In [47]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import math

from datetime import datetime
from scipy import stats

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from mlxtend.feature_selection import ColumnSelector
from sklearn.preprocessing import Imputer 


In [2]:
offline_df = pd.read_csv('../source/ccf_offline_stage1_train.csv')

In [35]:
offline_df.columns.values

array(['User_id', 'Merchant_id', 'Coupon_id', 'Discount_rate', 'Distance',
       'Date_received', 'Date'], dtype=object)

In [3]:
online_df = pd.read_csv('../source/ccf_online_stage1_train.csv')

In [62]:
class ContentExtractor(TransformerMixin):
    def __init__(self, col, fn):
        self.col = col
        self.fn = fn
    
    def fit(self, *args, **kwargs):
        return self

    def transform(self, X, **transform_params):
        return pd.DataFrame(X[self.col].apply(lambda row: self.fn(row)))

In [71]:
class ContentExtractorByRow(TransformerMixin):
    def __init__(self, fn):
        self.fn = fn
    
    def fit(self, *args, **kwargs):
        return self

    def transform(self, X, **transform_params):
        return pd.DataFrame(X.apply(lambda row: self.fn(row), axis=1))

In [239]:
class NewDataFrame(TransformerMixin):
    def __init__(self, cols, dtypes):
        self.cols = cols
        self.dtypes = dtypes
    def fit(self, *args, **kwargs):
        return self
    def transform(self, X, **transform_params):
        df = pd.DataFrame(data=X, columns=self.cols)
        for item in self.dtypes:
            df[item[0]] = df[item[0]].astype(item[1])
            
        return df

In [240]:
class GBDTTransformer(TransformerMixin):
    def __init__(self):
        self.n_estimator = 256
        self.model = GradientBoostingClassifier(max_depth=3, n_estimators=self.n_estimator, random_state=0)
        
    def fit(self, *args, **kwargs):
        self.model.fit(*args, **kwargs)
        return self
    
    def transform(self, X, **transform_params):
        return self.model.apply(X)[:, :, 0]

In [241]:
def join(df, col, series, key):
    return multi_join(df, col, series, [key])

def multi_join(df, col, series, keys):
    t = series.to_frame()
    t.columns = [col]
    
    return pd.merge(df, t, on=keys, how='left')

## 基础特征

In [242]:
# 优惠券信息 - 计算折扣率
def cal_discount(discount_rate):
    if isinstance(discount_rate, int) or isinstance(discount_rate, float):
        if math.isnan(discount_rate):
            return 0.0
        return float(discount_rate)

    if discount_rate == 'fixed':
        return 0.0

    arr = discount_rate.split(':')
    if len(arr) == 2:
        return (float(arr[0]) - float(arr[1])) / float(arr[0])
    else:
        return float(discount_rate)

def base_consume(discount_rate):
    if isinstance(discount_rate, int) or isinstance(discount_rate, float):
        return 0.0
    
    if discount_rate == 'fixed':
        return 0.0
    
    arr = discount_rate.split(':')
    if len(arr) == 2:
        return float(arr[0])
    else:
        return 0.0

def get_day_in_month_4_received_day(date_received):
    if math.isnan(date_received) or date_received <= 0:
        return 0.0
    
    date_received_dt = datetime.strptime(str(int(date_received)), '%Y%m%d')
    return date_received_dt.day

def get_day_in_week_4_received_day(date_received):
    if math.isnan(date_received) or date_received <= 0:
        return 0.0
    
    date_received_dt = datetime.strptime(str(int(date_received)), '%Y%m%d')
    return (date_received_dt.weekday() + 1)

def set_coupon_type(discount_rate):
    if isinstance(discount_rate, int) or isinstance(discount_rate, float):
        return 0
    
    if discount_rate == 'fixed':
        return 2
    
    arr = discount_rate.split(':')
    if len(arr) == 2:
        return 1
    else:
        return 0
    
def check_is_in_day_consume(row):
    
    if row.Coupon_id == 'fixed':
        return 0
    
    if float(row.Coupon_id) > 0 and float(row.Date_received) > 0 and float(row.Date) > 0:
        date_received = datetime.strptime(str(int(row.Date_received)), '%Y%m%d')
        date_consumed = datetime.strptime(str(int(row.Date)), '%Y%m%d')
        delta = date_consumed - date_received
        if delta.days < 16:
            return 1
        else:
            return 0
    
    return 0

def offline_consume(date):
    if math.isnan(date):
        return 0
    
    if date > 0:
        return 1
    else:
        return 0

In [247]:
features_extract_processing = Pipeline([
    ('features', FeatureUnion([
        ('extract', ColumnSelector(['User_id', 'Merchant_id', 'Coupon_id', 'Distance', 'Date_received'])),
        ('extract_label', ContentExtractorByRow(check_is_in_day_consume)),
        ('discount', ContentExtractor('Discount_rate', cal_discount)),
        ('base_consume', ContentExtractor('Discount_rate', base_consume)),
        ('day_in_month', ContentExtractor('Date_received', get_day_in_month_4_received_day)),
        ('day_in_week', ContentExtractor('Date_received', get_day_in_week_4_received_day)),
        ('coupon_type', ContentExtractor('Discount_rate', set_coupon_type)),
        ('offline_consume', ContentExtractor('Date', offline_consume)),
    ])),
    ('combine', NewDataFrame([
        'User_id', 
        'Merchant_id', 
        'Coupon_id', 
        'Distance', 
        'Date_received', 
        'Is_in_day_consume', 
        'Discount', 
        'Base_consume', 
        'Day_in_month', 
        'Day_in_week', 
        'Coupon_type', 
        'Offline_consume'
    ], []))
])

In [None]:
offline_df = offline_df.fillna(0)
features_extract_processing.fit(offline_df)
base_features_df = features_extract_processing.transform(offline_df)
base_features_df.head(20)

In [None]:
offline_df.info()

In [None]:
base_features_df.info()

## 用户特征

In [143]:
def receive_count(df, groupby):
    return df[df['Date_received']>0].groupby(groupby).size()

def cousume_count(df, groupby):
    return df[df['Offline_consume']>0].groupby(groupby).size()

def used_count(df, groupby):
    return df[df['Is_in_day_consume']>0].groupby(groupby).size()

def extract_info(df, type, groupby, column_name):
    factory = {
        'receive_type': receive_count,
        'cousume_type': cousume_count,
        'used_type': used_count
    }
    
    target_df = df[groupby].drop_duplicates()
    target_df['hash'] = 1
    
    series = factory[type](df, groupby)
    
    tdf = multi_join(target_df, column_name, series, groupby)
    tdf = tdf.fillna(0)
    
    mean = tdf[column_name].mean()
    tdf[column_name+'_occ'] = tdf[column_name] / mean
    return tdf[[column_name, column_name+'_occ']]

In [144]:
class GroupbyExtractor(TransformerMixin):
    def __init__(self, fn, type, groupby, column_name):
        self.fn = fn
        self.type = type
        self.groupby = groupby
        self.column_name = column_name
    
    def fit(self, *args, **kwargs):
        return self

    def transform(self, X, **transform_params):
        return self.fn(X, self.type, self.groupby, self.column_name)

In [194]:
# extract_info(base_features, 'receive_type', ['User_id'], 'User_receive_count')



df = base_features
type = 'receive_type' 
groupby =['User_id'] 
column_name = 'User_receive_count'

factory = {
    'receive_type': receive_count,
    'cousume_type': cousume_count,
    'used_type': used_count
}

target_df = df[groupby].drop_duplicates()
target_df = target_df.reset_index()

series = factory[type](df, groupby)

t = series.to_frame()
t.columns = [column_name]

# print(isinstance(t, pd.DataFrame))

tdf = pd.merge(target_df, t, on=groupby, how='left')
# tdf

# # tdf = multi_join(target_df, column_name, series, groupby)
# tdf = tdf.fillna(0)

# mean = tdf[column_name].mean()
# tdf[column_name+'_occ'] = tdf[column_name] / mean
# tdf[[column_name, column_name+'_occ']]

In [145]:
user_features_extract_processing = Pipeline([
    ('features', FeatureUnion([
        ('user_receive_count', GroupbyExtractor(extract_info, 'receive_type', ['User_id'], 'User_receive_count')),
        ('User_consume_count', GroupbyExtractor(extract_info, 'cousume_type', ['User_id'], 'User_consume_count')),
        ('User_used_count', GroupbyExtractor(extract_info, 'used_type', ['User_id'], 'User_used_count')),
    ]))
])

In [146]:
user_features_extract_processing.fit(base_features)
user_features_extract_processing.transform(base_features)

ValueError: You are trying to merge on object and int64 columns. If you wish to proceed you should use pd.concat