# 特征工程

In [1]:
import featuretools as ft
import pandas as pd
import datetime
import logging
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.preprocessing import OneHotEncoder,MinMaxScaler, Normalizer
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer

from mlxtend.preprocessing import DenseTransformer
from mlxtend.feature_selection import ColumnSelector

In [2]:
def groupby2df(t, name):
    t = t.to_frame()
    t.columns = [name]
    return t

In [3]:
logger = logging.getLogger('ai')
logging.basicConfig(level=logging.DEBUG,
                    format='%(asctime)s  %(filename)s : %(levelname)s  %(message)s')

In [4]:
offline_df = pd.read_csv('../source/ccf_offline_stage1_train.csv', parse_dates = ['Date_received', 'Date'])
online_df = pd.read_csv('../source/ccf_online_stage1_train.csv', parse_dates = ['Date_received', 'Date'])

In [5]:
pred_df = pd.read_csv('../source/ccf_offline_stage1_test_revised.csv', parse_dates = ['Date_received'])

## 数据集划分

In [6]:
time_range = [datetime.datetime(2016, 5, 16), datetime.datetime(2016, 6, 15)]

time_range_date_received = [datetime.datetime(2016, 2, 1), datetime.datetime(2016, 4, 30)]
time_range_date = [datetime.datetime(2016, 2, 1), datetime.datetime(2016, 5, 15)]

FILENAME = 'dataset_beta'

IS_PRED = False

In [7]:
# time_range = [datetime.datetime(2016, 4, 16), datetime.datetime(2016, 5, 15)]

# time_range_date_received = [datetime.datetime(2016, 1, 1), datetime.datetime(2016, 3, 31)]
# time_range_date = [datetime.datetime(2016, 1, 1), datetime.datetime(2016, 4, 15)]

# FILENAME = 'dataset_alpha'

# IS_PRED = False

In [8]:
def _split(row, time_range_date_received, time_range_date):
    if ((row.Date >= time_range_date[0]) & (row.Date <= time_range_date[1])) | ((row.Coupon_id == 0) & (row.Date_received >= time_range_date_received[0]) & (row.Date_received <= time_range_date_received[1])):
        return row
    
def dataset_fetch(time_range):
    dataset = offline_df[(offline_df.Date_received >= time_range[0]) & (offline_df.Date_received <= time_range[1])].copy()
    return dataset
    
def dataset_split(time_range_date_received, time_range_date):
    feature_offline = offline_df.loc[
        ((offline_df.Date >= time_range_date[0]) & (offline_df.Date <= time_range_date[1])) | 
        ((offline_df.Coupon_id == 0) & (offline_df.Date_received >= time_range_date_received[0]) & (offline_df.Date_received <= time_range_date_received[1]))]
    feature_online = online_df.loc[
        ((online_df.Date >= time_range_date[0]) & (online_df.Date <= time_range_date[1])) | 
        ((online_df.Coupon_id == 0) & (online_df.Date_received >= time_range_date_received[0]) & (online_df.Date_received <= time_range_date_received[1]))]
    
    return feature_offline, feature_online

if IS_PRED:
    dataset = pred_df
else:
    dataset = dataset_fetch(time_range)

feature_alpha_offline, feature_alpha_online = dataset_split(time_range_date_received, time_range_date)

## 基础数据特征

### 基础特征抽取

In [9]:
discount_cat = offline_df['Discount_rate'].unique()

In [10]:
def cal_coupon_feature(row):
    if isinstance(row.Discount_rate, str) and row.Discount_rate == 'fixed':
        row.Coupon_type = 2
        return row
    
    if isinstance(row.Discount_rate, float):
        row.Discount = row.Discount_rate
        i, = np.where(discount_cat == row.Discount_rate)
        if len(i)>0:
            row.Coupon_category = i[0]
        return row
    
    arr = row.Discount_rate.split(':')
    if len(arr) == 2:
        row.Discount =  (float(arr[0]) - float(arr[1])) / float(arr[0])
        row.Coupon_type = 1
        row.Base_consume = float(arr[0])
        row.Discount_money = float(arr[1])
    else:
        row.Discount = float(row.Discount_rate)
        
    i, = np.where(discount_cat == row.Discount_rate)
    if len(i)>0:
        row.Coupon_category = i[0]
    
    return row

In [11]:
def cal_previous_duration(row):
    if row['User_id'] == row['Previous_user_id'] and row['Date_received'] is not None and row['Previous_date_received'] is not None:
        return (row.Date_received - row.Previous_date_received).days
    
    return 0

def cal_next_duration(row):
    if row['User_id'] == row['Next_user_id'] and row['Date_received'] is not None and row['Next_date_received'] is not None:
        return (row.Next_date_received - row.Date_received).days
    
    return 0

In [12]:
def extract_basic_info(dataset):
    dataset['Distance'] = dataset['Distance'].fillna(-1)
    dataset['Distance'] = dataset['Distance'] + 1

    dataset['Month_of_received'] = dataset.apply(lambda row: row.Date_received.month, axis=1)
    dataset['Day_of_received'] = dataset.apply(lambda row: row.Date_received.day, axis=1)
    dataset['Weekday_of_received'] = dataset.apply(lambda row: row.Date_received.weekday() + 1, axis=1)

    dataset['Base_consume'] = 0.0
    dataset['Discount'] = 0.0
    dataset['Discount_money'] = 0.0
    dataset['Coupon_type'] = 0
    dataset['Coupon_category'] = 0

    dataset = dataset.apply(lambda row: cal_coupon_feature(row), axis=1)

    dataset = dataset.sort_values(by=['User_id', 'Date_received'], ascending=True)

    dataset['Previous_user_id'] = dataset['User_id'].shift(1)
    dataset['Previous_date_received'] = dataset['Date_received'].shift(1)

    dataset['Next_user_id'] = dataset['User_id'].shift(-1)
    dataset['Next_date_received'] = dataset['Date_received'].shift(-1)

    dataset['Previous_duration'] = dataset.apply(lambda row: cal_previous_duration(row), axis=1)
    dataset['Next_duration'] = dataset.apply(lambda row: cal_next_duration(row), axis=1)
    return dataset

In [13]:
dataset = extract_basic_info(dataset)

In [14]:
dataset.head()

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,Month_of_received,Day_of_received,Weekday_of_received,...,Discount,Discount_money,Coupon_type,Coupon_category,Previous_user_id,Previous_date_received,Next_user_id,Next_date_received,Previous_duration,Next_duration
679793,4,1469,2902.0,0.95,11.0,2016-06-07,NaT,6,7,2,...,0.95,0.0,0,21,,NaT,165.0,2016-05-25,0,0
95903,165,4195,7571.0,30:5,1.0,2016-05-25,NaT,5,25,3,...,0.833333,5.0,1,4,4.0,2016-06-07,166.0,2016-05-25,0,0
679003,166,484,9261.0,20:1,0.0,2016-05-25,NaT,5,25,3,...,0.95,1.0,1,2,165.0,2016-05-25,215.0,2016-05-24,0,0
1264361,215,129,8944.0,30:5,2.0,2016-05-24,NaT,5,24,2,...,0.833333,5.0,1,4,166.0,2016-05-25,236.0,2016-05-28,0,0
1263171,236,4663,11002.0,150:20,3.0,2016-05-28,NaT,5,28,6,...,0.866667,20.0,1,1,215.0,2016-05-24,238.0,2016-06-08,0,0


### 预测区间特征提取

In [15]:
d = dataset.copy()

+ **特征o1**: 用户在预测区获取的优惠券数量

In [16]:
t = d.groupby(['User_id']).size()
d = pd.merge(d, groupby2df(t, 'o1'), on=['User_id'], how='left')

+ **特征o2**: 用户平均15天领取的优惠券数量

In [17]:
d['o2'] = d['o1'] / 15

+ **特征o3**: 用户平均每天领取多少张优惠券

In [18]:
u = d[['User_id', 'o1']].drop_duplicates()

t = d.groupby('User_id')['Date_received'].max()
u = pd.merge(u, groupby2df(t, 'r_max'), on=['User_id'], how='left')

t = d.groupby('User_id')['Date_received'].min()
u = pd.merge(u, groupby2df(t, 'r_min'), on=['User_id'], how='left')

u['r_day_duration'] = u.apply(lambda row: (row['r_max'] - row['r_min']).days, axis=1)
u['o3'] = u['o1'] / u['r_day_duration']
d = pd.merge(d, u[['User_id', 'o3']], on=['User_id'], how='left')

+ **特征o4**:预测区用户每种类型优惠券领取的数量
+ **特征o5**:预测区用户每种类型优惠券领取的数量在所有领取的优惠券中的比率

In [19]:
t = d.groupby(['User_id', 'Coupon_category']).size()
d = pd.merge(d, groupby2df(t, 'o4'), on=['User_id', 'Coupon_category'], how='left')

In [20]:
d['o5'] = d['o4'] / d['o1']

+ **特征o6**:预测区用户领取优惠券Coupon_id领取的数量
+ **特征o8**:预测区用户领取优惠券Coupon_id领取的数量在所有领取的优惠券中的比率

In [21]:
t = d.groupby(['User_id', 'Coupon_id']).size()
d = pd.merge(d, groupby2df(t, 'o6'), on=['User_id', 'Coupon_id'], how='left')

In [22]:
d['o8'] = d['o6'] / d['o1']

+ **特征o7**:预测区用户领取优惠券Coupon_id在领取日领取的数量
+ **特征o9**:预测区用户领取优惠券Coupon_id在领取日领取的数量在所有领取的优惠券中的比率

In [23]:
t = d.groupby(['User_id', 'Coupon_id', 'Date_received']).size()
d = pd.merge(d, groupby2df(t, 'o7'), on=['User_id', 'Coupon_id', 'Date_received'], how='left')

In [24]:
d['o9'] = d['o7'] / d['o1']

+ **特征o10**:预测区用户领取多少种不同的优惠券
+ **特征o14**:预测区用户平均每个领取的优惠券领取了多少张
+ **特征o12**:预测区用户领取的不同的优惠券占所有优惠券的比率

In [25]:
t = d[['User_id', 'Coupon_id']].drop_duplicates()
t = t.groupby(['User_id']).size()
d = pd.merge(d, groupby2df(t, 'o10'), on=['User_id'], how='left')

In [26]:
d['o12'] = d['o10'] / d['Coupon_id'].unique().size

In [27]:
d['o14'] = d['o1'] / d['o10']

+ **特征o11**:预测区每种优惠券被领取的张数

In [28]:
t = dataset.groupby(['Coupon_id']).size()
d = pd.merge(d, groupby2df(t, 'o11'), on=['Coupon_id'], how='left')

+ **特征o13**:预测区用户领取的不同的商户数
+ **特征o16**:预测区用户领取的不同的商户数占所有商户数的比例

In [29]:
t = d[['User_id', 'Merchant_id']].drop_duplicates()
t = t.groupby(['User_id']).size()
d = pd.merge(d, groupby2df(t, 'o13'), on=['User_id'], how='left')

In [30]:
d['o16'] = d['o13'] / d['Merchant_id'].unique().size

+ **特征o15**:预测区用户在每个消费的商户领取的优惠券数
+ **特征o18**:预测区用户在每个消费的商户领取的优惠券数在所有领取的优惠券中的比率

In [31]:
t = dataset.groupby(['User_id', 'Merchant_id']).size()
d = pd.merge(d, groupby2df(t, 'o15'), on=['User_id', 'Merchant_id'], how='left')

In [32]:
d['o18'] = d['o15'] / d['o1']

+ **特征o19**:预测区用户在每个距离上领取的优惠券数量
+ **特征o20**:预测区用户在每个距离上领取的优惠券数量在所有领取的优惠券中的比率

In [33]:
t = dataset.groupby(['User_id', 'Distance']).size()
d = pd.merge(d, groupby2df(t, 'o19'), on=['User_id', 'Distance'], how='left')

In [34]:
d['o20'] = d['o19'] / d['o1']

+ **特征o21 - o23** 用户领取的优惠券距离最大、最小、平均

In [35]:
t = dataset.groupby(['User_id'])['Distance'].max()
d = pd.merge(d, groupby2df(t, 'o21'), on=['User_id'], how='left')

In [36]:
t = dataset.groupby(['User_id'])['Distance'].min()
d = pd.merge(d, groupby2df(t, 'o22'), on=['User_id'], how='left')

In [37]:
t = dataset.groupby(['User_id'])['Distance'].mean()
d = pd.merge(d, groupby2df(t, 'o23'), on=['User_id'], how='left')

+ **特征o17**:预测区用户领取的不同的优惠券分类的优惠券数量
+ **特征o24**:预测区用户领取的不同的优惠券分类的优惠券数量在所有领取的优惠券中的比率

In [38]:
t = dataset.groupby(['User_id', 'Coupon_type']).size()
d = pd.merge(d, groupby2df(t, 'o17'), on=['User_id', 'Coupon_type'], how='left')

In [39]:
d['o24'] = d['o17'] / d['o1']

+ **特征o25**:预测区用户在领取日领取的优惠券数量
+ **特征o26**:预测区用户在领取日领取的优惠券数量在所有领取的优惠券中的比率

In [40]:
t = d.groupby(['User_id', 'Date_received']).size()
d = pd.merge(d, groupby2df(t, 'o25'), on=['User_id', 'Date_received'], how='left')

In [41]:
d['o26'] = d['o25'] / d['o1']

+ **特征o27 - o29**用户优惠券折扣的最大、最小、平均值

In [42]:
t = d.groupby(['User_id'])['Discount'].mean()
d = pd.merge(d, groupby2df(t, 'o27'), on=['User_id'], how='left')

In [43]:
t = d.groupby(['User_id'])['Discount'].max()
d = pd.merge(d, groupby2df(t, 'o28'), on=['User_id'], how='left')

In [44]:
t = d.groupby(['User_id'])['Discount'].min()
d = pd.merge(d, groupby2df(t, 'o29'), on=['User_id'], how='left')

+ **特征o30**:预测区每个商户被领用的优惠券数量
+ **特征o38**:预测区每个商户平均被每个用户领取的数量

In [45]:
t = d.groupby(['Merchant_id']).size()
d = pd.merge(d, groupby2df(t, 'o30'), on=['Merchant_id'], how='left')

In [46]:
d['o38'] = d['o30'] / d['User_id'].unique().size

+ **特征o31**:预测区用户在每周不同的weekday领取优惠券的数量
+ **特征o39**:预测区用户在每周不同的weekday领取优惠券的数量在所有领取的优惠券中的比率

In [47]:
t = d.groupby(['User_id', 'Weekday_of_received']).size()
d = pd.merge(d, groupby2df(t, 'o31'), on=['User_id', 'Weekday_of_received'], how='left')

In [48]:
d['o39'] = d['o31'] / d['o1']

+ **特征o32**:预测区商户被多少不同用户领取
+ **特征o33**:预测区商户被不同用户平均领取优惠券数量

In [49]:
t = d[['Merchant_id', 'User_id']].drop_duplicates()
t = t.groupby(['Merchant_id']).size()
d = pd.merge(d, groupby2df(t, 'o32'), on=['Merchant_id'], how='left')

In [50]:
d['o33'] = d['o30'] / d['o32']

+ **特征o34**:每家商户有多少张不同的优惠券

In [51]:
t = d[['Merchant_id', 'Coupon_id']].drop_duplicates()
t = t.groupby(['Merchant_id']).size()
d = pd.merge(d, groupby2df(t, 'o34'), on=['Merchant_id'], how='left')

+ **特征o35**:每张优惠券被多少不同的人领取了
+ **特征o36**:每张优惠券平均被每个领用的用户领取了多少张
+ **特征o37**:每张优惠券平均被每个用户领取了多少张

In [52]:
t = d[['Coupon_id', 'User_id']].drop_duplicates()
t = t.groupby(['Coupon_id']).size()
d = pd.merge(d, groupby2df(t, 'o35'), on=['Coupon_id'], how='left')

In [53]:
d['o36'] = d['o11'] / d['o35']

In [54]:
d['o37'] = d['o11'] / d['User_id'].unique().size

In [55]:
d = d.replace([np.inf, -np.inf], np.nan)

In [56]:
dataset = d.copy()

### Label抽取

In [57]:
if not IS_PRED:
    dataset['Duration'] = dataset.apply(lambda row: (row['Date'] - row['Date_received']).days, axis=1)
    dataset['Label'] = dataset.apply(lambda row: 1 if row['Duration'] < 16 else 0, axis=1)

In [58]:
dataset.head(10)

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,Month_of_received,Day_of_received,Weekday_of_received,...,o31,o39,o32,o33,o34,o35,o36,o37,Duration,Label
0,4,1469,2902.0,0.95,11.0,2016-06-07,NaT,6,7,2,...,1,1.0,10308,1.175786,8,492,1.034553,0.002959,,0
1,165,4195,7571.0,30:5,1.0,2016-05-25,NaT,5,25,3,...,1,1.0,269,1.200743,1,269,1.200743,0.001878,,0
2,166,484,9261.0,20:1,0.0,2016-05-25,NaT,5,25,3,...,1,1.0,4,1.0,1,4,1.0,2.3e-05,,0
3,215,129,8944.0,30:5,2.0,2016-05-24,NaT,5,24,2,...,1,1.0,553,1.099458,1,553,1.099458,0.003535,,0
4,236,4663,11002.0,150:20,3.0,2016-05-28,NaT,5,28,6,...,1,1.0,10771,1.005571,2,7730,1.0,0.044939,,0
5,238,760,2418.0,30:5,5.0,2016-06-08,NaT,6,8,3,...,1,1.0,30614,1.023976,7,29269,1.000512,0.170246,,0
6,239,3465,9762.0,10:1,11.0,2016-06-14,NaT,6,14,2,...,2,1.0,5,1.6,2,4,1.0,2.3e-05,,0
7,239,3465,1255.0,10:1,11.0,2016-06-14,NaT,6,14,2,...,2,1.0,5,1.6,2,4,1.0,2.3e-05,,0
8,315,3621,1141.0,20:5,2.0,2016-05-18,NaT,5,18,3,...,1,0.5,13923,1.246211,9,1283,1.0,0.007459,,0
9,315,3621,4033.0,20:5,2.0,2016-05-19,NaT,5,19,4,...,1,0.5,13923,1.246211,9,2666,1.036759,0.016069,,0


## 用户特征抽取

In [59]:
def get_count(df, keys, suffix):
    t = receive.groupby(keys).size()
    df = pd.merge(df, groupby2df(t, suffix+'1'), on=keys, how='left')

    t = consume.groupby(keys).size()
    df = pd.merge(df, groupby2df(t, suffix+'2'), on=keys, how='left')

    t = use.groupby(keys).size()
    df = pd.merge(df, groupby2df(t, suffix+'3'), on=keys, how='left')

    df[suffix+'4'] = df[suffix+'1'] - df[suffix+'3']
    df[suffix+'5'] = df[suffix+'1'] - df[suffix+'2']
    
    df[suffix+'6'] = df[suffix+'3'] / df[suffix+'4']
    df[suffix+'7'] = df[suffix+'2'] / df[suffix+'5']
    return df

In [60]:
def process_mean_max_min(d, df, keys, column_name, prefix, count):
    t = d.groupby(keys)[column_name].mean()
    df = pd.merge(df, groupby2df(t, prefix+str(count)), on=keys, how='left')

    count = count + 1
    t = d.groupby(keys)[column_name].max()
    df = pd.merge(df, groupby2df(t, prefix+str(count)), on=keys, how='left')

    count = count + 1
    t = d.groupby(keys)[column_name].min()
    df = pd.merge(df, groupby2df(t, prefix+str(count)), on=keys, how='left')
    return df

### 拆分特征数据集合

In [61]:
feature_alpha_offline = extract_basic_info(feature_alpha_offline)

feature_alpha_offline['Duration'] = feature_alpha_offline.apply(lambda row: (row['Date'] - row['Date_received']).days, axis=1)
feature_alpha_offline['Label'] = feature_alpha_offline.apply(lambda row: 1 if row['Duration'] < 16 else 0, axis=1)

In [62]:
receive = feature_alpha_offline[feature_alpha_offline['Coupon_id']>0]
consume = receive[receive['Duration'] >= 0]
use = receive[receive['Label']==1]

### 用户特征

In [63]:
u = feature_alpha_offline[feature_alpha_offline['Coupon_id']>0][['User_id']].drop_duplicates()

+ **特征u1**:用户领取优惠券的数量
+ **特征u2**:用户消费优惠券的数量
+ **特征u3**:用户15天内消费优惠券的数量
+ **特征u4**:用户15天内没有消费优惠券的数量
+ **特征u5**:用户用户没有消费优惠券的数量
+ **特征u6**:用户15天内消费优惠券的数量 比 用户15天内没有消费优惠券的数量
+ **特征u7**:用户消费优惠券的数量 比 用户用户没有消费优惠券的数量

In [64]:
u = get_count(u, ['User_id'], 'u')

+ **特征8**:用户消费优惠券消费天数的平均值
+ **特征9**:用户消费优惠券消费天数的最大值
+ **特征10**:用户消费优惠券消费天数的最小值

In [65]:
u = process_mean_max_min(consume, u, ['User_id'], 'Duration', 'u', 8)

In [66]:
u = process_mean_max_min(consume, u, ['User_id'], 'Discount', 'u', 11)

In [67]:
t = consume[['User_id', 'Coupon_id']].drop_duplicates()
t = t.groupby(['User_id']).size()
u = pd.merge(u, groupby2df(t, 'u14'), on=['User_id'], how='left')

t = use[['User_id', 'Coupon_id']].drop_duplicates()
t = t.groupby(['User_id']).size()
u = pd.merge(u, groupby2df(t, 'u15'), on=['User_id'], how='left')

In [68]:
t = consume[['User_id', 'Merchant_id']].drop_duplicates()
t = t.groupby(['User_id']).size()
u = pd.merge(u, groupby2df(t, 'u16'), on=['User_id'], how='left')

t = use[['User_id', 'Merchant_id']].drop_duplicates()
t = t.groupby(['User_id']).size()
u = pd.merge(u, groupby2df(t, 'u17'), on=['User_id'], how='left')

In [69]:
u['u18'] = u['u3'] / u['u1']
u['u19'] = u['u2'] / u['u1']

u['u20'] = u['u3'] / 15
u['u21'] = u['u2'] / 15
u['u22'] = u['u1'] / 15

In [70]:
u = process_mean_max_min(use, u, ['User_id'], 'Duration', 'u', 23)

In [71]:
dataset = pd.merge(dataset, u, on=['User_id'], how='left')

### 用户-优惠券分类特征

In [72]:
ucc = feature_alpha_offline[feature_alpha_offline['Coupon_id']>0][['User_id', 'Coupon_category']].drop_duplicates()

In [73]:
ucc = get_count(ucc, ['User_id', 'Coupon_category'], 'ucc')

In [74]:
ucc = process_mean_max_min(consume, ucc, ['User_id', 'Coupon_category'], 'Duration', 'ucc', 8)

In [75]:
dataset = pd.merge(dataset, ucc, on=['User_id', 'Coupon_category'], how='left')

In [76]:
dataset['ucc11'] = dataset['ucc3'] / dataset['u1']
dataset['ucc12'] = dataset['ucc2'] / dataset['u1']

### 用户 - 优惠券特征

In [77]:
uc = feature_alpha_offline[feature_alpha_offline['Coupon_id']>0][['User_id', 'Coupon_id']].drop_duplicates()

In [78]:
uc = get_count(uc, ['User_id', 'Coupon_id'], 'uc')

In [79]:
uc = process_mean_max_min(consume, uc, ['User_id', 'Coupon_id'], 'Duration', 'uc', 8)

In [80]:
dataset = pd.merge(dataset, uc, on=['User_id', 'Coupon_id'], how='left')

In [81]:
dataset['uc11'] = dataset['uc3'] / dataset['u1']
dataset['uc12'] = dataset['uc2'] / dataset['u1']

### 用户 - 距离特征

In [82]:
ud = feature_alpha_offline[feature_alpha_offline['Coupon_id']>0][['User_id', 'Distance']].drop_duplicates()

In [83]:
ud = get_count(ud, ['User_id', 'Distance'], 'ud')

In [84]:
ud = process_mean_max_min(consume, ud, ['User_id', 'Distance'], 'Duration', 'ud', 8)

In [85]:
dataset = pd.merge(dataset, ud, on=['User_id', 'Distance'], how='left')

In [86]:
dataset['ud11'] = dataset['ud3'] / dataset['u1']
dataset['ud12'] = dataset['ud2'] / dataset['u1']

### 商户特征

In [87]:
m = feature_alpha_offline[feature_alpha_offline['Coupon_id']>0][['Merchant_id']].drop_duplicates()

In [88]:
m = get_count(m, ['Merchant_id'], 'm')

In [89]:
m = process_mean_max_min(consume, m, ['Merchant_id'], 'Duration', 'm', 8)

In [90]:
dataset = pd.merge(dataset, m, on=['Merchant_id'], how='left')

### 优惠券特征

In [91]:
c = feature_alpha_offline[feature_alpha_offline['Coupon_id']>0][['Coupon_id']].drop_duplicates()

In [92]:
c = get_count(c, ['Coupon_id'], 'c')

In [93]:
c = process_mean_max_min(consume, c, ['Coupon_id'], 'Duration', 'c', 8)

In [94]:
dataset = pd.merge(dataset, c, on=['Coupon_id'], how='left')

### 线上用户特征

In [95]:
ou = feature_alpha_online[['User_id']].drop_duplicates()

t = feature_alpha_online.groupby(['User_id']).size()
ou = pd.merge(ou, groupby2df(t, 'ou1'), on=['User_id'], how='left')

t = feature_alpha_online[feature_alpha_online.Action == 0].groupby(['User_id']).size()
ou = pd.merge(ou, groupby2df(t, 'ou2'), on=['User_id'], how='left')

t = feature_alpha_online[feature_alpha_online.Action == 1].groupby(['User_id']).size()
ou = pd.merge(ou, groupby2df(t, 'ou3'), on=['User_id'], how='left')

t = feature_alpha_online[feature_alpha_online.Action == 2].groupby(['User_id']).size()
ou = pd.merge(ou, groupby2df(t, 'ou4'), on=['User_id'], how='left')

In [96]:
dataset = pd.merge(dataset, ou, on=['User_id'], how='left')

## 构造特征选择器

In [97]:
len(dataset.columns.values)

147

In [98]:
dataset.columns.values

array(['User_id', 'Merchant_id', 'Coupon_id', 'Discount_rate', 'Distance',
       'Date_received', 'Date', 'Month_of_received', 'Day_of_received',
       'Weekday_of_received', 'Base_consume', 'Discount',
       'Discount_money', 'Coupon_type', 'Coupon_category',
       'Previous_user_id', 'Previous_date_received', 'Next_user_id',
       'Next_date_received', 'Previous_duration', 'Next_duration', 'o1',
       'o2', 'o3', 'o4', 'o5', 'o6', 'o8', 'o7', 'o9', 'o10', 'o12',
       'o14', 'o11', 'o13', 'o16', 'o15', 'o18', 'o19', 'o20', 'o21',
       'o22', 'o23', 'o17', 'o24', 'o25', 'o26', 'o27', 'o28', 'o29',
       'o30', 'o38', 'o31', 'o39', 'o32', 'o33', 'o34', 'o35', 'o36',
       'o37', 'Duration', 'Label', 'u1', 'u2', 'u3', 'u4', 'u5', 'u6',
       'u7', 'u8', 'u9', 'u10', 'u11', 'u12', 'u13', 'u14', 'u15', 'u16',
       'u17', 'u18', 'u19', 'u20', 'u21', 'u22', 'u23', 'u24', 'u25',
       'ucc1', 'ucc2', 'ucc3', 'ucc4', 'ucc5', 'ucc6', 'ucc7', 'ucc8',
       'ucc9', 'ucc10', 'ucc1

In [99]:
dataset = dataset.replace([np.inf, -np.inf], np.nan)

In [100]:
continous = [
    'Distance',
    'Month_of_received', 'Day_of_received',
       'Weekday_of_received', 'Base_consume', 'Discount',
       'Discount_money', 'Coupon_type', 'Coupon_category',
    'Previous_duration', 'Next_duration', 'o1',
       'o2', 'o3', 'o4', 'o5', 'o6', 'o8', 'o7', 'o9', 'o10', 'o12',
       'o14', 'o11', 'o13', 'o16', 'o15', 'o18', 'o19', 'o20', 'o21',
       'o22', 'o23', 'o17', 'o24', 'o25', 'o26', 'o27', 'o28', 'o29',
       'o30', 'o38', 'o31', 'o39', 'o32', 'o33', 'o34', 'o35', 'o36',
       'o37', 'u1', 'u2', 'u3', 'u4', 'u5', 'u6',
       'u7', 'u8', 'u9', 'u10', 'u11', 'u12', 'u13', 'u14', 'u15', 'u16',
       'u17', 'u18', 'u19', 'u20', 'u21', 'u22', 'u23', 'u24', 'u25',
       'ucc1', 'ucc2', 'ucc3', 'ucc4', 'ucc5', 'ucc6', 'ucc7', 'ucc8',
       'ucc9', 'ucc10', 'ucc11', 'ucc12', 'uc1', 'uc2', 'uc3', 'uc4',
       'uc5', 'uc6', 'uc7', 'uc8', 'uc9', 'uc10', 'uc11', 'uc12', 'ud1',
       'ud2', 'ud3', 'ud4', 'ud5', 'ud6', 'ud7', 'ud8', 'ud9', 'ud10',
       'ud11', 'ud12', 'm1', 'm2', 'm3', 'm4', 'm5', 'm6', 'm7', 'm8',
       'm9', 'm10', 'c1', 'c2', 'c3', 'c4', 'c5', 'c6', 'c7', 'c8', 'c9',
       'c10', 'ou1', 'ou2', 'ou3', 'ou4']

label = ['Label']

In [101]:
feature_processor = Pipeline([
    ('features', FeatureUnion([
        ('continuous', Pipeline([
            ('extract', ColumnSelector(continous)),
            ('imputer', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
#             ('scale', MinMaxScaler(copy=True, feature_range=(0, 1))),
            ('normalize', Normalizer())
        ])),
    ])),
#     ('sc4gbdt', StandardScaler()),
])

In [102]:
feature_processor.fit(dataset, dataset['Label'].values.ravel())
clf = GradientBoostingClassifier(max_depth=3, n_estimators=100, random_state=0)
clf.fit(feature_processor.transform(dataset), dataset['Label'].values.ravel())

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=0,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [103]:
clf.feature_importances_

array([3.48438950e-03, 0.00000000e+00, 6.86578135e-04, 0.00000000e+00,
       3.11658088e-02, 4.51838101e-03, 9.50587334e-03, 0.00000000e+00,
       7.32703232e-03, 2.14992184e-03, 2.18598857e-01, 0.00000000e+00,
       0.00000000e+00, 1.58826350e-03, 9.57583859e-02, 7.90730932e-04,
       1.29728839e-01, 3.74979374e-03, 0.00000000e+00, 8.82799870e-03,
       9.16659781e-03, 0.00000000e+00, 1.55920334e-02, 2.25009897e-02,
       7.02043921e-03, 0.00000000e+00, 9.92007051e-02, 6.89961590e-03,
       3.41012217e-04, 1.14017600e-03, 2.95846577e-03, 4.56959341e-04,
       0.00000000e+00, 1.03818447e-04, 2.07862929e-03, 4.39676927e-04,
       2.88753564e-02, 4.63810851e-04, 1.19674656e-03, 7.56511335e-04,
       4.90870175e-03, 0.00000000e+00, 0.00000000e+00, 6.82630570e-03,
       3.34132402e-02, 4.18031449e-02, 1.56154383e-02, 1.07991843e-02,
       5.13550028e-03, 0.00000000e+00, 0.00000000e+00, 4.73155898e-04,
       6.39129925e-04, 0.00000000e+00, 0.00000000e+00, 7.11998559e-04,
      

In [104]:
feature_selector = []
for index, value in enumerate(clf.feature_importances_):
    if value > 0:
        feature_selector.append(continous[index])

logger.info(len(feature_selector))
feature_selector

2019-02-06 01:02:16,077  <ipython-input-104-ac8315c57481> : INFO  83


['Distance',
 'Day_of_received',
 'Base_consume',
 'Discount',
 'Discount_money',
 'Coupon_category',
 'Previous_duration',
 'Next_duration',
 'o3',
 'o4',
 'o5',
 'o6',
 'o8',
 'o9',
 'o10',
 'o14',
 'o11',
 'o13',
 'o15',
 'o18',
 'o19',
 'o20',
 'o21',
 'o22',
 'o17',
 'o24',
 'o25',
 'o26',
 'o27',
 'o28',
 'o29',
 'o30',
 'o39',
 'o32',
 'o33',
 'o34',
 'o35',
 'o36',
 'u2',
 'u3',
 'u6',
 'u7',
 'u8',
 'u9',
 'u10',
 'u12',
 'u13',
 'u14',
 'u16',
 'u21',
 'u24',
 'ucc2',
 'ucc6',
 'ucc7',
 'ucc8',
 'ucc9',
 'ucc11',
 'uc1',
 'uc5',
 'uc7',
 'uc10',
 'uc11',
 'uc12',
 'ud5',
 'ud6',
 'ud9',
 'ud10',
 'ud11',
 'ud12',
 'm2',
 'm3',
 'm4',
 'm5',
 'm6',
 'm7',
 'm8',
 'm9',
 'c1',
 'c2',
 'c3',
 'c4',
 'c6',
 'c7']

## 保存数据

In [105]:
dataset = dataset.replace([np.inf, -np.inf], np.nan)

In [106]:
dataset.to_csv('../features/' + FILENAME + '.csv')