# 特征工程

In [1]:
import pandas as pd
import datetime
import logging
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.preprocessing import OneHotEncoder,MinMaxScaler, Normalizer
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer

from mlxtend.preprocessing import DenseTransformer
from mlxtend.feature_selection import ColumnSelector

In [2]:
def groupby2df(t, name):
    t = t.to_frame()
    t.columns = [name]
    return t

In [3]:
logger = logging.getLogger('ai')
logging.basicConfig(level=logging.DEBUG,
                    format='%(asctime)s  %(filename)s : %(levelname)s  %(message)s')

In [4]:
offline_df = pd.read_csv('../source/ccf_offline_stage1_train.csv', parse_dates = ['Date_received', 'Date'])
online_df = pd.read_csv('../source/ccf_online_stage1_train.csv', parse_dates = ['Date_received', 'Date'])

In [5]:
pred_df = pd.read_csv('../source/ccf_offline_stage1_test_revised.csv', parse_dates = ['Date_received'])

## 数据集划分

In [6]:
# time_range = [datetime.datetime(2016, 5, 16), datetime.datetime(2016, 6, 15)]

# time_range_date_received = [datetime.datetime(2016, 2, 1), datetime.datetime(2016, 4, 30)]
# time_range_date = [datetime.datetime(2016, 2, 1), datetime.datetime(2016, 5, 15)]

# FILENAME = 'dataset_beta'

# IS_PRED = False

In [7]:
time_range = [datetime.datetime(2016, 4, 16), datetime.datetime(2016, 5, 15)]

time_range_date_received = [datetime.datetime(2016, 1, 1), datetime.datetime(2016, 3, 31)]
time_range_date = [datetime.datetime(2016, 1, 1), datetime.datetime(2016, 4, 15)]

FILENAME = 'dataset_alpha'

IS_PRED = False

In [8]:
def _split(row, time_range_date_received, time_range_date):
    if ((row.Date >= time_range_date[0]) & (row.Date <= time_range_date[1])) | ((row.Coupon_id == 0) & (row.Date_received >= time_range_date_received[0]) & (row.Date_received <= time_range_date_received[1])):
        return row
    
def dataset_fetch(time_range):
    dataset = offline_df[(offline_df.Date_received >= time_range[0]) & (offline_df.Date_received <= time_range[1])].copy()
    return dataset
    
def dataset_split(time_range_date_received, time_range_date):
    feature_offline = offline_df.loc[
        ((offline_df.Date >= time_range_date[0]) & (offline_df.Date <= time_range_date[1])) | 
        ((offline_df.Coupon_id == 0) & (offline_df.Date_received >= time_range_date_received[0]) & (offline_df.Date_received <= time_range_date_received[1]))]
    feature_online = online_df.loc[
        ((online_df.Date >= time_range_date[0]) & (online_df.Date <= time_range_date[1])) | 
        ((online_df.Coupon_id == 0) & (online_df.Date_received >= time_range_date_received[0]) & (online_df.Date_received <= time_range_date_received[1]))]
    
    return feature_offline, feature_online

if IS_PRED:
    dataset = pred_df
else:
    dataset = dataset_fetch(time_range)

feature_alpha_offline, feature_alpha_online = dataset_split(time_range_date_received, time_range_date)

## 基础数据特征

### 基础特征抽取

In [9]:
discount_cat = offline_df['Discount_rate'].unique()

In [10]:
def cal_coupon_feature(row):
    if isinstance(row.Discount_rate, str) and row.Discount_rate == 'fixed':
        row.Coupon_type = 2
        return row
    
    if isinstance(row.Discount_rate, float):
        row.Discount = row.Discount_rate
        i, = np.where(discount_cat == row.Discount_rate)
        if len(i)>0:
            row.Coupon_category = i[0]
        return row
    
    arr = row.Discount_rate.split(':')
    if len(arr) == 2:
        row.Discount =  (float(arr[0]) - float(arr[1])) / float(arr[0])
        row.Coupon_type = 1
        row.Base_consume = float(arr[0])
        row.Discount_money = float(arr[1])
    else:
        row.Discount = float(row.Discount_rate)
        
    i, = np.where(discount_cat == row.Discount_rate)
    if len(i)>0:
        row.Coupon_category = i[0]
    
    return row

In [11]:
def cal_previous_duration(row):
    if row['User_id'] == row['Previous_user_id'] and row['Date_received'] is not None and row['Previous_date_received'] is not None:
        return (row.Date_received - row.Previous_date_received).days
    
    return 0

def cal_next_duration(row):
    if row['User_id'] == row['Next_user_id'] and row['Date_received'] is not None and row['Next_date_received'] is not None:
        return (row.Next_date_received - row.Date_received).days
    
    return 0

In [12]:
def extract_basic_info(dataset):
    dataset['Distance'] = dataset['Distance'].fillna(-1)
    dataset['Distance'] = dataset['Distance'] + 1

    dataset['Month_of_received'] = dataset.apply(lambda row: row.Date_received.month, axis=1)
    dataset['Day_of_received'] = dataset.apply(lambda row: row.Date_received.day, axis=1)
    dataset['Weekday_of_received'] = dataset.apply(lambda row: row.Date_received.weekday() + 1, axis=1)

    dataset['Base_consume'] = 0.0
    dataset['Discount'] = 0.0
    dataset['Discount_money'] = 0.0
    dataset['Coupon_type'] = 0
    dataset['Coupon_category'] = 0

    dataset = dataset.apply(lambda row: cal_coupon_feature(row), axis=1)

    dataset = dataset.sort_values(by=['User_id', 'Date_received'], ascending=True)

    dataset['Previous_user_id'] = dataset['User_id'].shift(1)
    dataset['Previous_date_received'] = dataset['Date_received'].shift(1)

    dataset['Next_user_id'] = dataset['User_id'].shift(-1)
    dataset['Next_date_received'] = dataset['Date_received'].shift(-1)

    dataset['Previous_duration'] = dataset.apply(lambda row: cal_previous_duration(row), axis=1)
    dataset['Next_duration'] = dataset.apply(lambda row: cal_next_duration(row), axis=1)
    return dataset

In [13]:
dataset = extract_basic_info(dataset)

In [14]:
dataset.head()

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,Month_of_received,Day_of_received,Weekday_of_received,...,Discount,Discount_money,Coupon_type,Coupon_category,Previous_user_id,Previous_date_received,Next_user_id,Next_date_received,Previous_duration,Next_duration
1264170,285,450,1532.0,30:5,3.0,2016-05-01,NaT,5,1,7,...,0.833333,5.0,1,4,,NaT,316.0,2016-04-30,0,0
679006,316,7974,8952.0,50:10,1.0,2016-04-30,NaT,4,30,6,...,0.8,10.0,1,5,285.0,2016-05-01,377.0,2016-05-12,0,0
95577,377,4906,2857.0,30:5,0.0,2016-05-12,NaT,5,12,4,...,0.833333,5.0,1,4,316.0,2016-04-30,387.0,2016-04-21,0,0
1265626,387,3381,7610.0,200:20,1.0,2016-04-21,NaT,4,21,4,...,0.9,20.0,1,3,377.0,2016-05-12,430.0,2016-04-19,0,0
679360,430,7555,9871.0,30:5,0.0,2016-04-19,NaT,4,19,2,...,0.833333,5.0,1,4,387.0,2016-04-21,467.0,2016-04-17,0,0


### 预测区间特征提取

In [15]:
d = dataset.copy()

+ **特征o1**: 用户在预测区获取的优惠券数量

In [16]:
t = d.groupby(['User_id']).size()
d = pd.merge(d, groupby2df(t, 'o1'), on=['User_id'], how='left')

+ **特征o2**: 用户平均15天领取的优惠券数量

In [17]:
d['o2'] = d['o1'] / 15

+ **特征o3**: 用户平均每天领取多少张优惠券

In [18]:
u = d[['User_id', 'o1']].drop_duplicates()

t = d.groupby('User_id')['Date_received'].max()
u = pd.merge(u, groupby2df(t, 'r_max'), on=['User_id'], how='left')

t = d.groupby('User_id')['Date_received'].min()
u = pd.merge(u, groupby2df(t, 'r_min'), on=['User_id'], how='left')

u['r_day_duration'] = u.apply(lambda row: (row['r_max'] - row['r_min']).days, axis=1)
u['o3'] = u['o1'] / u['r_day_duration']
d = pd.merge(d, u[['User_id', 'o3']], on=['User_id'], how='left')

+ **特征o4**:预测区用户每种类型优惠券领取的数量
+ **特征o5**:预测区用户每种类型优惠券领取的数量在所有领取的优惠券中的比率

In [19]:
t = d.groupby(['User_id', 'Coupon_category']).size()
d = pd.merge(d, groupby2df(t, 'o4'), on=['User_id', 'Coupon_category'], how='left')

In [20]:
d['o5'] = d['o4'] / d['o1']

+ **特征o6**:预测区用户领取优惠券Coupon_id领取的数量
+ **特征o8**:预测区用户领取优惠券Coupon_id领取的数量在所有领取的优惠券中的比率

In [21]:
t = d.groupby(['User_id', 'Coupon_id']).size()
d = pd.merge(d, groupby2df(t, 'o6'), on=['User_id', 'Coupon_id'], how='left')

In [22]:
d['o8'] = d['o6'] / d['o1']

+ **特征o7**:预测区用户领取优惠券Coupon_id在领取日领取的数量
+ **特征o9**:预测区用户领取优惠券Coupon_id在领取日领取的数量在所有领取的优惠券中的比率

In [23]:
t = d.groupby(['User_id', 'Coupon_id', 'Date_received']).size()
d = pd.merge(d, groupby2df(t, 'o7'), on=['User_id', 'Coupon_id', 'Date_received'], how='left')

In [24]:
d['o9'] = d['o7'] / d['o1']

+ **特征o10**:预测区用户领取多少种不同的优惠券
+ **特征o14**:预测区用户平均每个领取的优惠券领取了多少张
+ **特征o12**:预测区用户领取的不同的优惠券占所有优惠券的比率

In [25]:
t = d[['User_id', 'Coupon_id']].drop_duplicates()
t = t.groupby(['User_id']).size()
d = pd.merge(d, groupby2df(t, 'o10'), on=['User_id'], how='left')

In [26]:
d['o12'] = d['o10'] / d['Coupon_id'].unique().size

In [27]:
d['o14'] = d['o1'] / d['o10']

+ **特征o11**:预测区每种优惠券被领取的张数

In [28]:
t = dataset.groupby(['Coupon_id']).size()
d = pd.merge(d, groupby2df(t, 'o11'), on=['Coupon_id'], how='left')

+ **特征o13**:预测区用户领取的不同的商户数
+ **特征o16**:预测区用户领取的不同的商户数占所有商户数的比例

In [29]:
t = d[['User_id', 'Merchant_id']].drop_duplicates()
t = t.groupby(['User_id']).size()
d = pd.merge(d, groupby2df(t, 'o13'), on=['User_id'], how='left')

In [30]:
d['o16'] = d['o13'] / d['Merchant_id'].unique().size

+ **特征o15**:预测区用户在每个消费的商户领取的优惠券数
+ **特征o18**:预测区用户在每个消费的商户领取的优惠券数在所有领取的优惠券中的比率

In [31]:
t = dataset.groupby(['User_id', 'Merchant_id']).size()
d = pd.merge(d, groupby2df(t, 'o15'), on=['User_id', 'Merchant_id'], how='left')

In [32]:
d['o18'] = d['o15'] / d['o1']

+ **特征o19**:预测区用户在每个距离上领取的优惠券数量
+ **特征o20**:预测区用户在每个距离上领取的优惠券数量在所有领取的优惠券中的比率

In [33]:
t = dataset.groupby(['User_id', 'Distance']).size()
d = pd.merge(d, groupby2df(t, 'o19'), on=['User_id', 'Distance'], how='left')

In [34]:
d['o20'] = d['o19'] / d['o1']

+ **特征o21 - o23** 用户领取的优惠券距离最大、最小、平均

In [35]:
t = dataset.groupby(['User_id'])['Distance'].max()
d = pd.merge(d, groupby2df(t, 'o21'), on=['User_id'], how='left')

In [36]:
t = dataset.groupby(['User_id'])['Distance'].min()
d = pd.merge(d, groupby2df(t, 'o22'), on=['User_id'], how='left')

In [37]:
t = dataset.groupby(['User_id'])['Distance'].mean()
d = pd.merge(d, groupby2df(t, 'o23'), on=['User_id'], how='left')

+ **特征o17**:预测区用户领取的不同的优惠券分类的优惠券数量
+ **特征o24**:预测区用户领取的不同的优惠券分类的优惠券数量在所有领取的优惠券中的比率

In [38]:
t = dataset.groupby(['User_id', 'Coupon_type']).size()
d = pd.merge(d, groupby2df(t, 'o17'), on=['User_id', 'Coupon_type'], how='left')

In [39]:
d['o24'] = d['o17'] / d['o1']

+ **特征o25**:预测区用户在领取日领取的优惠券数量
+ **特征o26**:预测区用户在领取日领取的优惠券数量在所有领取的优惠券中的比率

In [40]:
t = d.groupby(['User_id', 'Date_received']).size()
d = pd.merge(d, groupby2df(t, 'o25'), on=['User_id', 'Date_received'], how='left')

In [41]:
d['o26'] = d['o25'] / d['o1']

+ **特征o27 - o29**用户优惠券折扣的最大、最小、平均值

In [42]:
t = d.groupby(['User_id'])['Discount'].mean()
d = pd.merge(d, groupby2df(t, 'o27'), on=['User_id'], how='left')

In [43]:
t = d.groupby(['User_id'])['Discount'].max()
d = pd.merge(d, groupby2df(t, 'o28'), on=['User_id'], how='left')

In [44]:
t = d.groupby(['User_id'])['Discount'].min()
d = pd.merge(d, groupby2df(t, 'o29'), on=['User_id'], how='left')

+ **特征o30**:预测区每个商户被领用的优惠券数量
+ **特征o38**:预测区每个商户平均被每个用户领取的数量

In [45]:
t = d.groupby(['Merchant_id']).size()
d = pd.merge(d, groupby2df(t, 'o30'), on=['Merchant_id'], how='left')

In [46]:
d['o38'] = d['o30'] / d['User_id'].unique().size

+ **特征o31**:预测区用户在每周不同的weekday领取优惠券的数量
+ **特征o39**:预测区用户在每周不同的weekday领取优惠券的数量在所有领取的优惠券中的比率
+ **特征o40**:预测区用户在每周不同的month领取优惠券的数量
+ **特征o41**:预测区用户在每周不同的month领取优惠券的数量在所有领取的优惠券中的比率
+ **特征o42**:预测区用户在每周不同的day领取优惠券的数量
+ **特征o43**:预测区用户在每周不同的day领取优惠券的数量在所有领取的优惠券中的比率

In [47]:
t = d.groupby(['User_id', 'Weekday_of_received']).size()
d = pd.merge(d, groupby2df(t, 'o31'), on=['User_id', 'Weekday_of_received'], how='left')

In [48]:
d['o39'] = d['o31'] / d['o1']

In [49]:
t = d.groupby(['User_id', 'Month_of_received']).size()
d = pd.merge(d, groupby2df(t, 'o40'), on=['User_id', 'Month_of_received'], how='left')

d['o41'] = d['o40'] / d['o1']

In [50]:
t = d.groupby(['User_id', 'Day_of_received']).size()
d = pd.merge(d, groupby2df(t, 'o42'), on=['User_id', 'Day_of_received'], how='left')

d['o43'] = d['o42'] / d['o1']

+ **特征o32**:预测区商户被多少不同用户领取
+ **特征o33**:预测区商户被不同用户平均领取优惠券数量

In [51]:
t = d[['Merchant_id', 'User_id']].drop_duplicates()
t = t.groupby(['Merchant_id']).size()
d = pd.merge(d, groupby2df(t, 'o32'), on=['Merchant_id'], how='left')

In [52]:
d['o33'] = d['o30'] / d['o32']

+ **特征o34**:每家商户有多少张不同的优惠券

In [53]:
t = d[['Merchant_id', 'Coupon_id']].drop_duplicates()
t = t.groupby(['Merchant_id']).size()
d = pd.merge(d, groupby2df(t, 'o34'), on=['Merchant_id'], how='left')

+ **特征o35**:每张优惠券被多少不同的人领取了
+ **特征o36**:每张优惠券平均被每个领用的用户领取了多少张
+ **特征o37**:每张优惠券平均被每个用户领取了多少张

In [54]:
t = d[['Coupon_id', 'User_id']].drop_duplicates()
t = t.groupby(['Coupon_id']).size()
d = pd.merge(d, groupby2df(t, 'o35'), on=['Coupon_id'], how='left')

In [55]:
d['o36'] = d['o11'] / d['o35']

In [56]:
d['o37'] = d['o11'] / d['User_id'].unique().size

+ **特征o44**:预测区用户next duration小于16天的次数

In [133]:
t = d[d['Next_duration']<16]
t = t.groupby(['User_id'])['Next_duration'].size()
d = pd.merge(d, groupby2df(t, 'o44'), on=['User_id'], how='left')

In [57]:
d = d.replace([np.inf, -np.inf], np.nan)

In [58]:
dataset = d.copy()

### Label抽取

In [59]:
if not IS_PRED:
    dataset['Duration'] = dataset.apply(lambda row: (row['Date'] - row['Date_received']).days, axis=1)
    dataset['Label'] = dataset.apply(lambda row: 1 if row['Duration'] < 16 else 0, axis=1)

In [60]:
dataset.head(10)

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,Month_of_received,Day_of_received,Weekday_of_received,...,o42,o43,o32,o33,o34,o35,o36,o37,Duration,Label
0,285,450,1532.0,30:5,3.0,2016-05-01,NaT,5,1,7,...,1,1.0,9926,1.037981,5,1871,1.000534,0.018191,,0
1,316,7974,8952.0,50:10,1.0,2016-04-30,NaT,4,30,6,...,1,1.0,360,1.0,1,360,1.0,0.003498,,0
2,377,4906,2857.0,30:5,0.0,2016-05-12,NaT,5,12,4,...,1,1.0,28,1.0,3,11,1.0,0.000107,,0
3,387,3381,7610.0,200:20,1.0,2016-04-21,NaT,4,21,4,...,1,1.0,19613,1.00051,1,19613,1.00051,0.190683,,0
4,430,7555,9871.0,30:5,0.0,2016-04-19,NaT,4,19,2,...,1,1.0,4235,1.006612,3,4132,1.000242,0.040162,,0
5,467,7555,9871.0,30:5,5.0,2016-04-17,NaT,4,17,7,...,1,1.0,4235,1.006612,3,4132,1.000242,0.040162,,0
6,470,7717,9614.0,20:1,3.0,2016-04-24,NaT,4,24,7,...,1,1.0,2648,1.069486,3,2288,1.000437,0.022243,,0
7,472,2436,3992.0,30:5,0.0,2016-05-04,NaT,5,4,3,...,1,1.0,4859,1.022433,4,4487,1.0,0.043602,,0
8,489,6568,4723.0,30:1,0.0,2016-04-25,NaT,4,25,1,...,1,0.5,53,1.075472,3,38,1.052632,0.000389,,0
9,489,4057,5112.0,20:1,3.0,2016-04-29,NaT,4,29,5,...,1,0.5,12,1.0,2,6,1.0,5.8e-05,,0


## 用户特征抽取

In [61]:
def get_count(df, keys, suffix):
    t = receive.groupby(keys).size()
    df = pd.merge(df, groupby2df(t, suffix+'1'), on=keys, how='left')

    t = consume.groupby(keys).size()
    df = pd.merge(df, groupby2df(t, suffix+'2'), on=keys, how='left')

    t = use.groupby(keys).size()
    df = pd.merge(df, groupby2df(t, suffix+'3'), on=keys, how='left')

    df[suffix+'4'] = df[suffix+'1'] - df[suffix+'3']
    df[suffix+'5'] = df[suffix+'1'] - df[suffix+'2']
    
    df[suffix+'6'] = df[suffix+'3'] / df[suffix+'4']
    df[suffix+'7'] = df[suffix+'2'] / df[suffix+'5']
    return df

In [62]:
def process_mean_max_min(d, df, keys, column_name, prefix, count):
    t = d.groupby(keys)[column_name].mean()
    df = pd.merge(df, groupby2df(t, prefix+str(count)), on=keys, how='left')

    count = count + 1
    t = d.groupby(keys)[column_name].max()
    df = pd.merge(df, groupby2df(t, prefix+str(count)), on=keys, how='left')

    count = count + 1
    t = d.groupby(keys)[column_name].min()
    df = pd.merge(df, groupby2df(t, prefix+str(count)), on=keys, how='left')
    return df

In [63]:
def get_unique_size(d, df, groups, keys, column_name):
    t = d[groups].drop_duplicates()
    t = t.groupby(keys).size()
    df = pd.merge(df, groupby2df(t, column_name), on=keys, how='left')
    return df

### 拆分特征数据集合

In [64]:
feature_alpha_offline = extract_basic_info(feature_alpha_offline)

feature_alpha_offline['Duration'] = feature_alpha_offline.apply(lambda row: (row['Date'] - row['Date_received']).days, axis=1)
feature_alpha_offline['Label'] = feature_alpha_offline.apply(lambda row: 1 if row['Duration'] < 16 else 0, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pa

In [65]:
receive = feature_alpha_offline[feature_alpha_offline['Coupon_id']>0]
consume = receive[receive['Duration'] >= 0]
use = receive[receive['Label']==1]

### 用户特征

In [66]:
u = feature_alpha_offline[feature_alpha_offline['Coupon_id']>0][['User_id']].drop_duplicates()

+ **特征u1**:用户领取优惠券的数量
+ **特征u2**:用户消费优惠券的数量
+ **特征u3**:用户15天内消费优惠券的数量
+ **特征u4**:用户15天内没有消费优惠券的数量
+ **特征u5**:用户用户没有消费优惠券的数量
+ **特征u6**:用户15天内消费优惠券的数量 比 用户15天内没有消费优惠券的数量
+ **特征u7**:用户消费优惠券的数量 比 用户用户没有消费优惠券的数量

In [67]:
u = get_count(u, ['User_id'], 'u')

+ **特征u8**:用户消费优惠券消费天数的平均值
+ **特征u9**:用户消费优惠券消费天数的最大值
+ **特征u10**:用户消费优惠券消费天数的最小值

In [68]:
u = process_mean_max_min(consume, u, ['User_id'], 'Duration', 'u', 8)

+ **特征u11**:用户消费优惠券折扣率的平均值
+ **特征u12**:用户消费优惠券折扣率的最大值
+ **特征u13**:用户消费优惠券折扣率的最小值

In [69]:
u = process_mean_max_min(consume, u, ['User_id'], 'Discount', 'u', 11)

+ **特征u14**:用户消费多少种不同优惠券
+ **特征u15**:用户15天内消费多少种不同优惠券

In [70]:
u = get_unique_size(consume, u, ['User_id', 'Coupon_id'], ['User_id'], 'u14')
u = get_unique_size(use, u, ['User_id', 'Coupon_id'], ['User_id'], 'u15')

+ **特征u16**:用户消费多少个不同商家的优惠券
+ **特征u17**:用户15天内消费多少个不同商家的优惠券

In [71]:
u = get_unique_size(consume, u, ['User_id', 'Merchant_id'], ['User_id'], 'u16')
u = get_unique_size(use, u, ['User_id', 'Merchant_id'], ['User_id'], 'u17')

+ **特征u18**:用户15天内消费优惠券的核销率
+ **特征u19**:用户消费优惠券的核销率
+ **特征u20**:用户15天内平均消费领取天数在15天内的优惠券的数量
+ **特征u21**:用户15天内平均消费优惠券的数量
+ **特征u22**:用户15天内平均领取优惠券的数量

In [72]:
u['u18'] = u['u3'] / u['u1']
u['u19'] = u['u2'] / u['u1']

u['u20'] = u['u3'] / 15
u['u21'] = u['u2'] / 15
u['u22'] = u['u1'] / 15

In [73]:
u = process_mean_max_min(use, u, ['User_id'], 'Duration', 'u', 23)

In [74]:
dataset = pd.merge(dataset, u, on=['User_id'], how='left')

### 用户-优惠券分类特征

In [75]:
ucc = feature_alpha_offline[feature_alpha_offline['Coupon_id']>0][['User_id', 'Coupon_category']].drop_duplicates()

In [76]:
ucc = get_count(ucc, ['User_id', 'Coupon_category'], 'ucc')

In [77]:
ucc = process_mean_max_min(consume, ucc, ['User_id', 'Coupon_category'], 'Duration', 'ucc', 8)

In [78]:
dataset = pd.merge(dataset, ucc, on=['User_id', 'Coupon_category'], how='left')

In [79]:
dataset['ucc11'] = dataset['ucc3'] / dataset['u1']
dataset['ucc12'] = dataset['ucc2'] / dataset['u1']

### 用户 - 优惠券特征

In [80]:
uc = feature_alpha_offline[feature_alpha_offline['Coupon_id']>0][['User_id', 'Coupon_id']].drop_duplicates()

+ **特征uc1**:不同coupon_id,用户领取优惠券的数量
+ **特征uc2**:不同coupon_id,用户消费优惠券的数量
+ **特征uc3**:不同coupon_id,用户15天内消费优惠券的数量
+ **特征uc4**:不同coupon_id,用户15天内没有消费优惠券的数量
+ **特征uc5**:不同coupon_id,用户用户没有消费优惠券的数量
+ **特征uc6**:不同coupon_id,用户15天内消费优惠券的数量 比 用户15天内没有消费优惠券的数量
+ **特征uc7**:不同coupon_id,用户消费优惠券的数量 比 用户用户没有消费优惠券的数量

In [81]:
uc = get_count(uc, ['User_id', 'Coupon_id'], 'uc')

+ **特征uc8**:不同coupon_id,用户消费优惠券消费天数的平均值
+ **特征uc9**:不同coupon_id,用户消费优惠券消费天数的最大值
+ **特征uc10**:不同coupon_id,用户消费优惠券消费天数的最小值

In [82]:
uc = process_mean_max_min(consume, uc, ['User_id', 'Coupon_id'], 'Duration', 'uc', 8)

In [83]:
dataset = pd.merge(dataset, uc, on=['User_id', 'Coupon_id'], how='left')

+ **特征uc11**:不同coupon_id,用户15天内消费优惠券的核销率
+ **特征uc12**:不同coupon_id,用户消费优惠券的核销率

In [84]:
dataset['uc11'] = dataset['uc3'] / dataset['u1']
dataset['uc12'] = dataset['uc2'] / dataset['u1']

### 用户 - 距离特征

In [85]:
ud = feature_alpha_offline[feature_alpha_offline['Coupon_id']>0][['User_id', 'Distance']].drop_duplicates()

In [86]:
ud = get_count(ud, ['User_id', 'Distance'], 'ud')

In [87]:
ud = process_mean_max_min(consume, ud, ['User_id', 'Distance'], 'Duration', 'ud', 8)

In [88]:
dataset = pd.merge(dataset, ud, on=['User_id', 'Distance'], how='left')

In [89]:
dataset['ud11'] = dataset['ud3'] / dataset['u1']
dataset['ud12'] = dataset['ud2'] / dataset['u1']

### 用户 - 商户特征

In [90]:
um = feature_alpha_offline[feature_alpha_offline['Coupon_id']>0][['User_id', 'Merchant_id']].drop_duplicates()

+ **特征um1**:不同merchant_id,用户领取优惠券的数量
+ **特征um2**:不同merchant_id,用户消费优惠券的数量
+ **特征um3**:不同merchant_id,用户15天内消费优惠券的数量
+ **特征um4**:不同merchant_id,用户15天内没有消费优惠券的数量
+ **特征um5**:不同merchant_id,用户用户没有消费优惠券的数量
+ **特征um6**:不同merchant_id,用户15天内消费优惠券的数量 比 用户15天内没有消费优惠券的数量
+ **特征um7**:不同merchant_id,用户消费优惠券的数量 比 用户用户没有消费优惠券的数量

In [91]:
um = get_count(um, ['User_id', 'Merchant_id'], 'um')

+ **特征um8**:不同merchant_id,用户消费优惠券消费天数的平均值
+ **特征um9**:不同merchant_id,用户消费优惠券消费天数的最大值
+ **特征um10**:不同merchant_id,用户消费优惠券消费天数的最小值

In [92]:
um = process_mean_max_min(consume, um, ['User_id', 'Merchant_id'], 'Duration', 'um', 8)

In [93]:
dataset = pd.merge(dataset, um, on=['User_id', 'Merchant_id'], how='left')

+ **特征um11**:不同merchant_id,用户15天内消费优惠券的核销率
+ **特征um12**:不同merchant_id,用户消费优惠券的核销率

In [94]:
dataset['um11'] = dataset['um3'] / dataset['u1']
dataset['um12'] = dataset['um2'] / dataset['u1']

+ **特征um13**:不同merchant_id,用户15天内消费优惠券在15天内平均核销数
+ **特征um14**:不同merchant_id,用户消费优惠券在15天内平均核销数

In [95]:
dataset['um13'] = dataset['um3'] / 15
dataset['um14'] = dataset['um2'] / 15

### 商户特征

In [96]:
m = feature_alpha_offline[feature_alpha_offline['Coupon_id']>0][['Merchant_id']].drop_duplicates()

In [97]:
m = get_count(m, ['Merchant_id'], 'm')

In [98]:
m = process_mean_max_min(consume, m, ['Merchant_id'], 'Duration', 'm', 8)

In [99]:
m = get_unique_size(consume, m, ['Merchant_id', 'User_id'], ['Merchant_id'], 'm11')
m = get_unique_size(use, m, ['Merchant_id', 'User_id'], ['Merchant_id'], 'm12')

In [100]:
m['m13'] = m['m2'] / m['m11']
m['m14'] = m['m3'] / m['m12']

In [101]:
dataset = pd.merge(dataset, m, on=['Merchant_id'], how='left')

### 优惠券特征

In [102]:
c = feature_alpha_offline[feature_alpha_offline['Coupon_id']>0][['Coupon_id']].drop_duplicates()

In [103]:
c = get_count(c, ['Coupon_id'], 'c')

In [104]:
c = process_mean_max_min(consume, c, ['Coupon_id'], 'Duration', 'c', 8)

In [105]:
c = get_unique_size(consume, c, ['Coupon_id', 'User_id'], ['Coupon_id'], 'c11')
c = get_unique_size(use, c, ['Coupon_id', 'User_id'], ['Coupon_id'], 'c12')

In [106]:
c['c13'] = c['c2'] / c['c11']
c['c14'] = c['c3'] / c['c12']

In [107]:
dataset = pd.merge(dataset, c, on=['Coupon_id'], how='left')

In [108]:
cd = feature_alpha_offline[feature_alpha_offline['Coupon_id']>0][['Coupon_id', 'Date_received']].drop_duplicates()

In [109]:
cd = get_count(cd, ['Coupon_id', 'Date_received'], 'cd')

In [110]:
dataset = pd.merge(dataset, cd, on=['Coupon_id', 'Date_received'], how='left')

### 优惠券分类特征

In [111]:
dr = feature_alpha_offline[feature_alpha_offline['Coupon_id']>0][['Discount_rate']].drop_duplicates()

In [112]:
dr = get_count(dr, ['Discount_rate'], 'dr')

In [113]:
dataset = pd.merge(dataset, dr, on=['Discount_rate'], how='left')

### 线上用户特征

In [114]:
ou = feature_alpha_online[['User_id']].drop_duplicates()

t = feature_alpha_online.groupby(['User_id']).size()
ou = pd.merge(ou, groupby2df(t, 'ou1'), on=['User_id'], how='left')

t = feature_alpha_online[feature_alpha_online.Action == 0].groupby(['User_id']).size()
ou = pd.merge(ou, groupby2df(t, 'ou2'), on=['User_id'], how='left')

t = feature_alpha_online[feature_alpha_online.Action == 1].groupby(['User_id']).size()
ou = pd.merge(ou, groupby2df(t, 'ou3'), on=['User_id'], how='left')

t = feature_alpha_online[feature_alpha_online.Action == 2].groupby(['User_id']).size()
ou = pd.merge(ou, groupby2df(t, 'ou4'), on=['User_id'], how='left')

In [115]:
dataset = pd.merge(dataset, ou, on=['User_id'], how='left')

## 构造特征选择器

In [116]:
len(dataset.columns.values)

187

In [117]:
dataset.columns.values

array(['User_id', 'Merchant_id', 'Coupon_id', 'Discount_rate', 'Distance',
       'Date_received', 'Date', 'Month_of_received', 'Day_of_received',
       'Weekday_of_received', 'Base_consume', 'Discount',
       'Discount_money', 'Coupon_type', 'Coupon_category',
       'Previous_user_id', 'Previous_date_received', 'Next_user_id',
       'Next_date_received', 'Previous_duration', 'Next_duration', 'o1',
       'o2', 'o3', 'o4', 'o5', 'o6', 'o8', 'o7', 'o9', 'o10', 'o12',
       'o14', 'o11', 'o13', 'o16', 'o15', 'o18', 'o19', 'o20', 'o21',
       'o22', 'o23', 'o17', 'o24', 'o25', 'o26', 'o27', 'o28', 'o29',
       'o30', 'o38', 'o31', 'o39', 'o40', 'o41', 'o42', 'o43', 'o32',
       'o33', 'o34', 'o35', 'o36', 'o37', 'Duration', 'Label', 'u1', 'u2',
       'u3', 'u4', 'u5', 'u6', 'u7', 'u8', 'u9', 'u10', 'u11', 'u12',
       'u13', 'u14', 'u15', 'u16', 'u17', 'u18', 'u19', 'u20', 'u21',
       'u22', 'u23', 'u24', 'u25', 'ucc1', 'ucc2', 'ucc3', 'ucc4', 'ucc5',
       'ucc6', 'ucc7', 'u

In [134]:
dataset = dataset.replace([np.inf, -np.inf], np.nan)

In [135]:
continous = [
    'Distance',
    'Base_consume', 'Discount',
    'Discount_money', 
    'Previous_duration', 'Next_duration', 'o1',
    'o2', 'o3', 'o4', 'o5', 'o6', 'o8', 'o7', 'o9', 'o10', 'o12',
    'o14', 'o11', 'o13', 'o16', 'o15', 'o18', 'o19', 'o20', 'o21',
    'o22', 'o23', 'o17', 'o24', 'o25', 'o26', 'o27', 'o28', 'o29',
    'o30', 'o38', 'o31', 'o39', 'o32', 'o33', 'o34', 'o35', 'o36','o37', 'o40', 'o41', 'o42', 'o43','o44',
    'u1', 'u2', 'u3', 'u4', 'u5', 'u6',
    'u7', 'u8', 'u9', 'u10', 'u11', 'u12', 'u13', 'u14', 'u15', 'u16',
    'u17', 'u18', 'u19', 'u20', 'u21', 'u22', 'u23', 'u24', 'u25',
    'ucc1', 'ucc2', 'ucc3', 'ucc4', 'ucc5', 'ucc6', 'ucc7', 'ucc8',
    'ucc9', 'ucc10', 'ucc11', 'ucc12', 'uc1', 'uc2', 'uc3', 'uc4',
    'uc5', 'uc6', 'uc7', 'uc8', 'uc9', 'uc10', 'uc11', 'uc12', 'ud1',
    'ud2', 'ud3', 'ud4', 'ud5', 'ud6', 'ud7', 'ud8', 'ud9', 'ud10',
    'ud11', 'ud12', 'um1', 'um2', 'um3', 'um4', 'um5', 'um6', 'um7',
    'um8', 'um9', 'um10', 'um11', 'um12', 'um13', 'um14', 'm1', 'm2',
    'm3', 'm4', 'm5', 'm6', 'm7', 'm8', 'm9', 'm10', 'm11', 'm12',
    'm13', 'm14', 'c1', 'c2', 'c3', 'c4', 'c5', 'c6', 'c7', 'c8', 'c9',
    'c10', 'c11', 'c12', 'c13', 'c14', 'cd1', 'cd2', 'cd3', 'cd4',
    'cd5', 'cd6', 'cd7', 'dr1', 'dr2', 'dr3', 'dr4', 'dr5', 'dr6',
    'dr7', 'ou1', 'ou2', 'ou3', 'ou4']

label = ['Label']

In [136]:
feature_processor = Pipeline([
    ('features', FeatureUnion([
        ('continuous', Pipeline([
            ('extract', ColumnSelector(continous)),
            ('imputer', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
#             ('scale', MinMaxScaler(copy=True, feature_range=(0, 1))),
            ('normalize', Normalizer())
        ])),
    ])),
#     ('sc4gbdt', StandardScaler()),
])

In [140]:
feature_processor.fit(dataset, dataset['Label'].values.ravel())
clf = GradientBoostingClassifier(max_depth=3, n_estimators=100, random_state=0)
clf.fit(feature_processor.transform(dataset), dataset['Label'].values.ravel())

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=0,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [141]:
clf.feature_importances_

array([2.33192260e-03, 4.97006059e-02, 0.00000000e+00, 6.38757399e-03,
       1.55144286e-03, 1.71183875e-01, 0.00000000e+00, 0.00000000e+00,
       1.58561036e-03, 1.16848177e-02, 1.39405200e-03, 5.91746859e-02,
       1.64860565e-03, 2.54103807e-04, 1.41835410e-02, 6.88131961e-03,
       0.00000000e+00, 4.80479239e-02, 1.46260171e-02, 8.35451202e-03,
       0.00000000e+00, 7.29123268e-02, 3.03559375e-03, 1.89617898e-03,
       0.00000000e+00, 2.92839046e-03, 9.09964166e-04, 1.03594760e-03,
       1.46025713e-03, 1.40402772e-04, 1.43743247e-03, 1.93601220e-02,
       6.89934023e-04, 0.00000000e+00, 1.13487940e-02, 8.10629903e-03,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.57067518e-02,
       5.52732823e-02, 3.88899367e-03, 2.08524597e-01, 1.75556981e-02,
       0.00000000e+00, 2.78645589e-03, 8.97523002e-04, 3.62538671e-03,
       2.60227727e-02, 0.00000000e+00, 1.56777467e-05, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.48462346e-03,
      

In [142]:
feature_selector = []
for index, value in enumerate(clf.feature_importances_):
    if value > 0:
        feature_selector.append(continous[index])

logger.info(len(feature_selector))
feature_selector

2019-02-07 17:13:02,329  <ipython-input-142-ac8315c57481> : INFO  96


['Distance',
 'Base_consume',
 'Discount_money',
 'Previous_duration',
 'Next_duration',
 'o3',
 'o4',
 'o5',
 'o6',
 'o8',
 'o7',
 'o9',
 'o10',
 'o14',
 'o11',
 'o13',
 'o15',
 'o18',
 'o19',
 'o21',
 'o22',
 'o23',
 'o17',
 'o24',
 'o25',
 'o26',
 'o27',
 'o29',
 'o30',
 'o32',
 'o33',
 'o34',
 'o35',
 'o36',
 'o40',
 'o41',
 'o42',
 'o43',
 'u1',
 'u6',
 'u8',
 'u19',
 'u20',
 'u21',
 'u23',
 'u24',
 'ucc5',
 'ucc6',
 'ucc7',
 'ucc10',
 'uc1',
 'uc5',
 'uc6',
 'uc7',
 'uc8',
 'uc9',
 'uc10',
 'uc11',
 'uc12',
 'ud5',
 'ud6',
 'ud8',
 'ud9',
 'ud10',
 'um2',
 'um3',
 'um4',
 'um7',
 'um8',
 'um9',
 'um10',
 'um11',
 'um12',
 'um14',
 'm1',
 'm2',
 'm3',
 'm4',
 'm5',
 'm6',
 'm7',
 'm8',
 'm9',
 'm10',
 'm14',
 'c1',
 'c2',
 'c3',
 'c4',
 'c6',
 'c7',
 'c8',
 'c9',
 'c10',
 'c12',
 'c13']

## 保存数据

In [124]:
dataset = dataset.replace([np.inf, -np.inf], np.nan)

In [125]:
dataset.to_csv('../features/' + FILENAME + '.csv')

In [131]:
dataset[['dr1','dr2','dr3','dr4','dr5','dr6','dr7']].describe()

Unnamed: 0,dr1,dr2,dr3,dr4,dr5,dr6,dr7
count,132889.0,132889.0,132878.0,132878.0,132889.0,128485.0,0.0
mean,4031.576316,4031.576316,3498.627959,533.282018,0.0,6.590542,
std,4887.161253,4887.161253,4328.805707,596.598613,0.0,6.299269,
min,1.0,1.0,1.0,0.0,0.0,1.0,
25%,744.0,744.0,620.0,123.0,0.0,2.678454,
50%,1132.0,1132.0,800.0,248.0,0.0,5.0,
75%,6608.0,6608.0,6360.0,1216.0,0.0,7.114152,
max,12155.0,12155.0,10657.0,1498.0,0.0,31.666667,
