In [104]:
import pandas as pd
import warnings

warnings.filterwarnings('ignore')

In [140]:
def split_off_dataset(off: pd.DataFrame, space=pd.to_timedelta(0), time_ranges=None):
    """
    滑动窗口分割
    :param space: 时间间隔，特征区间之后时间间隔后的一个月作为标签区间，默认为 0
    :param off: 线下数据集
    :param time_ranges: kv，表示特征区间
    :return: 两个列表，特征区间和标签区间，标签区间为特征区间结束间隔space后的一个月
    """
    print("====== off train set info ======")
    print(off.info())
    print()
    label_len = pd.to_timedelta(30, unit='D') # 三十天
    _feats = []
    _labels = []
    for k, v in time_ranges.items():
        time_range = pd.date_range(k, k+v)
        _feats.append(off[off['Date_received'].isin(time_range) |  # 领券日期在 time_range 内
                          ((off['Coupon_id'] == 0) & off['Date'].isin(time_range))]) # 在 time_range 内直接消费
        print(f'features time range {k} : {k+v}, size {len(_feats[len(_feats)-1])}')
        time_range = pd.date_range(k+v+space+pd.to_timedelta(1, 'D'), k+v+space+label_len)
        _labels.append(off[off['Date_received'].isin(time_range)]) # 标签集合，标签集合不需要采样直接消费的
        print(f'labels time range {k+v+space+pd.to_timedelta(1, "D")} : {k+v+space+label_len},'
              f' size {len(_labels[len(_labels)-1])}')
    del time_range
    print(f'total features size {sum(map(lambda x: len(x), _feats))}')
    print(f'total labels size {sum(map(lambda x: len(x), _labels))}')
    return _feats, _labels

In [4]:
off_train = pd.read_csv('./dataset_cleaned/ccf_off_train.csv', parse_dates=['Date_received', 'Date'])

In [141]:
feats, labels = split_off_dataset(off_train,
                                  time_ranges={
                                      pd.to_datetime('20160101'): pd.to_timedelta(75, unit='D'), # 4月
                                      pd.to_datetime('20160201'): pd.to_timedelta(75, unit='D'), # 5月
                                      pd.to_datetime('20160301'): pd.to_timedelta(75, unit='D'), # 6月
                                  })

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1754884 entries, 0 to 1754883
Data columns (total 16 columns):
 #   Column            Dtype         
---  ------            -----         
 0   User_id           int64         
 1   Merchant_id       int64         
 2   Coupon_id         int64         
 3   Discount_rate     object        
 4   Distance          int64         
 5   Date_received     datetime64[ns]
 6   Date              datetime64[ns]
 7   normal_consume    int64         
 8   coupon_consume    int64         
 9   no_consume        int64         
 10  no_distance       int64         
 11  is_full_discount  int64         
 12  discount_x        int64         
 13  discount_y        int64         
 14  discount_rate     float64       
 15  discount_type     int64         
dtypes: datetime64[ns](2), float64(1), int64(12), object(1)
memory usage: 214.2+ MB
None

features time range 2016-01-01 00:00:00 : 2016-03-16 00:00:00, size 726858
labels time range 2016-03-17 00:00:00 

检查生成的数据的分布是否符合要求

In [142]:
feats[0][feats[0]['Coupon_id'] == 0]['Date'].describe()

count                  209245
unique                     76
top       2016-01-23 00:00:00
freq                     3846
first     2016-01-01 00:00:00
last      2016-03-16 00:00:00
Name: Date, dtype: object

In [143]:
feats[1][feats[1]['Coupon_id'] == 0]['Date'].describe()

count                  260002
unique                     76
top       2016-03-26 00:00:00
freq                     9167
first     2016-02-01 00:00:00
last      2016-04-16 00:00:00
Name: Date, dtype: object

In [156]:
feats[2][feats[2]['Coupon_id'] == 0]['Date'].describe()

count                  325881
unique                     76
top       2016-03-26 00:00:00
freq                     9167
first     2016-03-01 00:00:00
last      2016-05-15 00:00:00
Name: Date, dtype: object

In [157]:
feats[0]['Date_received'].max(), feats[1]['Date_received'].max(), feats[2]['Date_received'].max()

(Timestamp('2016-03-16 00:00:00'),
 Timestamp('2016-04-16 00:00:00'),
 Timestamp('2016-05-15 00:00:00'))

In [164]:
(labels[0]['Coupon_id'] == 0).any(), (labels[1]['Coupon_id'] == 0).any(), (labels[2]['Coupon_id'] == 0).any()

(False, False, False)

In [159]:
labels[0]['Date_received'].describe()

count                  150140
unique                     30
top       2016-03-26 00:00:00
freq                    13719
first     2016-03-17 00:00:00
last      2016-04-15 00:00:00
Name: Date_received, dtype: object

In [160]:
labels[1]['Date_received'].describe()

count                  132635
unique                     30
top       2016-04-22 00:00:00
freq                     7381
first     2016-04-17 00:00:00
last      2016-05-16 00:00:00
Name: Date_received, dtype: object

In [161]:
labels[2]['Date_received'].describe()

count                  249111
unique                     30
top       2016-05-21 00:00:00
freq                    19859
first     2016-05-16 00:00:00
last      2016-06-14 00:00:00
Name: Date_received, dtype: object

In [166]:
feats[0].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 726858 entries, 0 to 1754872
Data columns (total 16 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   User_id           726858 non-null  int64         
 1   Merchant_id       726858 non-null  int64         
 2   Coupon_id         726858 non-null  int64         
 3   Discount_rate     726858 non-null  object        
 4   Distance          726858 non-null  int64         
 5   Date_received     726858 non-null  datetime64[ns]
 6   Date              726858 non-null  datetime64[ns]
 7   normal_consume    726858 non-null  int64         
 8   coupon_consume    726858 non-null  int64         
 9   no_consume        726858 non-null  int64         
 10  no_distance       726858 non-null  int64         
 11  is_full_discount  726858 non-null  int64         
 12  discount_x        726858 non-null  int64         
 13  discount_y        726858 non-null  int64         
 14  dis

In [167]:
labels[0]

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,normal_consume,coupon_consume,no_consume,no_distance,is_full_discount,discount_x,discount_y,discount_rate,discount_type
3,1439408,2632,1078,20:1,0,2016-03-19,1970-01-01,0,0,1,0,1,20,1,0.950000,14
20,94107,3381,7610,200:20,2,2016-04-12,1970-01-01,0,0,1,0,1,200,20,0.900000,12
23,253750,8390,7531,20:5,0,2016-03-27,1970-01-01,0,0,1,0,1,20,5,0.750000,7
38,2881376,8390,7531,20:5,0,2016-03-21,2016-03-29,0,1,0,0,1,20,5,0.750000,7
44,4061024,7555,9871,30:5,10,2016-04-09,1970-01-01,0,0,1,0,1,30,5,0.833333,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1754825,261622,3381,7610,200:20,1,2016-04-07,1970-01-01,0,0,1,0,1,200,20,0.900000,12
1754869,188086,6568,4723,30:1,0,2016-04-15,1970-01-01,0,0,1,0,1,30,1,0.966667,15
1754873,212662,2934,5686,30:5,2,2016-03-21,2016-03-30,0,1,0,0,1,30,5,0.833333,9
1754876,212662,3532,5267,30:5,1,2016-03-22,1970-01-01,0,0,1,0,1,30,5,0.833333,9


检查完毕

In [236]:
no_date = pd.to_datetime(0)

def reset_type(df: pd.DataFrame, cols, typs):
    for col, typ in zip(cols, typs):
        df[col] = df[col].astype(typ)
    return df

def attach_labels(label_set: pd.DataFrame) -> pd.DataFrame:
    """
    给 label_set 打上标签
    :param label_set:
    :return: label_set
    """
    label_set['label'] = 0
    label_set.loc[(label_set['Date'] != no_date) &  # 要求有 Date
                  (label_set['Date'] - label_set['Date_received'] <= pd.to_timedelta(15, 'D')),  # 并且小于 15 天
                  'label'] = 1
    return label_set

def attach_base_feat(label_set: pd.DataFrame) -> pd.DataFrame:
    """
    打上不依赖其他数据集的特征
    :param label_set:
    :return:
    """
    # 和日期有关的特征
    label_set['weekday'] = label_set['Date_received'].dt.weekday
    label_set = pd.get_dummies(label_set, columns=['weekday'])
    label_set['day'] = label_set['Date_received'].dt.day
    label_set['month'] = label_set['Date_received'].dt.month

    # 用户当前领券数量
    label_set = label_set.merge(label_set.groupby('User_id').size().reset_index(name='_u0'),
                                on='User_id', how='left').fillna(0)
    # 用户当前领取特定优惠券数
    label_set = label_set.merge(label_set.groupby(['User_id', 'Coupon_id']).size().reset_index(name='_u1'),
                                on=['User_id', 'Coupon_id'], how='left').fillna(0)
    # 用户当前当天领券数
    label_set = label_set.merge(label_set.groupby(['User_id', 'Date_received']).size().reset_index(name='_u2'),
                                on=['User_id', 'Date_received'], how='left').fillna(0)
    # 用户当天领取特定优惠券数
    label_set = label_set.merge(label_set.groupby(['User_id', 'Coupon_id', 'Date_received']).size()
                                       .reset_index(name='_u3'),on=['User_id', 'Coupon_id', 'Date_received'],
                                       how='left').fillna(0)
    # 用户是否在当天重复领取了特定优惠券
    label_set['_u4'] = (label_set['_u3'] > 1).astype(int)
    int_cols = ['_u0', '_u1', '_u2', '_u3']
    reset_type(label_set, int_cols, [int]*len(int_cols))

    return label_set

In [237]:
def attach_features(feat_set: pd.DataFrame, label_set: pd.DataFrame):
    """
    特征工程
    处理单独的一组 feature set & label set
    :param feat_set: 特征集
    :param label_set: 标签集
    :return:
    """
    print('attach features to label...')
    # 无消费券消费 Coupon_ID=0，Date_received=no_date, Date!=no_date
    normal_consumed = feat_set[feat_set['normal_consume'] == 1]
    # 有消费券消费 Coupon_ID=!0，Date_received!=no_date, Date!=no_date
    coupon_consumed = feat_set[feat_set['coupon_consume'] == 1]
    # 有消费券未消费 Coupon_ID=!0，Date_received!=no_date, Date=no_date
    no_consumed = feat_set[feat_set['no_consume'] == 1]
    # 断言相等
    assert len(no_consumed) + len(coupon_consumed) + len(normal_consumed) == len(feat_set)
    label_set = attach_base_feat(label_set)

    '''用户相关特征'''
    # 领券并消费了的数量
    label_set = label_set.merge(coupon_consumed.groupby('User_id').size().reset_index(name='u0'),
                                on='User_id', how='left').fillna(0)
    # 领券未消费的数量
    label_set = label_set.merge(no_consumed.groupby('User_id').size().reset_index(name='u1'),
                                on='User_id', how='left').fillna(0)
    # 用户历史领券的数量
    label_set['u2'] = label_set['u0'] + label_set['u1']

    # 领券并消费的占比
    label_set['u3'] = (label_set['u0'] / label_set['u2']).fillna(0).astype(float)
    # 领取并消费优惠券的平均折扣率
    label_set = label_set.merge(coupon_consumed.groupby('User_id').discount_rate.mean().reset_index(name='u4'),
                                on='User_id', how='left').fillna(1)
    # 领取并消费优惠券的平均距离
    label_set = label_set.merge(coupon_consumed[coupon_consumed['Distance'] != -1].groupby('User_id')
                                .Distance.mean().reset_index(name='u5'),
                                on='User_id', how='left').fillna(-1)
    # 领取但没有消费优惠券的平均折扣率
    label_set = label_set.merge(no_consumed.groupby('User_id').discount_rate.mean().reset_index(name='u6'),
                                on='User_id', how='left').fillna(1)
    # 在多少个不同商家领取并消费优惠券
    label_set = label_set.merge(coupon_consumed.groupby(['User_id', 'Merchant_id']).size()
                                .groupby('User_id').size().reset_index(name='u7'), on='User_id', how='left').fillna(0)
    # 在多少个不同商家领取但没有消费优惠券
    label_set = label_set.merge(no_consumed.groupby(['User_id', 'Merchant_id']).size()
                                .groupby('User_id').size().reset_index(name='u8'), on='User_id', how='left').fillna(0)
    # 在多少个不同商家领取优惠券
    label_set['u9'] = label_set['u7'] + label_set['u8']
    # 领取特定优惠券并消费的数量
    label_set = label_set.merge(coupon_consumed.groupby(['User_id', 'Coupon_id']).size()
                                .reset_index(name='u10'), on=['User_id', 'Coupon_id'], how='left').fillna(0)
    # 领取特定优惠券未消费的数量
    label_set = label_set.merge(no_consumed.groupby(['User_id', 'Coupon_id']).size()
                                .reset_index(name='u11'), on=['User_id', 'Coupon_id'], how='left').fillna(0)
    # 领取特定优惠券数量
    label_set['u12'] = label_set['u11'] + label_set['u10']
    # 当天领券数并消费数
    label_set = label_set.merge(coupon_consumed.groupby(['User_id', 'Date_received']).size()
                                .reset_index(name='u13'), on=['User_id', 'Date_received'], how='left').fillna(0)
    # 当天领取未消费数
    label_set = label_set.merge(no_consumed.groupby(['User_id', 'Date_received']).size()
                                .reset_index(name='u14'), on=['User_id', 'Date_received'], how='left').fillna(0)
    label_set['u15'] = label_set['u13'] + label_set['u14']

    int_cols = ['u0', 'u1', 'u2', 'u7', 'u8', 'u9', 'u10', 'u11', 'u12', 'u13', 'u14', 'u15']
    reset_type(label_set, int_cols, [int]*len(int_cols))
    return label_set

In [238]:
i = 0
for feat, label in zip(feats, labels):
    out = attach_features(feat, label)
    out = attach_labels(out)
    out.to_csv(f'./processed_data_{i}.csv', index=None)
    i+=1

attach features to label...
attach features to label...
attach features to label...


下面是测试数据工程效果的部分

In [239]:
import lightgbm as lgb

def train_model(train_data: pd.DataFrame, valid_data: pd.DataFrame, feature_cols,
                save_to='', stopping_round=20, num_boost_round=1000):
    X = train_data[feature_cols]
    y = train_data['label']
    X_v = valid_data[feature_cols]
    y_v = valid_data['label']
    train = lgb.Dataset(X, y)
    valid = lgb.Dataset(X_v, y_v, reference=train)
    params = {
        'task': 'train',
        'boosting_type': 'gbdt',        # 设置提升类型
        'objective': 'binary',          # 目标函数
        'metric': {'l2', 'auc'},        # 评估函数
        'num_leaves': 31,               # 叶子节点数
        'learning_rate': 0.10,          # 学习率
        'feature_fraction': 0.9,        # 建树的特征选择比例
        'bagging_fraction': 0.8,        # 建树的样本采样比例
        'bagging_freq': 5,              # 每 bagging_freq 次迭代执行bagging
        'verbose': 1                    # 提示
    }
    print('Start training...')
    _model = lgb.train(params, train, num_boost_round=num_boost_round, valid_sets=valid,
                      early_stopping_rounds=stopping_round)
    if save_to != '':
        print('Saving model')
        _model.save_model(save_to)
    return _model

In [240]:
train_data0 = pd.read_csv('./processed_data_0.csv', parse_dates=['Date_received', 'Date'])
train_data1 = pd.read_csv('./processed_data_1.csv', parse_dates=['Date_received', 'Date'])
train_data2 = pd.read_csv('./processed_data_2.csv', parse_dates=['Date_received', 'Date'])

In [241]:
feat_cols = ['Distance', 'no_distance', 'is_full_discount',
             'discount_x', 'discount_y', 'discount_rate', 'discount_type',
             *['u'+str(i) for i in range(0, 16)],
             *['_u'+str(i) for i in range(0, 5)],
             *['weekday_'+str(i) for i in range(0, 7)]]

In [242]:
concat_data_01 = pd.concat([train_data0, train_data1])
concat_data_01[feat_cols]

Unnamed: 0,Distance,no_distance,is_full_discount,discount_x,discount_y,discount_rate,discount_type,u0,u1,u2,...,_u2,_u3,_u4,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6
0,0,0,1,20,1,0.950000,14,0,1,1,...,1,1,0,0,0,0,0,0,1,0
1,2,0,1,200,20,0.900000,12,0,0,0,...,1,1,0,0,1,0,0,0,0,0
2,0,0,1,20,5,0.750000,7,0,0,0,...,1,1,0,0,0,0,0,0,0,1
3,0,0,1,20,5,0.750000,7,0,2,2,...,1,1,0,1,0,0,0,0,0,0
4,10,0,1,30,5,0.833333,9,0,2,2,...,1,1,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132630,10,0,0,-1,-1,0.950000,14,0,0,0,...,1,1,0,0,1,0,0,0,0,0
132631,-1,1,1,150,30,0.800000,8,0,0,0,...,1,1,0,1,0,0,0,0,0,0
132632,-1,1,1,20,1,0.950000,14,0,0,0,...,1,1,0,0,0,0,0,0,1,0
132633,6,0,1,30,1,0.966667,15,2,1,3,...,1,1,0,0,0,1,0,0,0,0


In [243]:
train_data2[feat_cols]

Unnamed: 0,Distance,no_distance,is_full_discount,discount_x,discount_y,discount_rate,discount_type,u0,u1,u2,...,_u2,_u3,_u4,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6
0,1,0,1,150,20,0.866667,11,0,1,1,...,1,1,0,0,0,0,0,0,1,0
1,0,0,1,20,1,0.950000,14,0,1,1,...,1,1,0,1,0,0,0,0,0,0
2,0,0,1,20,1,0.950000,14,0,1,1,...,1,1,0,1,0,0,0,0,0,0
3,0,0,1,30,5,0.833333,9,0,0,0,...,1,1,0,1,0,0,0,0,0,0
4,0,0,1,20,1,0.950000,14,0,0,0,...,1,1,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249106,0,0,1,30,5,0.833333,9,0,0,0,...,1,1,0,1,0,0,0,0,0,0
249107,0,0,1,30,5,0.833333,9,0,0,0,...,1,1,0,0,1,0,0,0,0,0
249108,-1,1,1,100,30,0.700000,6,0,1,1,...,1,1,0,1,0,0,0,0,0,0
249109,6,0,1,50,10,0.800000,8,0,0,0,...,1,1,0,1,0,0,0,0,0,0


In [244]:
model = train_model(concat_data_01, train_data2, feat_cols, stopping_round=-1)

Start training...
[LightGBM] [Info] Number of positive: 25659, number of negative: 257116
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 659
[LightGBM] [Info] Number of data points in the train set: 282775, number of used features: 32
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.090740 -> initscore=-2.304633
[LightGBM] [Info] Start training from score -2.304633
[1]	valid_0's l2: 0.0798357	valid_0's auc: 0.731031
[2]	valid_0's l2: 0.0782826	valid_0's auc: 0.742954
[3]	valid_0's l2: 0.0767568	valid_0's auc: 0.767574
[4]	valid_0's l2: 0.0757311	valid_0's auc: 0.768513
[5]	valid_0's l2: 0.0749947	valid_0's auc: 0.76789
[6]	valid_0's l2: 0.0743034	valid_0's auc: 0.772051
[7]	valid_0's l2: 0.0737805	valid_0's auc: 0.773315
[8]	valid_0's l2: 0.0733509	valid_0's auc: 0.774378
[9]	valid_0's l2: 0.0730106	valid_0's auc: 0.775564
[10]	valid_0's l2: 0.0726886	valid_0's auc: 0.775196
[11]	va

In [245]:
pd.DataFrame(model.feature_importance(), index=feat_cols, columns=['importance'])

Unnamed: 0,importance
Distance,2684
no_distance,112
is_full_discount,193
discount_x,1805
discount_y,769
discount_rate,2444
discount_type,185
u0,559
u1,1272
u2,1500


In [246]:
test_data = pd.read_csv('./dataset_cleaned/ccf_off_test.csv', parse_dates=['Date_received'])

test_feat_range = pd.date_range('2016/04/02', periods=75)
test_feat_off = off_train[off_train['Date_received'].isin(test_feat_range) |  # 领券日期在 time_range 内
                          ((off_train['Coupon_id'] == 0) & off_train['Date'].isin(test_feat_range))]

In [247]:
test_feat_off.Date_received.describe()

count                  790340
unique                     76
top       1970-01-01 00:00:00
freq                   349816
first     1970-01-01 00:00:00
last      2016-06-15 00:00:00
Name: Date_received, dtype: object

In [248]:
test = attach_features(test_feat_off, test_data)

attach features to label...


In [249]:
concat_all = pd.concat([train_data0, train_data1, train_data2])
model_full = train_model(concat_all, concat_data_01, feat_cols, stopping_round=-1)

Start training...
[LightGBM] [Info] Number of positive: 48175, number of negative: 483711
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 772
[LightGBM] [Info] Number of data points in the train set: 531886, number of used features: 32
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.090574 -> initscore=-2.306647
[LightGBM] [Info] Start training from score -2.306647
[1]	valid_0's l2: 0.0803308	valid_0's auc: 0.790206
[2]	valid_0's l2: 0.0786907	valid_0's auc: 0.794628
[3]	valid_0's l2: 0.0774159	valid_0's auc: 0.79842
[4]	valid_0's l2: 0.076372	valid_0's auc: 0.799604
[5]	valid_0's l2: 0.0755278	valid_0's auc: 0.802393
[6]	valid_0's l2: 0.0748438	valid_0's auc: 0.802299
[7]	valid_0's l2: 0.0742754	valid_0's auc: 0.803049
[8]	valid_0's l2: 0.073814	valid_0's auc: 0.802641
[9]	valid_0's l2: 0.0734437	valid_0's auc: 0.802752
[10]	valid_0's l2: 0.0731149	valid_0's auc: 0.803656
[11]	vali

In [250]:
pred = model_full.predict(test[feat_cols], num_iteration=model_full.best_iteration)

In [251]:
pred

array([0.05570649, 0.06166085, 0.0032231 , ..., 0.03793597, 0.15023105,
       0.15815034])

In [252]:
pd.DataFrame(model.feature_importance(), index=feat_cols, columns=['importance'])

Unnamed: 0,importance
Distance,2684
no_distance,112
is_full_discount,193
discount_x,1805
discount_y,769
discount_rate,2444
discount_type,185
u0,559
u1,1272
u2,1500


In [253]:
submit = test.copy()[['User_id', 'Coupon_id', 'Date_received']]
submit['Probability'] = pred
submit

Unnamed: 0,User_id,Coupon_id,Date_received,Probability
0,4129537,9983,2016-07-12,0.055706
1,6949378,3429,2016-07-06,0.061661
2,2166529,6928,2016-07-27,0.003223
3,2166529,1808,2016-07-27,0.014533
4,6172162,6500,2016-07-08,0.035139
...,...,...,...,...
113635,5828093,10418,2016-07-16,0.007637
113636,6626813,7595,2016-07-07,0.117016
113637,6626813,7590,2016-07-12,0.037936
113638,4547069,13602,2016-07-17,0.150231


In [254]:
submit.to_csv('submit.csv', index=False, header=False, date_format='%Y%m%d')