# 这是任务二的特征工程代码

## AutoML

- 自动特征工程
    - 特征的自动提取
- 特征筛选
- 模型融合
    - AutoGluon
- Stacking


In [1]:
import pandas as pd
import warnings
import itertools
import tqdm
from multiprocessing import Process

warnings.filterwarnings('ignore')

In [2]:
# 聚合函数列表， : 前表示作用的列，count 作用于任何列都一样所以不需要特定
# Distance, no_distance 只有线下集才有
agg_funcs = ['count',
             'discount_rate:mean', 'Distance:mean', 'discount_x:mean', 'discount_y:mean',
             'discount_rate:max', 'Distance:max', 'discount_x:max', 'discount_y:max',
             'discount_rate:min', 'Distance:min', 'discount_x:min', 'discount_y:min',
             'discount_rate:std', 'Distance:std', 'discount_x:std', 'discount_y:std',
             'is_full_discount:sum', 'no_distance:sum']
# 额外的聚合函数列表，这类列表和 label 强相关，不能作用于标签集的自身数据提取
# fixed_consume, is_click 只有线上集才有
agg_funcs_extra = ['coupon_consume:sum', 'normal_consume:sum',
                  'no_consume:sum', 'fixed_consume:sum', 'is_click:sum']
# 分组操作可以作用的列
groupby_cols = ['User_id', 'Coupon_id', 'Merchant_id', 'Date_received']


def pick(arr, deep=1):
    """
    从 arr 中抽取出所有子集，并且子集大小为 1~deep
    :param arr:     需要抽取的集合
    :param deep:
    :return:
    """
    picks = []
    for num in range(1, deep+1):
        picks.extend(itertools.combinations(arr, num))
    return picks

In [3]:
def attach_one_feat(
        _label: pd.DataFrame,
        _feat: pd.DataFrame,
        groupby_col=None,
        agg_func=None,
        feat_name='',
        na=0
):
    """
    抽取一个新的特征的模板代码
    :param _label:      pandas.DataFrame，标签集，特征抽取后会 merge 到此数据集上并返回
    :param _feat:
                        pandas.DataFrame，特征集，用于抽取特征的数据集，
                        当这个数据集就是 _label 时要注意不要抽取和 label 相关的特征
    :param groupby_col:
                        list of str|list of str list，来自 groupby_cols，进行 groupby 的列，可以是一组 list，
                        表示进行多次 groupby
    :param agg_func:
                        str|list of str，来自 agg_funcs，进行聚合的函数，类型应当符合 groupby_col 的类型要求，
                        如果是 list 表示多次聚合
    :param feat_name:   str，抽取出的新特征的名称
    :param na:          int|float，对空值的处理
    :return:            pandas.DataFrame，合并这个特征后的标签集
    """
    # 进行一次 groupby，返回一个 DataFrame
    def group_once(df: pd.DataFrame, _pivots, _agg):
        if _agg == 'count':
            return df.groupby(_pivots).size()
        col, func = _agg.split(':')
        return df.groupby(_pivots)[col].agg(func)
    # 判断是否是单次 groupby
    if isinstance(agg_func, str):
        grouped = group_once(_feat, groupby_col, agg_func).reset_index(name=feat_name)
        return _label.merge(grouped, on=groupby_col, how='left').fillna(na)
    # 多次 groupby
    grouped = _feat
    merge_pivots = []  # 用于最后合并用的 pivots
    for i in range(len(agg_func)):
        pivots, agg = groupby_col[i], agg_func[i]
        grouped = group_once(grouped, pivots, agg)
        if i == len(agg_func) - 1:  # 给最后一次 groupby 加上 feat_name
            grouped = grouped.reset_index(name=feat_name)
            merge_pivots = pivots
    return _label.merge(grouped, on=merge_pivots, how='left').fillna(na)

In [4]:
def auto_attach_feats(label, feat, feat_id, is_off=True):
    groupbys = pick(groupby_cols, deep=3)
    aggs = agg_funcs
    # 如果不是标签集自身特征提取的话加上额外的聚合函数
    if id(label) != id(feat):
        print('enabling attach feats with extra aggs')
        aggs = agg_funcs + agg_funcs_extra
    # 如果是线上集的话去除一些线下独有的特征
    if not is_off:
        aggs = [i for i in aggs if not (i.startswith('no_distance') or i.startswith('Distance'))]
    else: # 反之除去线上独有的特征
        aggs = [i for i in aggs if not (i.startswith('fixed_consume') or i.startswith('is_click'))]
    print(f"all groupby pivots is {groupbys}")
    print(f'all aggs is {aggs}')
    # 单次 groupby
    count = 0
    for by in groupbys:
        print(f'group pivots is {by}, start enumerate agg funcs to merge:')
        for agg in tqdm.tqdm(aggs):
            label = attach_one_feat(
                label, feat,
                groupby_col=list(by), agg_func=agg, feat_name=f'{feat_id}_1_{"-".join(by)}_{agg.replace(":", "-")}'
            )
            count += 1
    # 二次 groupby
    for by_0 in [i for i in groupbys if len(i) > 1]:
        second_groupbys = pick(by_0, deep=1)
        for by_1 in second_groupbys:
            print(f'group pivots is {by_0} -> {by_1}, start to merge:')
            # 防止时间复杂度爆炸，这里剪枝了，agg 只考虑 count
            label = attach_one_feat(
                label, feat,
                groupby_col=[list(by_0), list(by_1)], agg_func=['count', 'count'],
                feat_name=f'{feat_id}_2_{"-".join(by_0)}_count_{"-".join(by_1)}_count'
            )
            count += 1
    print(f'finish auto attach feats, merged feat count is {count}')
    # 除去全是0的列
    return label.loc[:, ~(label==0).all()]

In [5]:
def attach_feat(off1, off2, on1, on2, label, save, label_dates=None):
    if label_dates is None:
        label_dates = ['Date_received', 'Date']
    feat_off_set = pd.read_csv(off1, parse_dates=['Date_received', 'Date'])
    feat_on_set = pd.read_csv(on1, parse_dates=['Date_received', 'Date'])
    label_set = pd.read_csv(label, parse_dates=label_dates)
    print('======= start attach off1 =======')
    label_set = auto_attach_feats(label_set, feat_off_set, 'off1')
    print('======= start attach on1 =======')
    label_set = auto_attach_feats(label_set, feat_on_set, 'on1', is_off=False)

    feat_off_set = pd.read_csv(off2, parse_dates=['Date_received', 'Date'])
    feat_on_set = pd.read_csv(on2, parse_dates=['Date_received', 'Date'])
    print('======= start attach off2 =======')
    label_set = auto_attach_feats(label_set, feat_off_set, 'off2')
    print('======= start attach on2 =======')
    label_set = auto_attach_feats(label_set, feat_on_set, 'on2', is_off=False)

    print('======= start attach self =======')
    label_set = auto_attach_feats(label_set, label_set, 'self')
    label_set.to_csv(save, index=False)

for i in range(1, 3):
    attach_feat(
        f'./dataset_split/feat_{i}_1_off.csv',
        f'./dataset_split/feat_{i}_2_off.csv',
        f'./dataset_split/feat_{i}_1_on.csv',
        f'./dataset_split/feat_{i}_2_on.csv',
        f'./dataset_split/label_{i}.csv',
        f'./dataset_processed/auto_full_features_set_{i}.csv',
    )

print('end.')

enabling attach feats with extra aggs
all groupby pivots is [('User_id',), ('Coupon_id',), ('Merchant_id',), ('Date_received',), ('User_id', 'Coupon_id'), ('User_id', 'Merchant_id'), ('User_id', 'Date_received'), ('Coupon_id', 'Merchant_id'), ('Coupon_id', 'Date_received'), ('Merchant_id', 'Date_received'), ('User_id', 'Coupon_id', 'Merchant_id'), ('User_id', 'Coupon_id', 'Date_received'), ('User_id', 'Merchant_id', 'Date_received'), ('Coupon_id', 'Merchant_id', 'Date_received')]
all aggs is ['count', 'discount_rate:mean', 'Distance:mean', 'discount_x:mean', 'discount_y:mean', 'discount_rate:max', 'Distance:max', 'discount_x:max', 'discount_y:max', 'discount_rate:min', 'Distance:min', 'discount_x:min', 'discount_y:min', 'discount_rate:std', 'Distance:std', 'discount_x:std', 'discount_y:std', 'is_full_discount:sum', 'no_distance:sum', 'coupon_consume:sum', 'normal_consume:sum', 'no_consume:sum']
group pivots is ('User_id',), start enumerate agg funcs to merge:


  0%|          | 0/22 [00:00<?, ?it/s]

KeyboardInterrupt



In [6]:
attach_feat(
        './dataset_split/feat_test_1_off.csv',
        './dataset_split/feat_test_2_off.csv',
        './dataset_split/feat_test_1_on.csv',
        './dataset_split/feat_test_2_on.csv',
        './dataset_split/test.csv',
        './dataset_processed/auto_full_features_set_test.csv',
        ['Date_received']
)

enabling attach feats with extra aggs
all groupby pivots is [('User_id',), ('Coupon_id',), ('Merchant_id',), ('Date_received',), ('User_id', 'Coupon_id'), ('User_id', 'Merchant_id'), ('User_id', 'Date_received'), ('Coupon_id', 'Merchant_id'), ('Coupon_id', 'Date_received'), ('Merchant_id', 'Date_received'), ('User_id', 'Coupon_id', 'Merchant_id'), ('User_id', 'Coupon_id', 'Date_received'), ('User_id', 'Merchant_id', 'Date_received'), ('Coupon_id', 'Merchant_id', 'Date_received')]
all aggs is ['count', 'discount_rate:mean', 'Distance:mean', 'discount_x:mean', 'discount_y:mean', 'discount_rate:max', 'Distance:max', 'discount_x:max', 'discount_y:max', 'discount_rate:min', 'Distance:min', 'discount_x:min', 'discount_y:min', 'discount_rate:std', 'Distance:std', 'discount_x:std', 'discount_y:std', 'is_full_discount:sum', 'no_distance:sum', 'coupon_consume:sum', 'normal_consume:sum', 'no_consume:sum']
group pivots is ('User_id',), start enumerate agg funcs to merge:


100%|██████████| 22/22 [00:05<00:00,  4.06it/s]


group pivots is ('Coupon_id',), start enumerate agg funcs to merge:


100%|██████████| 22/22 [00:04<00:00,  5.50it/s]


group pivots is ('Merchant_id',), start enumerate agg funcs to merge:


100%|██████████| 22/22 [00:04<00:00,  5.04it/s]


group pivots is ('Date_received',), start enumerate agg funcs to merge:


100%|██████████| 22/22 [00:04<00:00,  4.88it/s]


group pivots is ('User_id', 'Coupon_id'), start enumerate agg funcs to merge:


100%|██████████| 22/22 [00:09<00:00,  2.44it/s]


group pivots is ('User_id', 'Merchant_id'), start enumerate agg funcs to merge:


100%|██████████| 22/22 [00:08<00:00,  2.50it/s]


group pivots is ('User_id', 'Date_received'), start enumerate agg funcs to merge:


100%|██████████| 22/22 [00:09<00:00,  2.21it/s]


group pivots is ('Coupon_id', 'Merchant_id'), start enumerate agg funcs to merge:


100%|██████████| 22/22 [00:07<00:00,  3.05it/s]


group pivots is ('Coupon_id', 'Date_received'), start enumerate agg funcs to merge:


100%|██████████| 22/22 [00:07<00:00,  2.90it/s]


group pivots is ('Merchant_id', 'Date_received'), start enumerate agg funcs to merge:


100%|██████████| 22/22 [00:07<00:00,  2.77it/s]


group pivots is ('User_id', 'Coupon_id', 'Merchant_id'), start enumerate agg funcs to merge:


100%|██████████| 22/22 [00:13<00:00,  1.67it/s]


group pivots is ('User_id', 'Coupon_id', 'Date_received'), start enumerate agg funcs to merge:


100%|██████████| 22/22 [00:12<00:00,  1.70it/s]


group pivots is ('User_id', 'Merchant_id', 'Date_received'), start enumerate agg funcs to merge:


100%|██████████| 22/22 [00:14<00:00,  1.57it/s]


group pivots is ('Coupon_id', 'Merchant_id', 'Date_received'), start enumerate agg funcs to merge:


100%|██████████| 22/22 [00:10<00:00,  2.19it/s]


group pivots is ('User_id', 'Coupon_id') -> ('User_id',), start to merge:
group pivots is ('User_id', 'Coupon_id') -> ('Coupon_id',), start to merge:
group pivots is ('User_id', 'Merchant_id') -> ('User_id',), start to merge:
group pivots is ('User_id', 'Merchant_id') -> ('Merchant_id',), start to merge:
group pivots is ('User_id', 'Date_received') -> ('User_id',), start to merge:
group pivots is ('User_id', 'Date_received') -> ('Date_received',), start to merge:
group pivots is ('Coupon_id', 'Merchant_id') -> ('Coupon_id',), start to merge:
group pivots is ('Coupon_id', 'Merchant_id') -> ('Merchant_id',), start to merge:
group pivots is ('Coupon_id', 'Date_received') -> ('Coupon_id',), start to merge:
group pivots is ('Coupon_id', 'Date_received') -> ('Date_received',), start to merge:
group pivots is ('Merchant_id', 'Date_received') -> ('Merchant_id',), start to merge:
group pivots is ('Merchant_id', 'Date_received') -> ('Date_received',), start to merge:
group pivots is ('User_id', 

100%|██████████| 19/19 [00:09<00:00,  1.92it/s]


group pivots is ('Coupon_id',), start enumerate agg funcs to merge:


100%|██████████| 19/19 [00:06<00:00,  2.73it/s]


group pivots is ('Merchant_id',), start enumerate agg funcs to merge:


100%|██████████| 19/19 [00:07<00:00,  2.68it/s]


group pivots is ('Date_received',), start enumerate agg funcs to merge:


100%|██████████| 19/19 [00:07<00:00,  2.57it/s]


group pivots is ('User_id', 'Coupon_id'), start enumerate agg funcs to merge:


100%|██████████| 19/19 [00:17<00:00,  1.10it/s]


group pivots is ('User_id', 'Merchant_id'), start enumerate agg funcs to merge:


100%|██████████| 19/19 [00:25<00:00,  1.32s/it]


group pivots is ('User_id', 'Date_received'), start enumerate agg funcs to merge:


100%|██████████| 19/19 [00:16<00:00,  1.16it/s]


group pivots is ('Coupon_id', 'Merchant_id'), start enumerate agg funcs to merge:


100%|██████████| 19/19 [00:10<00:00,  1.77it/s]


group pivots is ('Coupon_id', 'Date_received'), start enumerate agg funcs to merge:


100%|██████████| 19/19 [00:11<00:00,  1.70it/s]


group pivots is ('Merchant_id', 'Date_received'), start enumerate agg funcs to merge:


100%|██████████| 19/19 [00:11<00:00,  1.68it/s]


group pivots is ('User_id', 'Coupon_id', 'Merchant_id'), start enumerate agg funcs to merge:


100%|██████████| 19/19 [00:27<00:00,  1.44s/it]


group pivots is ('User_id', 'Coupon_id', 'Date_received'), start enumerate agg funcs to merge:


100%|██████████| 19/19 [00:19<00:00,  1.01s/it]


group pivots is ('User_id', 'Merchant_id', 'Date_received'), start enumerate agg funcs to merge:


100%|██████████| 19/19 [00:28<00:00,  1.48s/it]


group pivots is ('Coupon_id', 'Merchant_id', 'Date_received'), start enumerate agg funcs to merge:


100%|██████████| 19/19 [00:13<00:00,  1.39it/s]


group pivots is ('User_id', 'Coupon_id') -> ('User_id',), start to merge:
group pivots is ('User_id', 'Coupon_id') -> ('Coupon_id',), start to merge:
group pivots is ('User_id', 'Merchant_id') -> ('User_id',), start to merge:
group pivots is ('User_id', 'Merchant_id') -> ('Merchant_id',), start to merge:
group pivots is ('User_id', 'Date_received') -> ('User_id',), start to merge:
group pivots is ('User_id', 'Date_received') -> ('Date_received',), start to merge:
group pivots is ('Coupon_id', 'Merchant_id') -> ('Coupon_id',), start to merge:
group pivots is ('Coupon_id', 'Merchant_id') -> ('Merchant_id',), start to merge:
group pivots is ('Coupon_id', 'Date_received') -> ('Coupon_id',), start to merge:
group pivots is ('Coupon_id', 'Date_received') -> ('Date_received',), start to merge:
group pivots is ('Merchant_id', 'Date_received') -> ('Merchant_id',), start to merge:
group pivots is ('Merchant_id', 'Date_received') -> ('Date_received',), start to merge:
group pivots is ('User_id', 

100%|██████████| 22/22 [00:07<00:00,  3.10it/s]


group pivots is ('Coupon_id',), start enumerate agg funcs to merge:


100%|██████████| 22/22 [00:07<00:00,  3.06it/s]


group pivots is ('Merchant_id',), start enumerate agg funcs to merge:


100%|██████████| 22/22 [00:07<00:00,  2.86it/s]


group pivots is ('Date_received',), start enumerate agg funcs to merge:


100%|██████████| 22/22 [00:07<00:00,  2.76it/s]


group pivots is ('User_id', 'Coupon_id'), start enumerate agg funcs to merge:


100%|██████████| 22/22 [00:08<00:00,  2.49it/s]


group pivots is ('User_id', 'Merchant_id'), start enumerate agg funcs to merge:


100%|██████████| 22/22 [00:09<00:00,  2.38it/s]


group pivots is ('User_id', 'Date_received'), start enumerate agg funcs to merge:


100%|██████████| 22/22 [00:09<00:00,  2.28it/s]


group pivots is ('Coupon_id', 'Merchant_id'), start enumerate agg funcs to merge:


100%|██████████| 22/22 [00:09<00:00,  2.26it/s]


group pivots is ('Coupon_id', 'Date_received'), start enumerate agg funcs to merge:


100%|██████████| 22/22 [00:10<00:00,  2.17it/s]


group pivots is ('Merchant_id', 'Date_received'), start enumerate agg funcs to merge:


100%|██████████| 22/22 [00:10<00:00,  2.10it/s]


group pivots is ('User_id', 'Coupon_id', 'Merchant_id'), start enumerate agg funcs to merge:


100%|██████████| 22/22 [00:11<00:00,  1.98it/s]


group pivots is ('User_id', 'Coupon_id', 'Date_received'), start enumerate agg funcs to merge:


100%|██████████| 22/22 [00:11<00:00,  1.89it/s]


group pivots is ('User_id', 'Merchant_id', 'Date_received'), start enumerate agg funcs to merge:


100%|██████████| 22/22 [00:12<00:00,  1.81it/s]


group pivots is ('Coupon_id', 'Merchant_id', 'Date_received'), start enumerate agg funcs to merge:


100%|██████████| 22/22 [00:12<00:00,  1.82it/s]


group pivots is ('User_id', 'Coupon_id') -> ('User_id',), start to merge:
group pivots is ('User_id', 'Coupon_id') -> ('Coupon_id',), start to merge:
group pivots is ('User_id', 'Merchant_id') -> ('User_id',), start to merge:
group pivots is ('User_id', 'Merchant_id') -> ('Merchant_id',), start to merge:
group pivots is ('User_id', 'Date_received') -> ('User_id',), start to merge:
group pivots is ('User_id', 'Date_received') -> ('Date_received',), start to merge:
group pivots is ('Coupon_id', 'Merchant_id') -> ('Coupon_id',), start to merge:
group pivots is ('Coupon_id', 'Merchant_id') -> ('Merchant_id',), start to merge:
group pivots is ('Coupon_id', 'Date_received') -> ('Coupon_id',), start to merge:
group pivots is ('Coupon_id', 'Date_received') -> ('Date_received',), start to merge:
group pivots is ('Merchant_id', 'Date_received') -> ('Merchant_id',), start to merge:
group pivots is ('Merchant_id', 'Date_received') -> ('Date_received',), start to merge:
group pivots is ('User_id', 

100%|██████████| 19/19 [00:09<00:00,  2.07it/s]


group pivots is ('Coupon_id',), start enumerate agg funcs to merge:


100%|██████████| 19/19 [00:08<00:00,  2.27it/s]


group pivots is ('Merchant_id',), start enumerate agg funcs to merge:


100%|██████████| 19/19 [00:08<00:00,  2.18it/s]


group pivots is ('Date_received',), start enumerate agg funcs to merge:


100%|██████████| 19/19 [00:08<00:00,  2.12it/s]


group pivots is ('User_id', 'Coupon_id'), start enumerate agg funcs to merge:


100%|██████████| 19/19 [00:11<00:00,  1.69it/s]


group pivots is ('User_id', 'Merchant_id'), start enumerate agg funcs to merge:


100%|██████████| 19/19 [00:12<00:00,  1.49it/s]


group pivots is ('User_id', 'Date_received'), start enumerate agg funcs to merge:


100%|██████████| 19/19 [00:11<00:00,  1.58it/s]


group pivots is ('Coupon_id', 'Merchant_id'), start enumerate agg funcs to merge:


100%|██████████| 19/19 [00:10<00:00,  1.77it/s]


group pivots is ('Coupon_id', 'Date_received'), start enumerate agg funcs to merge:


100%|██████████| 19/19 [00:10<00:00,  1.74it/s]


group pivots is ('Merchant_id', 'Date_received'), start enumerate agg funcs to merge:


100%|██████████| 19/19 [00:11<00:00,  1.68it/s]


group pivots is ('User_id', 'Coupon_id', 'Merchant_id'), start enumerate agg funcs to merge:


100%|██████████| 19/19 [00:14<00:00,  1.31it/s]


group pivots is ('User_id', 'Coupon_id', 'Date_received'), start enumerate agg funcs to merge:


100%|██████████| 19/19 [00:14<00:00,  1.36it/s]


group pivots is ('User_id', 'Merchant_id', 'Date_received'), start enumerate agg funcs to merge:


100%|██████████| 19/19 [00:15<00:00,  1.22it/s]


group pivots is ('Coupon_id', 'Merchant_id', 'Date_received'), start enumerate agg funcs to merge:


100%|██████████| 19/19 [00:13<00:00,  1.42it/s]


group pivots is ('User_id', 'Coupon_id') -> ('User_id',), start to merge:
group pivots is ('User_id', 'Coupon_id') -> ('Coupon_id',), start to merge:
group pivots is ('User_id', 'Merchant_id') -> ('User_id',), start to merge:
group pivots is ('User_id', 'Merchant_id') -> ('Merchant_id',), start to merge:
group pivots is ('User_id', 'Date_received') -> ('User_id',), start to merge:
group pivots is ('User_id', 'Date_received') -> ('Date_received',), start to merge:
group pivots is ('Coupon_id', 'Merchant_id') -> ('Coupon_id',), start to merge:
group pivots is ('Coupon_id', 'Merchant_id') -> ('Merchant_id',), start to merge:
group pivots is ('Coupon_id', 'Date_received') -> ('Coupon_id',), start to merge:
group pivots is ('Coupon_id', 'Date_received') -> ('Date_received',), start to merge:
group pivots is ('Merchant_id', 'Date_received') -> ('Merchant_id',), start to merge:
group pivots is ('Merchant_id', 'Date_received') -> ('Date_received',), start to merge:
group pivots is ('User_id', 

100%|██████████| 19/19 [00:10<00:00,  1.89it/s]


group pivots is ('Coupon_id',), start enumerate agg funcs to merge:


100%|██████████| 19/19 [00:13<00:00,  1.45it/s]


group pivots is ('Merchant_id',), start enumerate agg funcs to merge:


100%|██████████| 19/19 [00:16<00:00,  1.13it/s]


group pivots is ('Date_received',), start enumerate agg funcs to merge:


100%|██████████| 19/19 [00:20<00:00,  1.08s/it]


group pivots is ('User_id', 'Coupon_id'), start enumerate agg funcs to merge:


100%|██████████| 19/19 [00:25<00:00,  1.34s/it]


group pivots is ('User_id', 'Merchant_id'), start enumerate agg funcs to merge:


100%|██████████| 19/19 [00:30<00:00,  1.59s/it]


group pivots is ('User_id', 'Date_received'), start enumerate agg funcs to merge:


100%|██████████| 19/19 [00:34<00:00,  1.80s/it]


group pivots is ('Coupon_id', 'Merchant_id'), start enumerate agg funcs to merge:


100%|██████████| 19/19 [00:39<00:00,  2.09s/it]


group pivots is ('Coupon_id', 'Date_received'), start enumerate agg funcs to merge:


100%|██████████| 19/19 [00:42<00:00,  2.24s/it]


group pivots is ('Merchant_id', 'Date_received'), start enumerate agg funcs to merge:


100%|██████████| 19/19 [00:47<00:00,  2.51s/it]


group pivots is ('User_id', 'Coupon_id', 'Merchant_id'), start enumerate agg funcs to merge:


100%|██████████| 19/19 [00:53<00:00,  2.84s/it]


group pivots is ('User_id', 'Coupon_id', 'Date_received'), start enumerate agg funcs to merge:


100%|██████████| 19/19 [00:59<00:00,  3.14s/it]


group pivots is ('User_id', 'Merchant_id', 'Date_received'), start enumerate agg funcs to merge:


100%|██████████| 19/19 [01:05<00:00,  3.45s/it]


group pivots is ('Coupon_id', 'Merchant_id', 'Date_received'), start enumerate agg funcs to merge:


100%|██████████| 19/19 [01:14<00:00,  3.90s/it]


group pivots is ('User_id', 'Coupon_id') -> ('User_id',), start to merge:
group pivots is ('User_id', 'Coupon_id') -> ('Coupon_id',), start to merge:
group pivots is ('User_id', 'Merchant_id') -> ('User_id',), start to merge:
group pivots is ('User_id', 'Merchant_id') -> ('Merchant_id',), start to merge:
group pivots is ('User_id', 'Date_received') -> ('User_id',), start to merge:
group pivots is ('User_id', 'Date_received') -> ('Date_received',), start to merge:
group pivots is ('Coupon_id', 'Merchant_id') -> ('Coupon_id',), start to merge:
group pivots is ('Coupon_id', 'Merchant_id') -> ('Merchant_id',), start to merge:
group pivots is ('Coupon_id', 'Date_received') -> ('Coupon_id',), start to merge:
group pivots is ('Coupon_id', 'Date_received') -> ('Date_received',), start to merge:
group pivots is ('Merchant_id', 'Date_received') -> ('Merchant_id',), start to merge:
group pivots is ('Merchant_id', 'Date_received') -> ('Date_received',), start to merge:
group pivots is ('User_id', 

## 筛选特征

In [6]:
import pandas as pd
import warnings

warnings.filterwarnings('ignore')

no_date = pd.to_datetime(0)

def attach_labels(_label_set: pd.DataFrame) -> pd.DataFrame:
    """
    给 label_set 打上标签
    :param _label_set:
    :return: label_set
    """
    _label_set['label'] = 0
    _label_set.loc[(_label_set['Date'] != no_date) &  # 要求有 Date
                  (_label_set['Date'] - _label_set['Date_received'] <= pd.to_timedelta(15, 'D')),  # 并且小于 15 天
                  'label'] = 1
    return _label_set

In [7]:
df1 = pd.read_csv('./dataset_processed/auto_full_features_set_1.csv', parse_dates=['Date', 'Date_received'])
df2 = pd.read_csv('./dataset_processed/auto_full_features_set_2.csv', parse_dates=['Date', 'Date_received'])
df3 = pd.read_csv('./dataset_processed/auto_full_features_set_test.csv', parse_dates=['Date_received'])
df1['day'] = df1.Date_received.dt.day
df1['weekday'] = df1.Date_received.dt.weekday
df2['day'] = df2.Date_received.dt.day
df2['weekday'] = df2.Date_received.dt.weekday
df3['day'] = df3.Date_received.dt.day
df3['weekday'] = df3.Date_received.dt.weekday

train_set = df1
# valid_set = df2.copy(deep=False).iloc[:len(df2)//2, :]
valid_set = df2
# test_set = pd.read_csv('./dataset_processed/auto_full_test_set.csv', parse_dates=['Date_received'])
test_set = df3

In [8]:
train_set = attach_labels(train_set)
valid_set = attach_labels(valid_set)
full_set = pd.concat([train_set, valid_set], ignore_index=True, axis=0)

In [9]:
feat_cols = ['Distance', *full_set.columns[9:-1]]
feat_cols, len(feat_cols)

(['Distance',
  'no_distance',
  'is_full_discount',
  'discount_x',
  'discount_y',
  'discount_rate',
  'discount_type',
  'off1_1_User_id_count',
  'off1_1_User_id_discount_rate-mean',
  'off1_1_User_id_Distance-mean',
  'off1_1_User_id_discount_x-mean',
  'off1_1_User_id_discount_y-mean',
  'off1_1_User_id_discount_rate-max',
  'off1_1_User_id_Distance-max',
  'off1_1_User_id_discount_x-max',
  'off1_1_User_id_discount_y-max',
  'off1_1_User_id_discount_rate-min',
  'off1_1_User_id_Distance-min',
  'off1_1_User_id_discount_x-min',
  'off1_1_User_id_discount_y-min',
  'off1_1_User_id_discount_rate-std',
  'off1_1_User_id_Distance-std',
  'off1_1_User_id_discount_x-std',
  'off1_1_User_id_discount_y-std',
  'off1_1_User_id_is_full_discount-sum',
  'off1_1_User_id_no_distance-sum',
  'off1_1_User_id_coupon_consume-sum',
  'off1_1_User_id_normal_consume-sum',
  'off1_1_User_id_no_consume-sum',
  'off1_1_Coupon_id_count',
  'off1_1_Coupon_id_discount_rate-mean',
  'off1_1_Coupon_id_Dist

In [10]:
import lightgbm as lgb

def train_model(train_data: pd.DataFrame, valid_data: pd.DataFrame, feature_cols,
                save_to='', stopping_round=20, num_boost_round=5000, par=None):
    X = train_data[feature_cols]
    y = train_data['label']
    X_v = valid_data[feature_cols]
    y_v = valid_data['label']
    train = lgb.Dataset(X, y)
    valid = lgb.Dataset(X_v, y_v, reference=train)
    print('Start training...')
    _model = lgb.train(par, train, num_boost_round=num_boost_round, valid_sets=valid,
                       early_stopping_rounds=stopping_round)
    if save_to != '':
        print('Saving model')
        _model.save_model(save_to)
    return _model

In [12]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',        # 设置提升类型
    'objective': 'binary',          # 目标函数
    'metric': {'l2', 'auc'},        # 评估函数
    'num_leaves': 31,              # 叶子节点数
    'learning_rate': 0.05,          # 学习率
    'feature_fraction': 0.9,        # 建树的特征选择比例
    'bagging_fraction': 0.8,        # 建树的样本采样比例
    'bagging_freq': 5,              # 每 bagging_freq 次迭代执行bagging
    'verbose': 1,                   # 提示
    'min_child_weight': 1.0,
}
bad_cols = []
# 过拟合测试，检查特征是否完整
model = train_model(full_set, full_set, feat_cols, num_boost_round=500, stopping_round=10, par=params)

Start training...
[LightGBM] [Info] Number of positive: 32118, number of negative: 358771
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 39120
[LightGBM] [Info] Number of data points in the train set: 390889, number of used features: 616
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.082167 -> initscore=-2.413268
[LightGBM] [Info] Start training from score -2.413268
[1]	valid_0's l2: 0.073545	valid_0's auc: 0.851927
Training until validation scores don't improve for 10 rounds
[2]	valid_0's l2: 0.0718917	valid_0's auc: 0.858727
[3]	valid_0's l2: 0.0704253	valid_0's auc: 0.861517
[4]	valid_0's l2: 0.0690958	valid_0's auc: 0.863858
[5]	valid_0's l2: 0.0678983	valid_0's auc: 0.863655
[6]	valid_0's l2: 0.0667937	valid_0's auc: 0.871217
[7]	valid_0's l2: 0.0657833	valid_0's auc: 0.872132
[8]	valid_0's l2: 0.0648749	valid_0's auc: 0.873267
[9]	valid_0's l2: 0.0640492	valid_0's auc: 0.873

In [13]:
importance = pd.DataFrame(model.feature_importance(), index=model.feature_name(), columns=['importance'])
# importance = importance.sort_values('importance', ascending=False).reset_index()
# important_cols = importance[(importance.importance > 100) & (~importance['index'].str.contains('self'))]
# important_cols
important_cols = list(importance.sort_values('importance', ascending=False).index)

In [14]:
importance

Unnamed: 0,importance
Distance,71
no_distance,5
is_full_discount,0
discount_x,73
discount_y,5
...,...
self_2_Coupon_id-Merchant_id-Date_received_count_Coupon_id_count,11
self_2_Coupon_id-Merchant_id-Date_received_count_Merchant_id_count,219
self_2_Coupon_id-Merchant_id-Date_received_count_Date_received_count,3
day,175


In [9]:
important_cols = [i for i in important_cols if i not in bad_cols]
important_cols, len(important_cols)

(['day',
  'self_1_Date_received_discount_rate-std',
  'self_2_Coupon_id-Date_received_count_Coupon_id_count',
  'self_1_Coupon_id_Distance-mean',
  'self_1_Coupon_id_Distance-std',
  'off1_1_Merchant_id_Distance-mean',
  'self_1_Date_received_discount_rate-mean',
  'self_1_Coupon_id-Date_received_Distance-mean',
  'self_1_Date_received_Distance-mean',
  'self_1_Date_received_discount_y-mean',
  'self_2_Coupon_id-Merchant_id-Date_received_count_Merchant_id_count',
  'self_1_Coupon_id_count',
  'off1_1_User_id_discount_rate-mean',
  'self_1_Coupon_id-Date_received_Distance-std',
  'self_1_Coupon_id-Date_received_count',
  'self_2_Coupon_id-Date_received_count_Date_received_count',
  'self_1_Merchant_id_Distance-std',
  'self_1_Date_received_discount_y-std',
  'self_1_Merchant_id-Date_received_Distance-std',
  'self_1_User_id_discount_rate-std',
  'off1_1_User_id_discount_rate-std',
  'self_1_Date_received_no_distance-sum',
  'self_1_Merchant_id-Date_received_Distance-mean',
  'self_1_Us

In [388]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',        # 设置提升类型
    'objective': 'binary',          # 目标函数
    'metric': {'l2', 'auc'},        # 评估函数
    'num_leaves': 63,               # 叶子节点数
    'learning_rate': 0.05,          # 学习率
    'feature_fraction': 0.9,        # 建树的特征选择比例
    'bagging_fraction': 0.8,        # 建树的样本采样比例
    'bagging_freq': 5,              # 每 bagging_freq 次迭代执行bagging
    'verbose': 1,                   # 提示
}

print(len(valid_set))
print(valid_set.Date_received.describe())
print()
print(train_set.Date_received.describe())
# 评价筛选后的特征
model_m = train_model(valid_set, train_set.sample(frac=1).iloc[:30000, :], important_cols, num_boost_round=2000, stopping_round=100, par=params)

138303
count                  138303
unique                     31
top       2016-04-22 00:00:00
freq                     7381
first     2016-04-15 00:00:00
last      2016-05-15 00:00:00
Name: Date_received, dtype: object

count                  252586
unique                     31
top       2016-05-21 00:00:00
freq                    19859
first     2016-05-16 00:00:00
last      2016-06-15 00:00:00
Name: Date_received, dtype: object
Start training...
[LightGBM] [Info] Number of positive: 9247, number of negative: 129056
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 22180
[LightGBM] [Info] Number of data points in the train set: 138303, number of used features: 200
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.066860 -> initscore=-2.635947
[LightGBM] [Info] Start training from score -2.635947
[1]	valid_0's l2: 0.0807735	valid_0's auc: 0.786154
Training until validation scores do

In [389]:
importance_m = pd.DataFrame(model_m.feature_importance(), index=model_m.feature_name(), columns=['importance'])
importance_m

Unnamed: 0,importance
self_1_Date_received_discount_rate-std,68
day,56
self_1_Coupon_id-Date_received_Distance-mean,73
self_1_Coupon_id-Date_received_Distance-std,71
self_1_Coupon_id_Distance-mean,122
...,...
self_1_User_id-Merchant_id-Date_received_discount_x-mean,20
self_1_User_id-Merchant_id-Date_received_discount_y-std,19
off2_1_User_id_discount_x-max,6
self_1_User_id-Merchant_id_discount_y-max,6


In [390]:
bad_cols = list(importance_m[importance_m.importance < 5].index)
bad_cols

[]

In [411]:
model_p = train_model(full_set, full_set, important_cols, num_boost_round=1000, stopping_round=10, par=params)

Start training...
[LightGBM] [Info] Number of positive: 32118, number of negative: 358771
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 25492
[LightGBM] [Info] Number of data points in the train set: 390889, number of used features: 200
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.082167 -> initscore=-2.413268
[LightGBM] [Info] Start training from score -2.413268
[1]	valid_0's l2: 0.073326	valid_0's auc: 0.855498
Training until validation scores don't improve for 10 rounds
[2]	valid_0's l2: 0.0715112	valid_0's auc: 0.868946
[3]	valid_0's l2: 0.0698904	valid_0's auc: 0.876297
[4]	valid_0's l2: 0.0684274	valid_0's auc: 0.877749
[5]	valid_0's l2: 0.0670845	valid_0's auc: 0.879005
[6]	valid_0's l2: 0.0658617	valid_0's auc: 0.881281
[7]	valid_0's l2: 0.0647729	valid_0's auc: 0.882739
[8]	valid_0's l2: 0.0637788	valid_0's auc: 0.883831
[9]	valid_0's l2: 0.062872	valid_0's auc: 0.8842

In [412]:
def pred_test(_model, _test_set, _feat_cols, _save_name):
    pred = _model.predict(_test_set[_feat_cols], num_iteration=_model.best_iteration)
    submit = _test_set.copy(deep=False)[['User_id', 'Coupon_id', 'Date_received']]
    submit['Probability'] = pred
    submit.to_csv(f'./submits/{_save_name}.csv', index=False, header=False, date_format='%Y%m%d')

In [413]:
test_set['on2_1_User_id_fixed_consume-sum'] = 0

In [414]:
pred_test(model_p, test_set, important_cols, 'importance_feats_submit_200_full_feats')

In [409]:
len(important_cols)

200