In [1]:
# -*- coding:utf-8 -*-
"""

Author:
    ruiyan zry,15617240@qq.com

"""

import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from lightgbm.sklearn import LGBMClassifier
from sklearn.metrics import roc_auc_score, f1_score
from scipy.stats import entropy
from gensim.models import Word2Vec
import time
import gc

pd.set_option('display.max_columns', None)

from core.utils import timeit, reduce_mem

In [2]:
##################################################################################################################

# 1、有些说明在前一次开源已经说了，在这里就不说了；
# 2、主要包含四组特征：
# （1）历史信息，即前一天的点击量、曝光量、点击率；
# （2）前x次曝光、后x次曝光到当前的时间差。后x次到当前曝光的时间差是穿越特征，并且是最强的特征；
# （3）二阶交叉特征；
# （4）embedding。
# 3、本来想一开到底的，但是得知15号之后就不让开源了，所以这应该是最后一次了，没机会为大家打开另外两张表了，大家自行发挥吧；
# 4、希望大家都能拿第一名。

##################################################################################################################


path = "/media/ryan/F/deep-learning-data/turing/vedio-predict/"
path_sub = path + 'sub/'
path_npy = path + 'npy/'
path_data = path + 'raw/'
path_model = path + 'model/'
path_result = path + 'result/'
path_pickle = path + 'pickle/'
path_profile = path + 'profile/'

debug_small = False
sub_sample = False
if debug_small:
    train_df = pd.read_pickle(path_pickle + 'train_small.pickle')
    test_df = pd.read_pickle(path_pickle + 'test_small.pickle')
    # app = pd.read_pickle(path_pickle + 'app_small.pickle')
    # user = pd.read_pickle(path_pickle + 'user_small.pickle')

else:
    train_df = pd.read_pickle(path_pickle + 'train.pickle')
    test_df = pd.read_pickle(path_pickle + 'test.pickle')


    # app = pd.read_pickle(path_pickle + 'app.pickle')
    # user = pd.read_pickle(path_pickle + 'user.pickle')

    if sub_sample:
        train_df = train_df[train_df.deviceid.str[-1] == '1']
        test_df = test_df[test_df.deviceid.str[-1] == '1']



In [3]:
print('=============================================== read train ===============================================')
t = time.time()
# train_df = pd.read_csv('dataset/train.csv')
train_df['date'] = pd.to_datetime(
    train_df['ts'].apply(lambda x: time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(x / 1000)))
)
train_df['day'] = train_df['date'].dt.day

# 训练集中，day=7的个数为11个，day=8的为3,674,871。 day9，10也是解决40w
# day=7占比不到1/百万，属于异常情况，去掉合理？ 线上的表现又会如何，为啥不是直接删除，这样有点过了
# 这里为啥只是改了day，不去直接改ts和timestamp呢？
train_df.loc[train_df['day'] == 7, 'day'] = 8
train_df['hour'] = train_df['date'].dt.hour
train_df['minute'] = train_df['date'].dt.minute
train_num = train_df.shape[0]
labels = train_df['target'].values
print('runtime:', time.time() - t)

runtime: 22.409448385238647


In [4]:
print('=============================================== click data ===============================================')
click_df = train_df[train_df['target'] == 1].sort_values('timestamp').reset_index(drop=True)
click_df['exposure_click_gap'] = click_df['timestamp'] - click_df['ts']
click_df = click_df[click_df['exposure_click_gap'] >= 0].reset_index(drop=True)
click_df['date'] = pd.to_datetime(
    click_df['timestamp'].apply(lambda x: time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(x / 1000)))
)
click_df['day'] = click_df['date'].dt.day
# 同上对day==7的修改
click_df.loc[click_df['day'] == 7, 'day'] = 8

del train_df['target'], train_df['timestamp']

# 这里为啥要把click_df的这些字段删除呢？
for f in ['date', 'exposure_click_gap', 'timestamp', 'ts', 'target', 'hour', 'minute']:
    del click_df[f]
print('runtime:', time.time() - t)

runtime: 34.62693381309509


In [5]:
print('=============================================== read test ===============================================')
test_df['date'] = pd.to_datetime(
    test_df['ts'].apply(lambda x: time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(x / 1000)))
)
test_df['day'] = test_df['date'].dt.day

# 测试集中，day=10的个数为32个，day=11的为3,653,560占比 1/十万，属于异常情况，去掉合理
test_df.loc[test_df['day'] == 10, 'day'] = 11
test_df['hour'] = test_df['date'].dt.hour
test_df['minute'] = test_df['date'].dt.minute
df = pd.concat([train_df, test_df], axis=0, ignore_index=False)
del train_df, test_df, df['date']
gc.collect()
print('runtime:', time.time() - t)

runtime: 59.27189826965332


In [6]:
print('============================================= category encoding =============================================')
df['lng_lat'] = df['lng'].astype('str') + '_' + df['lat'].astype('str')
del df['guid']
click_df['lng_lat'] = click_df['lng'].astype('str') + '_' + click_df['lat'].astype('str')
sort_df = df.sort_values('ts').reset_index(drop=True)
cate_cols = [
    'deviceid', 'newsid', 'pos', 'app_version', 'device_vendor',
    'netmodel', 'osversion', 'device_version', 'lng', 'lat', 'lng_lat'
]
for f in cate_cols:
    print(f)
    map_dict = dict(zip(df[f].unique(), range(df[f].nunique())))
    df[f] = df[f].map(map_dict).fillna(-1).astype('int32')
    click_df[f] = click_df[f].map(map_dict).fillna(-1).astype('int32')
    sort_df[f] = sort_df[f].map(map_dict).fillna(-1).astype('int32')
    df[f + '_count'] = df[f].map(df[f].value_counts())
df = reduce_mem(df)
click_df = reduce_mem(click_df)
sort_df = reduce_mem(sort_df)
print('runtime:', time.time() - t)

deviceid
newsid
pos
app_version
device_vendor
netmodel
osversion
device_version
lng
lat
lng_lat
2580.12 Mb, 1390.40 Mb (46.11 %)
77.91 Mb, 46.97 Mb (39.71 %)
1204.05 Mb, 673.70 Mb (44.05 %)
runtime: 185.4272334575653


In [7]:
print('============================================= feat engineer =============================================')

print('*************************** history stats ***************************')
for f in [
    ['deviceid'],
    ['pos', 'deviceid'],
    # ...
]:
    print('------------------ {} ------------------'.format('_'.join(f)))

    # 对前一天的点击次数进行统计
    tmp = click_df[f + ['day', 'id']].groupby(f + ['day'], as_index=False)['id'].agg(
        {'_'.join(f) + '_prev_day_click_count': 'count'})
    tmp['day'] += 1
    df = df.merge(tmp, on=f + ['day'], how='left')
    df['_'.join(f) + '_prev_day_click_count'] = df['_'.join(f) + '_prev_day_click_count'].fillna(0)
    df.loc[df['day'] == 8, '_'.join(f) + '_prev_day_click_count'] = None

    # 对前一天的曝光量进行统计
    tmp = df[f + ['day', 'id']].groupby(f + ['day'], as_index=False)['id'].agg(
        {'_'.join(f) + '_prev_day_count': 'count'})
    tmp['day'] += 1
    df = df.merge(tmp, on=f + ['day'], how='left')
    df['_'.join(f) + '_prev_day_count'] = df['_'.join(f) + '_prev_day_count'].fillna(0)
    df.loc[df['day'] == 8, '_'.join(f) + '_prev_day_count'] = None

    # 计算前一天的点击率
    df['_'.join(f) + '_prev_day_ctr'] = df['_'.join(f) + '_prev_day_click_count'] / (
            df['_'.join(f) + '_prev_day_count'] + df['_'.join(f) + '_prev_day_count'].mean())

    del tmp
    print('runtime:', time.time() - t)
del click_df
df = reduce_mem(df)

*************************** history stats ***************************
------------------ deviceid ------------------
runtime: 197.60648822784424
------------------ pos_deviceid ------------------
runtime: 212.74193501472473
2078.43 Mb, 1562.40 Mb (24.83 %)


In [None]:
print('*************************** cross feat (second order) ***************************')
# 二阶交叉特征，可以继续做更高阶的交叉。
# cross_cols = ['deviceid', 'newsid', 'pos', 'netmodel', 'lng_lat']
# for f in cross_cols:
#     for col in cross_cols:
#         if col == f:
#             continue
#         print('------------------ {} {} ------------------'.format(f, col))
#         df = df.merge(df[[f, col]].groupby(f, as_index=False)[col].agg({
#             'cross_{}_{}_nunique'.format(f, col): 'nunique',
#             'cross_{}_{}_ent'.format(f, col): lambda x: entropy(x.value_counts() / x.shape[0])  # 熵
#         }), on=f, how='left')
#         if 'cross_{}_{}_count'.format(f, col) not in df.columns.values and 'cross_{}_{}_count'.format(col,
#                                                                                                       f) not in df.columns.values:
#             df = df.merge(df[[f, col, 'id']].groupby([f, col], as_index=False)['id'].agg({
#                 'cross_{}_{}_count'.format(f, col): 'count'  # 共现次数
#             }), on=[f, col], how='left')
#         if 'cross_{}_{}_count_ratio'.format(col, f) not in df.columns.values:
#             df['cross_{}_{}_count_ratio'.format(col, f)] = df['cross_{}_{}_count'.format(f, col)] / df[
#                 f + '_count']  # 比例偏好
#         if 'cross_{}_{}_count_ratio'.format(f, col) not in df.columns.values:
#             df['cross_{}_{}_count_ratio'.format(f, col)] = df['cross_{}_{}_count'.format(f, col)] / df[
#                 col + '_count']  # 比例偏好
#         df['cross_{}_{}_nunique_ratio_{}_count'.format(f, col, f)] = df['cross_{}_{}_nunique'.format(f, col)] / df[
#             f + '_count']
#         print('runtime:', time.time() - t)
#     df = reduce_mem(df)
del df['id']
gc.collect()

In [8]:
print('*************************** embedding ***************************')


# 之前有个朋友给embedding做了一个我认为非常形象的比喻：
# 在非诚勿扰上面，如果你想了解一个女嘉宾，那么你可以看看她都中意过哪些男嘉宾；
# 反过来也一样，如果你想认识一个男嘉宾，那么你也可以看看他都选过哪些女嘉宾。


def emb(df, f1, f2):
    emb_size = 8
    print('====================================== {} {} ======================================'.format(f1, f2))
    tmp = df.groupby(f1, as_index=False)[f2].agg({'{}_{}_list'.format(f1, f2): list})
    sentences = tmp['{}_{}_list'.format(f1, f2)].values.tolist()
    del tmp['{}_{}_list'.format(f1, f2)]
    for i in range(len(sentences)):
        sentences[i] = [str(x) for x in sentences[i]]
    model = Word2Vec(sentences, size=emb_size, window=5, min_count=5, sg=0, hs=1, seed=2019)
    emb_matrix = []
    for seq in sentences:
        vec = []
        for w in seq:
            if w in model:
                vec.append(model[w])
        if len(vec) > 0:
            emb_matrix.append(np.mean(vec, axis=0))
        else:
            emb_matrix.append([0] * emb_size)

    # 为了支持数组多维处理，需要先做一个变换
    emb_matrix = np.array(emb_matrix)

    for i in range(emb_size):
        tmp['{}_{}_emb_{}'.format(f1, f2, i)] = emb_matrix[:, i]
    del model, emb_matrix, sentences
    tmp = reduce_mem(tmp)
    print('runtime:', time.time() - t)
    return tmp


emb_cols = [
    ['deviceid', 'newsid'],
    ['deviceid', 'lng_lat'],
    ['newsid', 'lng_lat'],
    # ...
]
for f1, f2 in emb_cols:
    df = df.merge(emb(sort_df, f1, f2), on=f1, how='left')
    df = df.merge(emb(sort_df, f2, f1), on=f2, how='left')
del sort_df
gc.collect()


*************************** embedding ***************************


INFO:MainThread:gensim.models.word2vec:collecting all words and their counts
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #10000, processed 1903504 words, keeping 448947 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #20000, processed 3645506 words, keeping 652836 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #30000, processed 5610194 words, keeping 821406 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #40000, processed 7484615 words, keeping 947066 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #50000, processed 9175854 words, keeping 1050483 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #60000, processed 11009015 words, keeping 1147302 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #70000, processed 12879752 words, 

INFO:MainThread:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 46.05% examples, 486889 words/s, in_qsize 5, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 48.42% examples, 485480 words/s, in_qsize 5, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 51.09% examples, 484130 words/s, in_qsize 5, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 53.58% examples, 482114 words/s, in_qsize 5, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 56.14% examples, 482830 words/s, in_qsize 5, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 58.72% examples, 481748 words/s, in_qsize 5, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 61.21% examples, 480397 words/s, in_qsize 6, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 63.75% examples, 478860 words/s, in_qsize 4, out_qsize 1
INFO:MainThread:

INFO:MainThread:gensim.models.base_any2vec:EPOCH 4 - PROGRESS: at 81.94% examples, 491121 words/s, in_qsize 6, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:worker thread finished; awaiting finish of 2 more threads
INFO:MainThread:gensim.models.base_any2vec:EPOCH 4 - PROGRESS: at 99.87% examples, 492182 words/s, in_qsize 1, out_qsize 1
INFO:MainThread:gensim.models.base_any2vec:worker thread finished; awaiting finish of 1 more threads
INFO:MainThread:gensim.models.base_any2vec:worker thread finished; awaiting finish of 0 more threads
INFO:MainThread:gensim.models.base_any2vec:EPOCH - 4 : training on 15030273 raw words (13004444 effective words) took 26.4s, 492247 effective words/s
INFO:MainThread:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 3.10% examples, 549844 words/s, in_qsize 5, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 5.71% examples, 530687 words/s, in_qsize 5, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 5 - PROG

8.74 Mb, 3.06 Mb (65.00 %)
runtime: 551.7352843284607


INFO:MainThread:gensim.models.word2vec:collecting all words and their counts
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #10000, processed 3187644 words, keeping 107895 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #20000, processed 4476881 words, keeping 110527 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #30000, processed 5376595 words, keeping 111711 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #40000, processed 6072945 words, keeping 112426 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #50000, processed 6610584 words, keeping 112810 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #60000, processed 7130372 words, keeping 113121 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #70000, processed 7581639 words, keep

INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #670000, processed 13612133 words, keeping 114552 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #680000, processed 13643937 words, keeping 114555 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #690000, processed 13672737 words, keeping 114555 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #700000, processed 13701995 words, keeping 114557 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #710000, processed 13728227 words, keeping 114557 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #720000, processed 13756050 words, keeping 114558 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #730000, processed 13781796 words, keeping 114559 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #740000, processed 13807174 words, keeping 114559 word types
INFO:MainThread:gensim.m

INFO:MainThread:gensim.models.word2vec:effective_min_count=5 leaves 15005254 word corpus (99% of original 15030273, drops 25019)
INFO:MainThread:gensim.models.word2vec:deleting the raw counts dictionary of 114584 items
INFO:MainThread:gensim.models.word2vec:sample=0.001 downsamples 4 most-common words
INFO:MainThread:gensim.models.word2vec:downsampling leaves estimated 14929795 word corpus (99.5% of prior 15005254)
INFO:MainThread:gensim.models.word2vec:constructing a huffman tree from 107489 words
INFO:MainThread:gensim.models.word2vec:built huffman tree with maximum node depth 22
INFO:MainThread:gensim.models.base_any2vec:estimated required memory for 107489 words and 8 dimensions: 85561244 bytes
INFO:MainThread:gensim.models.word2vec:resetting layer weights
INFO:MainThread:gensim.models.base_any2vec:training model with 3 workers on 107489 vocabulary and 8 features, using sg=0 hs=1 sample=0.001 negative=5 window=5
INFO:MainThread:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 0.05

INFO:MainThread:gensim.models.base_any2vec:EPOCH 3 - PROGRESS: at 0.45% examples, 584633 words/s, in_qsize 3, out_qsize 2
INFO:MainThread:gensim.models.base_any2vec:EPOCH 3 - PROGRESS: at 0.70% examples, 582990 words/s, in_qsize 4, out_qsize 1
INFO:MainThread:gensim.models.base_any2vec:EPOCH 3 - PROGRESS: at 1.01% examples, 585857 words/s, in_qsize 6, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 3 - PROGRESS: at 1.40% examples, 587455 words/s, in_qsize 4, out_qsize 1
INFO:MainThread:gensim.models.base_any2vec:EPOCH 3 - PROGRESS: at 1.85% examples, 587723 words/s, in_qsize 5, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 3 - PROGRESS: at 2.41% examples, 591123 words/s, in_qsize 6, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 3 - PROGRESS: at 3.07% examples, 589757 words/s, in_qsize 4, out_qsize 1
INFO:MainThread:gensim.models.base_any2vec:EPOCH 3 - PROGRESS: at 3.86% examples, 586852 words/s, in_qsize 4, out_qsize 1
INFO:MainThread:gensim.m

INFO:MainThread:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 10.93% examples, 595431 words/s, in_qsize 3, out_qsize 2
INFO:MainThread:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 13.56% examples, 596832 words/s, in_qsize 5, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 16.57% examples, 594566 words/s, in_qsize 5, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 21.24% examples, 596893 words/s, in_qsize 6, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 27.58% examples, 598077 words/s, in_qsize 4, out_qsize 1
INFO:MainThread:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 35.66% examples, 597266 words/s, in_qsize 5, out_qsize 2
INFO:MainThread:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 46.77% examples, 595638 words/s, in_qsize 5, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 62.12% examples, 592569 words/s, in_qsize 4, out_qsize 1
INFO:MainThread:

100.68 Mb, 35.24 Mb (65.00 %)
runtime: 960.2155692577362


INFO:MainThread:gensim.models.word2vec:collecting all words and their counts
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #10000, processed 1903504 words, keeping 52930 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #20000, processed 3645506 words, keeping 104101 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #30000, processed 5610194 words, keeping 156870 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #40000, processed 7484615 words, keeping 209120 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #50000, processed 9175854 words, keeping 260716 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #60000, processed 11009015 words, keeping 312796 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #70000, processed 12879752 words, kee

INFO:MainThread:gensim.models.base_any2vec:EPOCH 4 - PROGRESS: at 6.19% examples, 1103699 words/s, in_qsize 5, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 4 - PROGRESS: at 12.60% examples, 1112076 words/s, in_qsize 5, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 4 - PROGRESS: at 19.04% examples, 1094972 words/s, in_qsize 6, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 4 - PROGRESS: at 25.16% examples, 1102263 words/s, in_qsize 5, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 4 - PROGRESS: at 31.49% examples, 1102753 words/s, in_qsize 5, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 4 - PROGRESS: at 38.35% examples, 1100393 words/s, in_qsize 5, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 4 - PROGRESS: at 44.91% examples, 1096183 words/s, in_qsize 5, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 4 - PROGRESS: at 51.09% examples, 1094150 words/s, in_qsize 5, out_qsize 0
INFO:Main

8.74 Mb, 3.06 Mb (65.00 %)
runtime: 1201.9578239917755


INFO:MainThread:gensim.models.word2vec:collecting all words and their counts
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #10000, processed 3010741 words, keeping 72347 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #20000, processed 3563037 words, keeping 74398 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #30000, processed 4080032 words, keeping 76356 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #40000, processed 4585441 words, keeping 78410 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #50000, processed 5111333 words, keeping 80524 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #60000, processed 5650600 words, keeping 82462 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #70000, processed 6228623 words, keeping 84

INFO:MainThread:gensim.models.base_any2vec:worker thread finished; awaiting finish of 0 more threads
INFO:MainThread:gensim.models.base_any2vec:EPOCH - 1 : training on 15030273 raw words (12549494 effective words) took 8.9s, 1413956 effective words/s
INFO:MainThread:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 6.24% examples, 1613319 words/s, in_qsize 5, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 12.90% examples, 1622958 words/s, in_qsize 5, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 19.23% examples, 1612006 words/s, in_qsize 5, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 26.28% examples, 1608195 words/s, in_qsize 5, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 33.00% examples, 1601870 words/s, in_qsize 5, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 55.58% examples, 1547924 words/s, in_qsize 5, out_qsize 0
INFO:M

35.64 Mb, 12.48 Mb (65.00 %)
runtime: 1423.9153490066528


INFO:MainThread:gensim.models.word2vec:collecting all words and their counts
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #10000, processed 3187644 words, keeping 321038 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #20000, processed 4476881 words, keeping 365291 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #30000, processed 5376595 words, keeping 387271 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #40000, processed 6072945 words, keeping 401861 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #50000, processed 6610584 words, keeping 410887 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #60000, processed 7130372 words, keeping 418301 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #70000, processed 7581639 words, keep

INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #670000, processed 13612133 words, keeping 463157 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #680000, processed 13643937 words, keeping 463263 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #690000, processed 13672737 words, keeping 463347 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #700000, processed 13701995 words, keeping 463448 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #710000, processed 13728227 words, keeping 463537 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #720000, processed 13756050 words, keeping 463632 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #730000, processed 13781796 words, keeping 463708 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #740000, processed 13807174 words, keeping 463791 word types
INFO:MainThread:gensim.m

INFO:MainThread:gensim.models.word2vec:effective_min_count=5 leaves 14612974 word corpus (97% of original 15030273, drops 417299)
INFO:MainThread:gensim.models.word2vec:deleting the raw counts dictionary of 467201 items
INFO:MainThread:gensim.models.word2vec:sample=0.001 downsamples 3 most-common words
INFO:MainThread:gensim.models.word2vec:downsampling leaves estimated 12442394 word corpus (85.1% of prior 14612974)
INFO:MainThread:gensim.models.word2vec:constructing a huffman tree from 326144 words
INFO:MainThread:gensim.models.word2vec:built huffman tree with maximum node depth 21
INFO:MainThread:gensim.models.base_any2vec:estimated required memory for 326144 words and 8 dimensions: 259610624 bytes
INFO:MainThread:gensim.models.word2vec:resetting layer weights
INFO:MainThread:gensim.models.base_any2vec:training model with 3 workers on 326144 vocabulary and 8 features, using sg=0 hs=1 sample=0.001 negative=5 window=5
INFO:MainThread:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 0.

INFO:MainThread:gensim.models.base_any2vec:EPOCH 3 - PROGRESS: at 0.80% examples, 512574 words/s, in_qsize 6, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 3 - PROGRESS: at 1.13% examples, 508863 words/s, in_qsize 5, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 3 - PROGRESS: at 1.50% examples, 506023 words/s, in_qsize 5, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 3 - PROGRESS: at 2.03% examples, 509617 words/s, in_qsize 5, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 3 - PROGRESS: at 2.57% examples, 503904 words/s, in_qsize 6, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 3 - PROGRESS: at 3.33% examples, 504706 words/s, in_qsize 4, out_qsize 1
INFO:MainThread:gensim.models.base_any2vec:EPOCH 3 - PROGRESS: at 4.15% examples, 502449 words/s, in_qsize 6, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 3 - PROGRESS: at 5.04% examples, 498731 words/s, in_qsize 5, out_qsize 1
INFO:MainThread:gensim.m

INFO:MainThread:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 10.80% examples, 495557 words/s, in_qsize 4, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 13.14% examples, 494397 words/s, in_qsize 6, out_qsize 1
INFO:MainThread:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 16.27% examples, 495142 words/s, in_qsize 6, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 20.03% examples, 493291 words/s, in_qsize 5, out_qsize 1
INFO:MainThread:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 25.40% examples, 492932 words/s, in_qsize 5, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 32.20% examples, 490857 words/s, in_qsize 5, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 42.37% examples, 490975 words/s, in_qsize 5, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 59.21% examples, 493632 words/s, in_qsize 5, out_qsize 1
INFO:MainThread:

100.68 Mb, 35.24 Mb (65.00 %)
runtime: 1886.4034266471863


INFO:MainThread:gensim.models.word2vec:collecting all words and their counts
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #10000, processed 3010741 words, keeping 565871 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #20000, processed 3563037 words, keeping 627713 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #30000, processed 4080032 words, keeping 681175 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #40000, processed 4585441 words, keeping 729792 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #50000, processed 5111333 words, keeping 777973 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #60000, processed 5650600 words, keeping 826258 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #70000, processed 6228623 words, keep

INFO:MainThread:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 23.68% examples, 463551 words/s, in_qsize 5, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 25.94% examples, 463548 words/s, in_qsize 6, out_qsize 1
INFO:MainThread:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 28.20% examples, 464875 words/s, in_qsize 6, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 30.49% examples, 464800 words/s, in_qsize 5, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 32.57% examples, 462320 words/s, in_qsize 5, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 35.07% examples, 462796 words/s, in_qsize 5, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 46.87% examples, 465097 words/s, in_qsize 5, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 63.99% examples, 469690 words/s, in_qsize 5, out_qsize 0
INFO:MainThread:

INFO:MainThread:gensim.models.base_any2vec:worker thread finished; awaiting finish of 1 more threads
INFO:MainThread:gensim.models.base_any2vec:worker thread finished; awaiting finish of 0 more threads
INFO:MainThread:gensim.models.base_any2vec:EPOCH - 3 : training on 15030273 raw words (11206052 effective words) took 24.4s, 459526 effective words/s
INFO:MainThread:gensim.models.base_any2vec:EPOCH 4 - PROGRESS: at 1.97% examples, 490275 words/s, in_qsize 5, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 4 - PROGRESS: at 4.12% examples, 484582 words/s, in_qsize 5, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 4 - PROGRESS: at 6.35% examples, 476298 words/s, in_qsize 5, out_qsize 1
INFO:MainThread:gensim.models.base_any2vec:EPOCH 4 - PROGRESS: at 8.59% examples, 474894 words/s, in_qsize 5, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 4 - PROGRESS: at 10.78% examples, 471657 words/s, in_qsize 6, out_qsize 0
INFO:MainThread:gensim.models.base_an

35.64 Mb, 12.48 Mb (65.00 %)
runtime: 2255.636177778244


0

In [9]:
print('======================================== prepare train & valid  =============================================')
train_df = df[:train_num].reset_index(drop=True)
test_df = df[train_num:].reset_index(drop=True)
del df
gc.collect()

train_idx = train_df[train_df['day'] < 10].index.tolist()
val_idx = train_df[train_df['day'] == 10].index.tolist()

train_x = train_df.iloc[train_idx].reset_index(drop=True)
train_y = labels[train_idx]
val_x = train_df.iloc[val_idx].reset_index(drop=True)
val_y = labels[val_idx]

del train_x['day'], val_x['day'], train_df['day'], test_df['day']
gc.collect()
print('runtime:', time.time() - t)
print('========================================================================================================')


runtime: 2270.176098585129


In [14]:
# train_df.to_pickle(path_pickle + "81_all_final_train.pickle")
# test_df.to_pickle(path_pickle + "81_all_final_test.pickle")

In [18]:
print('=============================================== training validate ===============================================')
fea_imp_list = []
clf = LGBMClassifier(
    learning_rate=0.01,
    n_estimators=5000,
    num_leaves=255,
    subsample=0.9,
    colsample_bytree=0.8,
    random_state=2019,
    metric=None
)

print('************** training **************')
clf.fit(
    train_x, train_y,
    eval_set=[(val_x, val_y)],
    eval_metric='auc',
    categorical_feature=cate_cols,
    early_stopping_rounds=200,
    verbose=50
)
print('runtime:', time.time() - t)

print('************** validate predict **************')
best_rounds = clf.best_iteration_
best_auc = clf.best_score_['valid_0']['auc']
val_pred = clf.predict_proba(val_x)[:, 1]
fea_imp_list.append(clf.feature_importances_)
print('runtime:', time.time() - t)

************** training **************
Training until validation scores don't improve for 200 rounds
[50]	valid_0's auc: 0.900812
[100]	valid_0's auc: 0.904232
[150]	valid_0's auc: 0.905775
[200]	valid_0's auc: 0.906636
[250]	valid_0's auc: 0.907599
[300]	valid_0's auc: 0.9082
[350]	valid_0's auc: 0.908885
[400]	valid_0's auc: 0.909591
[450]	valid_0's auc: 0.909915
[500]	valid_0's auc: 0.910137
[550]	valid_0's auc: 0.910315
[600]	valid_0's auc: 0.910425
[650]	valid_0's auc: 0.91048
[700]	valid_0's auc: 0.910514
[750]	valid_0's auc: 0.910545
[800]	valid_0's auc: 0.910553
[850]	valid_0's auc: 0.910536
[900]	valid_0's auc: 0.910497
[950]	valid_0's auc: 0.910465
Early stopping, best iteration is:
[797]	valid_0's auc: 0.910567
runtime: 7443.5404760837555
************** validate predict **************
runtime: 9096.90061211586


In [19]:
print('=============================================== training predict ===============================================')
clf = LGBMClassifier(
    learning_rate=0.01,
    n_estimators=best_rounds,
    num_leaves=255,
    subsample=0.9,
    colsample_bytree=0.8,
    random_state=2019
)

print('************** training **************')
clf.fit(
    train_df, labels,
    eval_set=[(train_df, labels)],
    categorical_feature=cate_cols,
    verbose=50
)
print('runtime:', time.time() - t)

************** training **************
[50]	training's binary_logloss: 0.250592
[100]	training's binary_logloss: 0.217604
[150]	training's binary_logloss: 0.20104
[200]	training's binary_logloss: 0.191788
[250]	training's binary_logloss: 0.186001
[300]	training's binary_logloss: 0.182126
[350]	training's binary_logloss: 0.17927
[400]	training's binary_logloss: 0.1769
[450]	training's binary_logloss: 0.174873
[500]	training's binary_logloss: 0.173101
[550]	training's binary_logloss: 0.171569
[600]	training's binary_logloss: 0.170175
[650]	training's binary_logloss: 0.168934
[700]	training's binary_logloss: 0.167762
[750]	training's binary_logloss: 0.166652
runtime: 12322.643056631088


In [20]:
print('************** test predict **************')
sub = pd.read_csv(path_data + 'sample.csv')
sub['target'] = clf.predict_proba(test_df)[:, 1]
# tmp_df = clf.predict_proba(test_df)[:, 1]
fea_imp_list.append(clf.feature_importances_)
print('runtime:', time.time() - t)


************** test predict **************
runtime: 13446.176961421967


In [21]:
print('=============================================== feat importances ===============================================')
# 特征重要性可以好好看看
fea_imp_dict = dict(zip(train_df.columns.values, np.mean(fea_imp_list, axis=0)))
fea_imp_item = sorted(fea_imp_dict.items(), key=lambda x: x[1], reverse=True)
for f, imp in fea_imp_item:
    print('{} = {}'.format(f, imp))

deviceid = 73627.0
newsid = 31846.5
device_version = 28110.5
lat = 14211.5
lng = 13346.0
lng_lat = 7782.0
deviceid_count = 2736.5
lng_lat_count = 2446.5
deviceid_newsid_emb_6 = 1794.5
deviceid_newsid_emb_5 = 1572.0
netmodel_count = 1354.5
deviceid_newsid_emb_0 = 1344.5
pos = 1298.5
lng_lat_newsid_emb_1 = 1193.0
pos_count = 1122.0
deviceid_newsid_emb_2 = 1081.0
deviceid_prev_day_ctr = 873.5
pos_deviceid_prev_day_ctr = 805.0
deviceid_newsid_emb_4 = 742.0
pos_deviceid_prev_day_count = 735.0
newsid_count = 671.0
hour = 668.0
deviceid_newsid_emb_1 = 659.0
deviceid_newsid_emb_7 = 652.0
deviceid_prev_day_count = 634.5
lng_lat_newsid_emb_7 = 619.5
deviceid_newsid_emb_3 = 556.5
lng_count = 549.0
lng_lat_newsid_emb_0 = 516.0
ts = 507.0
newsid_deviceid_emb_6 = 456.0
newsid_lng_lat_emb_6 = 442.0
lat_count = 419.0
newsid_deviceid_emb_3 = 382.0
newsid_lng_lat_emb_1 = 378.5
lng_lat_newsid_emb_6 = 321.5
netmodel = 313.5
lng_lat_newsid_emb_4 = 309.0
newsid_deviceid_emb_0 = 306.0
minute = 305.0
app_vers

In [22]:
print('=============================================== threshold search ===============================================')
# f1阈值敏感，所以对阈值做一个简单的迭代搜索。
t0 = 0.05
v = 0.002
best_t = t0
best_f1 = 0
for step in range(201):
    curr_t = t0 + step * v
    y = [1 if x >= curr_t else 0 for x in val_pred]
    curr_f1 = f1_score(val_y, y)
    if curr_f1 > best_f1:
        best_t = curr_t
        best_f1 = curr_f1
        print('step: {}   best threshold: {}   best f1: {}'.format(step, best_t, best_f1))
print('search finish.')

val_pred = [1 if x >= best_t else 0 for x in val_pred]
print('\nbest auc:', best_auc)
print('best f1:', f1_score(val_y, val_pred))
print('validate mean:', np.mean(val_pred))
print('runtime:', time.time() - t)

step: 0   best threshold: 0.05   best f1: 0.4429750057829941
step: 1   best threshold: 0.052000000000000005   best f1: 0.44584488302145736
step: 2   best threshold: 0.054000000000000006   best f1: 0.4485423035291598
step: 3   best threshold: 0.056   best f1: 0.45125752296864097
step: 4   best threshold: 0.058   best f1: 0.45392608805128903
step: 5   best threshold: 0.060000000000000005   best f1: 0.45646384513097654
step: 6   best threshold: 0.062   best f1: 0.458943098941178
step: 7   best threshold: 0.064   best f1: 0.4613622522809429
step: 8   best threshold: 0.066   best f1: 0.46378214076022384
step: 9   best threshold: 0.068   best f1: 0.46629928417225314
step: 10   best threshold: 0.07   best f1: 0.46879860699202425
step: 11   best threshold: 0.07200000000000001   best f1: 0.47121908724499845
step: 12   best threshold: 0.07400000000000001   best f1: 0.4735719134904471
step: 13   best threshold: 0.07600000000000001   best f1: 0.4759132629855955
step: 14   best threshold: 0.078   b

In [24]:
print('=============================================== sub save ===============================================')
sub.to_csv('sub_prob_{}_{}_{}.csv'.format(best_auc, best_f1, sub['target'].mean()), index=False)
sub['target'] = sub['target'].apply(lambda x: 1 if x >= best_t else 0)
sub.to_csv('sub_{}_{}_{}.csv'.format(best_auc, best_f1, sub['target'].mean()), index=False)
print('runtime:', time.time() - t)
print('finish.')
print('========================================================================================================')


runtime: 15559.546869516373
finish.
