In [1]:
# -*- coding:utf-8 -*-
"""

Author:
    ruiyan zry,15617240@qq.com

"""

import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from lightgbm.sklearn import LGBMClassifier
from sklearn.metrics import roc_auc_score, f1_score
from scipy.stats import entropy
from gensim.models import Word2Vec
import time
import gc

pd.set_option('display.max_columns', None)

from core.utils import timeit, reduce_mem

In [2]:
##################################################################################################################

# 1、有些说明在前一次开源已经说了，在这里就不说了；
# 2、主要包含四组特征：
# （1）历史信息，即前一天的点击量、曝光量、点击率；
# （2）前x次曝光、后x次曝光到当前的时间差。后x次到当前曝光的时间差是穿越特征，并且是最强的特征；
# （3）二阶交叉特征；
# （4）embedding。
# 3、本来想一开到底的，但是得知15号之后就不让开源了，所以这应该是最后一次了，没机会为大家打开另外两张表了，大家自行发挥吧；
# 4、希望大家都能拿第一名。

##################################################################################################################


path = "/media/ryan/F/deep-learning-data/turing/vedio-predict/"
path_sub = path + 'sub/'
path_npy = path + 'npy/'
path_data = path + 'raw/'
path_model = path + 'model/'
path_result = path + 'result/'
path_pickle = path + 'pickle/'
path_profile = path + 'profile/'

debug_small = False
sub_sample = False
if debug_small:
    train_df = pd.read_pickle(path_pickle + 'train_small.pickle')
    test_df = pd.read_pickle(path_pickle + 'test_small.pickle')
    # app = pd.read_pickle(path_pickle + 'app_small.pickle')
    # user = pd.read_pickle(path_pickle + 'user_small.pickle')

else:
    train_df = pd.read_pickle(path_pickle + 'train.pickle')
    test_df = pd.read_pickle(path_pickle + 'test.pickle')


    # app = pd.read_pickle(path_pickle + 'app.pickle')
    # user = pd.read_pickle(path_pickle + 'user.pickle')

    if sub_sample:
        train_df = train_df[train_df.deviceid.str[-1] == '1']
        test_df = test_df[test_df.deviceid.str[-1] == '1']



In [3]:
print('=============================================== read train ===============================================')
t = time.time()
# train_df = pd.read_csv('dataset/train.csv')
train_df['date'] = pd.to_datetime(
    train_df['ts'].apply(lambda x: time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(x / 1000)))
)
train_df['day'] = train_df['date'].dt.day

# 训练集中，day=7的个数为11个，day=8的为3,674,871。 day9，10也是解决40w
# day=7占比不到1/百万，属于异常情况，去掉合理？ 线上的表现又会如何，为啥不是直接删除，这样有点过了
# 这里为啥只是改了day，不去直接改ts和timestamp呢？
train_df.loc[train_df['day'] == 7, 'day'] = 8
train_df['hour'] = train_df['date'].dt.hour
train_df['minute'] = train_df['date'].dt.minute
train_num = train_df.shape[0]
labels = train_df['target'].values
print('runtime:', time.time() - t)

runtime: 22.79059624671936


In [4]:
print('=============================================== click data ===============================================')
click_df = train_df[train_df['target'] == 1].sort_values('timestamp').reset_index(drop=True)
click_df['exposure_click_gap'] = click_df['timestamp'] - click_df['ts']
click_df = click_df[click_df['exposure_click_gap'] >= 0].reset_index(drop=True)
click_df['date'] = pd.to_datetime(
    click_df['timestamp'].apply(lambda x: time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(x / 1000)))
)
click_df['day'] = click_df['date'].dt.day
# 同上对day==7的修改
click_df.loc[click_df['day'] == 7, 'day'] = 8

del train_df['target'], train_df['timestamp']

# 这里为啥要把click_df的这些字段删除呢？
for f in ['date', 'exposure_click_gap', 'timestamp', 'ts', 'target', 'hour', 'minute']:
    del click_df[f]
print('runtime:', time.time() - t)

runtime: 28.208784103393555


In [5]:
print('=============================================== read test ===============================================')
test_df['date'] = pd.to_datetime(
    test_df['ts'].apply(lambda x: time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(x / 1000)))
)
test_df['day'] = test_df['date'].dt.day

# 测试集中，day=10的个数为32个，day=11的为3,653,560占比 1/十万，属于异常情况，去掉合理
test_df.loc[test_df['day'] == 10, 'day'] = 11
test_df['hour'] = test_df['date'].dt.hour
test_df['minute'] = test_df['date'].dt.minute
df = pd.concat([train_df, test_df], axis=0, ignore_index=False)
del train_df, test_df, df['date']
gc.collect()
print('runtime:', time.time() - t)

runtime: 39.0877902507782


In [6]:
print('============================================= category encoding =============================================')
df['lng_lat'] = df['lng'].astype('str') + '_' + df['lat'].astype('str')
del df['guid']
click_df['lng_lat'] = click_df['lng'].astype('str') + '_' + click_df['lat'].astype('str')
sort_df = df.sort_values('ts').reset_index(drop=True)
cate_cols = [
    'deviceid', 'newsid', 'pos', 'app_version', 'device_vendor',
    'netmodel', 'osversion', 'device_version', 'lng', 'lat', 'lng_lat'
]
for f in cate_cols:
    print(f)
    map_dict = dict(zip(df[f].unique(), range(df[f].nunique())))
    df[f] = df[f].map(map_dict).fillna(-1).astype('int32')
    click_df[f] = click_df[f].map(map_dict).fillna(-1).astype('int32')
    sort_df[f] = sort_df[f].map(map_dict).fillna(-1).astype('int32')
    df[f + '_count'] = df[f].map(df[f].value_counts())
df = reduce_mem(df)
click_df = reduce_mem(click_df)
sort_df = reduce_mem(sort_df)
print('runtime:', time.time() - t)

deviceid
newsid
pos
app_version
device_vendor
netmodel
osversion
device_version
lng
lat
lng_lat
2580.12 Mb, 1390.40 Mb (46.11 %)
77.91 Mb, 46.97 Mb (39.71 %)
1204.05 Mb, 673.70 Mb (44.05 %)
runtime: 169.77264666557312


In [7]:
print('============================================= feat engineer =============================================')

print('*************************** history stats ***************************')
for f in [
    ['deviceid'],
    ['pos', 'deviceid'],
    # ...
]:
    print('------------------ {} ------------------'.format('_'.join(f)))

    # 对前一天的点击次数进行统计
    tmp = click_df[f + ['day', 'id']].groupby(f + ['day'], as_index=False)['id'].agg(
        {'_'.join(f) + '_prev_day_click_count': 'count'})
    tmp['day'] += 1
    df = df.merge(tmp, on=f + ['day'], how='left')
    df['_'.join(f) + '_prev_day_click_count'] = df['_'.join(f) + '_prev_day_click_count'].fillna(0)
    df.loc[df['day'] == 8, '_'.join(f) + '_prev_day_click_count'] = None

    # 对前一天的曝光量进行统计
    tmp = df[f + ['day', 'id']].groupby(f + ['day'], as_index=False)['id'].agg(
        {'_'.join(f) + '_prev_day_count': 'count'})
    tmp['day'] += 1
    df = df.merge(tmp, on=f + ['day'], how='left')
    df['_'.join(f) + '_prev_day_count'] = df['_'.join(f) + '_prev_day_count'].fillna(0)
    df.loc[df['day'] == 8, '_'.join(f) + '_prev_day_count'] = None

    # 计算前一天的点击率
    df['_'.join(f) + '_prev_day_ctr'] = df['_'.join(f) + '_prev_day_click_count'] / (
            df['_'.join(f) + '_prev_day_count'] + df['_'.join(f) + '_prev_day_count'].mean())

    del tmp
    print('runtime:', time.time() - t)
del click_df
df = reduce_mem(df)

*************************** history stats ***************************
------------------ deviceid ------------------
runtime: 182.9914472103119
------------------ pos_deviceid ------------------
runtime: 198.81624746322632
2078.43 Mb, 1562.40 Mb (24.83 %)


In [8]:
print('*************************** exposure_ts_gap ***************************')
for f in [
    ['deviceid'], ['newsid'], ['lng_lat'],
    ['pos', 'deviceid'], ['pos', 'newsid'], ['pos', 'lng_lat'],
    ['pos', 'deviceid', 'lng_lat'],
    ['netmodel', 'deviceid'],
    ['pos', 'netmodel', 'deviceid'],
    ['netmodel', 'lng_lat'], ['deviceid', 'lng_lat'],
    ['netmodel', 'deviceid', 'lng_lat'], ['pos', 'netmodel', 'lng_lat'],
    ['pos', 'netmodel', 'deviceid', 'lng_lat']
]:
    print('------------------ {} ------------------'.format('_'.join(f)))

    tmp = sort_df[f + ['ts']].groupby(f)
    # 前x次、后x次曝光到当前的时间差
    for gap in [1, 2, 3, 5, 10]:
        sort_df['{}_prev{}_exposure_ts_gap'.format('_'.join(f), gap)] = tmp['ts'].shift(0) - tmp['ts'].shift(gap)
        sort_df['{}_next{}_exposure_ts_gap'.format('_'.join(f), gap)] = tmp['ts'].shift(-gap) - tmp['ts'].shift(0)
        tmp2 = sort_df[
            f + ['ts', '{}_prev{}_exposure_ts_gap'.format('_'.join(f), gap),
                 '{}_next{}_exposure_ts_gap'.format('_'.join(f), gap)]
            ].drop_duplicates(f + ['ts']).reset_index(drop=True)
        df = df.merge(tmp2, on=f + ['ts'], how='left')
        del sort_df['{}_prev{}_exposure_ts_gap'.format('_'.join(f), gap)]
        del sort_df['{}_next{}_exposure_ts_gap'.format('_'.join(f), gap)]
        del tmp2

    del tmp
    df = reduce_mem(df)
    print('runtime:', time.time() - t)
del df['ts']
gc.collect()

*************************** exposure_ts_gap ***************************
------------------ deviceid ------------------
2709.12 Mb, 2135.76 Mb (21.16 %)
runtime: 344.2962772846222
------------------ newsid ------------------
3282.48 Mb, 2709.12 Mb (17.47 %)
runtime: 487.00451612472534
------------------ lng_lat ------------------
3855.84 Mb, 3282.48 Mb (14.87 %)
runtime: 621.0728437900543
------------------ pos_deviceid ------------------
4429.20 Mb, 3855.84 Mb (12.94 %)
runtime: 762.2747118473053
------------------ pos_newsid ------------------
5002.56 Mb, 4429.20 Mb (11.46 %)
runtime: 927.0758574008942
------------------ pos_lng_lat ------------------
5575.92 Mb, 5002.56 Mb (10.28 %)
runtime: 1082.5440983772278
------------------ pos_deviceid_lng_lat ------------------
6149.28 Mb, 5575.92 Mb (9.32 %)
runtime: 1247.3648397922516
------------------ netmodel_deviceid ------------------
6722.64 Mb, 6149.28 Mb (8.53 %)
runtime: 1411.937433719635
------------------ pos_netmodel_deviceid ---

0

In [9]:
print('*************************** cross feat (second order) ***************************')
# 二阶交叉特征，可以继续做更高阶的交叉。
# cross_cols = ['deviceid', 'newsid', 'pos', 'netmodel', 'lng_lat']
# for f in cross_cols:
#     for col in cross_cols:
#         if col == f:
#             continue
#         print('------------------ {} {} ------------------'.format(f, col))
#         df = df.merge(df[[f, col]].groupby(f, as_index=False)[col].agg({
#             'cross_{}_{}_nunique'.format(f, col): 'nunique',
#             'cross_{}_{}_ent'.format(f, col): lambda x: entropy(x.value_counts() / x.shape[0])  # 熵
#         }), on=f, how='left')
#         if 'cross_{}_{}_count'.format(f, col) not in df.columns.values and 'cross_{}_{}_count'.format(col,
#                                                                                                       f) not in df.columns.values:
#             df = df.merge(df[[f, col, 'id']].groupby([f, col], as_index=False)['id'].agg({
#                 'cross_{}_{}_count'.format(f, col): 'count'  # 共现次数
#             }), on=[f, col], how='left')
#         if 'cross_{}_{}_count_ratio'.format(col, f) not in df.columns.values:
#             df['cross_{}_{}_count_ratio'.format(col, f)] = df['cross_{}_{}_count'.format(f, col)] / df[
#                 f + '_count']  # 比例偏好
#         if 'cross_{}_{}_count_ratio'.format(f, col) not in df.columns.values:
#             df['cross_{}_{}_count_ratio'.format(f, col)] = df['cross_{}_{}_count'.format(f, col)] / df[
#                 col + '_count']  # 比例偏好
#         df['cross_{}_{}_nunique_ratio_{}_count'.format(f, col, f)] = df['cross_{}_{}_nunique'.format(f, col)] / df[
#             f + '_count']
#         print('runtime:', time.time() - t)
#     df = reduce_mem(df)
del df['id']
gc.collect()

*************************** cross feat (second order) ***************************


0

In [10]:
print('*************************** embedding ***************************')


# 之前有个朋友给embedding做了一个我认为非常形象的比喻：
# 在非诚勿扰上面，如果你想了解一个女嘉宾，那么你可以看看她都中意过哪些男嘉宾；
# 反过来也一样，如果你想认识一个男嘉宾，那么你也可以看看他都选过哪些女嘉宾。


def emb(df, f1, f2):
    emb_size = 8
    print('====================================== {} {} ======================================'.format(f1, f2))
    tmp = df.groupby(f1, as_index=False)[f2].agg({'{}_{}_list'.format(f1, f2): list})
    sentences = tmp['{}_{}_list'.format(f1, f2)].values.tolist()
    del tmp['{}_{}_list'.format(f1, f2)]
    for i in range(len(sentences)):
        sentences[i] = [str(x) for x in sentences[i]]
    model = Word2Vec(sentences, size=emb_size, window=5, min_count=5, sg=0, hs=1, seed=2019)
    emb_matrix = []
    for seq in sentences:
        vec = []
        for w in seq:
            if w in model:
                vec.append(model[w])
        if len(vec) > 0:
            emb_matrix.append(np.mean(vec, axis=0))
        else:
            emb_matrix.append([0] * emb_size)

    # 为了支持数组多维处理，需要先做一个变换
    emb_matrix = np.array(emb_matrix)

    for i in range(emb_size):
        tmp['{}_{}_emb_{}'.format(f1, f2, i)] = emb_matrix[:, i]
    del model, emb_matrix, sentences
    tmp = reduce_mem(tmp)
    print('runtime:', time.time() - t)
    return tmp


emb_cols = [
    ['deviceid', 'newsid'],
    ['deviceid', 'lng_lat'],
    ['newsid', 'lng_lat'],
    # ...
]
for f1, f2 in emb_cols:
    df = df.merge(emb(sort_df, f1, f2), on=f1, how='left')
    df = df.merge(emb(sort_df, f2, f1), on=f2, how='left')
del sort_df
gc.collect()


*************************** embedding ***************************


INFO:MainThread:gensim.models.word2vec:collecting all words and their counts
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #10000, processed 1903504 words, keeping 448947 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #20000, processed 3645506 words, keeping 652836 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #30000, processed 5610194 words, keeping 821406 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #40000, processed 7484615 words, keeping 947066 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #50000, processed 9175854 words, keeping 1050483 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #60000, processed 11009015 words, keeping 1147302 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #70000, processed 12879752 words, 

INFO:MainThread:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 47.05% examples, 498804 words/s, in_qsize 5, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 49.82% examples, 499124 words/s, in_qsize 5, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 52.70% examples, 498640 words/s, in_qsize 5, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 55.35% examples, 498547 words/s, in_qsize 5, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 57.89% examples, 497816 words/s, in_qsize 6, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 60.57% examples, 497984 words/s, in_qsize 6, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 63.40% examples, 496944 words/s, in_qsize 6, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 66.60% examples, 496868 words/s, in_qsize 6, out_qsize 0
INFO:MainThread:

INFO:MainThread:gensim.models.base_any2vec:worker thread finished; awaiting finish of 1 more threads
INFO:MainThread:gensim.models.base_any2vec:worker thread finished; awaiting finish of 0 more threads
INFO:MainThread:gensim.models.base_any2vec:EPOCH - 4 : training on 15030273 raw words (13004444 effective words) took 26.4s, 492091 effective words/s
INFO:MainThread:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 3.14% examples, 562652 words/s, in_qsize 6, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 5.91% examples, 552407 words/s, in_qsize 5, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 8.59% examples, 535551 words/s, in_qsize 6, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 11.16% examples, 517791 words/s, in_qsize 6, out_qsize 1
INFO:MainThread:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 14.11% examples, 513767 words/s, in_qsize 4, out_qsize 1
INFO:MainThread:gensim.models.base_a

8.74 Mb, 3.06 Mb (65.00 %)
runtime: 3753.4011998176575


INFO:MainThread:gensim.models.word2vec:collecting all words and their counts
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #10000, processed 3187644 words, keeping 107895 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #20000, processed 4476881 words, keeping 110527 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #30000, processed 5376595 words, keeping 111711 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #40000, processed 6072945 words, keeping 112426 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #50000, processed 6610584 words, keeping 112810 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #60000, processed 7130372 words, keeping 113121 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #70000, processed 7581639 words, keep

INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #670000, processed 13612133 words, keeping 114552 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #680000, processed 13643937 words, keeping 114555 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #690000, processed 13672737 words, keeping 114555 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #700000, processed 13701995 words, keeping 114557 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #710000, processed 13728227 words, keeping 114557 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #720000, processed 13756050 words, keeping 114558 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #730000, processed 13781796 words, keeping 114559 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #740000, processed 13807174 words, keeping 114559 word types
INFO:MainThread:gensim.m

INFO:MainThread:gensim.models.word2vec:effective_min_count=5 leaves 15005254 word corpus (99% of original 15030273, drops 25019)
INFO:MainThread:gensim.models.word2vec:deleting the raw counts dictionary of 114584 items
INFO:MainThread:gensim.models.word2vec:sample=0.001 downsamples 4 most-common words
INFO:MainThread:gensim.models.word2vec:downsampling leaves estimated 14929795 word corpus (99.5% of prior 15005254)
INFO:MainThread:gensim.models.word2vec:constructing a huffman tree from 107489 words
INFO:MainThread:gensim.models.word2vec:built huffman tree with maximum node depth 22
INFO:MainThread:gensim.models.base_any2vec:estimated required memory for 107489 words and 8 dimensions: 85561244 bytes
INFO:MainThread:gensim.models.word2vec:resetting layer weights
INFO:MainThread:gensim.models.base_any2vec:training model with 3 workers on 107489 vocabulary and 8 features, using sg=0 hs=1 sample=0.001 negative=5 window=5
INFO:MainThread:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 0.06

INFO:MainThread:gensim.models.base_any2vec:EPOCH 3 - PROGRESS: at 0.75% examples, 607908 words/s, in_qsize 5, out_qsize 1
INFO:MainThread:gensim.models.base_any2vec:EPOCH 3 - PROGRESS: at 1.10% examples, 614054 words/s, in_qsize 6, out_qsize 1
INFO:MainThread:gensim.models.base_any2vec:EPOCH 3 - PROGRESS: at 1.49% examples, 613012 words/s, in_qsize 5, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 3 - PROGRESS: at 2.06% examples, 621727 words/s, in_qsize 5, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 3 - PROGRESS: at 2.67% examples, 618329 words/s, in_qsize 6, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 3 - PROGRESS: at 3.43% examples, 615106 words/s, in_qsize 5, out_qsize 2
INFO:MainThread:gensim.models.base_any2vec:EPOCH 3 - PROGRESS: at 4.30% examples, 614776 words/s, in_qsize 6, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 3 - PROGRESS: at 5.19% examples, 608331 words/s, in_qsize 6, out_qsize 1
INFO:MainThread:gensim.m

INFO:MainThread:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 9.72% examples, 608998 words/s, in_qsize 5, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 12.03% examples, 610278 words/s, in_qsize 6, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 15.04% examples, 612158 words/s, in_qsize 4, out_qsize 1
INFO:MainThread:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 18.69% examples, 609366 words/s, in_qsize 3, out_qsize 2
INFO:MainThread:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 23.99% examples, 610610 words/s, in_qsize 5, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 30.39% examples, 607543 words/s, in_qsize 6, out_qsize 1
INFO:MainThread:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 39.37% examples, 605769 words/s, in_qsize 5, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 52.21% examples, 603546 words/s, in_qsize 4, out_qsize 2
INFO:MainThread:g

100.68 Mb, 35.24 Mb (65.00 %)
runtime: 4179.900171041489


INFO:MainThread:gensim.models.word2vec:collecting all words and their counts
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #10000, processed 1903504 words, keeping 52930 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #20000, processed 3645506 words, keeping 104101 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #30000, processed 5610194 words, keeping 156870 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #40000, processed 7484615 words, keeping 209120 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #50000, processed 9175854 words, keeping 260716 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #60000, processed 11009015 words, keeping 312796 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #70000, processed 12879752 words, kee

INFO:MainThread:gensim.models.base_any2vec:EPOCH 4 - PROGRESS: at 6.19% examples, 1100441 words/s, in_qsize 6, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 4 - PROGRESS: at 12.57% examples, 1097761 words/s, in_qsize 6, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 4 - PROGRESS: at 19.11% examples, 1098706 words/s, in_qsize 6, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 4 - PROGRESS: at 25.01% examples, 1092671 words/s, in_qsize 5, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 4 - PROGRESS: at 31.15% examples, 1089919 words/s, in_qsize 6, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 4 - PROGRESS: at 37.73% examples, 1081071 words/s, in_qsize 6, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 4 - PROGRESS: at 44.11% examples, 1075026 words/s, in_qsize 4, out_qsize 1
INFO:MainThread:gensim.models.base_any2vec:EPOCH 4 - PROGRESS: at 50.42% examples, 1073861 words/s, in_qsize 6, out_qsize 0
INFO:Main

8.74 Mb, 3.06 Mb (65.00 %)
runtime: 4438.07319021225


INFO:MainThread:gensim.models.word2vec:collecting all words and their counts
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #10000, processed 3010741 words, keeping 72347 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #20000, processed 3563037 words, keeping 74398 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #30000, processed 4080032 words, keeping 76356 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #40000, processed 4585441 words, keeping 78410 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #50000, processed 5111333 words, keeping 80524 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #60000, processed 5650600 words, keeping 82462 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #70000, processed 6228623 words, keeping 84

INFO:MainThread:gensim.models.base_any2vec:worker thread finished; awaiting finish of 0 more threads
INFO:MainThread:gensim.models.base_any2vec:EPOCH - 1 : training on 15030273 raw words (12549494 effective words) took 9.0s, 1389421 effective words/s
INFO:MainThread:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 6.11% examples, 1587627 words/s, in_qsize 5, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 12.59% examples, 1576793 words/s, in_qsize 5, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 18.51% examples, 1564742 words/s, in_qsize 5, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 25.57% examples, 1572858 words/s, in_qsize 6, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 32.36% examples, 1571642 words/s, in_qsize 5, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 53.02% examples, 1532069 words/s, in_qsize 5, out_qsize 0
INFO:M

35.64 Mb, 12.48 Mb (65.00 %)
runtime: 4679.168709039688


INFO:MainThread:gensim.models.word2vec:collecting all words and their counts
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #10000, processed 3187644 words, keeping 321038 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #20000, processed 4476881 words, keeping 365291 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #30000, processed 5376595 words, keeping 387271 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #40000, processed 6072945 words, keeping 401861 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #50000, processed 6610584 words, keeping 410887 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #60000, processed 7130372 words, keeping 418301 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #70000, processed 7581639 words, keep

INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #670000, processed 13612133 words, keeping 463157 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #680000, processed 13643937 words, keeping 463263 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #690000, processed 13672737 words, keeping 463347 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #700000, processed 13701995 words, keeping 463448 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #710000, processed 13728227 words, keeping 463537 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #720000, processed 13756050 words, keeping 463632 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #730000, processed 13781796 words, keeping 463708 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #740000, processed 13807174 words, keeping 463791 word types
INFO:MainThread:gensim.m

INFO:MainThread:gensim.models.word2vec:effective_min_count=5 leaves 14612974 word corpus (97% of original 15030273, drops 417299)
INFO:MainThread:gensim.models.word2vec:deleting the raw counts dictionary of 467201 items
INFO:MainThread:gensim.models.word2vec:sample=0.001 downsamples 3 most-common words
INFO:MainThread:gensim.models.word2vec:downsampling leaves estimated 12442394 word corpus (85.1% of prior 14612974)
INFO:MainThread:gensim.models.word2vec:constructing a huffman tree from 326144 words
INFO:MainThread:gensim.models.word2vec:built huffman tree with maximum node depth 21
INFO:MainThread:gensim.models.base_any2vec:estimated required memory for 326144 words and 8 dimensions: 259610624 bytes
INFO:MainThread:gensim.models.word2vec:resetting layer weights
INFO:MainThread:gensim.models.base_any2vec:training model with 3 workers on 326144 vocabulary and 8 features, using sg=0 hs=1 sample=0.001 negative=5 window=5
INFO:MainThread:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 0.

INFO:MainThread:gensim.models.base_any2vec:EPOCH 3 - PROGRESS: at 0.33% examples, 530678 words/s, in_qsize 6, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 3 - PROGRESS: at 0.54% examples, 515441 words/s, in_qsize 5, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 3 - PROGRESS: at 0.79% examples, 506047 words/s, in_qsize 5, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 3 - PROGRESS: at 1.12% examples, 507287 words/s, in_qsize 4, out_qsize 1
INFO:MainThread:gensim.models.base_any2vec:EPOCH 3 - PROGRESS: at 1.46% examples, 501047 words/s, in_qsize 5, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 3 - PROGRESS: at 1.94% examples, 500374 words/s, in_qsize 6, out_qsize 2
INFO:MainThread:gensim.models.base_any2vec:EPOCH 3 - PROGRESS: at 2.55% examples, 502309 words/s, in_qsize 3, out_qsize 2
INFO:MainThread:gensim.models.base_any2vec:EPOCH 3 - PROGRESS: at 3.23% examples, 499343 words/s, in_qsize 5, out_qsize 1
INFO:MainThread:gensim.m

INFO:MainThread:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 9.12% examples, 496274 words/s, in_qsize 5, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 11.08% examples, 497180 words/s, in_qsize 4, out_qsize 1
INFO:MainThread:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 13.39% examples, 494777 words/s, in_qsize 6, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 16.41% examples, 493385 words/s, in_qsize 6, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 20.35% examples, 492481 words/s, in_qsize 5, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 25.82% examples, 492509 words/s, in_qsize 5, out_qsize 1
INFO:MainThread:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 32.78% examples, 490641 words/s, in_qsize 5, out_qsize 1
INFO:MainThread:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 42.60% examples, 488621 words/s, in_qsize 5, out_qsize 2
INFO:MainThread:g

100.68 Mb, 35.24 Mb (65.00 %)
runtime: 5161.076705694199


INFO:MainThread:gensim.models.word2vec:collecting all words and their counts
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #10000, processed 3010741 words, keeping 565871 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #20000, processed 3563037 words, keeping 627713 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #30000, processed 4080032 words, keeping 681175 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #40000, processed 4585441 words, keeping 729792 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #50000, processed 5111333 words, keeping 777973 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #60000, processed 5650600 words, keeping 826258 word types
INFO:MainThread:gensim.models.word2vec:PROGRESS: at sentence #70000, processed 6228623 words, keep

INFO:MainThread:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 23.70% examples, 463999 words/s, in_qsize 6, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 25.98% examples, 462990 words/s, in_qsize 5, out_qsize 1
INFO:MainThread:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 28.07% examples, 461785 words/s, in_qsize 5, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 30.28% examples, 459458 words/s, in_qsize 4, out_qsize 1
INFO:MainThread:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 32.47% examples, 458331 words/s, in_qsize 6, out_qsize 2
INFO:MainThread:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 34.72% examples, 457366 words/s, in_qsize 5, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 45.32% examples, 456934 words/s, in_qsize 5, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 59.59% examples, 458791 words/s, in_qsize 5, out_qsize 0
INFO:MainThread:

INFO:MainThread:gensim.models.base_any2vec:worker thread finished; awaiting finish of 1 more threads
INFO:MainThread:gensim.models.base_any2vec:worker thread finished; awaiting finish of 0 more threads
INFO:MainThread:gensim.models.base_any2vec:EPOCH - 3 : training on 15030273 raw words (11206052 effective words) took 23.6s, 475131 effective words/s
INFO:MainThread:gensim.models.base_any2vec:EPOCH 4 - PROGRESS: at 1.93% examples, 466359 words/s, in_qsize 6, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 4 - PROGRESS: at 3.86% examples, 452857 words/s, in_qsize 6, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 4 - PROGRESS: at 5.96% examples, 450815 words/s, in_qsize 5, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 4 - PROGRESS: at 8.12% examples, 448328 words/s, in_qsize 6, out_qsize 0
INFO:MainThread:gensim.models.base_any2vec:EPOCH 4 - PROGRESS: at 10.18% examples, 448699 words/s, in_qsize 5, out_qsize 0
INFO:MainThread:gensim.models.base_an

35.64 Mb, 12.48 Mb (65.00 %)
runtime: 5547.510270833969


0

In [11]:
df.to_pickle(path_pickle + "81_all_final_df.pickle")


In [12]:
print('======================================== prepare train & valid  =============================================')
train_df = df[:train_num].reset_index(drop=True)
test_df = df[train_num:].reset_index(drop=True)
del df
gc.collect()

train_idx = train_df[train_df['day'] < 10].index.tolist()
val_idx = train_df[train_df['day'] == 10].index.tolist()

train_x = train_df.iloc[train_idx].reset_index(drop=True)
train_y = labels[train_idx]
val_x = train_df.iloc[val_idx].reset_index(drop=True)
val_y = labels[val_idx]

del train_x['day'], val_x['day'], train_df['day'], test_df['day']
gc.collect()
print('runtime:', time.time() - t)
print('========================================================================================================')


runtime: 7192.464190006256


In [None]:
# train_df.to_pickle(path_pickle + "81_all_final_train.pickle")
# test_df.to_pickle(path_pickle + "81_all_final_test.pickle")

In [13]:
print('=============================================== training validate ===============================================')
fea_imp_list = []
clf = LGBMClassifier(
    learning_rate=0.01,
    n_estimators=5000,
    num_leaves=255,
    subsample=0.9,
    colsample_bytree=0.8,
    random_state=2019,
    metric=None
)

print('************** training **************')
clf.fit(
    train_x, train_y,
    eval_set=[(val_x, val_y)],
    eval_metric='auc',
    categorical_feature=cate_cols,
    early_stopping_rounds=200,
    verbose=50
)
print('runtime:', time.time() - t)

print('************** validate predict **************')
best_rounds = clf.best_iteration_
best_auc = clf.best_score_['valid_0']['auc']
val_pred = clf.predict_proba(val_x)[:, 1]
fea_imp_list.append(clf.feature_importances_)
print('runtime:', time.time() - t)

************** training **************


New categorical_feature is ['app_version', 'device_vendor', 'device_version', 'deviceid', 'lat', 'lng', 'lng_lat', 'netmodel', 'newsid', 'osversion', 'pos']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 200 rounds
[50]	valid_0's auc: 0.966704
[100]	valid_0's auc: 0.968927
[150]	valid_0's auc: 0.970494
[200]	valid_0's auc: 0.971524
[250]	valid_0's auc: 0.972321
[300]	valid_0's auc: 0.973264
[350]	valid_0's auc: 0.973946
[400]	valid_0's auc: 0.974556
[450]	valid_0's auc: 0.975058
[500]	valid_0's auc: 0.975489
[550]	valid_0's auc: 0.975821
[600]	valid_0's auc: 0.976138
[650]	valid_0's auc: 0.976369
[700]	valid_0's auc: 0.976521
[750]	valid_0's auc: 0.976658
[800]	valid_0's auc: 0.976784
[850]	valid_0's auc: 0.976876
[900]	valid_0's auc: 0.976963
[950]	valid_0's auc: 0.977034
[1000]	valid_0's auc: 0.977091
[1050]	valid_0's auc: 0.977133
[1100]	valid_0's auc: 0.977172
[1150]	valid_0's auc: 0.977205
[1200]	valid_0's auc: 0.977248
[1250]	valid_0's auc: 0.977277
[1300]	valid_0's auc: 0.977305
[1350]	valid_0's auc: 0.977331
[1400]	valid_0's auc: 0.977353
[1450]	valid_0's auc: 0.977369
[1500]	valid_0's auc: 0.977387
[1550]	valid_0's auc: 0.9774

In [1]:
print('=============================================== training predict ===============================================')
clf = LGBMClassifier(
    learning_rate=0.01,
    n_estimators=best_rounds,
    num_leaves=255,
    subsample=0.9,
    colsample_bytree=0.8,
    random_state=2019
)

print('************** training **************')
clf.fit(
    train_df, labels,
    eval_set=[(train_df, labels)],
    categorical_feature=cate_cols,
    verbose=50
)
print('runtime:', time.time() - t)



NameError: name 'LGBMClassifier' is not defined

In [None]:
print('************** test predict **************')
sub = pd.read_csv(path_data + 'sample.csv')
sub['target'] = clf.predict_proba(test_df)[:, 1]
# tmp_df = clf.predict_proba(test_df)[:, 1]
fea_imp_list.append(clf.feature_importances_)
print('runtime:', time.time() - t)


In [None]:
print('=============================================== feat importances ===============================================')
# 特征重要性可以好好看看
fea_imp_dict = dict(zip(train_df.columns.values, np.mean(fea_imp_list, axis=0)))
fea_imp_item = sorted(fea_imp_dict.items(), key=lambda x: x[1], reverse=True)
for f, imp in fea_imp_item:
    print('{} = {}'.format(f, imp))

In [None]:
print('=============================================== threshold search ===============================================')
# f1阈值敏感，所以对阈值做一个简单的迭代搜索。
t0 = 0.05
v = 0.002
best_t = t0
best_f1 = 0
for step in range(201):
    curr_t = t0 + step * v
    y = [1 if x >= curr_t else 0 for x in val_pred]
    curr_f1 = f1_score(val_y, y)
    if curr_f1 > best_f1:
        best_t = curr_t
        best_f1 = curr_f1
        print('step: {}   best threshold: {}   best f1: {}'.format(step, best_t, best_f1))
print('search finish.')

val_pred = [1 if x >= best_t else 0 for x in val_pred]
print('\nbest auc:', best_auc)
print('best f1:', f1_score(val_y, val_pred))
print('validate mean:', np.mean(val_pred))
print('runtime:', time.time() - t)

In [None]:
print('=============================================== sub save ===============================================')
sub.to_csv('sub_prob_{}_{}_{}.csv'.format(best_auc, best_f1, sub['target'].mean()), index=False)
sub['target'] = sub['target'].apply(lambda x: 1 if x >= best_t else 0)
sub.to_csv('sub_{}_{}_{}.csv'.format(best_auc, best_f1, sub['target'].mean()), index=False)
print('runtime:', time.time() - t)
print('finish.')
print('========================================================================================================')
