In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from lightgbm.sklearn import LGBMClassifier
from sklearn.metrics import roc_auc_score, f1_score
from scipy.stats import entropy
from gensim.models import Word2Vec
import time
import gc
import os

import tqdm                                                                                                   
import concurrent.futures
import multiprocessing

pd.set_option('display.max_columns', None)
num_processes = multiprocessing.cpu_count()
print("total cpu count", +num_processes) 

os.environ['NUMEXPR_MAX_THREADS'] = '8'

from core.utils import timeit, reduce_mem

total cpu count 8


In [29]:
path = "/media/ryan/F/deep-learning-data/turing/vedio-predict/"

path_sub = path + 'sub/'
path_npy = path + 'npy/'
path_data = path + 'raw/'
path_model = path + 'model/'
path_result = path + 'result/'
path_pickle = path + 'pickle/'
path_profile = path + 'profile/'

debug_small = False

if debug_small:
    train_df = pd.read_pickle(path_pickle + 'train_small.pickle')
    test_df = pd.read_pickle(path_pickle + 'test_small.pickle')
    sub = pd.read_csv(path_data + 'sample.csv')

    # app = pd.read_pickle(path_pickle + 'app_small.pickle')
    # user = pd.read_pickle(path_pickle + 'user_small.pickle')
else:
    train_df = pd.read_pickle(path_pickle + 'train.pickle')
    test_df = pd.read_pickle(path_pickle + 'test.pickle')
    sub = pd.read_csv(path_data + 'sample.csv')

    # app = pd.read_pickle(path_pickle + 'app.pickle')
    # user = pd.read_pickle(path_pickle + 'user.pickle')


In [3]:
train_df = train_df[train_df.deviceid.str[-1] == '1']
test_df = test_df[test_df.deviceid.str[-1] == '1']

In [5]:
test_df

Unnamed: 0,id,deviceid,newsid,guid,pos,app_version,device_vendor,netmodel,osversion,lng,lat,device_version,ts
101,test_102,fc2537a764aeebad1d9738bd835830c1,1026782592186002955,7a7e251a3a8a3e51f304558189d920f8,3,2.1.5,OPPO,o,8.1.0,1.060673e+02,2.796026e+01,PBCM10,1573450717181
102,test_103,fc2537a764aeebad1d9738bd835830c1,1114886426938376594,7a7e251a3a8a3e51f304558189d920f8,4,2.1.5,OPPO,o,8.1.0,4.940656e-324,4.940656e-324,PBCM10,1573445772769
103,test_104,fc2537a764aeebad1d9738bd835830c1,1121986237192882413,7a7e251a3a8a3e51f304558189d920f8,2,2.1.5,OPPO,o,8.1.0,1.060673e+02,2.796026e+01,PBCM10,1573450525650
104,test_105,fc2537a764aeebad1d9738bd835830c1,1192685813446768337,7a7e251a3a8a3e51f304558189d920f8,4,2.1.5,OPPO,o,8.1.0,1.060673e+02,2.796026e+01,PBCM10,1573449290652
105,test_106,fc2537a764aeebad1d9738bd835830c1,1394102932340986804,7a7e251a3a8a3e51f304558189d920f8,1,2.1.5,OPPO,o,8.1.0,1.060673e+02,2.796026e+01,PBCM10,1573449626574
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3653372,test_3653373,6676636e0343e2a2b02b9aec8be3ae31,3200871213877970645,7f02e3b78cfc49257dc5f326619a7086,1,2.1.5,HONOR,w,9,1.172218e+02,3.070950e+01,JSN-AL00,1573439588837
3653382,test_3653383,f0948752a64440a9cf8a11e330656f81,1497816818221073735,1f5918b9c85e3bf02008720c565b277d,1,2.0.8,vivo,o,8.1.0,4.940656e-324,4.940656e-324,V1813BT,1573480884142
3653383,test_3653384,f0948752a64440a9cf8a11e330656f81,3860129264619378056,1f5918b9c85e3bf02008720c565b277d,2,2.0.8,vivo,o,8.1.0,4.940656e-324,4.940656e-324,V1813BT,1573480717292
3653384,test_3653385,f0948752a64440a9cf8a11e330656f81,4052357272046746699,1f5918b9c85e3bf02008720c565b277d,0,2.0.8,vivo,w,8.1.0,4.940656e-324,4.940656e-324,V1813BT,1573481111925


In [6]:
sub = sub[sub.id.isin(test_df.id) ]
sub

Unnamed: 0,id,target
101,test_102,1
102,test_103,0
103,test_104,1
104,test_105,0
105,test_106,1
...,...,...
3653372,test_3653373,0
3653382,test_3653383,0
3653383,test_3653384,1
3653384,test_3653385,0


In [30]:
print('=============================================== read train ===============================================')
t = time.time()
# train_df = pd.read_csv('dataset/train.csv')
train_df['date'] = pd.to_datetime(
    train_df['ts'].apply(lambda x: time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(x / 1000)))
)
train_df['day'] = train_df['date'].dt.day

# 训练集中，day=7的个数为11个，day=8的为3,674,871。 day9，10也是解决40w
# day=7占比不到1/百万，属于异常情况，去掉合理？ 线上的表现又会如何，为啥不是直接删除，这样有点过了
# 这里为啥只是改了day，不去直接改ts和timestamp呢？
train_df.loc[train_df['day'] == 7, 'day'] = 8
train_df['hour'] = train_df['date'].dt.hour
train_df['minute'] = train_df['date'].dt.minute
train_num = train_df.shape[0]
labels = train_df['target'].values
print('runtime:', time.time() - t)

runtime: 22.714237213134766


In [8]:
print('=============================================== click data ===============================================')
click_df = train_df[train_df['target'] == 1].sort_values('timestamp').reset_index(drop=True)
click_df['exposure_click_gap'] = click_df['timestamp'] - click_df['ts']
click_df = click_df[click_df['exposure_click_gap'] >= 0].reset_index(drop=True)
click_df['date'] = pd.to_datetime(
    click_df['timestamp'].apply(lambda x: time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(x / 1000)))
)
click_df['day'] = click_df['date'].dt.day
# 同上对day==7的修改
click_df.loc[click_df['day'] == 7, 'day'] = 8

del train_df['target'], train_df['timestamp']

# 这里为啥要把click_df的这些字段删除呢？
for f in ['date', 'exposure_click_gap', 'timestamp', 'ts', 'target', 'hour', 'minute']:
    del click_df[f]
print('runtime:', time.time() - t)

runtime: 7.2453343868255615


In [9]:
print('=============================================== read test ===============================================')
test_df['date'] = pd.to_datetime(
    test_df['ts'].apply(lambda x: time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(x / 1000)))
)
test_df['day'] = test_df['date'].dt.day

# 测试集中，day=10的个数为32个，day=11的为3,653,560占比 1/十万，属于异常情况，去掉合理
test_df.loc[test_df['day'] == 10, 'day'] = 11
test_df['hour'] = test_df['date'].dt.hour
test_df['minute'] = test_df['date'].dt.minute
df = pd.concat([train_df, test_df], axis=0, ignore_index=False)
del train_df, test_df, df['date']
gc.collect()
print('runtime:', time.time() - t)

runtime: 11.870017051696777


In [10]:
print('============================================= category encoding =============================================')
df['lng_lat'] = df['lng'].astype('str') + '_' + df['lat'].astype('str')
del df['guid']
click_df['lng_lat'] = click_df['lng'].astype('str') + '_' + click_df['lat'].astype('str')
sort_df = df.sort_values('ts').reset_index(drop=True)
cate_cols = [
    'deviceid', 'newsid', 'pos', 'app_version', 'device_vendor',
    'netmodel', 'osversion', 'device_version', 'lng', 'lat', 'lng_lat'
]
for f in cate_cols:
    print(f)
    map_dict = dict(zip(df[f].unique(), range(df[f].nunique())))
    df[f] = df[f].map(map_dict).fillna(-1).astype('int32')
    click_df[f] = click_df[f].map(map_dict).fillna(-1).astype('int32')
    sort_df[f] = sort_df[f].map(map_dict).fillna(-1).astype('int32')
    df[f + '_count'] = df[f].map(df[f].value_counts())
df = reduce_mem(df)
click_df = reduce_mem(click_df)
sort_df = reduce_mem(sort_df)
print('runtime:', time.time() - t)


deviceid
newsid
pos
app_version
device_vendor
netmodel
osversion
device_version
lng
lat
lng_lat
166.76 Mb, 81.53 Mb (51.11 %)
4.87 Mb, 2.29 Mb (52.94 %)
77.82 Mb, 35.20 Mb (54.76 %)
runtime: 23.73936653137207


In [11]:
print('============================================= feat engineer =============================================')

print('*************************** history stats ***************************')
for f in [
    ['deviceid'],
    ['pos', 'deviceid'],
    # ...
]:
    print('------------------ {} ------------------'.format('_'.join(f)))

    # 对前一天的点击次数进行统计
    tmp = click_df[f + ['day', 'id']].groupby(f + ['day'], as_index=False)['id'].agg(
        {'_'.join(f) + '_prev_day_click_count': 'count'})
    tmp['day'] += 1
    df = df.merge(tmp, on=f + ['day'], how='left')
    df['_'.join(f) + '_prev_day_click_count'] = df['_'.join(f) + '_prev_day_click_count'].fillna(0)
    df.loc[df['day'] == 8, '_'.join(f) + '_prev_day_click_count'] = None

    # 对前一天的曝光量进行统计
    tmp = df[f + ['day', 'id']].groupby(f + ['day'], as_index=False)['id'].agg(
        {'_'.join(f) + '_prev_day_count': 'count'})
    tmp['day'] += 1
    df = df.merge(tmp, on=f + ['day'], how='left')
    df['_'.join(f) + '_prev_day_count'] = df['_'.join(f) + '_prev_day_count'].fillna(0)
    df.loc[df['day'] == 8, '_'.join(f) + '_prev_day_count'] = None

    # 计算前一天的点击率
    df['_'.join(f) + '_prev_day_ctr'] = df['_'.join(f) + '_prev_day_click_count'] / (
            df['_'.join(f) + '_prev_day_count'] + df['_'.join(f) + '_prev_day_count'].mean())

    del tmp
    print('runtime:', time.time() - t)
del click_df
df = reduce_mem(df)

*************************** history stats ***************************
------------------ deviceid ------------------
runtime: 24.4334557056427
------------------ pos_deviceid ------------------
runtime: 25.21677041053772
126.00 Mb, 92.64 Mb (26.47 %)


In [12]:
print('*************************** exposure_ts_gap ***************************')
for f in [
    ['deviceid'], ['newsid'], ['lng_lat'],
    ['pos', 'deviceid'], ['pos', 'newsid'], ['pos', 'lng_lat'],
    ['pos', 'deviceid', 'lng_lat'],
    ['netmodel', 'deviceid'],
    ['pos', 'netmodel', 'deviceid'],
    ['netmodel', 'lng_lat'], ['deviceid', 'lng_lat'],
    ['netmodel', 'deviceid', 'lng_lat'], ['pos', 'netmodel', 'lng_lat'],
    ['pos', 'netmodel', 'deviceid', 'lng_lat']
]:
    print('------------------ {} ------------------'.format('_'.join(f)))

    tmp = sort_df[f + ['ts']].groupby(f)
    # 前x次、后x次曝光到当前的时间差
    for gap in [1, 2, 3, 5, 10]:
        sort_df['{}_prev{}_exposure_ts_gap'.format('_'.join(f), gap)] = tmp['ts'].shift(0) - tmp['ts'].shift(gap)
        sort_df['{}_next{}_exposure_ts_gap'.format('_'.join(f), gap)] = tmp['ts'].shift(-gap) - tmp['ts'].shift(0)
        tmp2 = sort_df[
            f + ['ts', '{}_prev{}_exposure_ts_gap'.format('_'.join(f), gap),
                 '{}_next{}_exposure_ts_gap'.format('_'.join(f), gap)]
            ].drop_duplicates(f + ['ts']).reset_index(drop=True)
        df = df.merge(tmp2, on=f + ['ts'], how='left')
        del sort_df['{}_prev{}_exposure_ts_gap'.format('_'.join(f), gap)]
        del sort_df['{}_next{}_exposure_ts_gap'.format('_'.join(f), gap)]
        del tmp2

    del tmp
    df = reduce_mem(df)
    print('runtime:', time.time() - t)
del df['ts']
gc.collect()

*************************** exposure_ts_gap ***************************
------------------ deviceid ------------------
166.76 Mb, 129.70 Mb (22.22 %)
runtime: 34.02491521835327
------------------ newsid ------------------
203.82 Mb, 166.76 Mb (18.18 %)
runtime: 39.63182067871094
------------------ lng_lat ------------------
240.87 Mb, 203.82 Mb (15.38 %)
runtime: 44.98241329193115
------------------ pos_deviceid ------------------
277.93 Mb, 240.87 Mb (13.33 %)
runtime: 50.920658349990845
------------------ pos_newsid ------------------
314.99 Mb, 277.93 Mb (11.76 %)
runtime: 57.91858744621277
------------------ pos_lng_lat ------------------
352.05 Mb, 314.99 Mb (10.53 %)
runtime: 64.49775505065918
------------------ pos_deviceid_lng_lat ------------------
389.10 Mb, 352.05 Mb (9.52 %)
runtime: 71.71031022071838
------------------ netmodel_deviceid ------------------
426.16 Mb, 389.10 Mb (8.70 %)
runtime: 78.98814296722412
------------------ pos_netmodel_deviceid ------------------
46

0

In [None]:
# print('*************************** cross feat (second order) ***************************')
# # 二阶交叉特征，可以继续做更高阶的交叉。
# def build_cross_feat(df, f, col):
#     print('------------------ {} {} ------------------'.format(f, col))
#     df = df.merge(df[[f, col]].groupby(f, as_index=False)[col].agg({
#         'cross_{}_{}_nunique'.format(f, col): 'nunique',
#         'cross_{}_{}_ent'.format(f, col): lambda x: entropy(x.value_counts() / x.shape[0])  # 熵
#     }), on=f, how='left')
#     if 'cross_{}_{}_count'.format(f, col) not in df.columns.values and 'cross_{}_{}_count'.format(col,
#                                                                                                   f) not in df.columns.values:
#         df = df.merge(df[[f, col, 'id']].groupby([f, col], as_index=False)['id'].agg({
#             'cross_{}_{}_count'.format(f, col): 'count'  # 共现次数
#         }), on=[f, col], how='left')
#     if 'cross_{}_{}_count_ratio'.format(col, f) not in df.columns.values:
#         df['cross_{}_{}_count_ratio'.format(col, f)] = df['cross_{}_{}_count'.format(f, col)] / df[
#             f + '_count']  # 比例偏好
#     if 'cross_{}_{}_count_ratio'.format(f, col) not in df.columns.values:
#         df['cross_{}_{}_count_ratio'.format(f, col)] = df['cross_{}_{}_count'.format(f, col)] / df[
#             col + '_count']  # 比例偏好
#     df['cross_{}_{}_nunique_ratio_{}_count'.format(f, col, f)] = df['cross_{}_{}_nunique'.format(f, col)] / df[
#         f + '_count']
#     print('runtime:', time.time() - t)
#     df = reduce_mem(df)
#     return df
        
# cross_cols = ['deviceid', 'newsid', 'pos', 'netmodel', 'lng_lat']
# f_col_tuple_list = []
# for f in cross_cols:
#     for col in cross_cols:
#         if col == f:
#             continue
#         f_col_tuple_list.append((f, col))
        
# print(f_col_tuple_list)
# # with concurrent.futures.ProcessPoolExecutor(num_processes) as pool:
# #     df = list(tqdm.tqdm(pool.map(build_cross_feat, cross_cols, chunksize=10, total=df.shape[0])))
# for tuple_o in tqdm.tqdm(f_col_tuple_list):
#     print(tuple_o)
#     df = build_cross_feat(df, tuple_o[0], tuple_o[1])

# del df['id']
# gc.collect()

In [13]:
df.columns.values

array(['id', 'deviceid', 'newsid', 'pos', 'app_version', 'device_vendor',
       'netmodel', 'osversion', 'lng', 'lat', 'device_version', 'day',
       'hour', 'minute', 'lng_lat', 'deviceid_count', 'newsid_count',
       'pos_count', 'app_version_count', 'device_vendor_count',
       'netmodel_count', 'osversion_count', 'device_version_count',
       'lng_count', 'lat_count', 'lng_lat_count',
       'deviceid_prev_day_click_count', 'deviceid_prev_day_count',
       'deviceid_prev_day_ctr', 'pos_deviceid_prev_day_click_count',
       'pos_deviceid_prev_day_count', 'pos_deviceid_prev_day_ctr',
       'deviceid_prev1_exposure_ts_gap', 'deviceid_next1_exposure_ts_gap',
       'deviceid_prev2_exposure_ts_gap', 'deviceid_next2_exposure_ts_gap',
       'deviceid_prev3_exposure_ts_gap', 'deviceid_next3_exposure_ts_gap',
       'deviceid_prev5_exposure_ts_gap', 'deviceid_next5_exposure_ts_gap',
       'deviceid_prev10_exposure_ts_gap',
       'deviceid_next10_exposure_ts_gap', 'newsid_prev1_ex

In [14]:
print('*************************** cross feat (second order) ***************************')
# 二阶交叉特征，可以继续做更高阶的交叉。
cross_cols = ['deviceid', 'newsid', 'pos', 'netmodel', 'lng_lat']
for f in cross_cols:
    for col in cross_cols:
        if col == f:
            continue
        print('------------------ {} {} ------------------'.format(f, col))
        if  'cross_{}_{}_nunique'.format(f, col) not in df.columns.values:
            df = df.merge(df[[f, col]].groupby(f, as_index=False)[col].agg({
                'cross_{}_{}_nunique'.format(f, col): 'nunique',
                'cross_{}_{}_ent'.format(f, col): lambda x: entropy(x.value_counts() / x.shape[0])  # 熵
            }), on=f, how='left')
        if 'cross_{}_{}_count'.format(f, col) not in df.columns.values and 'cross_{}_{}_count'.format(col,
                                                                                                      f) not in df.columns.values:
            df = df.merge(df[[f, col, 'id']].groupby([f, col], as_index=False)['id'].agg({
                'cross_{}_{}_count'.format(f, col): 'count'  # 共现次数
            }), on=[f, col], how='left')
        if 'cross_{}_{}_count_ratio'.format(col, f) not in df.columns.values:
            df['cross_{}_{}_count_ratio'.format(col, f)] = df['cross_{}_{}_count'.format(f, col)] / df[
                f + '_count']  # 比例偏好
        if 'cross_{}_{}_count_ratio'.format(f, col) not in df.columns.values:
            df['cross_{}_{}_count_ratio'.format(f, col)] = df['cross_{}_{}_count'.format(f, col)] / df[
                col + '_count']  # 比例偏好
        df['cross_{}_{}_nunique_ratio_{}_count'.format(f, col, f)] = df['cross_{}_{}_nunique'.format(f, col)] / df[
            f + '_count']
        print('runtime:', time.time() - t)
    df = reduce_mem(df)
del df['id']
gc.collect()

*************************** cross feat (second order) ***************************
------------------ deviceid newsid ------------------
runtime: 140.0743055343628
------------------ deviceid pos ------------------
runtime: 147.7860164642334
------------------ deviceid netmodel ------------------
runtime: 156.31755542755127
------------------ deviceid lng_lat ------------------
runtime: 165.10156059265137
781.91 Mb, 645.73 Mb (17.42 %)
------------------ newsid deviceid ------------------
runtime: 454.40870666503906
------------------ newsid pos ------------------
runtime: 742.6436910629272
------------------ newsid netmodel ------------------
runtime: 1017.4372234344482
------------------ newsid lng_lat ------------------
runtime: 1303.300873041153
801.37 Mb, 682.78 Mb (14.80 %)
------------------ pos deviceid ------------------
runtime: 1308.1814727783203
------------------ pos newsid ------------------
runtime: 1310.4886124134064
------------------ pos netmodel ------------------
run

0

In [15]:
df.to_pickle(path_pickle + "df_sample01_081_cross.pickle")
print("success build df_sample01_081_cross.pickle")

del sort_df
train_num

success build df_sample01_081_cross.pickle


734116

In [16]:
df =pd.read_pickle(path_pickle + "df_sample01_081_cross.pickle")


In [17]:
# train_num =11376681
cate_cols = [
    'deviceid', 'newsid', 'pos', 'app_version', 'device_vendor',
    'netmodel', 'osversion', 'device_version', 'lng', 'lat', 'lng_lat'
]

In [18]:
print('======================================== prepare train & valid  =============================================')
train_df = df[:train_num].reset_index(drop=True)
test_df = df[train_num:].reset_index(drop=True)
del df
gc.collect()

train_idx = train_df[train_df['day'] < 10].index.tolist()
val_idx = train_df[train_df['day'] == 10].index.tolist()

train_x = train_df.iloc[train_idx].reset_index(drop=True)
train_y = labels[train_idx]
val_x = train_df.iloc[val_idx].reset_index(drop=True)
val_y = labels[val_idx]

del train_x['day'], val_x['day'], train_df['day'], test_df['day']
gc.collect()
print('runtime:', time.time() - t)
print('========================================================================================================')



runtime: 1481.3427867889404


In [19]:
print('=============================================== training validate ===============================================')
fea_imp_list = []
clf = LGBMClassifier(
    n_jobs=7,
    learning_rate=0.01,
    n_estimators=5000,
    num_leaves=255,
    subsample=0.9,
    colsample_bytree=0.8,
    random_state=2019,
    metric=None
)

print('************** training **************')
clf.fit(
    train_x, train_y,
    eval_set=[(val_x, val_y)],
    eval_metric='auc',
    categorical_feature=cate_cols,
    early_stopping_rounds=200,
    verbose=50
)
print('runtime:', time.time() - t)

************** training **************


New categorical_feature is ['app_version', 'device_vendor', 'device_version', 'deviceid', 'lat', 'lng', 'lng_lat', 'netmodel', 'newsid', 'osversion', 'pos']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 200 rounds
[50]	valid_0's auc: 0.966111
[100]	valid_0's auc: 0.967396
[150]	valid_0's auc: 0.968283
[200]	valid_0's auc: 0.968857
[250]	valid_0's auc: 0.969554
[300]	valid_0's auc: 0.969995
[350]	valid_0's auc: 0.970615
[400]	valid_0's auc: 0.971105
[450]	valid_0's auc: 0.971563
[500]	valid_0's auc: 0.97186
[550]	valid_0's auc: 0.972066
[600]	valid_0's auc: 0.97225
[650]	valid_0's auc: 0.972352
[700]	valid_0's auc: 0.972417
[750]	valid_0's auc: 0.972431
[800]	valid_0's auc: 0.972444
[850]	valid_0's auc: 0.972444
[900]	valid_0's auc: 0.972478
[950]	valid_0's auc: 0.972455
[1000]	valid_0's auc: 0.972458
[1050]	valid_0's auc: 0.972457
Early stopping, best iteration is:
[896]	valid_0's auc: 0.972479
runtime: 4915.117074012756


In [20]:
gc.collect()

print('************** validate predict **************')
best_rounds = clf.best_iteration_
best_auc = clf.best_score_['valid_0']['auc']
val_pred = clf.predict_proba(val_x)[:, 1]
fea_imp_list.append(clf.feature_importances_)
print('runtime:', time.time() - t)

************** validate predict **************
runtime: 4984.874499082565


In [None]:


print('=============================================== training predict ===============================================')
clf = LGBMClassifier(
    learning_rate=0.01,
    n_estimators=best_rounds,
    num_leaves=255,
    subsample=0.9,
    colsample_bytree=0.8,
    random_state=2019
)

In [None]:
print('************** training using all the data **************')
clf.fit(
    train_df, labels,
    eval_set=[(train_df, labels)],
    categorical_feature=cate_cols,
    verbose=50
)
print('runtime:', time.time() - t)

In [22]:
print('************** test predict **************')
# sub = pd.read_csv(path_data + 'sample.csv')

sub['target'] = clf.predict_proba(test_df)[:, 1]
fea_imp_list.append(clf.feature_importances_)
print('runtime:', time.time() - t)

************** test predict **************
runtime: 5134.657577991486


In [23]:
gc.collect()

6

In [24]:
print('=============================================== feat importances ===============================================')
# 特征重要性可以好好看看
fea_imp_dict = dict(zip(train_df.columns.values, np.mean(fea_imp_list, axis=0)))
fea_imp_item = sorted(fea_imp_dict.items(), key=lambda x: x[1], reverse=True)
for f, imp in fea_imp_item:
    print('{} = {}'.format(f, imp))

deviceid = 29432.0
device_version = 18247.0
newsid = 8640.0
lng = 8072.0
lat = 7996.0
lng_lat = 6657.0
netmodel_deviceid_lng_lat_next1_exposure_ts_gap = 3749.0
netmodel_deviceid_next1_exposure_ts_gap = 2886.0
deviceid_next3_exposure_ts_gap = 2681.0
pos_netmodel_deviceid_next1_exposure_ts_gap = 2377.0
netmodel_deviceid_next3_exposure_ts_gap = 2259.0
netmodel_deviceid_lng_lat_next2_exposure_ts_gap = 2122.0
deviceid_lng_lat_next3_exposure_ts_gap = 2033.0
deviceid_next1_exposure_ts_gap = 2030.0
netmodel_deviceid_next2_exposure_ts_gap = 1988.0
netmodel_deviceid_lng_lat_next3_exposure_ts_gap = 1763.0
cross_deviceid_newsid_count = 1752.0
netmodel_lng_lat_next1_exposure_ts_gap = 1745.0
pos_netmodel_deviceid_lng_lat_next1_exposure_ts_gap = 1731.0
cross_deviceid_pos_ent = 1655.0
cross_lng_lat_pos_ent = 1642.0
newsid_next1_exposure_ts_gap = 1584.0
deviceid_next5_exposure_ts_gap = 1576.0
pos_count = 1574.0
deviceid_lng_lat_next1_exposure_ts_gap = 1567.0
netmodel_deviceid_next5_exposure_ts_gap = 15

In [25]:
print('=============================================== threshold search ===============================================')
# f1阈值敏感，所以对阈值做一个简单的迭代搜索。
t0 = 0.05
v = 0.002
best_t = t0
best_f1 = 0
for step in range(201):
    curr_t = t0 + step * v
    y = [1 if x >= curr_t else 0 for x in val_pred]
    curr_f1 = f1_score(val_y, y)
    if curr_f1 > best_f1:
        best_t = curr_t
        best_f1 = curr_f1
        print('step: {}   best threshold: {}   best f1: {}'.format(step, best_t, best_f1))
print('search finish.')

val_pred = [1 if x >= best_t else 0 for x in val_pred]
print('\nbest auc:', best_auc)
print('best f1:', f1_score(val_y, val_pred))
print('validate mean:', np.mean(val_pred))
print('runtime:', time.time() - t)


step: 0   best threshold: 0.05   best f1: 0.6979839444078625
step: 1   best threshold: 0.052000000000000005   best f1: 0.7004941782702353
step: 2   best threshold: 0.054000000000000006   best f1: 0.7026545166402536
step: 3   best threshold: 0.056   best f1: 0.7047535111161654
step: 4   best threshold: 0.058   best f1: 0.7067306319426597
step: 5   best threshold: 0.060000000000000005   best f1: 0.708621901033825
step: 6   best threshold: 0.062   best f1: 0.7109849903488806
step: 7   best threshold: 0.064   best f1: 0.7127444174476455
step: 8   best threshold: 0.066   best f1: 0.7143126751157761
step: 9   best threshold: 0.068   best f1: 0.7160324700875876
step: 10   best threshold: 0.07   best f1: 0.7174730435545932
step: 11   best threshold: 0.07200000000000001   best f1: 0.718934867812633
step: 12   best threshold: 0.07400000000000001   best f1: 0.7205986330426585
step: 13   best threshold: 0.07600000000000001   best f1: 0.7218207345008498
step: 14   best threshold: 0.078   best f1: 0

step: 146   best threshold: 0.34199999999999997   best f1: 0.7736118223717902
step: 149   best threshold: 0.348   best f1: 0.7736891368583333
search finish.

best auc: 0.9724794667703146
best f1: 0.7736891368583333
validate mean: 0.10427252394333887
runtime: 5213.824611902237


In [None]:
print('=============================================== sub save ===============================================')
sub.to_csv('sub_prob_{}_{}_{}.csv'.format(best_auc, best_f1, sub['target'].mean()), index=False)
sub['target'] = sub['target'].apply(lambda x: 1 if x >= best_t else 0)
sub.to_csv('sub_{}_{}_{}.csv'.format(best_auc, best_f1, sub['target'].mean()), index=False)
print('runtime:', time.time() - t)
print('finish.')
print('========================================================================================================')

In [None]:
## 开始预测全量数据

In [43]:
if debug_small:
    train_df = pd.read_pickle(path_pickle + 'train_small.pickle')
    test_df = pd.read_pickle(path_pickle + 'test_small.pickle')

    # app = pd.read_pickle(path_pickle + 'app_small.pickle')
    # user = pd.read_pickle(path_pickle + 'user_small.pickle')
else:
    train_df = pd.read_pickle(path_pickle + 'train.pickle')
    test_df = pd.read_pickle(path_pickle + 'test.pickle')
    sub = pd.read_csv(path_data + 'sample.csv')

    # app = pd.read_pickle(path_pickle + 'app.pickle')
    # user = pd.read_pickle(path_pickle + 'user.pickle')


In [44]:
print('=============================================== read train ===============================================')
t = time.time()

train_num = train_df.shape[0]
labels = train_df['target'].values
print('runtime:', time.time() - t)

runtime: 21.906771421432495


In [49]:
df =pd.read_pickle(path_pickle + "df_081_cross.pickle")

In [65]:
miss_cols = ['cross_deviceid_newsid_nunique_x', 'cross_deviceid_newsid_ent_x', 'cross_deviceid_pos_nunique_x', 'cross_deviceid_pos_ent_x', 'cross_deviceid_newsid_nunique_y', 'cross_deviceid_newsid_ent_y', 'cross_deviceid_pos_nunique_y', 'cross_deviceid_pos_ent_y']
for mis in miss_cols:
    del df[mis]

In [67]:
print('======================================== prepare train & valid  =============================================')
train_df = df[:train_num].reset_index(drop=True)
test_df = df[train_num:].reset_index(drop=True)
del df
gc.collect()

train_idx = train_df[train_df['day'] < 10].index.tolist()
val_idx = train_df[train_df['day'] == 10].index.tolist()

train_x = train_df.iloc[train_idx].reset_index(drop=True)
train_y = labels[train_idx]
val_x = train_df.iloc[val_idx].reset_index(drop=True)
val_y = labels[val_idx]

del train_x['day'], val_x['day'], train_df['day'], test_df['day']
gc.collect()
print('runtime:', time.time() - t)
print('========================================================================================================')



runtime: 1823.8565957546234


In [51]:
df_2 =pd.read_pickle(path_pickle + "df_sample01_081_cross.pickle")


In [68]:
gc.collect()

print('************** validate predict **************')
best_rounds = clf.best_iteration_
best_auc = clf.best_score_['valid_0']['auc']
val_pred = clf.predict_proba(val_x)[:, 1]
fea_imp_list.append(clf.feature_importances_)
print('runtime:', time.time() - t)

************** validate predict **************
runtime: 2352.062028646469


In [53]:
# df_2.columns.values

array(['deviceid', 'newsid', 'pos', 'app_version', 'device_vendor',
       'netmodel', 'osversion', 'lng', 'lat', 'device_version', 'day',
       'hour', 'minute', 'lng_lat', 'deviceid_count', 'newsid_count',
       'pos_count', 'app_version_count', 'device_vendor_count',
       'netmodel_count', 'osversion_count', 'device_version_count',
       'lng_count', 'lat_count', 'lng_lat_count',
       'deviceid_prev_day_click_count', 'deviceid_prev_day_count',
       'deviceid_prev_day_ctr', 'pos_deviceid_prev_day_click_count',
       'pos_deviceid_prev_day_count', 'pos_deviceid_prev_day_ctr',
       'deviceid_prev1_exposure_ts_gap', 'deviceid_next1_exposure_ts_gap',
       'deviceid_prev2_exposure_ts_gap', 'deviceid_next2_exposure_ts_gap',
       'deviceid_prev3_exposure_ts_gap', 'deviceid_next3_exposure_ts_gap',
       'deviceid_prev5_exposure_ts_gap', 'deviceid_next5_exposure_ts_gap',
       'deviceid_prev10_exposure_ts_gap',
       'deviceid_next10_exposure_ts_gap', 'newsid_prev1_exposure

In [66]:
# miss_cols =[]
# for col in df.columns.values:
#     if col not in df_2.columns.values:
#         miss_cols.append(col)
# print(miss_cols)


[]


In [63]:
# df[miss_cols].head(5)

Unnamed: 0,cross_deviceid_newsid_nunique_x,cross_deviceid_newsid_ent_x,cross_deviceid_pos_nunique_x,cross_deviceid_pos_ent_x,cross_deviceid_newsid_nunique_y,cross_deviceid_newsid_ent_y,cross_deviceid_pos_nunique_y,cross_deviceid_pos_ent_y
0,6,1.735352,3,0.995117,6,1.735352,3,0.995117
1,6,1.735352,3,0.995117,6,1.735352,3,0.995117
2,19,2.945312,3,1.086914,19,2.945312,3,1.086914
3,19,2.945312,3,1.086914,19,2.945312,3,1.086914
4,40,3.634766,5,1.234375,40,3.634766,5,1.234375


In [73]:
print('=============================================== training predict ===============================================')
clf = LGBMClassifier(
    learning_rate=0.01,
    n_estimators=best_rounds,
    num_leaves=255,
    subsample=0.9,
    colsample_bytree=0.8,
    random_state=2019
)



In [74]:
print('************** training using all the data **************')
clf.fit(
    train_df, labels,
    eval_set=[(train_df, labels)],
    categorical_feature=cate_cols,
    verbose=50
)
print('runtime:', time.time() - t)

************** training using all the data **************


New categorical_feature is ['app_version', 'device_vendor', 'device_version', 'deviceid', 'lat', 'lng', 'lng_lat', 'netmodel', 'newsid', 'osversion', 'pos']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


[50]	training's binary_logloss: 0.211156
[100]	training's binary_logloss: 0.166179
[150]	training's binary_logloss: 0.143335
[200]	training's binary_logloss: 0.129926
[250]	training's binary_logloss: 0.121307
[300]	training's binary_logloss: 0.115278
[350]	training's binary_logloss: 0.110885
[400]	training's binary_logloss: 0.107303
[450]	training's binary_logloss: 0.104661
[500]	training's binary_logloss: 0.102335
[550]	training's binary_logloss: 0.100249
[600]	training's binary_logloss: 0.0983435
[650]	training's binary_logloss: 0.0967053
[700]	training's binary_logloss: 0.0952016
[750]	training's binary_logloss: 0.0938171
[800]	training's binary_logloss: 0.0925661
[850]	training's binary_logloss: 0.0914056
runtime: 13965.7793571949


In [69]:
print('************** test predict **************')
# sub = pd.read_csv(path_data + 'sample.csv')

sub['target'] = clf.predict_proba(test_df)[:, 1]
fea_imp_list.append(clf.feature_importances_)
print('runtime:', time.time() - t)

************** test predict **************
runtime: 3522.3794405460358


In [70]:
print('=============================================== feat importances ===============================================')
# 特征重要性可以好好看看
fea_imp_dict = dict(zip(train_df.columns.values, np.mean(fea_imp_list, axis=0)))
fea_imp_item = sorted(fea_imp_dict.items(), key=lambda x: x[1], reverse=True)
for f, imp in fea_imp_item:
    print('{} = {}'.format(f, imp))

deviceid = 29432.0
device_version = 18247.0
newsid = 8640.0
lng = 8072.0
lat = 7996.0
lng_lat = 6657.0
netmodel_deviceid_lng_lat_next1_exposure_ts_gap = 3749.0
netmodel_deviceid_next1_exposure_ts_gap = 2886.0
deviceid_next3_exposure_ts_gap = 2681.0
pos_netmodel_deviceid_next1_exposure_ts_gap = 2377.0
netmodel_deviceid_next3_exposure_ts_gap = 2259.0
netmodel_deviceid_lng_lat_next2_exposure_ts_gap = 2122.0
deviceid_lng_lat_next3_exposure_ts_gap = 2033.0
deviceid_next1_exposure_ts_gap = 2030.0
netmodel_deviceid_next2_exposure_ts_gap = 1988.0
netmodel_deviceid_lng_lat_next3_exposure_ts_gap = 1763.0
cross_deviceid_newsid_count_ratio = 1752.0
netmodel_lng_lat_next1_exposure_ts_gap = 1745.0
pos_netmodel_deviceid_lng_lat_next1_exposure_ts_gap = 1731.0
cross_deviceid_pos_nunique_ratio_deviceid_count = 1655.0
cross_lng_lat_pos_ent = 1642.0
newsid_next1_exposure_ts_gap = 1584.0
deviceid_next5_exposure_ts_gap = 1576.0
pos_count = 1574.0
deviceid_lng_lat_next1_exposure_ts_gap = 1567.0
netmodel_devi

In [71]:
print('=============================================== threshold search ===============================================')
# f1阈值敏感，所以对阈值做一个简单的迭代搜索。
t0 = 0.05
v = 0.002
best_t = t0
best_f1 = 0
for step in range(201):
    curr_t = t0 + step * v
    y = [1 if x >= curr_t else 0 for x in val_pred]
    curr_f1 = f1_score(val_y, y)
    if curr_f1 > best_f1:
        best_t = curr_t
        best_f1 = curr_f1
        print('step: {}   best threshold: {}   best f1: {}'.format(step, best_t, best_f1))
print('search finish.')

val_pred = [1 if x >= best_t else 0 for x in val_pred]
print('\nbest auc:', best_auc)
print('best f1:', f1_score(val_y, val_pred))
print('validate mean:', np.mean(val_pred))
print('runtime:', time.time() - t)


step: 0   best threshold: 0.05   best f1: 0.6997430713166896
step: 1   best threshold: 0.052000000000000005   best f1: 0.7023112620488454
step: 2   best threshold: 0.054000000000000006   best f1: 0.7047099608620907
step: 3   best threshold: 0.056   best f1: 0.7068569901490088
step: 4   best threshold: 0.058   best f1: 0.7087415502199624
step: 5   best threshold: 0.060000000000000005   best f1: 0.710591678219499
step: 6   best threshold: 0.062   best f1: 0.7122296888635893
step: 7   best threshold: 0.064   best f1: 0.7137043205994118
step: 8   best threshold: 0.066   best f1: 0.7150138315006197
step: 9   best threshold: 0.068   best f1: 0.7163322620434747
step: 10   best threshold: 0.07   best f1: 0.7174975604048065
step: 11   best threshold: 0.07200000000000001   best f1: 0.7183878005054323
step: 12   best threshold: 0.07400000000000001   best f1: 0.7192221056495739
step: 13   best threshold: 0.07600000000000001   best f1: 0.7199101832375536
step: 14   best threshold: 0.078   best f1: 

In [72]:
print('=============================================== sub save ===============================================')
sub.to_csv('sub_prob_{}_{}_{}.csv'.format(best_auc, best_f1, sub['target'].mean()), index=False)
sub['target'] = sub['target'].apply(lambda x: 1 if x >= best_t else 0)
sub.to_csv('sub_{}_{}_{}.csv'.format(best_auc, best_f1, sub['target'].mean()), index=False)
print('runtime:', time.time() - t)
print('finish.')
print('========================================================================================================')

runtime: 4523.013119459152
finish.
