In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from lightgbm.sklearn import LGBMClassifier
from sklearn.metrics import roc_auc_score, f1_score
from scipy.stats import entropy
from gensim.models import Word2Vec
import time
import gc
import os

import tqdm                                                                                                   
import concurrent.futures
import multiprocessing

pd.set_option('display.max_columns', None)
num_processes = multiprocessing.cpu_count()
print("total cpu count", +num_processes) 

os.environ['NUMEXPR_MAX_THREADS'] = '8'

from core.utils import timeit, reduce_mem

total cpu count 8


In [2]:
path = "/media/ryan/F/deep-learning-data/turing/vedio-predict/"

path_sub = path + 'sub/'
path_npy = path + 'npy/'
path_data = path + 'raw/'
path_model = path + 'model/'
path_result = path + 'result/'
path_pickle = path + 'pickle/'
path_profile = path + 'profile/'

debug_small = False

if debug_small:
    train_df = pd.read_pickle(path_pickle + 'train_small.pickle')
    test_df = pd.read_pickle(path_pickle + 'test_small.pickle')
    sub = pd.read_csv(path_data + 'sample.csv')

    # app = pd.read_pickle(path_pickle + 'app_small.pickle')
    # user = pd.read_pickle(path_pickle + 'user_small.pickle')
else:
    train_df = pd.read_pickle(path_pickle + 'train.pickle')
    test_df = pd.read_pickle(path_pickle + 'test.pickle')
    sub = pd.read_csv(path_data + 'sample.csv')

    # app = pd.read_pickle(path_pickle + 'app.pickle')
    # user = pd.read_pickle(path_pickle + 'user.pickle')


In [None]:
# train_df = train_df[train_df.deviceid.str[-1] == '1']
# test_df = test_df[test_df.deviceid.str[-1] == '1']

In [None]:
# test_df

In [None]:
# sub = sub[sub.id.isin(test_df.id) ]
# sub

In [3]:
print('=============================================== read train ===============================================')
t = time.time()
# train_df = pd.read_csv('dataset/train.csv')
train_df['date'] = pd.to_datetime(
    train_df['ts'].apply(lambda x: time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(x / 1000)))
)
train_df['day'] = train_df['date'].dt.day

# 训练集中，day=7的个数为11个，day=8的为3,674,871。 day9，10也是解决40w
# day=7占比不到1/百万，属于异常情况，去掉合理？ 线上的表现又会如何，为啥不是直接删除，这样有点过了
# 这里为啥只是改了day，不去直接改ts和timestamp呢？
train_df.loc[train_df['day'] == 7, 'day'] = 8
train_df['hour'] = train_df['date'].dt.hour
train_df['minute'] = train_df['date'].dt.minute
train_num = train_df.shape[0]
labels = train_df['target'].values
print('runtime:', time.time() - t)

runtime: 22.337076902389526


In [5]:
train_num

11376681

In [6]:
df =pd.read_pickle(path_pickle + "df_081_emd_all.pickle")


In [7]:
train_num =11376681
cate_cols = [
    'deviceid', 'newsid', 'pos', 'app_version', 'device_vendor',
    'netmodel', 'osversion', 'device_version', 'lng', 'lat', 'lng_lat'
]

In [8]:
print('======================================== prepare train & valid  =============================================')
train_df = df[:train_num].reset_index(drop=True)
test_df = df[train_num:].reset_index(drop=True)
del df
gc.collect()



0

In [9]:
train_df.shape

(11376681, 317)

In [10]:


train_idx = train_df[train_df['day'] < 10].index.tolist()
val_idx = train_df[train_df['day'] == 10].index.tolist()

train_x = train_df.iloc[train_idx].reset_index(drop=True)
train_y = labels[train_idx]
val_x = train_df.iloc[val_idx].reset_index(drop=True)
val_y = labels[val_idx]

del train_x['day'], val_x['day'], train_df['day'], test_df['day']
gc.collect()
print('runtime:', time.time() - t)
print('========================================================================================================')



runtime: 238.82718586921692


In [11]:
def learning_rate_callback(env):
    delta_lr = 0.0001
    iteration = env.iteration
    if iteration % 10 == 0:
        learning_rate = env.params['learning_rate'] - delta_lr
        env.params['learning_rate'] = learning_rate
        
        print('---- current learning rate:' + str(learning_rate) + '----')
        

In [13]:
print('=============================================== training validate ===============================================')
fea_imp_list = []
clf = LGBMClassifier(
    n_jobs=7,
    learning_rate=0.02,
    n_estimators=5000,
    num_leaves=255,
    subsample=0.9,
    colsample_bytree=0.8,
    random_state=2019,
    metric=None,
    
    feature_fraction=0.8, # 随机特征采样
    bagging_fraction=0.8, # 随机样本采样
    bagging_freq=5       # k means perform bagging at every k iteration
)

print('************** training **************')
clf.fit(
    train_x, train_y,
    eval_set=[(val_x, val_y)],
    eval_metric='auc',
    categorical_feature=cate_cols,
    early_stopping_rounds=200,
    verbose=50,
    callbacks=[learning_rate_callback]
)
print('runtime:', time.time() - t)

************** training **************


New categorical_feature is ['app_version', 'device_vendor', 'device_version', 'deviceid', 'lat', 'lng', 'lng_lat', 'netmodel', 'newsid', 'osversion', 'pos']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


---- current learning rate:0.0199----
Training until validation scores don't improve for 200 rounds
---- current learning rate:0.0198----
---- current learning rate:0.019700000000000002----
---- current learning rate:0.019600000000000003----
---- current learning rate:0.019500000000000003----
[50]	valid_0's auc: 0.970956
---- current learning rate:0.019400000000000004----
---- current learning rate:0.019300000000000005----
---- current learning rate:0.019200000000000005----
---- current learning rate:0.019100000000000006----
---- current learning rate:0.019000000000000006----
[100]	valid_0's auc: 0.97365
---- current learning rate:0.018900000000000007----
---- current learning rate:0.018800000000000008----
---- current learning rate:0.01870000000000001----
---- current learning rate:0.01860000000000001----
---- current learning rate:0.01850000000000001----
[150]	valid_0's auc: 0.975558
---- current learning rate:0.01840000000000001----
---- current learning rate:0.01830000000000001----

In [15]:
gc.collect()

print('************** validate predict **************')
best_rounds = clf.best_iteration_
best_auc = clf.best_score_['valid_0']['auc']
val_pred = clf.predict_proba(val_x)[:, 1]
fea_imp_list.append(clf.feature_importances_)
print('runtime:', time.time() - t)

************** validate predict **************
runtime: 17309.106901168823


In [None]:


print('=============================================== training predict ===============================================')
clf = LGBMClassifier(
    learning_rate=0.01,
    n_estimators=best_rounds,
    num_leaves=255,
    subsample=0.9,
    colsample_bytree=0.8,
    random_state=2019
)

In [None]:
print('************** training using all the data **************')
clf.fit(
    train_df, labels,
    eval_set=[(train_df, labels)],
    categorical_feature=cate_cols,
    verbose=50
)
print('runtime:', time.time() - t)

In [16]:
print('************** test predict **************')
# sub = pd.read_csv(path_data + 'sample.csv')

sub['target'] = clf.predict_proba(test_df)[:, 1]
fea_imp_list.append(clf.feature_importances_)
print('runtime:', time.time() - t)

************** test predict **************
runtime: 59716.57367801666


In [17]:
gc.collect()

16

In [18]:
print('=============================================== feat importances ===============================================')
# 特征重要性可以好好看看
fea_imp_dict = dict(zip(train_df.columns.values, np.mean(fea_imp_list, axis=0)))
fea_imp_item = sorted(fea_imp_dict.items(), key=lambda x: x[1], reverse=True)
for f, imp in fea_imp_item:
    print('{} = {}'.format(f, imp))

deviceid = 70524.0
newsid = 36244.0
device_version = 29596.0
lat = 13653.0
lng = 13354.0
lng_lat = 9820.0
netmodel_deviceid_lng_lat_next1_exposure_ts_gap = 2935.0
netmodel_deviceid_next1_exposure_ts_gap = 2448.0
deviceid_next3_exposure_ts_gap = 1878.0
netmodel_deviceid_next3_exposure_ts_gap = 1611.0
pos_netmodel_deviceid_next1_exposure_ts_gap = 1546.0
deviceid_lng_lat_next3_exposure_ts_gap = 1504.0
netmodel_deviceid_lng_lat_next2_exposure_ts_gap = 1475.0
pos_count = 1443.0
cross_deviceid_newsid_count = 1419.0
cross_lng_lat_pos_ent = 1401.0
netmodel_deviceid_next2_exposure_ts_gap = 1372.0
pos_netmodel_deviceid_lng_lat_next1_exposure_ts_gap = 1342.0
pos = 1340.0
deviceid_next1_exposure_ts_gap = 1209.0
netmodel_deviceid_lng_lat_next3_exposure_ts_gap = 1183.0
deviceid_newsid_emb_6 = 1177.0
deviceid_next5_exposure_ts_gap = 809.0
deviceid_lng_lat_next1_exposure_ts_gap = 805.0
cross_deviceid_pos_ent_x = 795.0
deviceid_next2_exposure_ts_gap = 773.0
netmodel_deviceid_next10_exposure_ts_gap = 75

In [None]:
print('=============================================== threshold search ===============================================')
# f1阈值敏感，所以对阈值做一个简单的迭代搜索。
t0 = 0.05
v = 0.002
best_t = t0
best_f1 = 0
for step in range(201):
    curr_t = t0 + step * v
    y = [1 if x >= curr_t else 0 for x in val_pred]
    curr_f1 = f1_score(val_y, y)
    if curr_f1 > best_f1:
        best_t = curr_t
        best_f1 = curr_f1
        print('step: {}   best threshold: {}   best f1: {}'.format(step, best_t, best_f1))
print('search finish.')

val_pred = [1 if x >= best_t else 0 for x in val_pred]
print('\nbest auc:', best_auc)
print('best f1:', f1_score(val_y, val_pred))
print('validate mean:', np.mean(val_pred))
print('runtime:', time.time() - t)


step: 0   best threshold: 0.05   best f1: 0.6850870278409747
step: 1   best threshold: 0.052000000000000005   best f1: 0.6882450875517184
step: 2   best threshold: 0.054000000000000006   best f1: 0.6910326188891379
step: 3   best threshold: 0.056   best f1: 0.6938570740696238
step: 4   best threshold: 0.058   best f1: 0.6965791884245541
step: 5   best threshold: 0.060000000000000005   best f1: 0.6992329151704849


In [None]:
print('=============================================== sub save ===============================================')
sub.to_csv('sub_prob_{}_{}_{}.csv'.format(best_auc, best_f1, sub['target'].mean()), index=False)
sub['target'] = sub['target'].apply(lambda x: 1 if x >= best_t else 0)
sub.to_csv('sub_{}_{}_{}.csv'.format(best_auc, best_f1, sub['target'].mean()), index=False)
print('runtime:', time.time() - t)
print('finish.')
print('========================================================================================================')