In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from lightgbm.sklearn import LGBMClassifier
from sklearn.metrics import roc_auc_score, f1_score
from scipy.stats import entropy
from gensim.models import Word2Vec
import time
import gc
import os

import tqdm                                                                                                   
import concurrent.futures
import multiprocessing

pd.set_option('display.max_columns', None)
num_processes = multiprocessing.cpu_count()
print("total cpu count", +num_processes) 

os.environ['NUMEXPR_MAX_THREADS'] = '8'

from core.utils import timeit, reduce_mem

total cpu count 8


In [2]:
path = "/media/ryan/F/deep-learning-data/turing/vedio-predict/"

path_sub = path + 'sub/'
path_npy = path + 'npy/'
path_data = path + 'raw/'
path_model = path + 'model/'
path_result = path + 'result/'
path_pickle = path + 'pickle/'
path_profile = path + 'profile/'

debug_small = False

if debug_small:
    train_df = pd.read_pickle(path_pickle + 'train_small.pickle')
    test_df = pd.read_pickle(path_pickle + 'test_small.pickle')
    sub = pd.read_csv(path_data + 'sample.csv')

    # app = pd.read_pickle(path_pickle + 'app_small.pickle')
    # user = pd.read_pickle(path_pickle + 'user_small.pickle')
else:
    train_df = pd.read_pickle(path_pickle + 'train.pickle')
    test_df = pd.read_pickle(path_pickle + 'test.pickle')
    sub = pd.read_csv(path_data + 'sample.csv')

    # app = pd.read_pickle(path_pickle + 'app.pickle')
    # user = pd.read_pickle(path_pickle + 'user.pickle')


In [None]:
# train_df = train_df[train_df.deviceid.str[-1] == '1']
# test_df = test_df[test_df.deviceid.str[-1] == '1']

In [None]:
# test_df

In [None]:
# sub = sub[sub.id.isin(test_df.id) ]
# sub

In [3]:
print('=============================================== read train ===============================================')
t = time.time()
# train_df = pd.read_csv('dataset/train.csv')
train_df['date'] = pd.to_datetime(
    train_df['ts'].apply(lambda x: time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(x / 1000)))
)
train_df['day'] = train_df['date'].dt.day

# 训练集中，day=7的个数为11个，day=8的为3,674,871。 day9，10也是解决40w
# day=7占比不到1/百万，属于异常情况，去掉合理？ 线上的表现又会如何，为啥不是直接删除，这样有点过了
# 这里为啥只是改了day，不去直接改ts和timestamp呢？
train_df.loc[train_df['day'] == 7, 'day'] = 8
train_df['hour'] = train_df['date'].dt.hour
train_df['minute'] = train_df['date'].dt.minute
train_num = train_df.shape[0]
labels = train_df['target'].values
print('runtime:', time.time() - t)

runtime: 23.693913221359253


In [None]:
print('=============================================== click data ===============================================')
click_df = train_df[train_df['target'] == 1].sort_values('timestamp').reset_index(drop=True)
click_df['exposure_click_gap'] = click_df['timestamp'] - click_df['ts']
click_df = click_df[click_df['exposure_click_gap'] >= 0].reset_index(drop=True)
click_df['date'] = pd.to_datetime(
    click_df['timestamp'].apply(lambda x: time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(x / 1000)))
)
click_df['day'] = click_df['date'].dt.day
# 同上对day==7的修改
click_df.loc[click_df['day'] == 7, 'day'] = 8

del train_df['target'], train_df['timestamp']

# 这里为啥要把click_df的这些字段删除呢？
for f in ['date', 'exposure_click_gap', 'timestamp', 'ts', 'target', 'hour', 'minute']:
    del click_df[f]
print('runtime:', time.time() - t)

In [None]:
print('=============================================== read test ===============================================')
test_df['date'] = pd.to_datetime(
    test_df['ts'].apply(lambda x: time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(x / 1000)))
)
test_df['day'] = test_df['date'].dt.day

# 测试集中，day=10的个数为32个，day=11的为3,653,560占比 1/十万，属于异常情况，去掉合理
test_df.loc[test_df['day'] == 10, 'day'] = 11
test_df['hour'] = test_df['date'].dt.hour
test_df['minute'] = test_df['date'].dt.minute
df = pd.concat([train_df, test_df], axis=0, ignore_index=False)
del train_df, test_df, df['date']
gc.collect()
print('runtime:', time.time() - t)

In [None]:
print('============================================= category encoding =============================================')
df['lng_lat'] = df['lng'].astype('str') + '_' + df['lat'].astype('str')
del df['guid']
click_df['lng_lat'] = click_df['lng'].astype('str') + '_' + click_df['lat'].astype('str')
sort_df = df.sort_values('ts').reset_index(drop=True)
cate_cols = [
    'deviceid', 'newsid', 'pos', 'app_version', 'device_vendor',
    'netmodel', 'osversion', 'device_version', 'lng', 'lat', 'lng_lat'
]
for f in cate_cols:
    print(f)
    map_dict = dict(zip(df[f].unique(), range(df[f].nunique())))
    df[f] = df[f].map(map_dict).fillna(-1).astype('int32')
    click_df[f] = click_df[f].map(map_dict).fillna(-1).astype('int32')
    sort_df[f] = sort_df[f].map(map_dict).fillna(-1).astype('int32')
    df[f + '_count'] = df[f].map(df[f].value_counts())
df = reduce_mem(df)
click_df = reduce_mem(click_df)
sort_df = reduce_mem(sort_df)
print('runtime:', time.time() - t)


In [None]:
print('============================================= feat engineer =============================================')

print('*************************** history stats ***************************')
for f in [
    ['deviceid'],
    ['pos', 'deviceid'],
    # ...
]:
    print('------------------ {} ------------------'.format('_'.join(f)))

    # 对前一天的点击次数进行统计
    tmp = click_df[f + ['day', 'id']].groupby(f + ['day'], as_index=False)['id'].agg(
        {'_'.join(f) + '_prev_day_click_count': 'count'})
    tmp['day'] += 1
    df = df.merge(tmp, on=f + ['day'], how='left')
    df['_'.join(f) + '_prev_day_click_count'] = df['_'.join(f) + '_prev_day_click_count'].fillna(0)
    df.loc[df['day'] == 8, '_'.join(f) + '_prev_day_click_count'] = None

    # 对前一天的曝光量进行统计
    tmp = df[f + ['day', 'id']].groupby(f + ['day'], as_index=False)['id'].agg(
        {'_'.join(f) + '_prev_day_count': 'count'})
    tmp['day'] += 1
    df = df.merge(tmp, on=f + ['day'], how='left')
    df['_'.join(f) + '_prev_day_count'] = df['_'.join(f) + '_prev_day_count'].fillna(0)
    df.loc[df['day'] == 8, '_'.join(f) + '_prev_day_count'] = None

    # 计算前一天的点击率
    df['_'.join(f) + '_prev_day_ctr'] = df['_'.join(f) + '_prev_day_click_count'] / (
            df['_'.join(f) + '_prev_day_count'] + df['_'.join(f) + '_prev_day_count'].mean())

    del tmp
    print('runtime:', time.time() - t)
del click_df
df = reduce_mem(df)

In [None]:
print('*************************** exposure_ts_gap ***************************')
for f in [
    ['deviceid'], ['newsid'], ['lng_lat'],
    ['pos', 'deviceid'], ['pos', 'newsid'], ['pos', 'lng_lat'],
    ['pos', 'deviceid', 'lng_lat'],
    ['netmodel', 'deviceid'],
    ['pos', 'netmodel', 'deviceid'],
    ['netmodel', 'lng_lat'], ['deviceid', 'lng_lat'],
    ['netmodel', 'deviceid', 'lng_lat'], ['pos', 'netmodel', 'lng_lat'],
    ['pos', 'netmodel', 'deviceid', 'lng_lat']
]:
    print('------------------ {} ------------------'.format('_'.join(f)))

    tmp = sort_df[f + ['ts']].groupby(f)
    # 前x次、后x次曝光到当前的时间差
    for gap in [1, 2, 3, 5, 10]:
        sort_df['{}_prev{}_exposure_ts_gap'.format('_'.join(f), gap)] = tmp['ts'].shift(0) - tmp['ts'].shift(gap)
        sort_df['{}_next{}_exposure_ts_gap'.format('_'.join(f), gap)] = tmp['ts'].shift(-gap) - tmp['ts'].shift(0)
        tmp2 = sort_df[
            f + ['ts', '{}_prev{}_exposure_ts_gap'.format('_'.join(f), gap),
                 '{}_next{}_exposure_ts_gap'.format('_'.join(f), gap)]
            ].drop_duplicates(f + ['ts']).reset_index(drop=True)
        df = df.merge(tmp2, on=f + ['ts'], how='left')
        del sort_df['{}_prev{}_exposure_ts_gap'.format('_'.join(f), gap)]
        del sort_df['{}_next{}_exposure_ts_gap'.format('_'.join(f), gap)]
        del tmp2

    del tmp
    df = reduce_mem(df)
    print('runtime:', time.time() - t)
del df['ts']
gc.collect()

In [None]:
# print('*************************** cross feat (second order) ***************************')
# # 二阶交叉特征，可以继续做更高阶的交叉。
# def build_cross_feat(df, f, col):
#     print('------------------ {} {} ------------------'.format(f, col))
#     df = df.merge(df[[f, col]].groupby(f, as_index=False)[col].agg({
#         'cross_{}_{}_nunique'.format(f, col): 'nunique',
#         'cross_{}_{}_ent'.format(f, col): lambda x: entropy(x.value_counts() / x.shape[0])  # 熵
#     }), on=f, how='left')
#     if 'cross_{}_{}_count'.format(f, col) not in df.columns.values and 'cross_{}_{}_count'.format(col,
#                                                                                                   f) not in df.columns.values:
#         df = df.merge(df[[f, col, 'id']].groupby([f, col], as_index=False)['id'].agg({
#             'cross_{}_{}_count'.format(f, col): 'count'  # 共现次数
#         }), on=[f, col], how='left')
#     if 'cross_{}_{}_count_ratio'.format(col, f) not in df.columns.values:
#         df['cross_{}_{}_count_ratio'.format(col, f)] = df['cross_{}_{}_count'.format(f, col)] / df[
#             f + '_count']  # 比例偏好
#     if 'cross_{}_{}_count_ratio'.format(f, col) not in df.columns.values:
#         df['cross_{}_{}_count_ratio'.format(f, col)] = df['cross_{}_{}_count'.format(f, col)] / df[
#             col + '_count']  # 比例偏好
#     df['cross_{}_{}_nunique_ratio_{}_count'.format(f, col, f)] = df['cross_{}_{}_nunique'.format(f, col)] / df[
#         f + '_count']
#     print('runtime:', time.time() - t)
#     df = reduce_mem(df)
#     return df
        
# cross_cols = ['deviceid', 'newsid', 'pos', 'netmodel', 'lng_lat']
# f_col_tuple_list = []
# for f in cross_cols:
#     for col in cross_cols:
#         if col == f:
#             continue
#         f_col_tuple_list.append((f, col))
        
# print(f_col_tuple_list)
# # with concurrent.futures.ProcessPoolExecutor(num_processes) as pool:
# #     df = list(tqdm.tqdm(pool.map(build_cross_feat, cross_cols, chunksize=10, total=df.shape[0])))
# for tuple_o in tqdm.tqdm(f_col_tuple_list):
#     print(tuple_o)
#     df = build_cross_feat(df, tuple_o[0], tuple_o[1])

# del df['id']
# gc.collect()

In [None]:
df.columns.values

In [None]:
print('*************************** cross feat (second order) ***************************')
# 二阶交叉特征，可以继续做更高阶的交叉。
cross_cols = ['deviceid', 'newsid', 'pos', 'netmodel', 'lng_lat']
for f in cross_cols:
    for col in cross_cols:
        if col == f:
            continue
        print('------------------ {} {} ------------------'.format(f, col))
        if  'cross_{}_{}_nunique'.format(f, col) not in df.columns.values:
            df = df.merge(df[[f, col]].groupby(f, as_index=False)[col].agg({
                'cross_{}_{}_nunique'.format(f, col): 'nunique',
                'cross_{}_{}_ent'.format(f, col): lambda x: entropy(x.value_counts() / x.shape[0])  # 熵
            }), on=f, how='left')
        if 'cross_{}_{}_count'.format(f, col) not in df.columns.values and 'cross_{}_{}_count'.format(col,
                                                                                                      f) not in df.columns.values:
            df = df.merge(df[[f, col, 'id']].groupby([f, col], as_index=False)['id'].agg({
                'cross_{}_{}_count'.format(f, col): 'count'  # 共现次数
            }), on=[f, col], how='left')
        if 'cross_{}_{}_count_ratio'.format(col, f) not in df.columns.values:
            df['cross_{}_{}_count_ratio'.format(col, f)] = df['cross_{}_{}_count'.format(f, col)] / df[
                f + '_count']  # 比例偏好
        if 'cross_{}_{}_count_ratio'.format(f, col) not in df.columns.values:
            df['cross_{}_{}_count_ratio'.format(f, col)] = df['cross_{}_{}_count'.format(f, col)] / df[
                col + '_count']  # 比例偏好
        df['cross_{}_{}_nunique_ratio_{}_count'.format(f, col, f)] = df['cross_{}_{}_nunique'.format(f, col)] / df[
            f + '_count']
        print('runtime:', time.time() - t)
    df = reduce_mem(df)
del df['id']
gc.collect()

In [None]:
df.to_pickle(path_pickle + "df_081_cross.pickle")
print("success build df_081_cross.pickle")

sort_df.to_pickle(path_pickle + "sort_df_081_cross.pickle")
print("success build sort_df_081_cross.pickle")

In [4]:
df =pd.read_pickle(path_pickle + "df_081_cross.pickle")


In [None]:
print('*************************** embedding ***************************')


# 之前有个朋友给embedding做了一个我认为非常形象的比喻：
# 在非诚勿扰上面，如果你想了解一个女嘉宾，那么你可以看看她都中意过哪些男嘉宾；
# 反过来也一样，如果你想认识一个男嘉宾，那么你也可以看看他都选过哪些女嘉宾。


def emb(df, f1, f2):
    emb_size = 8
    print('====================================== {} {} ======================================'.format(f1, f2))
    tmp = df.groupby(f1, as_index=False)[f2].agg({'{}_{}_list'.format(f1, f2): list})
    sentences = tmp['{}_{}_list'.format(f1, f2)].values.tolist()
    del tmp['{}_{}_list'.format(f1, f2)]
    for i in range(len(sentences)):
        sentences[i] = [str(x) for x in sentences[i]]
    model = Word2Vec(sentences, size=emb_size, window=5, min_count=5, sg=0, hs=1, seed=2019)
    emb_matrix = []
    for seq in sentences:
        vec = []
        for w in seq:
            if w in model:
                vec.append(model[w])
        if len(vec) > 0:
            emb_matrix.append(np.mean(vec, axis=0))
        else:
            emb_matrix.append([0] * emb_size)

    # 为了支持数组多维处理，需要先做一个变换
    emb_matrix = np.array(emb_matrix)

    for i in range(emb_size):
        tmp['{}_{}_emb_{}'.format(f1, f2, i)] = emb_matrix[:, i]
    del model, emb_matrix, sentences
    tmp = reduce_mem(tmp)
    print('runtime:', time.time() - t)
    return tmp


emb_cols = [
    ['deviceid', 'newsid'],
    ['deviceid', 'lng_lat'],
    ['newsid', 'lng_lat'],
    # ...
]
for f1, f2 in tqdm.tqdm(emb_cols):
    df = df.merge(emb(sort_df, f1, f2), on=f1, how='left')
    df = df.merge(emb(sort_df, f2, f1), on=f2, how='left')
del sort_df
gc.collect()

In [None]:
df.to_pickle(path_pickle + "df_081_emd_all.pickle")
print("success build df_081_all.pickle")


In [None]:
train_num

In [None]:
df =pd.read_pickle(path_pickle + "df_081_emd_all.pickle")


In [5]:
train_num =11376681
cate_cols = [
    'deviceid', 'newsid', 'pos', 'app_version', 'device_vendor',
    'netmodel', 'osversion', 'device_version', 'lng', 'lat', 'lng_lat'
]

In [None]:
print('======================================== prepare train & valid  =============================================')
train_df = df[:train_num].reset_index(drop=True)
test_df = df[train_num:].reset_index(drop=True)
del df
gc.collect()

In [10]:
train_df.shape

(11376681, 268)

In [11]:


train_idx = train_df[train_df['day'] < 10].index.tolist()
val_idx = train_df[train_df['day'] == 10].index.tolist()

train_x = train_df.iloc[train_idx].reset_index(drop=True)
train_y = labels[train_idx]
val_x = train_df.iloc[val_idx].reset_index(drop=True)
val_y = labels[val_idx]

del train_x['day'], val_x['day'], train_df['day'], test_df['day']
gc.collect()
print('runtime:', time.time() - t)
print('========================================================================================================')



KeyError: 'day'

In [17]:
def learning_rate_callback(env):
    iteration = env.iteration
    if iteration % 10 == 0:
        learning_rate = env.params['learning_rate'] * 0.99
        env.params['learning_rate'] = learning_rate
        
        print('---- current learning rate:' + str(learning_rate) + '----')
        

In [18]:
print('=============================================== training validate ===============================================')
fea_imp_list = []
clf = LGBMClassifier(
    n_jobs=7,
    learning_rate=0.02,
    n_estimators=5000,
    num_leaves=255,
    subsample=0.9,
    colsample_bytree=0.8,
    random_state=2019,
    metric=None
)

print('************** training **************')
clf.fit(
    train_x, train_y,
    eval_set=[(val_x, val_y)],
    eval_metric='auc',
    categorical_feature=cate_cols,
    early_stopping_rounds=200,
    verbose=50,
    callbacks=[learning_rate_callback]
)
print('runtime:', time.time() - t)

************** training **************
---- current learning rate:0.0198----
Training until validation scores don't improve for 200 rounds
---- current learning rate:0.019602----
---- current learning rate:0.01940598----
---- current learning rate:0.0192119202----
---- current learning rate:0.019019800998----
[50]	valid_0's auc: 0.970397
---- current learning rate:0.01882960298802----
---- current learning rate:0.0186413069581398----
---- current learning rate:0.0184548938885584----
---- current learning rate:0.018270344949672817----
---- current learning rate:0.01808764150017609----
[100]	valid_0's auc: 0.973233
---- current learning rate:0.017906765085174327----
---- current learning rate:0.017727697434322585----
---- current learning rate:0.01755042045997936----
---- current learning rate:0.017374916255379565----
---- current learning rate:0.01720116709282577----
[150]	valid_0's auc: 0.975124
---- current learning rate:0.017029155421897514----
---- current learning rate:0.0168588638

In [19]:
gc.collect()

print('************** validate predict **************')
best_rounds = clf.best_iteration_
best_auc = clf.best_score_['valid_0']['auc']
val_pred = clf.predict_proba(val_x)[:, 1]
fea_imp_list.append(clf.feature_importances_)
print('runtime:', time.time() - t)

************** validate predict **************
runtime: 16980.81683063507


In [None]:


print('=============================================== training predict ===============================================')
clf = LGBMClassifier(
    learning_rate=0.01,
    n_estimators=best_rounds,
    num_leaves=255,
    subsample=0.9,
    colsample_bytree=0.8,
    random_state=2019
)

In [None]:
print('************** training using all the data **************')
clf.fit(
    train_df, labels,
    eval_set=[(train_df, labels)],
    categorical_feature=cate_cols,
    verbose=50
)
print('runtime:', time.time() - t)

In [20]:
print('************** test predict **************')
# sub = pd.read_csv(path_data + 'sample.csv')

sub['target'] = clf.predict_proba(test_df)[:, 1]
clf.predict_proba(test_df)[:, 1]
fea_imp_list.append(clf.feature_importances_)
print('runtime:', time.time() - t)

************** test predict **************
runtime: 25565.922644615173


In [21]:
gc.collect()

16

In [22]:
print('=============================================== feat importances ===============================================')
# 特征重要性可以好好看看
fea_imp_dict = dict(zip(train_df.columns.values, np.mean(fea_imp_list, axis=0)))
fea_imp_item = sorted(fea_imp_dict.items(), key=lambda x: x[1], reverse=True)
for f, imp in fea_imp_item:
    print('{} = {}'.format(f, imp))

deviceid = 87117.0
newsid = 43714.0
device_version = 33658.0
lat = 15132.0
lng = 14444.0
lng_lat = 10186.0
netmodel_deviceid_lng_lat_next1_exposure_ts_gap = 3269.0
netmodel_deviceid_next1_exposure_ts_gap = 2787.0
deviceid_next3_exposure_ts_gap = 2426.0
netmodel_deviceid_lng_lat_next2_exposure_ts_gap = 1942.0
netmodel_deviceid_next3_exposure_ts_gap = 1800.0
pos_netmodel_deviceid_lng_lat_next1_exposure_ts_gap = 1763.0
pos = 1741.0
deviceid_lng_lat_next3_exposure_ts_gap = 1735.0
netmodel_deviceid_next2_exposure_ts_gap = 1734.0
deviceid_next1_exposure_ts_gap = 1696.0
pos_netmodel_deviceid_next1_exposure_ts_gap = 1682.0
cross_deviceid_newsid_count = 1579.0
pos_count = 1561.0
cross_lng_lat_pos_ent = 1536.0
cross_deviceid_pos_ent_x = 1458.0
netmodel_deviceid_lng_lat_next3_exposure_ts_gap = 1447.0
deviceid_next2_exposure_ts_gap = 1191.0
netmodel_deviceid_next10_exposure_ts_gap = 1010.0
deviceid_next5_exposure_ts_gap = 998.0
deviceid_lng_lat_next1_exposure_ts_gap = 993.0
netmodel_deviceid_next5

cross_lng_lat_netmodel_nunique_ratio_lng_lat_count = 48.0
osversion_count = 46.0
pos_lng_lat_prev3_exposure_ts_gap = 43.0
pos_deviceid_lng_lat_prev2_exposure_ts_gap = 42.0
netmodel = 24.0
cross_deviceid_netmodel_nunique = 24.0
cross_deviceid_newsid_ent_y = 24.0
cross_netmodel_deviceid_nunique_ratio_netmodel_count = 23.0
cross_newsid_netmodel_nunique = 19.0
cross_pos_lng_lat_ent = 18.0
cross_deviceid_pos_nunique = 13.0
cross_pos_newsid_nunique_ratio_pos_count = 13.0
cross_netmodel_deviceid_nunique = 13.0
cross_lng_lat_netmodel_nunique = 13.0
cross_netmodel_lng_lat_ent = 12.0
osversion = 11.0
lng_lat_count = 11.0
cross_netmodel_newsid_nunique_ratio_netmodel_count = 5.0
cross_deviceid_newsid_nunique_y = 4.0
cross_netmodel_lng_lat_nunique_ratio_netmodel_count = 4.0
cross_deviceid_newsid_ent = 3.0
cross_netmodel_pos_nunique_ratio_netmodel_count = 3.0
cross_pos_newsid_nunique = 1.0
cross_pos_netmodel_nunique_ratio_pos_count = 1.0
cross_netmodel_newsid_nunique = 1.0
cross_lng_lat_deviceid_nun

In [23]:
print('=============================================== threshold search ===============================================')
# f1阈值敏感，所以对阈值做一个简单的迭代搜索。
t0 = 0.05
v = 0.002
best_t = t0
best_f1 = 0
for step in range(201):
    curr_t = t0 + step * v
    y = [1 if x >= curr_t else 0 for x in val_pred]
    curr_f1 = f1_score(val_y, y)
    if curr_f1 > best_f1:
        best_t = curr_t
        best_f1 = curr_f1
        print('step: {}   best threshold: {}   best f1: {}'.format(step, best_t, best_f1))
print('search finish.')

val_pred = [1 if x >= best_t else 0 for x in val_pred]
print('\nbest auc:', best_auc)
print('best f1:', f1_score(val_y, val_pred))
print('validate mean:', np.mean(val_pred))
print('runtime:', time.time() - t)


step: 0   best threshold: 0.05   best f1: 0.6741247259698413
step: 1   best threshold: 0.052000000000000005   best f1: 0.677214481657178
step: 2   best threshold: 0.054000000000000006   best f1: 0.6802106795934408
step: 3   best threshold: 0.056   best f1: 0.6831118601178485
step: 4   best threshold: 0.058   best f1: 0.6859091355624608
step: 5   best threshold: 0.060000000000000005   best f1: 0.688564232436159
step: 6   best threshold: 0.062   best f1: 0.6912090360858483
step: 7   best threshold: 0.064   best f1: 0.6937056551492506
step: 8   best threshold: 0.066   best f1: 0.6961010887426297
step: 9   best threshold: 0.068   best f1: 0.698368284377157
step: 10   best threshold: 0.07   best f1: 0.700670622072013
step: 11   best threshold: 0.07200000000000001   best f1: 0.7028456949514735
step: 12   best threshold: 0.07400000000000001   best f1: 0.705000116236027
step: 13   best threshold: 0.07600000000000001   best f1: 0.7070492886434312
step: 14   best threshold: 0.078   best f1: 0.70

step: 120   best threshold: 0.29   best f1: 0.7897447697856753
step: 121   best threshold: 0.292   best f1: 0.7899719267635249
step: 122   best threshold: 0.294   best f1: 0.7902039481641082
step: 123   best threshold: 0.296   best f1: 0.7903516433983188
step: 124   best threshold: 0.298   best f1: 0.7905416936121291
step: 125   best threshold: 0.3   best f1: 0.790703687631673
step: 126   best threshold: 0.302   best f1: 0.7908918585489288
step: 127   best threshold: 0.304   best f1: 0.7911330704375198
step: 128   best threshold: 0.306   best f1: 0.791313552149607
step: 129   best threshold: 0.308   best f1: 0.7914395242975015
step: 130   best threshold: 0.31   best f1: 0.7916364529021616
step: 131   best threshold: 0.312   best f1: 0.7917843720479263
step: 132   best threshold: 0.314   best f1: 0.7919713947768023
step: 133   best threshold: 0.316   best f1: 0.7921508162324553
step: 134   best threshold: 0.318   best f1: 0.7923015710615464
step: 135   best threshold: 0.32   best f1: 0.

In [24]:
print('=============================================== sub save ===============================================')
sub.to_csv('sub_prob_{}_{}_{}.csv'.format(best_auc, best_f1, sub['target'].mean()), index=False)
sub['target'] = sub['target'].apply(lambda x: 1 if x >= best_t else 0)
sub.to_csv('sub_{}_{}_{}.csv'.format(best_auc, best_f1, sub['target'].mean()), index=False)
print('runtime:', time.time() - t)
print('finish.')
print('========================================================================================================')

runtime: 26305.168647289276
finish.
