In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from lightgbm.sklearn import LGBMClassifier
from sklearn.metrics import roc_auc_score, f1_score
from scipy.stats import entropy
from gensim.models import Word2Vec
import time
import gc
import os

import tqdm                                                                                                   
import concurrent.futures
import multiprocessing

pd.set_option('display.max_columns', None)
num_processes = multiprocessing.cpu_count()
print("total cpu count", +num_processes) 

os.environ['NUMEXPR_MAX_THREADS'] = '30'

from core.utils import timeit, reduce_mem

total cpu count 32


In [2]:
path = "/root/ryan/data/"

path_sub = path + 'sub/'
path_npy = path + 'npy/'
path_data = path + 'raw/'
path_model = path + 'model/'
path_result = path + 'result/'
path_pickle = path + 'pickle/'
path_profile = path + 'profile/'

debug_small = False

if debug_small:
    train_df = pd.read_pickle(path_pickle + 'train_small.pickle')
    test_df = pd.read_pickle(path_pickle + 'test_small.pickle')
    sub = pd.read_csv(path_data + 'sample.csv')

    # app = pd.read_pickle(path_pickle + 'app_small.pickle')
    # user = pd.read_pickle(path_pickle + 'user_small.pickle')
else:
    train_df = pd.read_pickle(path_pickle + 'train.pickle')
    test_df = pd.read_pickle(path_pickle + 'test.pickle')
    sub = pd.read_csv(path_data + 'sample.csv')

    # app = pd.read_pickle(path_pickle + 'app.pickle')
    # user = pd.read_pickle(path_pickle + 'user.pickle')


In [3]:
train_df = train_df[train_df.deviceid.str[-1] == '1']
test_df = test_df[test_df.deviceid.str[-1] == '1']

In [4]:
test_df

Unnamed: 0,id,deviceid,newsid,guid,pos,app_version,device_vendor,netmodel,osversion,lng,lat,device_version,ts
101,test_102,fc2537a764aeebad1d9738bd835830c1,1026782592186002955,7a7e251a3a8a3e51f304558189d920f8,3,2.1.5,OPPO,o,8.1.0,1.060673e+02,2.796026e+01,PBCM10,1573450717181
102,test_103,fc2537a764aeebad1d9738bd835830c1,1114886426938376594,7a7e251a3a8a3e51f304558189d920f8,4,2.1.5,OPPO,o,8.1.0,4.940656e-324,4.940656e-324,PBCM10,1573445772769
103,test_104,fc2537a764aeebad1d9738bd835830c1,1121986237192882413,7a7e251a3a8a3e51f304558189d920f8,2,2.1.5,OPPO,o,8.1.0,1.060673e+02,2.796026e+01,PBCM10,1573450525650
104,test_105,fc2537a764aeebad1d9738bd835830c1,1192685813446768337,7a7e251a3a8a3e51f304558189d920f8,4,2.1.5,OPPO,o,8.1.0,1.060673e+02,2.796026e+01,PBCM10,1573449290652
105,test_106,fc2537a764aeebad1d9738bd835830c1,1394102932340986804,7a7e251a3a8a3e51f304558189d920f8,1,2.1.5,OPPO,o,8.1.0,1.060673e+02,2.796026e+01,PBCM10,1573449626574
106,test_107,fc2537a764aeebad1d9738bd835830c1,1844115029060315821,7a7e251a3a8a3e51f304558189d920f8,2,2.1.5,OPPO,o,8.1.0,1.060673e+02,2.796026e+01,PBCM10,1573450382179
107,test_108,fc2537a764aeebad1d9738bd835830c1,186936397887951974,7a7e251a3a8a3e51f304558189d920f8,3,2.1.5,OPPO,o,8.1.0,1.060673e+02,2.796026e+01,PBCM10,1573450731378
108,test_109,fc2537a764aeebad1d9738bd835830c1,1911556780026317588,7a7e251a3a8a3e51f304558189d920f8,0,2.1.5,OPPO,o,8.1.0,1.060673e+02,2.796026e+01,PBCM10,1573450530095
109,test_110,fc2537a764aeebad1d9738bd835830c1,2135048452071604306,7a7e251a3a8a3e51f304558189d920f8,0,2.1.5,OPPO,o,8.1.0,1.060516e+02,2.796990e+01,PBCM10,1573460843492
110,test_111,fc2537a764aeebad1d9738bd835830c1,2152135578886192092,7a7e251a3a8a3e51f304558189d920f8,3,2.1.5,OPPO,o,8.1.0,1.060673e+02,2.796026e+01,PBCM10,1573448713153


In [5]:
sub = sub[sub.id.isin(test_df.id) ]
sub

Unnamed: 0,id,target
101,test_102,1
102,test_103,0
103,test_104,1
104,test_105,0
105,test_106,1
106,test_107,0
107,test_108,1
108,test_109,0
109,test_110,1
110,test_111,0


In [6]:
print('=============================================== read train ===============================================')
t = time.time()
# train_df = pd.read_csv('dataset/train.csv')
train_df['date'] = pd.to_datetime(
    train_df['ts'].apply(lambda x: time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(x / 1000)))
)
train_df['day'] = train_df['date'].dt.day

# 训练集中，day=7的个数为11个，day=8的为3,674,871。 day9，10也是解决40w
# day=7占比不到1/百万，属于异常情况，去掉合理？ 线上的表现又会如何，为啥不是直接删除，这样有点过了
# 这里为啥只是改了day，不去直接改ts和timestamp呢？
train_df.loc[train_df['day'] == 7, 'day'] = 8
train_df['hour'] = train_df['date'].dt.hour
train_df['minute'] = train_df['date'].dt.minute
train_num = train_df.shape[0]
labels = train_df['target'].values
print('runtime:', time.time() - t)

runtime: 1.9849631786346436


In [7]:
print('=============================================== click data ===============================================')
click_df = train_df[train_df['target'] == 1].sort_values('timestamp').reset_index(drop=True)
click_df['exposure_click_gap'] = click_df['timestamp'] - click_df['ts']
click_df = click_df[click_df['exposure_click_gap'] >= 0].reset_index(drop=True)
click_df['date'] = pd.to_datetime(
    click_df['timestamp'].apply(lambda x: time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(x / 1000)))
)
click_df['day'] = click_df['date'].dt.day
# 同上对day==7的修改
click_df.loc[click_df['day'] == 7, 'day'] = 8

del train_df['target'], train_df['timestamp']

# 这里为啥要把click_df的这些字段删除呢？
for f in ['date', 'exposure_click_gap', 'timestamp', 'ts', 'target', 'hour', 'minute']:
    del click_df[f]
print('runtime:', time.time() - t)

runtime: 2.3846285343170166


In [8]:
print('=============================================== read test ===============================================')
test_df['date'] = pd.to_datetime(
    test_df['ts'].apply(lambda x: time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(x / 1000)))
)
test_df['day'] = test_df['date'].dt.day

# 测试集中，day=10的个数为32个，day=11的为3,653,560占比 1/十万，属于异常情况，去掉合理
test_df.loc[test_df['day'] == 10, 'day'] = 11
test_df['hour'] = test_df['date'].dt.hour
test_df['minute'] = test_df['date'].dt.minute
df = pd.concat([train_df, test_df], axis=0, ignore_index=False)
del train_df, test_df, df['date']
gc.collect()
print('runtime:', time.time() - t)

runtime: 3.795077323913574


In [9]:
print('============================================= category encoding =============================================')
df['lng_lat'] = df['lng'].astype('str') + '_' + df['lat'].astype('str')
del df['guid']
click_df['lng_lat'] = click_df['lng'].astype('str') + '_' + click_df['lat'].astype('str')
sort_df = df.sort_values('ts').reset_index(drop=True)
cate_cols = [
    'deviceid', 'newsid', 'pos', 'app_version', 'device_vendor',
    'netmodel', 'osversion', 'device_version', 'lng', 'lat', 'lng_lat'
]
for f in cate_cols:
    print(f)
    map_dict = dict(zip(df[f].unique(), range(df[f].nunique())))
    df[f] = df[f].map(map_dict).fillna(-1).astype('int32')
    click_df[f] = click_df[f].map(map_dict).fillna(-1).astype('int32')
    sort_df[f] = sort_df[f].map(map_dict).fillna(-1).astype('int32')
    df[f + '_count'] = df[f].map(df[f].value_counts())
df = reduce_mem(df)
click_df = reduce_mem(click_df)
sort_df = reduce_mem(sort_df)
print('runtime:', time.time() - t)


deviceid
newsid
pos
app_version
device_vendor
netmodel
osversion
device_version
lng
lat
lng_lat
166.76 Mb, 81.53 Mb (51.11 %)
4.87 Mb, 2.29 Mb (52.94 %)
77.82 Mb, 35.20 Mb (54.76 %)
runtime: 16.35232663154602


In [10]:
print('============================================= feat engineer =============================================')

print('*************************** history stats ***************************')
for f in [
    ['deviceid'],
    ['pos', 'deviceid'],
    # ...
]:
    print('------------------ {} ------------------'.format('_'.join(f)))

    # 对前一天的点击次数进行统计
    tmp = click_df[f + ['day', 'id']].groupby(f + ['day'], as_index=False)['id'].agg(
        {'_'.join(f) + '_prev_day_click_count': 'count'})
    tmp['day'] += 1
    df = df.merge(tmp, on=f + ['day'], how='left')
    df['_'.join(f) + '_prev_day_click_count'] = df['_'.join(f) + '_prev_day_click_count'].fillna(0)
    df.loc[df['day'] == 8, '_'.join(f) + '_prev_day_click_count'] = None

    # 对前一天的曝光量进行统计
    tmp = df[f + ['day', 'id']].groupby(f + ['day'], as_index=False)['id'].agg(
        {'_'.join(f) + '_prev_day_count': 'count'})
    tmp['day'] += 1
    df = df.merge(tmp, on=f + ['day'], how='left')
    df['_'.join(f) + '_prev_day_count'] = df['_'.join(f) + '_prev_day_count'].fillna(0)
    df.loc[df['day'] == 8, '_'.join(f) + '_prev_day_count'] = None

    # 计算前一天的点击率
    df['_'.join(f) + '_prev_day_ctr'] = df['_'.join(f) + '_prev_day_click_count'] / (
            df['_'.join(f) + '_prev_day_count'] + df['_'.join(f) + '_prev_day_count'].mean())

    del tmp
    print('runtime:', time.time() - t)
del click_df
df = reduce_mem(df)

*************************** history stats ***************************
------------------ deviceid ------------------
runtime: 17.288740396499634
------------------ pos_deviceid ------------------
runtime: 18.16665267944336
126.00 Mb, 92.64 Mb (26.47 %)


In [11]:
print('*************************** exposure_ts_gap ***************************')
for f in [
    ['deviceid'], ['newsid'], ['lng_lat'],
    ['pos', 'deviceid'], ['pos', 'newsid'], ['pos', 'lng_lat'],
    ['pos', 'deviceid', 'lng_lat'],
    ['netmodel', 'deviceid'],
    ['pos', 'netmodel', 'deviceid'],
    ['netmodel', 'lng_lat'], ['deviceid', 'lng_lat'],
    ['netmodel', 'deviceid', 'lng_lat'], ['pos', 'netmodel', 'lng_lat'],
    ['pos', 'netmodel', 'deviceid', 'lng_lat']
]:
    print('------------------ {} ------------------'.format('_'.join(f)))

    tmp = sort_df[f + ['ts']].groupby(f)
    # 前x次、后x次曝光到当前的时间差
    for gap in [1, 2, 3, 5, 10]:
        sort_df['{}_prev{}_exposure_ts_gap'.format('_'.join(f), gap)] = tmp['ts'].shift(0) - tmp['ts'].shift(gap)
        sort_df['{}_next{}_exposure_ts_gap'.format('_'.join(f), gap)] = tmp['ts'].shift(-gap) - tmp['ts'].shift(0)
        tmp2 = sort_df[
            f + ['ts', '{}_prev{}_exposure_ts_gap'.format('_'.join(f), gap),
                 '{}_next{}_exposure_ts_gap'.format('_'.join(f), gap)]
            ].drop_duplicates(f + ['ts']).reset_index(drop=True)
        df = df.merge(tmp2, on=f + ['ts'], how='left')
        del sort_df['{}_prev{}_exposure_ts_gap'.format('_'.join(f), gap)]
        del sort_df['{}_next{}_exposure_ts_gap'.format('_'.join(f), gap)]
        del tmp2

    del tmp
    df = reduce_mem(df)
    print('runtime:', time.time() - t)
del df['ts']
gc.collect()

*************************** exposure_ts_gap ***************************
------------------ deviceid ------------------
166.76 Mb, 129.70 Mb (22.22 %)
runtime: 25.28958034515381
------------------ newsid ------------------
203.82 Mb, 166.76 Mb (18.18 %)
runtime: 33.32697057723999
------------------ lng_lat ------------------
240.87 Mb, 203.82 Mb (15.38 %)
runtime: 40.84906220436096
------------------ pos_deviceid ------------------
277.93 Mb, 240.87 Mb (13.33 %)
runtime: 49.61402606964111
------------------ pos_newsid ------------------
314.99 Mb, 277.93 Mb (11.76 %)
runtime: 60.088242292404175
------------------ pos_lng_lat ------------------
352.05 Mb, 314.99 Mb (10.53 %)
runtime: 69.57750463485718
------------------ pos_deviceid_lng_lat ------------------
389.10 Mb, 352.05 Mb (9.52 %)
runtime: 80.43958115577698
------------------ netmodel_deviceid ------------------
426.16 Mb, 389.10 Mb (8.70 %)
runtime: 91.39895296096802
------------------ pos_netmodel_deviceid ------------------
46

7

In [12]:
df.to_pickle(path_pickle + "df_081_without_cross.pickle")
print("success build df_081_without_cross.pickle")

success build df_081_without_cross.pickle


In [13]:
## 重新加载数据处理
df = pd.read_pickle(path_pickle + 'df_081_without_cross.pickle')
cate_cols = [
    'deviceid', 'newsid', 'pos', 'app_version', 'device_vendor',
    'netmodel', 'osversion', 'device_version', 'lng', 'lat', 'lng_lat'
]

del df['id']
gc.collect()

14

In [None]:
# from pyspark.sql import Row
# from pyspark import SparkConf
# from pyspark import SparkContext

# conf = SparkConf()
# conf.setAppName("[陈亮时/149675]-[tf_format_test]")
# sc = SparkContext(conf=conf)
    
# l = [('Ankit',25),('Jalfaizy',22),('saurabh',20),('Bala',26)]
# rdd = sc.parallelize(l)
# people = rdd.map(lambda x: Row(name=x[0], age=int(x[1])))
# schemaPeople = sqlContext.createDataFrame(people)

In [None]:
# print('*************************** cross feat (second order) ***************************')
# # 二阶交叉特征，可以继续做更高阶的交叉。
# def build_cross_feat(df, f, col):
#     print('------------------ {} {} ------------------'.format(f, col))
#     df = df.merge(df[[f, col]].groupby(f, as_index=False)[col].agg({
#         'cross_{}_{}_nunique'.format(f, col): 'nunique',
#         'cross_{}_{}_ent'.format(f, col): lambda x: entropy(x.value_counts() / x.shape[0])  # 熵
#     }), on=f, how='left')
#     if 'cross_{}_{}_count'.format(f, col) not in df.columns.values and 'cross_{}_{}_count'.format(col,
#                                                                                                   f) not in df.columns.values:
#         df = df.merge(df[[f, col, 'id']].groupby([f, col], as_index=False)['id'].agg({
#             'cross_{}_{}_count'.format(f, col): 'count'  # 共现次数
#         }), on=[f, col], how='left')
#     if 'cross_{}_{}_count_ratio'.format(col, f) not in df.columns.values:
#         df['cross_{}_{}_count_ratio'.format(col, f)] = df['cross_{}_{}_count'.format(f, col)] / df[
#             f + '_count']  # 比例偏好
#     if 'cross_{}_{}_count_ratio'.format(f, col) not in df.columns.values:
#         df['cross_{}_{}_count_ratio'.format(f, col)] = df['cross_{}_{}_count'.format(f, col)] / df[
#             col + '_count']  # 比例偏好
#     df['cross_{}_{}_nunique_ratio_{}_count'.format(f, col, f)] = df['cross_{}_{}_nunique'.format(f, col)] / df[
#         f + '_count']
#     print('runtime:', time.time() - t)
#     df = reduce_mem(df)
#     return df
        
# cross_cols = ['deviceid', 'newsid', 'pos', 'netmodel', 'lng_lat']
# f_col_tuple_list = []
# for f in cross_cols:
#     for col in cross_cols:
#         if col == f:
#             continue
#         f_col_tuple_list.append((f, col))
        
# print(f_col_tuple_list)
# # with concurrent.futures.ProcessPoolExecutor(num_processes) as pool:
# #     df = list(tqdm.tqdm(pool.map(build_cross_feat, cross_cols, chunksize=10, total=df.shape[0])))
# for tuple_o in tqdm.tqdm(f_col_tuple_list):
#     print(tuple_o)
#     df = build_cross_feat(df, tuple_o[0], tuple_o[1])

# del df['id']
# gc.collect()

In [None]:
print('*************************** cross feat (second order) ***************************')
# 二阶交叉特征，可以继续做更高阶的交叉。
cross_cols = ['deviceid', 'newsid', 'pos', 'netmodel', 'lng_lat']
for f in cross_cols:
    for col in cross_cols:
        if col == f:
            continue
        print('------------------ {} {} ------------------'.format(f, col))
        df = df.merge(df[[f, col]].groupby(f, as_index=False)[col].agg({
            'cross_{}_{}_nunique'.format(f, col): 'nunique',
            'cross_{}_{}_ent'.format(f, col): lambda x: entropy(x.value_counts() / x.shape[0])  # 熵
        }), on=f, how='left')
        if 'cross_{}_{}_count'.format(f, col) not in df.columns.values and 'cross_{}_{}_count'.format(col,
                                                                                                      f) not in df.columns.values:
            df = df.merge(df[[f, col, 'id']].groupby([f, col], as_index=False)['id'].agg({
                'cross_{}_{}_count'.format(f, col): 'count'  # 共现次数
            }), on=[f, col], how='left')
        if 'cross_{}_{}_count_ratio'.format(col, f) not in df.columns.values:
            df['cross_{}_{}_count_ratio'.format(col, f)] = df['cross_{}_{}_count'.format(f, col)] / df[
                f + '_count']  # 比例偏好
        if 'cross_{}_{}_count_ratio'.format(f, col) not in df.columns.values:
            df['cross_{}_{}_count_ratio'.format(f, col)] = df['cross_{}_{}_count'.format(f, col)] / df[
                col + '_count']  # 比例偏好
        df['cross_{}_{}_nunique_ratio_{}_count'.format(f, col, f)] = df['cross_{}_{}_nunique'.format(f, col)] / df[
            f + '_count']
        print('runtime:', time.time() - t)
    df = reduce_mem(df)
del df['id']
gc.collect()

In [None]:
print('*************************** embedding ***************************')


# 之前有个朋友给embedding做了一个我认为非常形象的比喻：
# 在非诚勿扰上面，如果你想了解一个女嘉宾，那么你可以看看她都中意过哪些男嘉宾；
# 反过来也一样，如果你想认识一个男嘉宾，那么你也可以看看他都选过哪些女嘉宾。


def emb(df, f1, f2):
    emb_size = 8
    print('====================================== {} {} ======================================'.format(f1, f2))
    tmp = df.groupby(f1, as_index=False)[f2].agg({'{}_{}_list'.format(f1, f2): list})
    sentences = tmp['{}_{}_list'.format(f1, f2)].values.tolist()
    del tmp['{}_{}_list'.format(f1, f2)]
    for i in range(len(sentences)):
        sentences[i] = [str(x) for x in sentences[i]]
    model = Word2Vec(sentences, size=emb_size, window=5, min_count=5, sg=0, hs=1, seed=2019)
    emb_matrix = []
    for seq in sentences:
        vec = []
        for w in seq:
            if w in model:
                vec.append(model[w])
        if len(vec) > 0:
            emb_matrix.append(np.mean(vec, axis=0))
        else:
            emb_matrix.append([0] * emb_size)

    # 为了支持数组多维处理，需要先做一个变换
    emb_matrix = np.array(emb_matrix)

    for i in range(emb_size):
        tmp['{}_{}_emb_{}'.format(f1, f2, i)] = emb_matrix[:, i]
    del model, emb_matrix, sentences
    tmp = reduce_mem(tmp)
    print('runtime:', time.time() - t)
    return tmp


emb_cols = [
    ['deviceid', 'newsid'],
    ['deviceid', 'lng_lat'],
    ['newsid', 'lng_lat'],
    # ...
]
for f1, f2 in tqdm.tqdm(emb_cols):
    df = df.merge(emb(sort_df, f1, f2), on=f1, how='left')
    df = df.merge(emb(sort_df, f2, f1), on=f2, how='left')
del sort_df
gc.collect()

In [14]:
print('======================================== prepare train & valid  =============================================')
train_df = df[:train_num].reset_index(drop=True)
test_df = df[train_num:].reset_index(drop=True)
del df
gc.collect()

train_idx = train_df[train_df['day'] < 10].index.tolist()
val_idx = train_df[train_df['day'] == 10].index.tolist()

train_x = train_df.iloc[train_idx].reset_index(drop=True)
train_y = labels[train_idx]
val_x = train_df.iloc[val_idx].reset_index(drop=True)
val_y = labels[val_idx]

del train_x['day'], val_x['day'], train_df['day'], test_df['day']
gc.collect()
print('runtime:', time.time() - t)
print('========================================================================================================')



runtime: 202.06735730171204


In [15]:
print('=============================================== training validate ===============================================')
fea_imp_list = []
clf = LGBMClassifier(
    n_jobs=30,
    learning_rate=0.01,
    n_estimators=5000,
    num_leaves=255,
    subsample=0.9,
    colsample_bytree=0.8,
    random_state=2019,
    metric=None
)

print('************** training **************')
clf.fit(
    train_x, train_y,
    eval_set=[(val_x, val_y)],
    eval_metric='auc',
    categorical_feature=cate_cols,
    early_stopping_rounds=200,
    verbose=50
)
print('runtime:', time.time() - t)

************** training **************


New categorical_feature is ['app_version', 'device_vendor', 'device_version', 'deviceid', 'lat', 'lng', 'lng_lat', 'netmodel', 'newsid', 'osversion', 'pos']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 200 rounds
[50]	valid_0's auc: 0.964174
[100]	valid_0's auc: 0.965414
[150]	valid_0's auc: 0.965946
[200]	valid_0's auc: 0.966834
[250]	valid_0's auc: 0.967566
[300]	valid_0's auc: 0.967968
[350]	valid_0's auc: 0.968141
[400]	valid_0's auc: 0.968568
[450]	valid_0's auc: 0.968936
[500]	valid_0's auc: 0.969331
[550]	valid_0's auc: 0.969574
[600]	valid_0's auc: 0.96973
[650]	valid_0's auc: 0.969786
[700]	valid_0's auc: 0.969841
[750]	valid_0's auc: 0.96985
[800]	valid_0's auc: 0.969837
[850]	valid_0's auc: 0.969813
[900]	valid_0's auc: 0.969781
[950]	valid_0's auc: 0.9698
Early stopping, best iteration is:
[761]	valid_0's auc: 0.96986
runtime: 601.2647705078125


In [16]:
### 再重新喂入新数据之前，试试save mode，看看提交的效果
from sklearn.externals import joblib
joblib.dump(clf, path_model+'lgb.pkl')
# load model
clf = joblib.load(path_model + 'lgb.pkl')


In [17]:
print('************** validate predict **************')
best_rounds = clf.best_iteration_
best_auc = clf.best_score_['valid_0']['auc']
val_pred = clf.predict_proba(val_x)[:, 1]
fea_imp_list.append(clf.feature_importances_)
print('runtime:', time.time() - t)

print('=============================================== training predict ===============================================')
clf = LGBMClassifier(
    learning_rate=0.01,
    n_estimators=best_rounds,
    num_leaves=255,
    subsample=0.9,
    colsample_bytree=0.8,
    random_state=2019
)

************** validate predict **************
runtime: 1106.656444311142


In [18]:
print('************** training using all the data **************')
clf.fit(
    train_df, labels,
    eval_set=[(train_df, labels)],
    categorical_feature=cate_cols,
    verbose=50
)
print('runtime:', time.time() - t)

print('************** test predict **************')
# sub = pd.read_csv(path_data + 'sample.csv')

sub['target'] = clf.predict_proba(test_df)[:, 1]
clf.predict_proba(test_df)[:, 1]
fea_imp_list.append(clf.feature_importances_)
print('runtime:', time.time() - t)

************** training using all the data **************
[50]	training's binary_logloss: 0.200409
[100]	training's binary_logloss: 0.152057
[150]	training's binary_logloss: 0.12613
[200]	training's binary_logloss: 0.110245
[250]	training's binary_logloss: 0.0991378
[300]	training's binary_logloss: 0.0909502
[350]	training's binary_logloss: 0.0843922
[400]	training's binary_logloss: 0.0790308
[450]	training's binary_logloss: 0.0741518
[500]	training's binary_logloss: 0.0698588
[550]	training's binary_logloss: 0.066097
[600]	training's binary_logloss: 0.0626686
[650]	training's binary_logloss: 0.0596007
[700]	training's binary_logloss: 0.0568337
[750]	training's binary_logloss: 0.0543628
runtime: 1583.5312888622284
************** test predict **************
runtime: 1596.2505402565002


In [19]:
gc.collect()

45

In [None]:
### 临时试试如该不对training内容，再fit一次的话，提交的效果如何
# print('************** test predict **************')
# sub = pd.read_csv(path_data + 'sample.csv')
# # sub['target'] = clf.predict_proba(test_df)[:, 1]
# clf.predict_proba(test_df)[:, 1]
# fea_imp_list.append(clf.feature_importances_)
# print('runtime:', time.time() - t)

In [20]:
print('=============================================== feat importances ===============================================')
# 特征重要性可以好好看看
fea_imp_dict = dict(zip(train_df.columns.values, np.mean(fea_imp_list, axis=0)))
fea_imp_item = sorted(fea_imp_dict.items(), key=lambda x: x[1], reverse=True)
for f, imp in fea_imp_item:
    print('{} = {}'.format(f, imp))

deviceid = 32630.0
device_version = 18983.5
lat = 8493.5
newsid = 8058.5
lng = 7728.5
lng_lat = 7038.5
netmodel_deviceid_lng_lat_next1_exposure_ts_gap = 3528.0
pos_count = 3126.0
netmodel_deviceid_next1_exposure_ts_gap = 2802.0
deviceid_next3_exposure_ts_gap = 2475.0
netmodel_deviceid_next3_exposure_ts_gap = 2209.0
pos_netmodel_deviceid_next1_exposure_ts_gap = 2100.5
deviceid_lng_lat_next3_exposure_ts_gap = 2017.0
netmodel_deviceid_lng_lat_next2_exposure_ts_gap = 2000.0
netmodel_deviceid_next2_exposure_ts_gap = 1795.5
deviceid_next1_exposure_ts_gap = 1780.0
netmodel_deviceid_lng_lat_next3_exposure_ts_gap = 1643.5
pos_netmodel_deviceid_lng_lat_next1_exposure_ts_gap = 1637.0
pos_newsid_next1_exposure_ts_gap = 1599.5
deviceid_count = 1593.5
newsid_next1_exposure_ts_gap = 1579.0
deviceid_lng_lat_next1_exposure_ts_gap = 1526.5
netmodel_deviceid_next5_exposure_ts_gap = 1511.0
pos = 1468.5
deviceid_next5_exposure_ts_gap = 1396.0
netmodel_lng_lat_next1_exposure_ts_gap = 1322.0
netmodel_devicei

In [21]:
val_y.shape

(261555,)

In [22]:
val_x.shape

(261555, 170)

In [23]:
val_pred.shape

(261555,)

In [24]:
test_df.shape

(237323, 170)

In [25]:
val_pred.shape

(261555,)

In [26]:
print('=============================================== threshold search ===============================================')
# f1阈值敏感，所以对阈值做一个简单的迭代搜索。
t0 = 0.05
v = 0.002
best_t = t0
best_f1 = 0
for step in range(201):
    curr_t = t0 + step * v
    y = [1 if x >= curr_t else 0 for x in val_pred]
    curr_f1 = f1_score(val_y, y)
    if curr_f1 > best_f1:
        best_t = curr_t
        best_f1 = curr_f1
        print('step: {}   best threshold: {}   best f1: {}'.format(step, best_t, best_f1))
print('search finish.')

val_pred = [1 if x >= best_t else 0 for x in val_pred]
print('\nbest auc:', best_auc)
print('best f1:', f1_score(val_y, val_pred))
print('validate mean:', np.mean(val_pred))
print('runtime:', time.time() - t)


step: 0   best threshold: 0.05   best f1: 0.6853515786002804
step: 1   best threshold: 0.052000000000000005   best f1: 0.6878322953060914
step: 2   best threshold: 0.054000000000000006   best f1: 0.6901318528799445
step: 3   best threshold: 0.056   best f1: 0.6921221080229265
step: 4   best threshold: 0.058   best f1: 0.6948123852375145
step: 5   best threshold: 0.060000000000000005   best f1: 0.6970874880288435
step: 6   best threshold: 0.062   best f1: 0.6992373756667657
step: 7   best threshold: 0.064   best f1: 0.7008457110368843
step: 8   best threshold: 0.066   best f1: 0.7028963784563117
step: 9   best threshold: 0.068   best f1: 0.7045480614819912
step: 10   best threshold: 0.07   best f1: 0.7061532925512773
step: 11   best threshold: 0.07200000000000001   best f1: 0.7080208649414077
step: 12   best threshold: 0.07400000000000001   best f1: 0.7097663219274452
step: 13   best threshold: 0.07600000000000001   best f1: 0.7114049009187402
step: 14   best threshold: 0.078   best f1:

In [27]:
print('=============================================== sub save ===============================================')
sub.to_csv('sub_prob_{}_{}_{}.csv'.format(best_auc, best_f1, sub['target'].mean()), index=False)
sub['target'] = sub['target'].apply(lambda x: 1 if x >= best_t else 0)
sub.to_csv('sub_{}_{}_{}.csv'.format(best_auc, best_f1, sub['target'].mean()), index=False)
print('runtime:', time.time() - t)
print('finish.')
print('========================================================================================================')

runtime: 1759.3171303272247
finish.
