In [1]:
import lightgbm as lgb
import os
import math
import pandas as pd
import numpy as np
import random
import gc
from datetime import datetime
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import  OneHotEncoder

np.random.seed(2019)
random.seed(2019)
pd.set_option('display.max_rows', 4)
pd.set_option('display.max_columns', 10)
pd.set_option('display.width', 280)
pd.set_option('display.max_colwidth', 150)
data_path = '/data/workspace/kimi/tencent_ads/2020/dataset'
preprocess_path = 'preprocess'

In [2]:
flag ='test'
user_base_statics_df= pd.read_pickle(f'{preprocess_path}/{flag}_user_base_statics.pkl')
user_base_statics_df = user_base_statics_df.reset_index()
user_base_statics_df['click_times_sum_log'] = user_base_statics_df['click_times_sum'].apply(lambda x :math.log(x))
user_base_statics_df['click_times_count_log'] = user_base_statics_df['click_times_count'].apply(lambda x :math.log(x))
user_base_statics_df = user_base_statics_df.astype(float)
print(user_base_statics_df)


          user_id  click_times_sum  click_times_count  ad_id_nunique  product_id_nunique  product_category_nunique  advertiser_id_nunique  industry_nunique  click_times_sum_log  click_times_count_log
0       3000001.0             11.0               11.0           11.0                 7.0                       4.0                   10.0               5.0             2.397895               2.397895
1       3000002.0             65.0               61.0           49.0                24.0                       5.0                   41.0              21.0             4.174387               4.110874
...           ...              ...                ...            ...                 ...                       ...                    ...               ...                  ...                    ...
999998  3999999.0             86.0               80.0           79.0                26.0                       4.0                   63.0              29.0             4.454347               4.382027


In [3]:
def merge_features(train_df,train_file,target_encode=False):
    train_features_df  = pd.read_pickle(f'{preprocess_path}/{train_file}')
    if target_encode:
        train_features_df.columns = [ '_'.join(i) for i in train_features_df.columns.values  ]

    train_df = train_df.merge(train_features_df,on='user_id')
    print(train_df)
    return train_df


In [4]:
test_df = user_base_statics_df
for i in ['creative_id','ad_id', 'product_id','advertiser_id','industry','product_category']:
    print(f'merge {i}...')
    test_df = merge_features(test_df,f'test_user_target_encoder_{i}_age.pkl',True)
    print(test_df)

merge creative_id...
          user_id  click_times_sum  click_times_count  ad_id_nunique  product_id_nunique  ...  creative_id_gender0_kfold_mean_std  creative_id_gender1_kfold_mean_min  creative_id_gender1_kfold_mean_max  creative_id_gender1_kfold_mean_mean  creative_id_gender1_kfold_mean_std
0       3000001.0             11.0               11.0           11.0                 7.0  ...                            0.187787                            0.008403                            0.560420                             0.151177                            0.187787
1       3000002.0             65.0               61.0           49.0                24.0  ...                            0.251634                            0.000000                            1.000000                             0.516239                            0.259620
...           ...              ...                ...            ...                 ...  ...                                 ...                         

In [5]:
# train_x = train_df.drop(['age','user_id','gender','advertiser_id_seq','industry_seq','product_id_seq'], axis=1)
# train_y = train_df['age'].astype(int)

# valid_x = valid_df.drop(['age','user_id','gender','advertiser_id_seq','industry_seq','product_id_seq'], axis=1)
# valid_y = valid_df['age'].astype(int)

drop_list = ['user_id','click_times_sum','click_times_count']
test_df = test_df.drop(drop_list, axis=1)





In [6]:

gbm = lgb.Booster(model_file='model/age_target.model')
print('Feature importances:', list(gbm.feature_importance()))

Feature importances: [467, 539, 190, 560, 679, 555, 573, 41, 511, 834, 498, 128, 598, 1205, 581, 178, 598, 984, 569, 154, 439, 874, 541, 129, 445, 799, 477, 177, 529, 1022, 579, 102, 535, 941, 524, 56, 460, 852, 518, 41, 386, 732, 449, 24, 419, 575, 418, 31, 512, 665, 536, 170, 544, 1398, 676, 241, 671, 1191, 605, 201, 504, 1022, 605, 145, 442, 798, 470, 206, 486, 1022, 538, 118, 485, 1043, 502, 68, 441, 921, 447, 35, 443, 693, 469, 27, 388, 613, 399, 370, 461, 523, 425, 521, 537, 519, 402, 417, 514, 460, 319, 390, 460, 434, 365, 395, 481, 472, 453, 374, 469, 487, 387, 442, 452, 537, 357, 445, 342, 388, 329, 296, 376, 379, 355, 278, 486, 473, 417, 813, 526, 795, 546, 1589, 1051, 1228, 973, 1172, 922, 891, 682, 871, 1061, 636, 635, 867, 911, 859, 541, 979, 803, 757, 580, 992, 939, 722, 505, 878, 792, 637, 501, 679, 542, 588, 387, 474, 564, 811, 423, 790, 395, 863, 659, 377, 524, 531, 545, 307, 675, 542, 416, 579, 510, 774, 718, 654, 408, 637, 526, 496, 370, 603, 531, 563, 384, 563, 484,

In [7]:
y_pred = gbm.predict(test_df.astype(float))
y_pred_precent = y_pred.copy()
for i in range(len(y_pred)):
        max_value=max(y_pred[i])
        for j in range(len(y_pred[i])):
            if max_value==y_pred[i][j]:
                y_pred[i][j]=1
            else:
                y_pred[i][j]=0
print(y_pred)             

ret = []
for user_id,age,age_precent in zip(range(1000000),y_pred,y_pred_precent):
    ret.append([int(user_id+3000001),int(age.tolist().index(1) + 1),age_precent])

ret_df = pd.DataFrame(ret,columns=['user_id','predicted_age','lgb_age_precent'])
ret_df.to_pickle("output/lgb_test_output.pkl")


[[0. 0. 0. ... 1. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 ...
 [1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 1. 0. 0.]]
        user_id  predicted_age
0       3000001              8
1       3000002              1
...         ...            ...
999998  3999999              8
999999  4000000              8

[1000000 rows x 2 columns]
1    590953
8    203647
      ...  
6      1451
9      1036
Name: predicted_age, Length: 5, dtype: int64
