In [1]:
import lightgbm as lgb
import os
import math
import pandas as pd
import numpy as np
import random
import gc
from datetime import datetime
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import  OneHotEncoder

np.random.seed(2019)
random.seed(2019)
pd.set_option('display.max_rows', 6)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 280)
pd.set_option('display.max_colwidth', 150)
data_path = '/data/workspace/kimi/tencent_ads/2020/dataset'
preprocess_path = 'preprocess'

In [2]:
flag ='test'
user_base_statics_df= pd.read_pickle(f'{preprocess_path}/{flag}_user_base_statics.pkl')
user_base_statics_df = user_base_statics_df.reset_index()
user_base_statics_df['click_times_sum_log'] = user_base_statics_df['click_times_sum'].apply(lambda x :math.log(x))
user_base_statics_df['click_times_count_log'] = user_base_statics_df['click_times_count'].apply(lambda x :math.log(x))
user_base_statics_df = user_base_statics_df.astype(float)
print(user_base_statics_df)


          user_id  click_times_sum  click_times_count  ad_id_nunique  product_id_nunique  product_category_nunique  advertiser_id_nunique  industry_nunique  click_times_sum_log  click_times_count_log
0       3000001.0             11.0               11.0           11.0                 7.0                       4.0                   10.0               5.0             2.397895               2.397895
1       3000002.0             65.0               61.0           49.0                24.0                       5.0                   41.0              21.0             4.174387               4.110874
2       3000003.0             36.0               36.0           35.0                15.0                       5.0                   29.0              17.0             3.583519               3.583519
...           ...              ...                ...            ...                 ...                       ...                    ...               ...                  ...                    ...


In [3]:
def merge_features(train_df,train_file,target_encode=False):
    train_features_df  = pd.read_pickle(f'{preprocess_path}/{train_file}')
    if target_encode:
        train_features_df.columns = [ '_'.join(i) for i in train_features_df.columns.values  ]

    train_df = train_df.merge(train_features_df,on='user_id')
    print(train_df)
    return train_df


In [4]:
test_df = user_base_statics_df
for i in ['creative_id','ad_id', 'product_id','advertiser_id','industry','product_category']:
    print(f'merge {i}...')
    test_df = merge_features(test_df,f'test_user_target_encoder_{i}_gender.pkl',True)
    print(test_df)

merge creative_id...
          user_id  click_times_sum  click_times_count  ad_id_nunique  product_id_nunique  product_category_nunique  advertiser_id_nunique  industry_nunique  click_times_sum_log  click_times_count_log  creative_id_age0_kfold_mean_min  creative_id_age0_kfold_mean_max  \
0       3000001.0             11.0               11.0           11.0                 7.0                       4.0                   10.0               5.0             2.397895               2.397895                              0.0                         0.101266   
1       3000002.0             65.0               61.0           49.0                24.0                       5.0                   41.0              21.0             4.174387               4.110874                              0.0                         0.166667   
2       3000003.0             36.0               36.0           35.0                15.0                       5.0                   29.0              17.0             3.58

          user_id  click_times_sum  click_times_count  ad_id_nunique  product_id_nunique  product_category_nunique  advertiser_id_nunique  industry_nunique  click_times_sum_log  click_times_count_log  creative_id_age0_kfold_mean_min  creative_id_age0_kfold_mean_max  \
0       3000001.0             11.0               11.0           11.0                 7.0                       4.0                   10.0               5.0             2.397895               2.397895                              0.0                         0.101266   
1       3000002.0             65.0               61.0           49.0                24.0                       5.0                   41.0              21.0             4.174387               4.110874                              0.0                         0.166667   
2       3000003.0             36.0               36.0           35.0                15.0                       5.0                   29.0              17.0             3.583519               3.

          user_id  click_times_sum  click_times_count  ad_id_nunique  product_id_nunique  product_category_nunique  advertiser_id_nunique  industry_nunique  click_times_sum_log  click_times_count_log  creative_id_age0_kfold_mean_min  creative_id_age0_kfold_mean_max  \
0       3000001.0             11.0               11.0           11.0                 7.0                       4.0                   10.0               5.0             2.397895               2.397895                              0.0                         0.101266   
1       3000002.0             65.0               61.0           49.0                24.0                       5.0                   41.0              21.0             4.174387               4.110874                              0.0                         0.166667   
2       3000003.0             36.0               36.0           35.0                15.0                       5.0                   29.0              17.0             3.583519               3.

          user_id  click_times_sum  click_times_count  ad_id_nunique  product_id_nunique  product_category_nunique  advertiser_id_nunique  industry_nunique  click_times_sum_log  click_times_count_log  creative_id_age0_kfold_mean_min  creative_id_age0_kfold_mean_max  \
0       3000001.0             11.0               11.0           11.0                 7.0                       4.0                   10.0               5.0             2.397895               2.397895                              0.0                         0.101266   
1       3000002.0             65.0               61.0           49.0                24.0                       5.0                   41.0              21.0             4.174387               4.110874                              0.0                         0.166667   
2       3000003.0             36.0               36.0           35.0                15.0                       5.0                   29.0              17.0             3.583519               3.

          user_id  click_times_sum  click_times_count  ad_id_nunique  product_id_nunique  product_category_nunique  advertiser_id_nunique  industry_nunique  click_times_sum_log  click_times_count_log  creative_id_age0_kfold_mean_min  creative_id_age0_kfold_mean_max  \
0       3000001.0             11.0               11.0           11.0                 7.0                       4.0                   10.0               5.0             2.397895               2.397895                              0.0                         0.101266   
1       3000002.0             65.0               61.0           49.0                24.0                       5.0                   41.0              21.0             4.174387               4.110874                              0.0                         0.166667   
2       3000003.0             36.0               36.0           35.0                15.0                       5.0                   29.0              17.0             3.583519               3.

          user_id  click_times_sum  click_times_count  ad_id_nunique  product_id_nunique  product_category_nunique  advertiser_id_nunique  industry_nunique  click_times_sum_log  click_times_count_log  creative_id_age0_kfold_mean_min  creative_id_age0_kfold_mean_max  \
0       3000001.0             11.0               11.0           11.0                 7.0                       4.0                   10.0               5.0             2.397895               2.397895                              0.0                         0.101266   
1       3000002.0             65.0               61.0           49.0                24.0                       5.0                   41.0              21.0             4.174387               4.110874                              0.0                         0.166667   
2       3000003.0             36.0               36.0           35.0                15.0                       5.0                   29.0              17.0             3.583519               3.

In [5]:
# train_x = train_df.drop(['age','user_id','gender','advertiser_id_seq','industry_seq','product_id_seq'], axis=1)
# train_y = train_df['age'].astype(int)

# valid_x = valid_df.drop(['age','user_id','gender','advertiser_id_seq','industry_seq','product_id_seq'], axis=1)
# valid_y = valid_df['age'].astype(int)

drop_list = ['user_id','click_times_sum','click_times_count']
test_df = test_df.drop(drop_list, axis=1)





In [6]:

gbm = lgb.Booster(model_file='model/age_target.model')
print('Feature importances:', list(gbm.feature_importance()))

Feature importances: [1072, 1240, 412, 1352, 1480, 1287, 1114, 111, 1280, 1846, 1313, 364, 1395, 2426, 1666, 482, 1397, 2252, 1519, 409, 1143, 2346, 1531, 329, 1120, 2037, 1419, 381, 1283, 2276, 1580, 237, 1278, 2092, 1472, 128, 1215, 2033, 1428, 93, 1106, 1661, 1256, 45, 1124, 1516, 1181, 131, 1236, 1694, 1312, 371, 1333, 2781, 1702, 576, 1422, 2554, 1529, 460, 1148, 2272, 1606, 362, 1049, 1914, 1455, 454, 1272, 2368, 1605, 313, 1261, 2393, 1384, 127, 1156, 2216, 1398, 77, 1061, 1694, 1288, 50, 1081, 1470, 1043, 1135, 1362, 1511, 1423, 1361, 1504, 1381, 1236, 1205, 1568, 1191, 1067, 1216, 1245, 1351, 1256, 1199, 1389, 1494, 1255, 1040, 1529, 1384, 1250, 1249, 1404, 1371, 1233, 1170, 1165, 1026, 1078, 947, 1063, 1016, 1073, 778, 1311, 1215, 1224, 1921, 1637, 2067, 1612, 3347, 2427, 2758, 2356, 2515, 2377, 2186, 1767, 2034, 2513, 1674, 1815, 2162, 2311, 1996, 1661, 2276, 2062, 1831, 1649, 2332, 2193, 1787, 1514, 2072, 2002, 1581, 1469, 1575, 1497, 1384, 1212, 1184, 1744, 1842, 1205, 184

In [None]:
y_pred = gbm.predict(test_df.astype(float))
for i in range(len(y_pred)):
        max_value=max(y_pred[i])
        for j in range(len(y_pred[i])):
            if max_value==y_pred[i][j]:
                y_pred[i][j]=1
            else:
                y_pred[i][j]=0
print(y_pred)             

ret = []
for user_id,age in zip(range(1000000),y_pred):
    ret.append([int(user_id+3000001),int(age.tolist().index(1) + 1)])

ret_df = pd.DataFrame(ret,columns=['user_id','predicted_age'])
ret_df.to_pickle("age_target_out2.pkl")
print(ret_df)
print(ret_df['predicted_age'].value_counts())