In [1]:
import lightgbm as lgb
import os
import math
import pandas as pd
import numpy as np
import random
import gc
from datetime import datetime
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import  OneHotEncoder

np.random.seed(2019)
random.seed(2019)
pd.set_option('display.max_rows', 6)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 280)
pd.set_option('display.max_colwidth', 150)
data_path = '/data/workspace/kimi/tencent_ads/2020/dataset'
preprocess_path = 'preprocess'

In [2]:
flag ='test'
user_base_statics_df= pd.read_pickle(f'{preprocess_path}/{flag}_user_base_statics.pkl')
user_base_statics_df = user_base_statics_df.reset_index()
user_base_statics_df['click_times_sum_log'] = user_base_statics_df['click_times_sum'].apply(lambda x :math.log(x))
user_base_statics_df['click_times_count_log'] = user_base_statics_df['click_times_count'].apply(lambda x :math.log(x))
user_base_statics_df = user_base_statics_df.astype(float)
print(user_base_statics_df)


          user_id  click_times_sum  click_times_count  ad_id_nunique  product_id_nunique  product_category_nunique  advertiser_id_nunique  industry_nunique  click_times_sum_log  click_times_count_log
0       3000001.0             11.0               11.0           11.0                 7.0                       4.0                   10.0               5.0             2.397895               2.397895
1       3000002.0             65.0               61.0           49.0                24.0                       5.0                   41.0              21.0             4.174387               4.110874
2       3000003.0             36.0               36.0           35.0                15.0                       5.0                   29.0              17.0             3.583519               3.583519
...           ...              ...                ...            ...                 ...                       ...                    ...               ...                  ...                    ...


In [3]:
def merge_features(train_df,train_file,target_encode=False):
    train_features_df  = pd.read_pickle(f'{preprocess_path}/{train_file}')
    if target_encode:
        train_features_df.columns = [ '_'.join(i) for i in train_features_df.columns.values  ]

    train_df = train_df.merge(train_features_df,on='user_id')
    print(train_df)
    return train_df


In [4]:
test_df = merge_features(user_base_statics_df,'test_user_target_encoder_product_id.pkl',True)


          user_id  click_times_sum  click_times_count  ad_id_nunique  product_id_nunique  product_category_nunique  advertiser_id_nunique  industry_nunique  click_times_sum_log  click_times_count_log  product_id_age0_kfold_mean_min  product_id_age0_kfold_mean_max  \
0       3000001.0             11.0               11.0           11.0                 7.0                       4.0                   10.0               5.0             2.397895               2.397895                        0.006556                        0.102945   
1       3000002.0             65.0               61.0           49.0                24.0                       5.0                   41.0              21.0             4.174387               4.110874                        0.000000                        0.115073   
2       3000003.0             36.0               36.0           35.0                15.0                       5.0                   29.0              17.0             3.583519               3.583519

In [5]:
test_df = merge_features(test_df,'test_user_target_encoder_industry.pkl',True)

          user_id  click_times_sum  click_times_count  ad_id_nunique  product_id_nunique  product_category_nunique  advertiser_id_nunique  industry_nunique  click_times_sum_log  click_times_count_log  product_id_age0_kfold_mean_min  product_id_age0_kfold_mean_max  \
0       3000001.0             11.0               11.0           11.0                 7.0                       4.0                   10.0               5.0             2.397895               2.397895                        0.006556                        0.102945   
1       3000002.0             65.0               61.0           49.0                24.0                       5.0                   41.0              21.0             4.174387               4.110874                        0.000000                        0.115073   
2       3000003.0             36.0               36.0           35.0                15.0                       5.0                   29.0              17.0             3.583519               3.583519

In [7]:
test_df = merge_features(test_df,'test_user_target_encoder_advertiser_id.pkl',True)

          user_id  click_times_sum  click_times_count  ad_id_nunique  product_id_nunique  product_category_nunique  advertiser_id_nunique  industry_nunique  click_times_sum_log  click_times_count_log  product_id_age0_kfold_mean_min  product_id_age0_kfold_mean_max  \
0       3000001.0             11.0               11.0           11.0                 7.0                       4.0                   10.0               5.0             2.397895               2.397895                        0.006556                        0.102945   
1       3000002.0             65.0               61.0           49.0                24.0                       5.0                   41.0              21.0             4.174387               4.110874                        0.000000                        0.115073   
2       3000003.0             36.0               36.0           35.0                15.0                       5.0                   29.0              17.0             3.583519               3.583519

In [8]:
test_df = merge_features(test_df,'test_user_target_encoder_product_category.pkl',True)

          user_id  click_times_sum  click_times_count  ad_id_nunique  product_id_nunique  product_category_nunique  advertiser_id_nunique  industry_nunique  click_times_sum_log  click_times_count_log  product_id_age0_kfold_mean_min  product_id_age0_kfold_mean_max  \
0       3000001.0             11.0               11.0           11.0                 7.0                       4.0                   10.0               5.0             2.397895               2.397895                        0.006556                        0.102945   
1       3000002.0             65.0               61.0           49.0                24.0                       5.0                   41.0              21.0             4.174387               4.110874                        0.000000                        0.115073   
2       3000003.0             36.0               36.0           35.0                15.0                       5.0                   29.0              17.0             3.583519               3.583519

In [9]:
# train_x = train_df.drop(['age','user_id','gender','advertiser_id_seq','industry_seq','product_id_seq'], axis=1)
# train_y = train_df['age'].astype(int)

# valid_x = valid_df.drop(['age','user_id','gender','advertiser_id_seq','industry_seq','product_id_seq'], axis=1)
# valid_y = valid_df['age'].astype(int)

drop_list = ['user_id','click_times_sum','click_times_count']
test_df = test_df.drop(drop_list, axis=1)





In [10]:

gbm = lgb.Booster(model_file='age_emb1.txt')
print('Feature importances:', list(gbm.feature_importance()))

Feature importances: [1391, 1491, 533, 1683, 1737, 1418, 1260, 1488, 1635, 1978, 1754, 1743, 1914, 1609, 1534, 1575, 1983, 1501, 1376, 1531, 1693, 1612, 1522, 1568, 1809, 1762, 1522, 1392, 1869, 1753, 1482, 1670, 1826, 1589, 1522, 1514, 1528, 1266, 1446, 1187, 1406, 1273, 1400, 1011, 1660, 1484, 1560, 2159, 1475, 2496, 2198, 1297, 1831, 1517, 1864, 1207, 1977, 1615, 1697, 1813, 1793, 2151, 2171, 2180, 1469, 1975, 1990, 1725, 1202, 1791, 1699, 1649, 1261, 1695, 1585, 1537, 1475, 1568, 1642, 1540, 1479, 1511, 1714, 1686, 1532, 1915, 1643, 2282, 2211, 2823, 2106, 3844, 3186, 3727, 2813, 3384, 3143, 2990, 2235, 2529, 3084, 2528, 2116, 2649, 2814, 2619, 1899, 2674, 2484, 2677, 1905, 2691, 2656, 2545, 1803, 2584, 2480, 2530, 1790, 1883, 1958, 2224, 1675, 1382, 2301, 2237, 1611, 721, 439, 1631, 2182, 455, 447, 1313, 1043, 386, 568, 1476, 1303, 391, 906, 1753, 1376, 431, 565, 1643, 1218, 570, 683, 1507, 1498, 410, 408, 1479, 1223, 614, 346, 1326, 1180, 534, 388, 1064, 864, 502, 393, 1284, 984]

In [None]:
y_pred = gbm.predict(test_df.astype(float))
for i in range(len(y_pred)):
        max_value=max(y_pred[i])
        for j in range(len(y_pred[i])):
            if max_value==y_pred[i][j]:
                y_pred[i][j]=1
            else:
                y_pred[i][j]=0
print(y_pred)             

ret = []
for user_id,age in zip(range(1000000),y_pred):
    ret.append([int(user_id+3000001),int(age.tolist().index(1) + 1)])

ret_df = pd.DataFrame(ret,columns=['user_id','predicted_age'])
ret_df.to_pickle("age_target_out1.pkl")
print(ret_df)
print(ret_df['predicted_age'].value_counts())