In [1]:
import lightgbm as lgb
import os
import math
import pandas as pd
import numpy as np
import random
import gc
from datetime import datetime
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import  OneHotEncoder

np.random.seed(2019)
random.seed(2019)
pd.set_option('display.max_rows', 6)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 280)
pd.set_option('display.max_colwidth', 150)
data_path = '/data/workspace/kimi/tencent_ads/2020/dataset'
preprocess_path = 'preprocess'

In [2]:
flag ='test'
user_base_statics_df= pd.read_pickle(f'{preprocess_path}/{flag}_user_base_statics.pkl')
user_base_statics_df = user_base_statics_df.reset_index()
user_base_statics_df['click_times_sum_log'] = user_base_statics_df['click_times_sum'].apply(lambda x :math.log(x))
user_base_statics_df['click_times_count_log'] = user_base_statics_df['click_times_count'].apply(lambda x :math.log(x))
user_base_statics_df = user_base_statics_df.astype(float)
print(user_base_statics_df)


          user_id  click_times_sum  click_times_count  ad_id_nunique  product_id_nunique  product_category_nunique  advertiser_id_nunique  industry_nunique  click_times_sum_log  click_times_count_log
0       3000001.0             11.0               11.0           11.0                 7.0                       4.0                   10.0               5.0             2.397895               2.397895
1       3000002.0             65.0               61.0           49.0                24.0                       5.0                   41.0              21.0             4.174387               4.110874
2       3000003.0             36.0               36.0           35.0                15.0                       5.0                   29.0              17.0             3.583519               3.583519
...           ...              ...                ...            ...                 ...                       ...                    ...               ...                  ...                    ...


In [3]:
def merge_features(train_df,train_file,target_encode=False):
    train_features_df  = pd.read_pickle(f'{preprocess_path}/{train_file}')
    if target_encode:
        train_features_df.columns = [ '_'.join(i) for i in train_features_df.columns.values  ]
    train_df = train_df.merge(train_features_df,on='user_id')
    print(train_df)
    return train_df



In [4]:
test_df = user_base_statics_df
for i in ['creative_id','ad_id', 'product_id','advertiser_id','industry','product_category']:
    print(f'merge {i}...')
    test_df = merge_features(test_df,f'test_user_target_encoder_{i}_gender.pkl',True)
    print(test_df)


merge creative_id...
          user_id  click_times_sum  click_times_count  ad_id_nunique  product_id_nunique  product_category_nunique  advertiser_id_nunique  industry_nunique  click_times_sum_log  click_times_count_log  creative_id_gender0_kfold_mean_min  creative_id_gender0_kfold_mean_max  \
0       3000001.0             11.0               11.0           11.0                 7.0                       4.0                   10.0               5.0             2.397895               2.397895                             0.43958                            0.991597   
1       3000002.0             65.0               61.0           49.0                24.0                       5.0                   41.0              21.0             4.174387               4.110874                             0.00000                            1.000000   
2       3000003.0             36.0               36.0           35.0                15.0                       5.0                   29.0              17.

          user_id  click_times_sum  click_times_count  ad_id_nunique  product_id_nunique  product_category_nunique  advertiser_id_nunique  industry_nunique  click_times_sum_log  click_times_count_log  creative_id_gender0_kfold_mean_min  creative_id_gender0_kfold_mean_max  \
0       3000001.0             11.0               11.0           11.0                 7.0                       4.0                   10.0               5.0             2.397895               2.397895                             0.43958                            0.991597   
1       3000002.0             65.0               61.0           49.0                24.0                       5.0                   41.0              21.0             4.174387               4.110874                             0.00000                            1.000000   
2       3000003.0             36.0               36.0           35.0                15.0                       5.0                   29.0              17.0             3.58351

          user_id  click_times_sum  click_times_count  ad_id_nunique  product_id_nunique  product_category_nunique  advertiser_id_nunique  industry_nunique  click_times_sum_log  click_times_count_log  creative_id_gender0_kfold_mean_min  creative_id_gender0_kfold_mean_max  \
0       3000001.0             11.0               11.0           11.0                 7.0                       4.0                   10.0               5.0             2.397895               2.397895                             0.43958                            0.991597   
1       3000002.0             65.0               61.0           49.0                24.0                       5.0                   41.0              21.0             4.174387               4.110874                             0.00000                            1.000000   
2       3000003.0             36.0               36.0           35.0                15.0                       5.0                   29.0              17.0             3.58351

          user_id  click_times_sum  click_times_count  ad_id_nunique  product_id_nunique  product_category_nunique  advertiser_id_nunique  industry_nunique  click_times_sum_log  click_times_count_log  creative_id_gender0_kfold_mean_min  creative_id_gender0_kfold_mean_max  \
0       3000001.0             11.0               11.0           11.0                 7.0                       4.0                   10.0               5.0             2.397895               2.397895                             0.43958                            0.991597   
1       3000002.0             65.0               61.0           49.0                24.0                       5.0                   41.0              21.0             4.174387               4.110874                             0.00000                            1.000000   
2       3000003.0             36.0               36.0           35.0                15.0                       5.0                   29.0              17.0             3.58351

          user_id  click_times_sum  click_times_count  ad_id_nunique  product_id_nunique  product_category_nunique  advertiser_id_nunique  industry_nunique  click_times_sum_log  click_times_count_log  creative_id_gender0_kfold_mean_min  creative_id_gender0_kfold_mean_max  \
0       3000001.0             11.0               11.0           11.0                 7.0                       4.0                   10.0               5.0             2.397895               2.397895                             0.43958                            0.991597   
1       3000002.0             65.0               61.0           49.0                24.0                       5.0                   41.0              21.0             4.174387               4.110874                             0.00000                            1.000000   
2       3000003.0             36.0               36.0           35.0                15.0                       5.0                   29.0              17.0             3.58351

In [5]:

drop_list = ['user_id','click_times_sum','click_times_count']
test_df = test_df.drop(drop_list, axis=1)


In [6]:

gbm = lgb.Booster(model_file='model/gender_target_encode.model')
print('Feature importances:', list(gbm.feature_importance()))


Feature importances: [1190, 1124, 579, 1340, 1289, 1288, 1173, 983, 1872, 2750, 2118, 779, 2542, 2675, 2297, 1530, 1621, 1489, 1520, 1544, 1428, 1581, 1029, 932, 656, 1894, 2233, 354, 471, 1557, 780, 1743, 2291, 1750, 1796, 1712, 2436, 1510, 1230, 1719, 1523, 1793, 2000, 1168, 1128, 1528, 967]


In [7]:
y_pred = gbm.predict(test_df.astype(float))
for i in range(len(y_pred)):
        max_value=max(y_pred[i])
        for j in range(len(y_pred[i])):
            if max_value==y_pred[i][j]:
                y_pred[i][j]=1
            else:
                y_pred[i][j]=0
print(y_pred)             

ret = []
for user_id,age in zip(range(1000000),y_pred):
    ret.append([int(user_id+3000001),int(age.tolist().index(1) + 1)])

ret_df = pd.DataFrame(ret,columns=['user_id','predicted_gender'])
ret_df.to_pickle("gender_target_out2.pkl")
print(ret_df)
print(ret_df['predicted_gender'].value_counts())

[[1. 0.]
 [0. 1.]
 [0. 1.]
 ...
 [1. 0.]
 [1. 0.]
 [1. 0.]]
        user_id  predicted_gender
0       3000001                 1
1       3000002                 2
2       3000003                 2
...         ...               ...
999997  3999998                 1
999998  3999999                 1
999999  4000000                 1

[1000000 rows x 2 columns]
1    685629
2    314371
Name: predicted_gender, dtype: int64


In [9]:
age_df = pd.read_pickle('age_target_out2.pkl')[['user_id','predicted_age']]
gender_df = pd.read_pickle('gender_target_out2.pkl')[['user_id','predicted_gender']]

output_df = age_df.merge(gender_df,on='user_id')
output_df.to_csv("submission_target2.csv",index=False)