In [1]:
import lightgbm as lgb
import os
import math
import pandas as pd
import numpy as np
import random
import gc
from datetime import datetime
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import  OneHotEncoder

np.random.seed(2019)
random.seed(2019)
pd.set_option('display.max_rows', 4)
pd.set_option('display.max_columns', 10)
pd.set_option('display.width', 280)
pd.set_option('display.max_colwidth', 150)
data_path = '/data/workspace/kimi/tencent_ads/2020/dataset'
preprocess_path = 'preprocess'

In [2]:

def get_precent_age(path,f,o):
    df = pd.read_pickle(path)
    df = df[['user_id',f]].rename(columns={f:o})
    return df
    


In [15]:
output_dic={
    'bert':( 'bert_test_output.pkl','age_percent'),
    'lgb':( 'lgb_test_output.pkl','lgb_age_precent'),
}
    
total_df =None
    
for k,v in output_dic.items():
    df = get_precent_age(f'output/{v[0]}',v[1],f'{k}_age_percent')
    if total_df is None:
        total_df = df
    else:
        total_df = total_df.merge(df,on='user_id',how='left')

print(total_df)



        user_id                                                                                                                            bert_age_percent
0       3000001        [0.0038677964, 0.10122928, 0.72807115, 0.14541557, 0.018257076, 0.002646775, 0.00041883535, 7.36673e-05, 1.85203e-05, 1.2872738e-06]
1       3000002       [0.0017600106, 0.00016363547, 0.0002463058, 0.00028151763, 0.00900832, 0.10255116, 0.5898225, 0.28653073, 0.009434198, 0.00020163893]
...         ...                                                                                                                                         ...
999998  3999999  [0.01771404, 0.70296717, 0.27645692, 0.0028105238, 4.8052938e-05, 2.42413e-06, 7.7825234e-07, 7.2467905e-08, 1.9215419e-08, 3.3433242e-10]
999999  4000000          [0.00045651942, 0.0023482905, 0.04213559, 0.20981358, 0.43084055, 0.24747723, 0.06421742, 0.002637042, 7.31197e-05, 6.5277084e-07]

[1000000 rows x 2 columns]
        user_id                     

In [18]:
def mean_columns(df):
    ret = [np.array(df[f'{k}_age_percent']) for k in output_dic]
    return np.mean(ret,axis=0)

total_df['total_age_precent'] = total_df.apply(mean_columns ,axis=1)
print(total_df)


        user_id                                                                                                                            bert_age_percent  \
0       3000001        [0.0038677964, 0.10122928, 0.72807115, 0.14541557, 0.018257076, 0.002646775, 0.00041883535, 7.36673e-05, 1.85203e-05, 1.2872738e-06]   
1       3000002       [0.0017600106, 0.00016363547, 0.0002463058, 0.00028151763, 0.00900832, 0.10255116, 0.5898225, 0.28653073, 0.009434198, 0.00020163893]   
...         ...                                                                                                                                         ...   
999998  3999999  [0.01771404, 0.70296717, 0.27645692, 0.0028105238, 4.8052938e-05, 2.42413e-06, 7.7825234e-07, 7.2467905e-08, 1.9215419e-08, 3.3433242e-10]   
999999  4000000          [0.00045651942, 0.0023482905, 0.04213559, 0.20981358, 0.43084055, 0.24747723, 0.06421742, 0.002637042, 7.31197e-05, 6.5277084e-07]   

                                             

In [22]:
y_pred_classes = total_df['total_age_precent'].values

for i in range(len(y_pred_classes)):
        max_value=max(y_pred_classes[i])
        for j in range(len(y_pred_classes[i])):
            if max_value==y_pred_classes[i][j]:
                y_pred_classes[i][j]=1
            else:
                y_pred_classes[i][j]=0
                
ret = []
for user_id,age in zip(range(1000000),y_pred_classes):
    ret.append([int(user_id)+3000001,int(age.tolist().index(1) + 1)])
ret_df = pd.DataFrame(ret,columns=['user_id','predicted_age'])
print(ret_df['predicted_age'].value_counts())
ret_df.to_pickle('output/total_age_output.pkl')

3     287292
2     175609
       ...  
9      16232
10     13218
Name: predicted_age, Length: 10, dtype: int64


In [24]:
ret_df.to_pickle('output/total_age_output.pkl')
print(ret_df)



        user_id  predicted_age
0       3000001              3
1       3000002              7
...         ...            ...
999998  3999999              2
999999  4000000              5

[1000000 rows x 2 columns]
