In [1]:
import lightgbm as lgb
import os
import math
import pandas as pd
import numpy as np
import random
import gc
from datetime import datetime
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import  OneHotEncoder
import cmath

np.random.seed(2019)
random.seed(2019)
pd.set_option('display.max_rows', 4)
pd.set_option('display.max_columns', 10)
pd.set_option('display.width', 280)
pd.set_option('display.max_colwidth', 150)
data_path = '/data/workspace/kimi/tencent_ads/2020/dataset'
preprocess_path = 'preprocess'

In [8]:
valid_lgb_df = pd.read_pickle(f'output/lgb_test_output.pkl')
print(valid_lgb_df)
valid_trans_df = pd.read_pickle(f'output/transform_test_ret.pkl')
valid_trans_df['user_id'] = valid_trans_df['user_id'].map(lambda row: int(row) + 2280000)
print(valid_trans_df)

        user_id  predicted_age                                                                                                                                        lgb_age_precent
0       3000001              3  [0.0028889586486444854, 0.045188251216377734, 0.60632804379472, 0.3054728934737454, 0.03678698479029757, 0.0018154506690055414, 0.0012171000687226...
1       3000002              7  [0.0017253463481559117, 0.0026205938752497967, 0.006022812537324088, 0.006128980281137767, 0.009692144732614736, 0.07299333260728408, 0.5842831473...
...         ...            ...                                                                                                                                                    ...
999998  3999999              2  [0.09551533442342093, 0.5832584538815928, 0.2947450410770101, 0.016546191858453668, 0.0038013669310788473, 0.002146904157632699, 0.000987487043077...
999999  4000000              4  [0.01218308852772241, 0.05399057145956054, 0.0503661699727

In [10]:
valid_trans_df[[f'trans_age{i}' for i in range(10)]] = pd.DataFrame(valid_trans_df.age_percent.values.tolist(), index= valid_trans_df.index)
print(valid_trans_df)
valid_lgb_df[[f'lgb_age{i}' for i in range(10)]] = pd.DataFrame(valid_lgb_df.lgb_age_precent.values.tolist(), index= valid_lgb_df.index)
print(valid_lgb_df)
valid_df = valid_trans_df[['user_id'] + [f'trans_age{i}' for i in range(10)]].merge(valid_lgb_df[['user_id'] + [f'lgb_age{i}' for i in range(10)]],on='user_id')

print(valid_df)

        user_id  predicted_age                                                                                                                              age_percent  trans_age0  trans_age1  ...  trans_age5  trans_age6  trans_age7  trans_age8    trans_age9
0       3000001              3     [0.00094938726, 0.030507457, 0.7374537, 0.2018274, 0.026994014, 0.0019912398, 0.000240724, 3.341317e-05, 1.9608422e-06, 7.18248e-07]    0.000949    0.030507  ...    0.001991    0.000241    0.000033    0.000002  7.182480e-07
1       3000002              8   [0.00027378518, 0.00012342796, 0.00027071242, 0.0002558099, 0.009641608, 0.044181608, 0.35961723, 0.52451736, 0.05347052, 0.007647908]    0.000274    0.000123  ...    0.044182    0.359617    0.524517    0.053471  7.647908e-03
...         ...            ...                                                                                                                                      ...         ...         ...  ...         ...         ...   

In [11]:

drop_list = ['user_id']


valid_x = valid_df.drop(drop_list, axis=1)


print(valid_x)





        trans_age0  trans_age1  trans_age2  trans_age3  trans_age4  ...  lgb_age5  lgb_age6  lgb_age7  lgb_age8  lgb_age9
0         0.000949    0.030507    0.737454    0.201827    0.026994  ...  0.001815  0.001217  0.000206  0.000077  0.000019
1         0.000274    0.000123    0.000271    0.000256    0.009642  ...  0.072993  0.584283  0.301634  0.013345  0.001555
...            ...         ...         ...         ...         ...  ...       ...       ...       ...       ...       ...
999998    0.031822    0.654280    0.303229    0.009348    0.001137  ...  0.002147  0.000987  0.002499  0.000444  0.000056
999999    0.012576    0.031715    0.102769    0.207963    0.372209  ...  0.142795  0.018154  0.003458  0.000785  0.000151

[1000000 rows x 20 columns]


In [12]:

gbm = lgb.Booster(model_file='model/age_result.model')
print('Feature importances:', list(gbm.feature_importance()))

Feature importances: [2451, 2116, 2305, 2512, 2335, 2407, 2080, 2132, 2151, 2207, 2554, 2522, 2759, 2657, 2572, 2335, 2288, 2418, 2210, 2349]


In [13]:
y_pred = gbm.predict(valid_x.astype(float))
y_pred_precent = y_pred.copy()
for i in range(len(y_pred)):
        max_value=max(y_pred[i])
        for j in range(len(y_pred[i])):
            if max_value==y_pred[i][j]:
                y_pred[i][j]=1
            else:
                y_pred[i][j]=0
print(y_pred)             

ret = []
for user_id,age in zip(range(1000000),y_pred):
    ret.append([int(user_id+3000001),int(age.tolist().index(1) + 1)])

ret_df = pd.DataFrame(ret,columns=['user_id','predicted_age'])
ret_df.to_pickle("output/lgb_age_result_output.pkl")
print(ret_df)

[[0. 0. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 ...
 [0. 1. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
