In [1]:
import lightgbm as lgb
import os
import math
import pandas as pd
import numpy as np
import random
import gc
from datetime import datetime
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import  OneHotEncoder
import cmath

np.random.seed(2019)
random.seed(2019)
pd.set_option('display.max_rows', 4)
pd.set_option('display.max_columns', 10)
pd.set_option('display.width', 280)
pd.set_option('display.max_colwidth', 150)
data_path = '/data/workspace/kimi/tencent_ads/2020/dataset'
preprocess_path = 'preprocess'

In [2]:
label_df = pd.read_csv(f'{data_path}/train_preliminary/user.csv')


In [3]:
valid_lgb_df = pd.read_pickle(f'output/lgb_valid_output.pkl')
print(valid_lgb_df)
valid_trans_df = pd.read_pickle(f'output/transform_valid_ret.pkl')
print(valid_trans_df)

        user_id  predicted_age                                                                                                                                            age_percent  label_age
0        720001              2  [0.014440673427286577, 0.1636304453944477, 0.3173154942782703, 0.21344298184401694, 0.16753979356320084, 0.0855110277093806, 0.03272362842072226, ...          2
1        720002              1  [0.0045147017601383085, 0.42730889445895004, 0.3245581087949548, 0.18099460982524912, 0.040200509981416874, 0.013017909713077928, 0.00611303738128...          1
...         ...            ...                                                                                                                                                    ...        ...
179998   899999              2  [0.001455267175916176, 0.114343751781822, 0.27468935137885075, 0.15468374077748762, 0.20059416346393644, 0.14373274246675669, 0.09148457082934978,...          2
179999   900000              3  [0.

In [4]:
valid_trans_df[[f'trans_age{i}' for i in range(10)]] = pd.DataFrame(valid_trans_df.age_percent.values.tolist(), index= valid_trans_df.index)
print(valid_trans_df)
valid_lgb_df[[f'lgb_age{i}' for i in range(10)]] = pd.DataFrame(valid_lgb_df.age_percent.values.tolist(), index= valid_lgb_df.index)
print(valid_lgb_df)
valid_df = valid_trans_df[['user_id'] + [f'trans_age{i}' for i in range(10)]].merge(valid_lgb_df[['user_id'] + [f'lgb_age{i}' for i in range(10)]],on='user_id')
valid_df = valid_df.merge(label_df,on='user_id',how='left')
valid_df['age'] = valid_df['age'] -1 

print(valid_df)

        user_id  predicted_age                                                                                                                             age_percent  trans_age0  trans_age1  ...  trans_age5  trans_age6  trans_age7  trans_age8  trans_age9
0        720001              3        [0.019144528, 0.20491104, 0.30700263, 0.26462606, 0.11737743, 0.058445495, 0.01969487, 0.0040292162, 0.0026491152, 0.0021196164]    0.019145    0.204911  ...    0.058445    0.019695    0.004029    0.002649    0.002120
1        720002              3  [0.012328157, 0.34160623, 0.43078998, 0.17672162, 0.031835053, 0.0048781997, 0.0014903151, 0.00019775821, 9.150131e-05, 6.1132494e-05]    0.012328    0.341606  ...    0.004878    0.001490    0.000198    0.000092    0.000061
...         ...            ...                                                                                                                                     ...         ...         ...  ...         ...         ...         ... 

In [5]:
train_lgb_df = pd.read_pickle('output/lgb_train_output.pkl')
print(train_lgb_df)
train_transform_df = pd.read_pickle(f'output/transform_train_ret.pkl')
print(train_transform_df)

        user_id  predicted_age                                                                                                                                            age_percent  label_age
0             1              4  [0.0028087757168940987, 0.026028784707379474, 0.12335767386647269, 0.3080929334969957, 0.20921210453779848, 0.22430468204056994, 0.080323505884134...          3
1             2              3  [0.01594388950367522, 0.10134185110480497, 0.2848891019492552, 0.22860966642266428, 0.17484493585539068, 0.08597178798670557, 0.051085602741700084...          9
...         ...            ...                                                                                                                                                    ...        ...
719998   719999              2  [0.0688645480104275, 0.6379783934896666, 0.1965069229077374, 0.06332483728734954, 0.023359544585813973, 0.006359034465862536, 0.002366002337563623...          3
719999   720000              4  [0.

In [6]:
train_transform_df[[f'trans_age{i}' for i in range(10)]] = pd.DataFrame(train_transform_df.trans_age_percent.values.tolist(), index= train_transform_df.index)
train_lgb_df[[f'lgb_age{i}' for i in range(10)]] = pd.DataFrame(train_lgb_df.age_percent.values.tolist(), index= train_lgb_df.index)
train_df = train_transform_df[['user_id'] + [f'trans_age{i}' for i in range(10)]].merge(train_lgb_df[['user_id'] + [f'lgb_age{i}' for i in range(10)]],on='user_id')
train_df = train_df.merge(label_df,on='user_id',how='left')
train_df['age'] = train_df['age'] -1 
print(train_df)


        user_id  trans_age0  trans_age1  trans_age2  trans_age3  ...  lgb_age7  lgb_age8  lgb_age9  age  gender
0             1    0.013478    0.082695    0.127543    0.137634  ...  0.018560  0.006423  0.000888    3       1
1             2    0.015503    0.083090    0.186826    0.247610  ...  0.015210  0.008758  0.033346    9       1
...         ...         ...         ...         ...         ...  ...       ...       ...       ...  ...     ...
719998   719999    0.066974    0.610116    0.267866    0.045680  ...  0.000828  0.000263  0.000150    3       1
719999   720000    0.003307    0.035774    0.185940    0.299372  ...  0.013481  0.001739  0.000136    4       1

[720000 rows x 23 columns]


In [7]:

drop_list = ['age','user_id','gender']
train_x = train_df.drop(drop_list, axis=1)
train_y = train_df['age'].astype(int)

valid_x = valid_df.drop(drop_list, axis=1)
valid_y = valid_df['age'].astype(int)
print(train_x)
print(train_y)

print(valid_x)
print(valid_y)

#train_x,valid_x,train_y,valid_y =  train_test_split(x,y,test_size=0.20, random_state=42)
#print(train_x.shape)
train_data = lgb.Dataset(valid_x.values, label=valid_y, feature_name=list(train_x.columns),free_raw_data=False)
valid_data = lgb.Dataset(train_x.values, label=train_y, feature_name=list(train_x.columns),free_raw_data=False,reference=train_data)



        trans_age0  trans_age1  trans_age2  trans_age3  trans_age4  ...  lgb_age5  lgb_age6  lgb_age7  lgb_age8  lgb_age9
0         0.013478    0.082695    0.127543    0.137634    0.255952  ...  0.224305  0.080324  0.018560  0.006423  0.000888
1         0.015503    0.083090    0.186826    0.247610    0.199560  ...  0.085972  0.051086  0.015210  0.008758  0.033346
...            ...         ...         ...         ...         ...  ...       ...       ...       ...       ...       ...
719998    0.066974    0.610116    0.267866    0.045680    0.006738  ...  0.006359  0.002366  0.000828  0.000263  0.000150
719999    0.003307    0.035774    0.185940    0.299372    0.219567  ...  0.068725  0.016117  0.013481  0.001739  0.000136

[720000 rows x 20 columns]
0         3
1         9
         ..
719998    3
719999    4
Name: age, Length: 720000, dtype: int64
        trans_age0  trans_age1  trans_age2  trans_age3  trans_age4  ...  lgb_age5  lgb_age6  lgb_age7  lgb_age8  lgb_age9
0         0.019145

In [8]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'softmax',
    'num_class':10,
    'metric': 'multi_error',
    'num_leaves': 32 + 1,
    'learning_rate': 0.1,
    'feature_fraction': 0.4,
    'bagging_fraction': 0.4,
    'bagging_freq': 5,
    'lambda_l1':0.1,
    'verbose': 1
}
print('Starting training...')
# train

gbm = lgb.train(params,
                train_data,
                 valid_sets=[train_data,valid_data],
                num_boost_round = 2000,
                early_stopping_rounds=50
               )
gbm.save_model('model/age_result.model')
print('Feature importances:', list(gbm.feature_importance()))

Starting training...
[1]	training's multi_error: 0.771367	valid_1's multi_error: 0.772138
Training until validation scores don't improve for 50 rounds.
[2]	training's multi_error: 0.729256	valid_1's multi_error: 0.728793
[3]	training's multi_error: 0.683756	valid_1's multi_error: 0.681471
[4]	training's multi_error: 0.640211	valid_1's multi_error: 0.634943
[5]	training's multi_error: 0.6077	valid_1's multi_error: 0.601782
[6]	training's multi_error: 0.586706	valid_1's multi_error: 0.5797
[7]	training's multi_error: 0.574933	valid_1's multi_error: 0.566907
[8]	training's multi_error: 0.566383	valid_1's multi_error: 0.559346
[9]	training's multi_error: 0.562256	valid_1's multi_error: 0.555044
[10]	training's multi_error: 0.558639	valid_1's multi_error: 0.551056
[11]	training's multi_error: 0.555983	valid_1's multi_error: 0.548104
[12]	training's multi_error: 0.554056	valid_1's multi_error: 0.545769
[13]	training's multi_error: 0.552144	valid_1's multi_error: 0.543811
[14]	training's mult