In [1]:
import lightgbm as lgb
import os
import pandas as pd
import numpy as np
import random
import gc
from datetime import datetime
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import  OneHotEncoder

np.random.seed(2019)
random.seed(2019)
pd.set_option('display.max_rows', 6)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 280)
pd.set_option('display.max_colwidth', 150)
data_path = '/data/workspace/kimi/tencent_ads/2020/dataset'
preprocess_path = 'preprocess'

In [2]:
total_final_df= pd.read_pickle(f'{preprocess_path}/total_final.pkl')
label_df = pd.read_csv(f'{data_path}/train_preliminary/user.csv')
total_final_df = total_final_df.merge(label_df,on='user_id')
total_final_df['age'] = total_final_df['age'] -1
print(total_final_df)


        user_id  active_days  click_times_total                                                                                                                                      advertiser_id_seq  \
0             1           10                 14                                                                    [7293, 9702, 29455, 14668, 11411, 14681, 17189, 367, 44865, 188, 23575, 188, 10988]   
1             2           28                 46  [22885, 10686, 18562, 25932, 22885, 34505, 768, 26006, 918, 34503, 17284, 13732, 20033, 27843, 25260, 27843, 42272, 26006, 38785, 2302, 42272, 149...   
2             3           23                 30  [32974, 9877, 18492, 14186, 17018, 9058, 8371, 2336, 39500, 45169, 14289, 13119, 36384, 48608, 16764, 9970, 15351, 14498, 55248, 16895, 52263, 918...   
...         ...          ...                ...                                                                                                                                                 

In [3]:
train_df = total_final_df[total_final_df.user_id <= 720000]
valid_df = total_final_df[total_final_df.user_id > 720000]
valid_df = valid_df[valid_df.user_id < 2000000]
print(valid_df)

del total_final_df

        user_id  active_days  click_times_total                                                                                                                                      advertiser_id_seq  \
720000   720001           19                 32  [19065, 918, 10985, 10985, 19065, 13732, 7733, 41149, 24894, 16320, 37513, 15736, 918, 20264, 41389, 41389, 38357, 10975, 6465, 6465, 23210, 953, ...   
720001   720002           21                 29  [19862, 1461, 19862, 14682, 11425, 18103, 2367, 18786, 14636, 15095, 11101, 10986, 10989, 10986, 36700, 7817, 7809, 14682, 14681, 52180, 14681, 25...   
720002   720003           18                 21             [10831, 14513, 14681, 15385, 12270, 18103, 18103, 18103, 18103, 14681, 14681, 14681, 19056, 6974, 14681, 14681, 4751, 14682, 19056, 17018]   
...         ...          ...                ...                                                                                                                                                 

In [4]:
train_industry_df = pd.read_pickle(f'{preprocess_path}/industry_top3_l16_train.pkl')
valid_industry_df = pd.read_pickle(f'{preprocess_path}/industry_top3_l16_valid.pkl')
print(valid_industry_df)
train_df = train_df.merge(train_industry_df,on='user_id')
valid_df = valid_df.merge(valid_industry_df,on='user_id')
#print(train_df)
print(valid_df)
del train_industry_df
del valid_industry_df

        user_id  industry_0  industry_1  industry_2  industry_3  industry_4  industry_5  industry_6  industry_7  industry_8  industry_9  industry_10  industry_11  industry_12  industry_13  industry_14  industry_15  industry_top0_0  industry_top0_1  industry_top0_2  \
0        720001   -1.989230   -2.485385   -0.775828   -2.405642   -0.579944    4.934428    6.225451   -2.301669   -2.214990    0.856269    -2.414757     1.870960     1.029048     3.209267    -1.862676    -1.774886        -1.735877         2.013226         1.120578   
1        720002   -1.502039    2.271748    0.350132    1.125167   -1.233660   -1.070818   -3.583268   -0.954239   -1.935579   -0.525664    -1.712189    -2.707626    -0.668274     1.354530     0.002641     3.774900        -1.735877         2.013226         1.120578   
2        720003    0.915773    2.172321   -1.929452   -1.091829   -0.282038    0.805745   -0.664731   -0.263200   -1.436507    2.145942     0.755215     0.446492    -1.664001     0.509256     3.54

        user_id  active_days  click_times_total                                                                                                                                      advertiser_id_seq  \
0        720001           19                 32  [19065, 918, 10985, 10985, 19065, 13732, 7733, 41149, 24894, 16320, 37513, 15736, 918, 20264, 41389, 41389, 38357, 10975, 6465, 6465, 23210, 953, ...   
1        720002           21                 29  [19862, 1461, 19862, 14682, 11425, 18103, 2367, 18786, 14636, 15095, 11101, 10986, 10989, 10986, 36700, 7817, 7809, 14682, 14681, 52180, 14681, 25...   
2        720003           18                 21             [10831, 14513, 14681, 15385, 12270, 18103, 18103, 18103, 18103, 14681, 14681, 14681, 19056, 6974, 14681, 14681, 4751, 14682, 19056, 17018]   
...         ...          ...                ...                                                                                                                                                 

In [5]:
train_advertiser_id_df = pd.read_pickle(f'{preprocess_path}/advertiser_id_top3_l32_train.pkl')
valid_advertiser_id_df = pd.read_pickle(f'{preprocess_path}/advertiser_id_top3_l32_valid.pkl')
print(valid_advertiser_id_df)
train_df = train_df.merge(train_advertiser_id_df,on='user_id')
valid_df = valid_df.merge(valid_advertiser_id_df,on='user_id')
#print(train_df)
print(valid_df)

del train_advertiser_id_df
del train_advertiser_id_df

        user_id  advertiser_id_0  advertiser_id_1  advertiser_id_2  advertiser_id_3  advertiser_id_4  advertiser_id_5  advertiser_id_6  advertiser_id_7  advertiser_id_8  advertiser_id_9  advertiser_id_10  advertiser_id_11  advertiser_id_12  advertiser_id_13  advertiser_id_14  \
0        720001         1.136321         0.213856         0.235502         0.533652         0.641636        -0.825572         0.474796        -3.250204        -1.069312        -0.434412         -4.983632          0.528394         -1.142879          1.671987         -0.639480   
1        720002        -0.444156        -1.480911        -1.698048         0.921614        -1.396144         0.542396         0.222608         0.949205        -3.359164        -0.913740          0.151295          1.423159          0.468865          2.096266         -1.824014   
2        720003        -0.316999        -0.413298        -1.352538         2.920290        -0.045855         1.507078         1.606123         1.657228        -0.6

        user_id  active_days  click_times_total                                                                                                                                      advertiser_id_seq  \
0        720001           19                 32  [19065, 918, 10985, 10985, 19065, 13732, 7733, 41149, 24894, 16320, 37513, 15736, 918, 20264, 41389, 41389, 38357, 10975, 6465, 6465, 23210, 953, ...   
1        720002           21                 29  [19862, 1461, 19862, 14682, 11425, 18103, 2367, 18786, 14636, 15095, 11101, 10986, 10989, 10986, 36700, 7817, 7809, 14682, 14681, 52180, 14681, 25...   
2        720003           18                 21             [10831, 14513, 14681, 15385, 12270, 18103, 18103, 18103, 18103, 14681, 14681, 14681, 19056, 6974, 14681, 14681, 4751, 14682, 19056, 17018]   
...         ...          ...                ...                                                                                                                                                 

NameError: name 'train_advertiser_id_df' is not defined

In [6]:
train_product_id_df = pd.read_pickle(f'{preprocess_path}/product_id_top3_l32_train.pkl')
valid_product_id_df = pd.read_pickle(f'{preprocess_path}/product_id_top3_l32_valid.pkl')
print(valid_product_id_df)
train_df = train_df.merge(train_product_id_df,on='user_id')
valid_df = valid_df.merge(valid_product_id_df,on='user_id')
#print(train_df)
print(valid_df)


del train_product_id_df
del train_product_id_df

        user_id  product_id_0  product_id_1  product_id_2  product_id_3  product_id_4  product_id_5  product_id_6  product_id_7  product_id_8  product_id_9  product_id_10  product_id_11  product_id_12  product_id_13  product_id_14  product_id_15  product_id_16  product_id_17  \
0        720001      0.204135     -2.837156     -4.074146      0.643553     -1.720915      3.232595     -3.547356     -0.452134      0.858969     -1.846138      -1.528601       1.528533      -5.126956       2.169204      -2.204114      -1.818891      -0.081513      -1.177031   
1        720002     -0.468107      0.210799      1.089811     -0.286071     -3.130891      1.369872     -0.434628      0.195874      0.886335     -1.720489       0.616350      -1.940804       2.639883      -2.568676      -1.609848      -1.415634       0.182829      -2.036438   
2        720003      0.000072     -1.743875     -0.553029     -1.631177     -2.417432     -0.004326     -0.595079      1.729431      0.268238      0.367174       0

        user_id  active_days  click_times_total                                                                                                                                      advertiser_id_seq  \
0        720001           19                 32  [19065, 918, 10985, 10985, 19065, 13732, 7733, 41149, 24894, 16320, 37513, 15736, 918, 20264, 41389, 41389, 38357, 10975, 6465, 6465, 23210, 953, ...   
1        720002           21                 29  [19862, 1461, 19862, 14682, 11425, 18103, 2367, 18786, 14636, 15095, 11101, 10986, 10989, 10986, 36700, 7817, 7809, 14682, 14681, 52180, 14681, 25...   
2        720003           18                 21             [10831, 14513, 14681, 15385, 12270, 18103, 18103, 18103, 18103, 14681, 14681, 14681, 19056, 6974, 14681, 14681, 4751, 14682, 19056, 17018]   
...         ...          ...                ...                                                                                                                                                 

NameError: name 'train_product_id_df' is not defined

In [8]:
train_x = train_df.drop(['age','user_id','gender','advertiser_id_seq','industry_seq','product_id_seq'], axis=1).astype(float)
train_y = train_df['age'].astype(int)

valid_x = valid_df.drop(['age','user_id','gender','advertiser_id_seq','industry_seq','product_id_seq'], axis=1).astype(float)
valid_y = valid_df['age'].astype(int)
#print(x)
#print(y)
print(train_x)
print(train_y)

print(valid_x)
print(valid_y)

#train_x,valid_x,train_y,valid_y =  train_test_split(x,y,test_size=0.20, random_state=42)
#print(train_x.shape)
train_data = lgb.Dataset(train_x.values, label=train_y, feature_name=list(train_x.columns),free_raw_data=False)
valid_data = lgb.Dataset(valid_x.values, label=valid_y, feature_name=list(train_x.columns),free_raw_data=False,reference=train_data)



        active_days  click_times_total  industry_0  industry_1  industry_2  industry_3  industry_4  industry_5  industry_6  industry_7  industry_8  industry_9  industry_10  industry_11  industry_12  industry_13  industry_14  industry_15  industry_top0_0  industry_top0_1  \
0              10.0               14.0   -2.407525   -2.480743    0.258189    0.068892    2.275061    0.609698   -1.124533   -2.170345   -3.801786   -1.069965    -1.431871    -0.779060    -2.454330     3.554383    -3.526832     2.944196        -3.168944        -9.633119   
1              28.0               46.0    0.528324    0.173733    0.413423   -2.342237    2.927688    2.095639    4.499999   -2.768379   -0.016797    0.830495    -3.164093     2.597350    -2.405309     4.605047    -3.062250    -0.915257        -1.735877         2.013226   
2              23.0               30.0   -3.704870   -3.906977    0.328102   -1.694905   -0.775484    2.218845   -2.914866   -2.651409    0.200202    2.303957     0.117288    -0.

In [11]:

gbm =  lgb.Booster(model_file='age_emb1.txt')
print('Feature importances:', list(gbm.feature_importance()))

Feature importances: [2046, 1820, 2427, 1887, 1979, 2219, 2849, 2031, 1826, 1772, 1961, 2335, 2430, 2312, 2378, 2187, 1895, 2719, 411, 395, 422, 375, 498, 384, 258, 442, 447, 473, 501, 536, 476, 330, 404, 469, 793, 889, 749, 722, 752, 982, 752, 665, 845, 875, 756, 869, 921, 851, 776, 654, 809, 603, 1156, 1060, 998, 1070, 1267, 1063, 851, 1114, 1117, 1109, 1138, 1199, 1087, 965, 899, 1003, 499, 1924, 1753, 59, 2601, 1995, 2238, 3130, 2230, 2177, 2732, 2030, 2952, 2731, 3769, 3829, 4102, 2836, 4661, 3136, 5179, 4479, 2464, 2185, 2545, 3247, 2252, 2088, 2597, 3141, 4048, 5965, 2832, 4537, 2991, 1726, 980, 720, 887, 949, 923, 851, 989, 757, 1014, 885, 897, 1066, 891, 877, 936, 934, 1211, 1182, 978, 883, 863, 881, 842, 919, 929, 918, 1037, 1249, 930, 973, 818, 843, 456, 1267, 1109, 1227, 1396, 1278, 1171, 1327, 1085, 1404, 1267, 1208, 1386, 1326, 1122, 1278, 1153, 1477, 1503, 1357, 1230, 1180, 1346, 1201, 1200, 1262, 1185, 1303, 1372, 1211, 1317, 1123, 1155, 363, 1505, 1462, 1353, 1675, 151

In [10]:
before_one_hot =  train_y.values.reshape([-1,1])
print(before_one_hot)
enc = OneHotEncoder()
enc.fit(before_one_hot)

one_hoted_y  = enc.transform(before_one_hot).toarray()
print(one_hoted_y.shape)


[[3]
 [9]
 [6]
 ...
 [3]
 [3]
 [4]]
(720000, 10)


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [12]:
y_pred = gbm.predict(train_x.astype(float))
for i in range(len(y_pred)):
        max_value=max(y_pred[i])
        for j in range(len(y_pred[i])):
            if max_value==y_pred[i][j]:
                y_pred[i][j]=1
            else:
                y_pred[i][j]=0
                
print(precision_score(one_hoted_y, y_pred,average='micro'))

ret = []
for user_id,age in zip(range(1000000),y_pred):
    ret.append([int(user_id),int(age.tolist().index(1) + 1)])
ret_df = pd.DataFrame(ret,columns=['user_id','predicted_age'])
print(ret_df['predicted_age'].value_counts())

0.6491541666666667
3     233639
2     123271
4      94318
       ...  
8      22258
9      16694
10      9544
Name: predicted_age, Length: 10, dtype: int64


In [13]:

before_one_hot =  valid_y.values.reshape([-1,1])
print(before_one_hot)
enc = OneHotEncoder()
enc.fit(before_one_hot)

one_hoted_y  = enc.transform(before_one_hot).toarray()
print(one_hoted_y.shape)


[[2]
 [1]
 [1]
 ...
 [3]
 [2]
 [2]]
(180000, 10)


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [16]:
y_pred = gbm.predict(valid_x)
for i in range(len(y_pred)):
        max_value=max(y_pred[i])
        for j in range(len(y_pred[i])):
            if max_value==y_pred[i][j]:
                y_pred[i][j]=1
            else:
                y_pred[i][j]=0
                
precision_score(one_hoted_y, y_pred,average='micro')


0.33785

In [17]:
ret = []
for user_id,age in zip(range(1000000),y_pred):
    ret.append([int(user_id),int(age.tolist().index(1) + 1)])
ret_df = pd.DataFrame(ret,columns=['user_id','predicted_age'])
print(ret_df['predicted_age'].value_counts())

3     64410
2     31073
5     24799
      ...  
8      3582
9      2777
10     1705
Name: predicted_age, Length: 10, dtype: int64
