In [1]:
import lightgbm as lgb
import os
import math
import pandas as pd
import numpy as np
import random
import gc
from datetime import datetime
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import  OneHotEncoder

np.random.seed(2019)
random.seed(2019)
pd.set_option('display.max_rows', 6)
pd.set_option('display.max_columns', 10)
pd.set_option('display.width', 280)
pd.set_option('display.max_colwidth', 150)
data_path = '/data/workspace/kimi/tencent_ads/2020/dataset'
preprocess_path = 'preprocess'

In [2]:
user_base_statics_df= pd.read_pickle(f'{preprocess_path}/train_user_base_statics.pkl')
user_base_statics_df.columns = ['_'.join(i) for i in user_base_statics_df.columns.values]
label_df = pd.read_csv(f'{data_path}/train_preliminary/user.csv')
user_base_statics_df = user_base_statics_df.merge(label_df,on='user_id')
user_base_statics_df['click_times_sum_log'] = user_base_statics_df['click_times_sum'].apply(lambda x :math.log(x))
user_base_statics_df['click_times_count_log'] = user_base_statics_df['click_times_count'].apply(lambda x :math.log(x))
user_base_statics_df['age'] = user_base_statics_df['age'] -1
user_base_statics_df = user_base_statics_df.astype(float)
print(user_base_statics_df)


         user_id  click_times_sum  click_times_count  ad_id_nunique  product_id_nunique  ...  industry_nunique  age  gender  click_times_sum_log  click_times_count_log
0            1.0             14.0               13.0           12.0                 6.0  ...               9.0  3.0     1.0             2.639057               2.564949
1            2.0             46.0               45.0           42.0                20.0  ...              15.0  9.0     1.0             3.828641               3.806662
2            3.0             30.0               30.0           30.0                17.0  ...               8.0  6.0     2.0             3.401197               3.401197
...          ...              ...                ...            ...                 ...  ...               ...  ...     ...                  ...                    ...
899997  899998.0             15.0               14.0           14.0                 5.0  ...               5.0  3.0     2.0             2.708050               2

In [3]:
train_df = user_base_statics_df[user_base_statics_df.user_id <= 720000]
valid_df = user_base_statics_df[user_base_statics_df.user_id > 720000]
valid_df = valid_df[valid_df.user_id < 2000000]
print(valid_df)


         user_id  click_times_sum  click_times_count  ad_id_nunique  product_id_nunique  ...  industry_nunique  age  gender  click_times_sum_log  click_times_count_log
720000  720001.0             32.0               28.0           26.0                11.0  ...              11.0  2.0     2.0             3.465736               3.332205
720001  720002.0             29.0               29.0           26.0                 2.0  ...              19.0  1.0     2.0             3.367296               3.367296
720002  720003.0             21.0               20.0           17.0                 8.0  ...              10.0  1.0     1.0             3.044522               2.995732
...          ...              ...                ...            ...                 ...  ...               ...  ...     ...                  ...                    ...
899997  899998.0             15.0               14.0           14.0                 5.0  ...               5.0  3.0     2.0             2.708050               2

In [4]:
def merge_features(train_df,valid_df,train_file,valid_file,target_encode=False):
    train_features_df  = pd.read_pickle(f'{preprocess_path}/{train_file}')
    valid_features_df = pd.read_pickle(f'{preprocess_path}/{valid_file}')
    if target_encode:
        train_features_df.columns = [ '_'.join(i) for i in train_features_df.columns.values  ]
        valid_features_df.columns = ['_'.join(i) for i in valid_features_df.columns.values  ]

    train_df = train_df.merge(train_features_df,on='user_id')
    valid_df = valid_df.merge(valid_features_df,on='user_id')
    return train_df,valid_df


In [5]:
for i in ['creative_id','ad_id', 'product_id','advertiser_id','industry','product_category']:
    print(f'merge {i}...')
    train_df,valid_df = merge_features(train_df,valid_df,f'train_user_target_encoder_{i}_age.pkl',f'valid_user_target_encoder_{i}.pkl',True)
    print(train_df)
    print(valid_df)

merge creative_id...
         user_id  click_times_sum  click_times_count  ad_id_nunique  product_id_nunique  ...  creative_id_age8_kfold_mean_std  creative_id_age9_kfold_mean_min  creative_id_age9_kfold_mean_max  creative_id_age9_kfold_mean_mean  creative_id_age9_kfold_mean_std
0            1.0             14.0               13.0           12.0                 6.0  ...                         0.022296                              0.0                         0.061728                          0.016455                         0.020560
1            2.0             46.0               45.0           42.0                20.0  ...                         0.026155                              0.0                         0.500000                          0.029619                         0.074746
2            3.0             30.0               30.0           30.0                17.0  ...                         0.061345                              0.0                         0.053333           

In [6]:
def merge_w2v_features(train_df,valid_df,train_file,valid_file,f=None,target_encode=False):
    train_features_df  = pd.read_pickle(f'{preprocess_path}/{train_file}').drop([f'{f}_top0_{i}'  for i in range(64)],axis=1)
    print(train_features_df)
    valid_features_df = pd.read_pickle(f'{preprocess_path}/{valid_file}').drop([f'{f}_top0_{i}'  for i in range(64)],axis=1)
    if target_encode:
        train_features_df.columns = [ '_'.join(i) for i in train_features_df.columns.values  ]
        valid_features_df.columns = ['_'.join(i) for i in valid_features_df.columns.values  ]

    train_df = train_df.merge(train_features_df,on='user_id')
    valid_df = valid_df.merge(valid_features_df,on='user_id')
    return train_df,valid_df


In [7]:
for i in ['creative_id','ad_id', 'product_id','advertiser_id','industry']:
    print(f'merge {i}...' )
    #creative_id_top1_s64_clk.pkl
    train_df,valid_df = merge_w2v_features(train_df,valid_df,f'{i}_top1_s64_clk.pkl',f'{i}_top1_s64_clk.pkl',i,False)
    print(train_df)
    print(valid_df)

merge creative_id...
         user_id  creative_id_0  creative_id_1  creative_id_2  creative_id_3  ...  creative_id_top0_64  creative_id_mean  creative_id_std  creative_id_min  creative_id_max
0         2267.0      -1.437453      -0.779419       0.307058       0.399161  ...                    2          1.060976         0.239286                1                2
1       512898.0      -1.954691       0.417370      -0.402261      -0.314112  ...                    2          1.035088         0.184002                1                2
2       524600.0      -1.731089       1.084934      -0.111126       0.813327  ...                    2          1.029412         0.168958                1                2
...          ...            ...            ...            ...            ...  ...                  ...               ...              ...              ...              ...
899997  868400.0      -0.882293       1.556970       0.277414       1.389624  ...                    1          1.00000

In [8]:
# train_x = train_df.drop(['age','user_id','gender','advertiser_id_seq','industry_seq','product_id_seq'], axis=1)
# train_y = train_df['age'].astype(int)

# valid_x = valid_df.drop(['age','user_id','gender','advertiser_id_seq','industry_seq','product_id_seq'], axis=1)
# valid_y = valid_df['age'].astype(int)

drop_list = ['age','user_id','gender','click_times_sum','click_times_count']
train_x = train_df.drop(drop_list, axis=1)
train_y = train_df['age'].astype(int)

valid_x = valid_df.drop(drop_list, axis=1)
valid_y = valid_df['age'].astype(int)
#print(x)
#print(y)
print(train_x)
print(train_y)

print(valid_x)
print(valid_y)

#train_x,valid_x,train_y,valid_y =  train_test_split(x,y,test_size=0.20, random_state=42)
#print(train_x.shape)
train_data = lgb.Dataset(train_x.values, label=train_y, feature_name=list(train_x.columns),free_raw_data=False)
valid_data = lgb.Dataset(valid_x.values, label=valid_y, feature_name=list(train_x.columns),free_raw_data=False,reference=train_data)



        ad_id_nunique  product_id_nunique  product_category_nunique  advertiser_id_nunique  industry_nunique  ...  industry_top0_64  industry_mean  industry_std  industry_min  industry_max
0                12.0                 6.0                       3.0                   12.0               9.0  ...                 3       1.444444      0.684935             1             3
1                42.0                20.0                       3.0                   36.0              15.0  ...                 9       3.000000      2.422120             1             9
2                30.0                17.0                       6.0                   28.0               8.0  ...                 8       3.750000      2.633913             1             8
...               ...                 ...                       ...                    ...               ...  ...               ...            ...           ...           ...           ...
719997           38.0                21.0              

In [15]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'softmax',
    'num_class':10,
    'metric': 'multi_error',
    'num_leaves': 64 + 1,
    'learning_rate': 0.1,
    'feature_fraction': 0.7,
    'bagging_fraction': 0.7,
    'bagging_freq': 5,
    'lambda_l2':0.03,
    'verbose': 1
}
print('Starting training...')
# train

gbm = lgb.Booster(model_file='model/age_target.model')
#gbm.save_model('model/age_target.model')
print('Feature importances:', list(gbm.feature_importance()))

Starting training...
Feature importances: [369, 439, 163, 576, 566, 401, 380, 42, 493, 725, 413, 134, 506, 1239, 565, 165, 539, 1001, 451, 115, 428, 1067, 461, 104, 371, 835, 376, 145, 452, 941, 469, 80, 418, 924, 407, 37, 406, 787, 436, 23, 324, 661, 401, 34, 361, 611, 383, 41, 423, 696, 440, 159, 499, 1521, 621, 228, 532, 1177, 491, 186, 446, 1054, 462, 110, 358, 870, 395, 144, 455, 987, 484, 117, 438, 1147, 415, 55, 362, 951, 393, 30, 317, 702, 441, 32, 332, 580, 337, 346, 337, 337, 305, 440, 459, 396, 304, 362, 463, 312, 249, 338, 402, 309, 277, 362, 373, 402, 299, 316, 448, 380, 279, 423, 402, 303, 267, 375, 328, 229, 271, 247, 285, 218, 252, 216, 382, 257, 282, 790, 499, 743, 386, 1684, 944, 1092, 757, 1138, 883, 873, 522, 792, 1035, 693, 481, 764, 930, 606, 415, 932, 772, 676, 450, 1002, 855, 598, 361, 965, 730, 509, 374, 698, 502, 518, 343, 477, 570, 544, 336, 610, 293, 431, 423, 304, 426, 321, 365, 255, 590, 287, 340, 486, 445, 363, 446, 469, 359, 386, 355, 367, 300, 342, 352,

In [12]:
before_one_hot =  train_y.values.reshape([-1,1])
print(before_one_hot)
enc = OneHotEncoder()
enc.fit(before_one_hot)

one_hoted_y  = enc.transform(before_one_hot).toarray()
print(one_hoted_y.shape)


[[3]
 [9]
 [6]
 ...
 [3]
 [3]
 [4]]
(720000, 10)


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [14]:
del train_df
gc.collect()

y_pred = gbm.predict(train_x.astype(float))
y_pred_percent = y_pred.copy()
for i in range(len(y_pred)):
        max_value=max(y_pred[i])
        for j in range(len(y_pred[i])):
            if max_value==y_pred[i][j]:
                y_pred[i][j]=1
            else:
                y_pred[i][j]=0
                
print(precision_score(one_hoted_y, y_pred,average='micro'))

ret = []
for user_id,age,age_percent,label_age in zip(range(1000000),y_pred,y_pred_percent,train_y):
    ret.append([int(user_id) + 1,int(age.tolist().index(1) ),age_percent,label_age])
ret_df = pd.DataFrame(ret,columns=['user_id','predicted_age','age_percent','label_age'])
print(ret_df)
ret_df.to_pickle('output/lgb_train_output.pkl')


0.20020416666666666
        user_id  predicted_age                                                                                                                                            age_percent  label_age
0             1              2  [0.03580419457322223, 0.1898360514870551, 0.29164262786260153, 0.22700110675902607, 0.06148453396238852, 0.06269398690908999, 0.08561426040271033,...          3
1             2              2  [0.1401433705682285, 0.19316240655524627, 0.23475777201241146, 0.15779094290523438, 0.08661455342297181, 0.04144567574685301, 0.11511808701112597,...          9
2             3              1  [0.08179875928343638, 0.2798110024675992, 0.21560536296149746, 0.1939984375222692, 0.0729170868976046, 0.07304765763862908, 0.047401501563739096, ...          6
...         ...            ...                                                                                                                                                    ...        ...
719997   719998

In [None]:

before_one_hot =  valid_y.values.reshape([-1,1])
print(before_one_hot)
enc = OneHotEncoder()
enc.fit(before_one_hot)

one_hoted_y  = enc.transform(before_one_hot).toarray()
print(one_hoted_y.shape)


In [None]:
y_pred = gbm.predict(valid_x.astype(float))
y_pred_precent =  y_pred.copy()
for i in range(len(y_pred)):
        max_value=max(y_pred[i])
        for j in range(len(y_pred[i])):
            if max_value==y_pred[i][j]:
                y_pred[i][j]=1
            else:
                y_pred[i][j]=0
                
precision_score(one_hoted_y, y_pred,average='micro')


In [None]:
ret = []
for user_id,age,age_percent,label_age in zip(range(1000000),y_pred,y_pred_precent,valid_y):
    ret.append([int(user_id + 720001),int(age.tolist().index(1)),age_percent,label_age])
ret_df = pd.DataFrame(ret,columns=['user_id','predicted_age','age_percent','label_age'])
print(ret_df)
print(ret_df['predicted_age'].value_counts())
ret_df.to_pickle('output/lgb_output.pkl')

In [None]:
true_user_df = ret_df[ret_df.predicted_age == ret_df.label_age ]
print(true_user_df)


In [None]:
true_user_df = true_user_df.merge(user_base_statics_df,on='user_id')
print(true_user_df)

In [None]:
plt_values =  true_user_df['click_times_count'].value_counts().sort_index()
pd.set_option('display.max_rows', 100)
print(plt_values)
pd.set_option('display.max_rows', 6)
x = [str(i) for i in plt_values.index]
y = plt_values.values
plt.scatter(x,y)
plt.show()

In [None]:
plt_values =  true_user_df['click_times_sum'].value_counts().sort_index()
pd.set_option('display.max_rows', 100)
print(plt_values)
pd.set_option('display.max_rows', 6)
x = [str(i) for i in plt_values.index]
y = plt_values.values
plt.scatter(x,y)
plt.show()

In [None]:
plt_values =  true_user_df['age'].value_counts()
print(plt_values)
x = [str(i) for i in plt_values.index]
y = plt_values.values
plt.scatter(x,y)
plt.show()

In [None]:
valid_user_base_statics_df =  user_base_statics_df[user_base_statics_df.user_id > 720000]
print(valid_user_base_statics_df)

In [None]:
len_10_30_df = valid_user_base_statics_df[valid_user_base_statics_df.click_times_count > 10]
len_10_30_df = len_10_30_df[len_10_30_df.click_times_count <=20]
print(valid_user_base_statics_df)
plt_values =  len_10_30_df['age'].value_counts()
print(plt_values)
x = [str(i) for i in plt_values.index]
y = plt_values.values
plt.scatter(x,y)
plt.show()

In [None]:

plt_values =  valid_user_base_statics_df['age'].value_counts()
print(plt_values)
x = [str(i) for i in plt_values.index]
y = plt_values.values
plt.scatter(x,y)
plt.show()

In [None]:
plt_values =  valid_user_base_statics_df['click_times_count'].value_counts().sort_index()
pd.set_option('display.max_rows', 100)
print(plt_values)
pd.set_option('display.max_rows', 6)
x = [str(i) for i in plt_values.index]
y = plt_values.values
plt.scatter(x,y)
plt.show()

In [None]:
plt_values =  valid_user_base_statics_df['click_times_sum'].value_counts().sort_index()
pd.set_option('display.max_rows', 100)
print(plt_values)
pd.set_option('display.max_rows', 6)
x = [str(i) for i in plt_values.index]
y = plt_values.values
plt.scatter(x,y)
plt.show()

In [None]:
valid_user_base_statics_df['click_times_sum_log'] =  valid_user_base_statics_df['click_times_sum'].apply(lambda x: math.log(x))
print(valid_user_base_statics_df)
plt_values =  valid_user_base_statics_df['click_times_sum_log'].value_counts().sort_index()
pd.set_option('display.max_rows', 100)
print(plt_values)
pd.set_option('display.max_rows', 6)
x = [str(i) for i in plt_values.index]
y = plt_values.values
plt.scatter(x,y)
plt.show()

In [None]:
print(valid_user_base_statics_df)

In [None]:
false_user_df = ret_df[ret_df.predicted_age != ret_df.label_age ]
false_user_df = false_user_df.merge(user_base_statics_df,on='user_id')
print(false_user_df)

In [None]:
plt_values =  false_user_df['click_times_count'].value_counts().sort_index()
pd.set_option('display.max_rows', 100)
print(plt_values)
pd.set_option('display.max_rows', 6)
x = [str(i) for i in plt_values.index]
y = plt_values.values
plt.scatter(x,y)
plt.show()