In [1]:
import lightgbm as lgb
import os
import math
import pandas as pd
import numpy as np
import random
import gc
from datetime import datetime
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import  OneHotEncoder

np.random.seed(2019)
random.seed(2019)
pd.set_option('display.max_rows', 6)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 280)
pd.set_option('display.max_colwidth', 150)
data_path = '/data/workspace/kimi/tencent_ads/2020/dataset'
preprocess_path = 'preprocess'

In [2]:
user_base_statics_df= pd.read_pickle(f'{preprocess_path}/train_user_base_statics.pkl')
user_base_statics_df.columns = ['_'.join(i) for i in user_base_statics_df.columns.values]
label_df = pd.read_csv(f'{data_path}/train_preliminary/user.csv')
user_base_statics_df = user_base_statics_df.merge(label_df,on='user_id')
user_base_statics_df['click_times_sum_log'] = user_base_statics_df['click_times_sum'].apply(lambda x :math.log(x))
user_base_statics_df['click_times_count_log'] = user_base_statics_df['click_times_count'].apply(lambda x :math.log(x))
user_base_statics_df['gender'] = user_base_statics_df['gender'] -1
user_base_statics_df = user_base_statics_df.astype(float)
print(user_base_statics_df)


         user_id  click_times_sum  click_times_count  ad_id_nunique  product_id_nunique  product_category_nunique  advertiser_id_nunique  industry_nunique   age  gender  click_times_sum_log  click_times_count_log
0            1.0             14.0               13.0           12.0                 6.0                       3.0                   12.0               9.0   4.0     0.0             2.639057               2.564949
1            2.0             46.0               45.0           42.0                20.0                       3.0                   36.0              15.0  10.0     0.0             3.828641               3.806662
2            3.0             30.0               30.0           30.0                17.0                       6.0                   28.0               8.0   7.0     1.0             3.401197               3.401197
...          ...              ...                ...            ...                 ...                       ...                    ...            

In [3]:
train_df = user_base_statics_df[user_base_statics_df.user_id <= 720000]
valid_df = user_base_statics_df[user_base_statics_df.user_id > 720000]
valid_df = valid_df[valid_df.user_id < 2000000]
print(valid_df)


         user_id  click_times_sum  click_times_count  ad_id_nunique  product_id_nunique  product_category_nunique  advertiser_id_nunique  industry_nunique  age  gender  click_times_sum_log  click_times_count_log
720000  720001.0             32.0               28.0           26.0                11.0                       2.0                   20.0              11.0  3.0     1.0             3.465736               3.332205
720001  720002.0             29.0               29.0           26.0                 2.0                       5.0                   18.0              19.0  2.0     1.0             3.367296               3.367296
720002  720003.0             21.0               20.0           17.0                 8.0                       3.0                   11.0              10.0  2.0     0.0             3.044522               2.995732
...          ...              ...                ...            ...                 ...                       ...                    ...               .

In [4]:
def merge_features(train_df,valid_df,train_file,valid_file,target_encode=False):
    train_features_df  = pd.read_pickle(f'{preprocess_path}/{train_file}')
    valid_features_df = pd.read_pickle(f'{preprocess_path}/{valid_file}')
    if target_encode:
        train_features_df.columns = [ '_'.join(i) for i in train_features_df.columns.values  ]
        valid_features_df.columns = ['_'.join(i) for i in valid_features_df.columns.values  ]

    train_df = train_df.merge(train_features_df,on='user_id')
    valid_df = valid_df.merge(valid_features_df,on='user_id')
    print(train_df)
    print(valid_df)
    return train_df,valid_df


In [5]:
train_df,valid_df = merge_features(train_df,valid_df,'train_user_target_encoder_gender.pkl','valid_user_target_encoder_gender.pkl',True)

         user_id  click_times_sum  click_times_count  ad_id_nunique  product_id_nunique  product_category_nunique  advertiser_id_nunique  industry_nunique   age  gender  click_times_sum_log  click_times_count_log  product_id_gender0_kfold_mean_min  \
0            1.0             14.0               13.0           12.0                 6.0                       3.0                   12.0               9.0   4.0     0.0             2.639057               2.564949                           0.629201   
1            2.0             46.0               45.0           42.0                20.0                       3.0                   36.0              15.0  10.0     0.0             3.828641               3.806662                           0.572262   
2            3.0             30.0               30.0           30.0                17.0                       6.0                   28.0               8.0   7.0     1.0             3.401197               3.401197                           0.083135

In [6]:
# train_x = train_df.drop(['age','user_id','gender','advertiser_id_seq','industry_seq','product_id_seq'], axis=1)
# train_y = train_df['age'].astype(int)

# valid_x = valid_df.drop(['age','user_id','gender','advertiser_id_seq','industry_seq','product_id_seq'], axis=1)
# valid_y = valid_df['age'].astype(int)

drop_list = ['age','user_id','gender','click_times_sum','click_times_count']
train_x = train_df.drop(drop_list, axis=1)
train_y = train_df['gender'].astype(int)

valid_x = valid_df.drop(drop_list, axis=1)
valid_y = valid_df['gender'].astype(int)
#print(x)
#print(y)
print(train_x)
print(train_y)

print(valid_x)
print(valid_y)

#train_x,valid_x,train_y,valid_y =  train_test_split(x,y,test_size=0.20, random_state=42)
#print(train_x.shape)
train_data = lgb.Dataset(train_x.values, label=train_y, feature_name=list(train_x.columns),free_raw_data=False)
valid_data = lgb.Dataset(valid_x.values, label=valid_y, feature_name=list(train_x.columns),free_raw_data=False,reference=train_data)



        ad_id_nunique  product_id_nunique  product_category_nunique  advertiser_id_nunique  industry_nunique  click_times_sum_log  click_times_count_log  product_id_gender0_kfold_mean_min  product_id_gender0_kfold_mean_max  product_id_gender0_kfold_mean_mean  \
0                12.0                 6.0                       3.0                   12.0               9.0             2.639057               2.564949                           0.629201                           0.708436                            0.673089   
1                42.0                20.0                       3.0                   36.0              15.0             3.828641               3.806662                           0.572262                           0.897775                            0.706484   
2                30.0                17.0                       6.0                   28.0               8.0             3.401197               3.401197                           0.083135                           

In [13]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'softmax',
    'num_class':2,
    'metric': 'multi_error',
    'num_leaves': 128 + 1,
    'learning_rate': 0.1,
    'feature_fraction': 0.6,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 1
}
print('Starting training...')
# train

gbm = lgb.train(params,
                train_data,
                 valid_sets=[train_data,valid_data],
                num_boost_round = 2000,
                early_stopping_rounds=100
               )
gbm.save_model('model/gender_target_encode.model')
print('Feature importances:', list(gbm.feature_importance()))

Starting training...
[1]	training's multi_error: 0.330718	valid_1's multi_error: 0.329294
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_error: 0.330718	valid_1's multi_error: 0.329294
[3]	training's multi_error: 0.229989	valid_1's multi_error: 0.228172
[4]	training's multi_error: 0.138569	valid_1's multi_error: 0.137839
[5]	training's multi_error: 0.121279	valid_1's multi_error: 0.121233
[6]	training's multi_error: 0.113953	valid_1's multi_error: 0.114128
[7]	training's multi_error: 0.109483	valid_1's multi_error: 0.109683
[8]	training's multi_error: 0.106901	valid_1's multi_error: 0.107033
[9]	training's multi_error: 0.105125	valid_1's multi_error: 0.105439
[10]	training's multi_error: 0.103829	valid_1's multi_error: 0.104161
[11]	training's multi_error: 0.102894	valid_1's multi_error: 0.103544
[12]	training's multi_error: 0.102136	valid_1's multi_error: 0.102894
[13]	training's multi_error: 0.101587	valid_1's multi_error: 0.102328
[14]	training's

[116]	training's multi_error: 0.0929681	valid_1's multi_error: 0.0980167
[117]	training's multi_error: 0.0929347	valid_1's multi_error: 0.0980056
[118]	training's multi_error: 0.0929194	valid_1's multi_error: 0.0980056
[119]	training's multi_error: 0.0928597	valid_1's multi_error: 0.0979889
[120]	training's multi_error: 0.0928139	valid_1's multi_error: 0.0979333
[121]	training's multi_error: 0.0927833	valid_1's multi_error: 0.0979722
[122]	training's multi_error: 0.0927403	valid_1's multi_error: 0.0979556
[123]	training's multi_error: 0.0926833	valid_1's multi_error: 0.0979222
[124]	training's multi_error: 0.0926583	valid_1's multi_error: 0.0979444
[125]	training's multi_error: 0.0926389	valid_1's multi_error: 0.0979333
[126]	training's multi_error: 0.0926292	valid_1's multi_error: 0.0979667
[127]	training's multi_error: 0.0925764	valid_1's multi_error: 0.0979333
[128]	training's multi_error: 0.0925194	valid_1's multi_error: 0.0979278
[129]	training's multi_error: 0.0924778	valid_1's m

[229]	training's multi_error: 0.0883861	valid_1's multi_error: 0.0978778
[230]	training's multi_error: 0.0883417	valid_1's multi_error: 0.0978944
[231]	training's multi_error: 0.088275	valid_1's multi_error: 0.0979222
[232]	training's multi_error: 0.0881986	valid_1's multi_error: 0.0979222
[233]	training's multi_error: 0.0881319	valid_1's multi_error: 0.0979056
[234]	training's multi_error: 0.0880708	valid_1's multi_error: 0.0978889
[235]	training's multi_error: 0.0880431	valid_1's multi_error: 0.0978556
[236]	training's multi_error: 0.0880125	valid_1's multi_error: 0.0979278
[237]	training's multi_error: 0.0879694	valid_1's multi_error: 0.0979167
[238]	training's multi_error: 0.0879	valid_1's multi_error: 0.0978667
[239]	training's multi_error: 0.0878681	valid_1's multi_error: 0.0979111
[240]	training's multi_error: 0.0878417	valid_1's multi_error: 0.0979167
[241]	training's multi_error: 0.0878306	valid_1's multi_error: 0.0979167
[242]	training's multi_error: 0.0877889	valid_1's multi

In [11]:
before_one_hot =  train_y.values.reshape([-1,1])
print(before_one_hot)
enc = OneHotEncoder()
enc.fit(before_one_hot)

one_hoted_y  = enc.transform(before_one_hot).toarray()
print(one_hoted_y.shape)


[[0]
 [0]
 [1]
 ...
 [1]
 [0]
 [0]]
(720000, 2)


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [12]:
gc.collect()
y_pred = gbm.predict(train_x.astype(float))
for i in range(len(y_pred)):
        max_value=max(y_pred[i])
        for j in range(len(y_pred[i])):
            if max_value==y_pred[i][j]:
                y_pred[i][j]=1
            else:
                y_pred[i][j]=0
                
#print(precision_score(one_hoted_y, y_pred,average='micro'))

ret = []
for user_id,age in zip(range(1000000),y_pred):
    ret.append([int(user_id),int(age.tolist().index(1) + 1)])
ret_df = pd.DataFrame(ret,columns=['user_id','predicted_age'])
print(ret_df['predicted_age'].value_counts())

ValueError: Multi-label binary indicator input with different numbers of labels

In [None]:

before_one_hot =  valid_y.values.reshape([-1,1])
print(before_one_hot)
enc = OneHotEncoder()
enc.fit(before_one_hot)

one_hoted_y  = enc.transform(before_one_hot).toarray()
print(one_hoted_y.shape)


In [None]:
y_pred = gbm.predict(valid_x.astype(float))
for i in range(len(y_pred)):
        max_value=max(y_pred[i])
        for j in range(len(y_pred[i])):
            if max_value==y_pred[i][j]:
                y_pred[i][j]=1
            else:
                y_pred[i][j]=0
                
precision_score(one_hoted_y, y_pred,average='micro')


In [None]:
ret = []
for user_id,age,label_age in zip(range(1000000),y_pred,valid_y):
    ret.append([int(user_id + 720001),int(age.tolist().index(1)),label_age])
ret_df = pd.DataFrame(ret,columns=['user_id','predicted_age','label_age'])
print(ret_df)
print(ret_df['predicted_age'].value_counts())

In [None]:
true_user_df = ret_df[ret_df.predicted_age == ret_df.label_age ]
print(true_user_df)


In [None]:
true_user_df = true_user_df.merge(user_base_statics_df,on='user_id')
print(true_user_df)

In [None]:
plt_values =  true_user_df['click_times_count'].value_counts().sort_index()
pd.set_option('display.max_rows', 100)
print(plt_values)
pd.set_option('display.max_rows', 6)
x = [str(i) for i in plt_values.index]
y = plt_values.values
plt.scatter(x,y)
plt.show()

In [None]:
plt_values =  true_user_df['click_times_sum'].value_counts().sort_index()
pd.set_option('display.max_rows', 100)
print(plt_values)
pd.set_option('display.max_rows', 6)
x = [str(i) for i in plt_values.index]
y = plt_values.values
plt.scatter(x,y)
plt.show()

In [None]:
plt_values =  true_user_df['age'].value_counts()
print(plt_values)
x = [str(i) for i in plt_values.index]
y = plt_values.values
plt.scatter(x,y)
plt.show()

In [None]:
valid_user_base_statics_df =  user_base_statics_df[user_base_statics_df.user_id > 720000]
print(valid_user_base_statics_df)

In [None]:
len_10_30_df = valid_user_base_statics_df[valid_user_base_statics_df.click_times_count > 10]
len_10_30_df = len_10_30_df[len_10_30_df.click_times_count <=20]
print(valid_user_base_statics_df)
plt_values =  len_10_30_df['age'].value_counts()
print(plt_values)
x = [str(i) for i in plt_values.index]
y = plt_values.values
plt.scatter(x,y)
plt.show()

In [None]:

plt_values =  valid_user_base_statics_df['age'].value_counts()
print(plt_values)
x = [str(i) for i in plt_values.index]
y = plt_values.values
plt.scatter(x,y)
plt.show()

In [None]:
plt_values =  valid_user_base_statics_df['click_times_count'].value_counts().sort_index()
pd.set_option('display.max_rows', 100)
print(plt_values)
pd.set_option('display.max_rows', 6)
x = [str(i) for i in plt_values.index]
y = plt_values.values
plt.scatter(x,y)
plt.show()

In [None]:
plt_values =  valid_user_base_statics_df['click_times_sum'].value_counts().sort_index()
pd.set_option('display.max_rows', 100)
print(plt_values)
pd.set_option('display.max_rows', 6)
x = [str(i) for i in plt_values.index]
y = plt_values.values
plt.scatter(x,y)
plt.show()

In [None]:
valid_user_base_statics_df['click_times_sum_log'] =  valid_user_base_statics_df['click_times_sum'].apply(lambda x: math.log(x))
print(valid_user_base_statics_df)
plt_values =  valid_user_base_statics_df['click_times_sum_log'].value_counts().sort_index()
pd.set_option('display.max_rows', 100)
print(plt_values)
pd.set_option('display.max_rows', 6)
x = [str(i) for i in plt_values.index]
y = plt_values.values
plt.scatter(x,y)
plt.show()

In [None]:
print(valid_user_base_statics_df)

In [None]:
false_user_df = ret_df[ret_df.predicted_age != ret_df.label_age ]
false_user_df = false_user_df.merge(user_base_statics_df,on='user_id')
print(false_user_df)

In [None]:
plt_values =  false_user_df['click_times_count'].value_counts().sort_index()
pd.set_option('display.max_rows', 100)
print(plt_values)
pd.set_option('display.max_rows', 6)
x = [str(i) for i in plt_values.index]
y = plt_values.values
plt.scatter(x,y)
plt.show()