In [1]:
import lightgbm as lgb
import os
import math
import pandas as pd
import numpy as np
import random
import gc
from datetime import datetime
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import  OneHotEncoder

np.random.seed(2019)
random.seed(2019)
pd.set_option('display.max_rows', 6)
pd.set_option('display.max_columns', 10)
pd.set_option('display.width', 280)
pd.set_option('display.max_colwidth', 150)
data_path = '/data/workspace/kimi/tencent_ads/2020/dataset'
preprocess_path = 'preprocess'

In [4]:
user_base_statics_df= pd.read_pickle(f'{preprocess_path}/train_user_base_statics.pkl')
user_base_statics_df.columns = ["_".join(i) for i in user_base_statics_df.columns.values]
print(user_base_statics_df)
label_df = pd.read_csv(f'{data_path}/train_preliminary/user.csv')
user_base_statics_df = user_base_statics_df.merge(label_df,on='user_id')
user_base_statics_df['click_times_sum_log'] = user_base_statics_df['click_times_sum'].apply(lambda x :math.log(x))
user_base_statics_df['click_times_count_log'] = user_base_statics_df['click_times_count'].apply(lambda x :math.log(x))
user_base_statics_df['age'] = user_base_statics_df['age'] -1
user_base_statics_df = user_base_statics_df.astype(float)
print(user_base_statics_df)


          click_times_sum  click_times_count  ad_id_nunique  product_id_nunique  product_category_nunique  advertiser_id_nunique  industry_nunique
user_id                                                                                                                                           
1.0                  14.0                 13             12                   6                         3                     12                 9
2.0                  46.0                 45             42                  20                         3                     36                15
3.0                  30.0                 30             30                  17                         6                     28                 8
...                   ...                ...            ...                 ...                       ...                    ...               ...
899998.0             15.0                 14             14                   5                         4             

In [5]:
def merge_data(dim,df=None,top=1,size=64,pivot='user_id'):
    print('start merge ' + dim)
    ret_df= pd.read_pickle(f'{preprocess_path}/{dim}_top{top}_l{size}')
    if df is None:
        return ret_df
    df =  df.merge(ret_df,on=pivot,how='left')
    return df
    

In [6]:
dim_list = ['creative_id','ad_id','product_id','advertiser_id','industry']
for i in dim_list:
    user_base_statics_df = merge_data(i,df=user_base_statics_df)
print(user_base_statics_df)


start merge creative_id
start merge ad_id
start merge product_id
start merge advertiser_id
start merge industry
         user_id  click_times_sum  click_times_count  ad_id_nunique  product_id_nunique  product_category_nunique  advertiser_id_nunique  industry_nunique  age  gender  click_times_sum_log  click_times_count_log  creative_id_0  creative_id_1  creative_id_2  creative_id_3  \
0            1.0             14.0               13.0           12.0                 6.0                       3.0                   12.0               9.0  3.0     1.0             2.639057               2.564949       0.412025       0.547905       0.658964      -1.583155   
1            2.0             46.0               45.0           42.0                20.0                       3.0                   36.0              15.0  9.0     1.0             3.828641               3.806662       0.740653       0.077695      -0.587430      -0.909381   
2            3.0             30.0               30.0           

In [7]:
train_df = user_base_statics_df[user_base_statics_df.user_id <= 720000]
valid_df = user_base_statics_df[user_base_statics_df.user_id > 720000]
valid_df = valid_df[valid_df.user_id < 2000000]
print(valid_df)


         user_id  click_times_sum  click_times_count  ad_id_nunique  product_id_nunique  product_category_nunique  advertiser_id_nunique  industry_nunique  age  gender  click_times_sum_log  click_times_count_log  creative_id_0  creative_id_1  creative_id_2  creative_id_3  \
720000  720001.0             32.0               28.0           26.0                11.0                       2.0                   20.0              11.0  2.0     2.0             3.465736               3.332205      -0.160892      -0.309415      -1.075792      -2.426498   
720001  720002.0             29.0               29.0           26.0                 2.0                       5.0                   18.0              19.0  1.0     2.0             3.367296               3.367296      -0.130559       1.684029       0.642178      -0.535968   
720002  720003.0             21.0               20.0           17.0                 8.0                       3.0                   11.0              10.0  1.0     1.0        

In [8]:
# train_x = train_df.drop(['age','user_id','gender','advertiser_id_seq','industry_seq','product_id_seq'], axis=1)
# train_y = train_df['age'].astype(int)

# valid_x = valid_df.drop(['age','user_id','gender','advertiser_id_seq','industry_seq','product_id_seq'], axis=1)
# valid_y = valid_df['age'].astype(int)

drop_list = ['age','user_id','gender','click_times_sum','click_times_count']
train_x = train_df.drop(drop_list, axis=1)
train_y = train_df['age'].astype(int)

valid_x = valid_df.drop(drop_list, axis=1)
valid_y = valid_df['age'].astype(int)
#print(x)
#print(y)
print(train_x)
print(train_y)

print(valid_x)
print(valid_y)

#train_x,valid_x,train_y,valid_y =  train_test_split(x,y,test_size=0.20, random_state=42)
#print(train_x.shape)
train_data = lgb.Dataset(train_x.values, label=train_y, feature_name=list(train_x.columns),free_raw_data=False)
valid_data = lgb.Dataset(valid_x.values, label=valid_y, feature_name=list(train_x.columns),free_raw_data=False,reference=train_data)



        ad_id_nunique  product_id_nunique  product_category_nunique  advertiser_id_nunique  industry_nunique  click_times_sum_log  click_times_count_log  creative_id_0  creative_id_1  creative_id_2  creative_id_3  creative_id_4  creative_id_5  creative_id_6  creative_id_7  \
0                12.0                 6.0                       3.0                   12.0               9.0             2.639057               2.564949       0.412025       0.547905       0.658964      -1.583155       0.728377      -0.486789      -1.981210      -3.009078   
1                42.0                20.0                       3.0                   36.0              15.0             3.828641               3.806662       0.740653       0.077695      -0.587430      -0.909381       2.121953      -2.294639      -1.990913      -3.093746   
2                30.0                17.0                       6.0                   28.0               8.0             3.401197               3.401197       0.360184     

        ad_id_nunique  product_id_nunique  product_category_nunique  advertiser_id_nunique  industry_nunique  click_times_sum_log  click_times_count_log  creative_id_0  creative_id_1  creative_id_2  creative_id_3  creative_id_4  creative_id_5  creative_id_6  creative_id_7  \
720000           26.0                11.0                       2.0                   20.0              11.0             3.465736               3.332205      -0.160892      -0.309415      -1.075792      -2.426498       0.398152      -1.482154       0.168960      -1.471193   
720001           26.0                 2.0                       5.0                   18.0              19.0             3.367296               3.367296      -0.130559       1.684029       0.642178      -0.535968       0.445029       0.301784      -0.808392      -1.956358   
720002           17.0                 8.0                       3.0                   11.0              10.0             3.044522               2.995732       2.752069     

In [None]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'softmax',
    'num_class':10,
    'metric': 'multi_error',
    'num_leaves': 128,
    'learning_rate': 0.1,
    'feature_fraction': 0.7,
    'bagging_fraction': 0.7,
    'bagging_freq': 5,
    'verbose': 1
}
print('Starting training...')
# train

gbm = lgb.train(params,
                train_data,
                 valid_sets=[train_data,valid_data],
                num_boost_round = 2000,
                early_stopping_rounds=100
               )
gbm.save_model('age_emb1.txt')
print('Feature importances:', list(gbm.feature_importance()))

Starting training...
[1]	training's multi_error: 0.773507	valid_1's multi_error: 0.772628
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_error: 0.772801	valid_1's multi_error: 0.772128
[3]	training's multi_error: 0.763828	valid_1's multi_error: 0.76445
[4]	training's multi_error: 0.754464	valid_1's multi_error: 0.756022
[5]	training's multi_error: 0.743831	valid_1's multi_error: 0.746789
[6]	training's multi_error: 0.73109	valid_1's multi_error: 0.736228
[7]	training's multi_error: 0.718589	valid_1's multi_error: 0.726661
[8]	training's multi_error: 0.707368	valid_1's multi_error: 0.717233
[9]	training's multi_error: 0.697119	valid_1's multi_error: 0.70925
[10]	training's multi_error: 0.687982	valid_1's multi_error: 0.702411
[11]	training's multi_error: 0.679885	valid_1's multi_error: 0.6971
[12]	training's multi_error: 0.67286	valid_1's multi_error: 0.692439
[13]	training's multi_error: 0.666951	valid_1's multi_error: 0.688011
[14]	training's multi

In [None]:
before_one_hot =  train_y.values.reshape([-1,1])
print(before_one_hot)
enc = OneHotEncoder()
enc.fit(before_one_hot)

one_hoted_y  = enc.transform(before_one_hot).toarray()
print(one_hoted_y.shape)


In [None]:
gc.collect()
y_pred = gbm.predict(train_x.astype(float))
for i in range(len(y_pred)):
        max_value=max(y_pred[i])
        for j in range(len(y_pred[i])):
            if max_value==y_pred[i][j]:
                y_pred[i][j]=1
            else:
                y_pred[i][j]=0
                
print(precision_score(one_hoted_y, y_pred,average='micro'))

ret = []
for user_id,age in zip(range(1000000),y_pred):
    ret.append([int(user_id),int(age.tolist().index(1) + 1)])
ret_df = pd.DataFrame(ret,columns=['user_id','predicted_age'])
print(ret_df['predicted_age'].value_counts())

In [None]:

before_one_hot =  valid_y.values.reshape([-1,1])
print(before_one_hot)
enc = OneHotEncoder()
enc.fit(before_one_hot)

one_hoted_y  = enc.transform(before_one_hot).toarray()
print(one_hoted_y.shape)


In [None]:
y_pred = gbm.predict(valid_x.astype(float))
for i in range(len(y_pred)):
        max_value=max(y_pred[i])
        for j in range(len(y_pred[i])):
            if max_value==y_pred[i][j]:
                y_pred[i][j]=1
            else:
                y_pred[i][j]=0
                
precision_score(one_hoted_y, y_pred,average='micro')


In [None]:
ret = []
for user_id,age,label_age in zip(range(1000000),y_pred,valid_y):
    ret.append([int(user_id + 720001),int(age.tolist().index(1)),label_age])
ret_df = pd.DataFrame(ret,columns=['user_id','predicted_age','label_age'])
print(ret_df)
print(ret_df['predicted_age'].value_counts())

In [None]:
true_user_df = ret_df[ret_df.predicted_age == ret_df.label_age ]
print(true_user_df)


In [None]:
true_user_df = true_user_df.merge(user_base_statics_df,on='user_id')
print(true_user_df)

In [None]:
plt_values =  true_user_df['click_times_count'].value_counts().sort_index()
pd.set_option('display.max_rows', 100)
print(plt_values)
pd.set_option('display.max_rows', 6)
x = [str(i) for i in plt_values.index]
y = plt_values.values
plt.scatter(x,y)
plt.show()

In [None]:
plt_values =  true_user_df['click_times_sum'].value_counts().sort_index()
pd.set_option('display.max_rows', 100)
print(plt_values)
pd.set_option('display.max_rows', 6)
x = [str(i) for i in plt_values.index]
y = plt_values.values
plt.scatter(x,y)
plt.show()

In [None]:
plt_values =  true_user_df['age'].value_counts()
print(plt_values)
x = [str(i) for i in plt_values.index]
y = plt_values.values
plt.scatter(x,y)
plt.show()

In [None]:
valid_user_base_statics_df =  user_base_statics_df[user_base_statics_df.user_id > 720000]
print(valid_user_base_statics_df)

In [None]:
len_10_30_df = valid_user_base_statics_df[valid_user_base_statics_df.click_times_count > 10]
len_10_30_df = len_10_30_df[len_10_30_df.click_times_count <=20]
print(valid_user_base_statics_df)
plt_values =  len_10_30_df['age'].value_counts()
print(plt_values)
x = [str(i) for i in plt_values.index]
y = plt_values.values
plt.scatter(x,y)
plt.show()

In [None]:

plt_values =  valid_user_base_statics_df['age'].value_counts()
print(plt_values)
x = [str(i) for i in plt_values.index]
y = plt_values.values
plt.scatter(x,y)
plt.show()

In [None]:
plt_values =  valid_user_base_statics_df['click_times_count'].value_counts().sort_index()
pd.set_option('display.max_rows', 100)
print(plt_values)
pd.set_option('display.max_rows', 6)
x = [str(i) for i in plt_values.index]
y = plt_values.values
plt.scatter(x,y)
plt.show()

In [None]:
plt_values =  valid_user_base_statics_df['click_times_sum'].value_counts().sort_index()
pd.set_option('display.max_rows', 100)
print(plt_values)
pd.set_option('display.max_rows', 6)
x = [str(i) for i in plt_values.index]
y = plt_values.values
plt.scatter(x,y)
plt.show()

In [None]:
valid_user_base_statics_df['click_times_sum_log'] =  valid_user_base_statics_df['click_times_sum'].apply(lambda x: math.log(x))
print(valid_user_base_statics_df)
plt_values =  valid_user_base_statics_df['click_times_sum_log'].value_counts().sort_index()
pd.set_option('display.max_rows', 100)
print(plt_values)
pd.set_option('display.max_rows', 6)
x = [str(i) for i in plt_values.index]
y = plt_values.values
plt.scatter(x,y)
plt.show()

In [None]:
print(valid_user_base_statics_df)

In [None]:
false_user_df = ret_df[ret_df.predicted_age != ret_df.label_age ]
false_user_df = false_user_df.merge(user_base_statics_df,on='user_id')
print(false_user_df)

In [None]:
plt_values =  false_user_df['click_times_count'].value_counts().sort_index()
pd.set_option('display.max_rows', 100)
print(plt_values)
pd.set_option('display.max_rows', 6)
x = [str(i) for i in plt_values.index]
y = plt_values.values
plt.scatter(x,y)
plt.show()