In [1]:
import lightgbm as lgb
import os
import math
import pandas as pd
import numpy as np
import random
import gc
from datetime import datetime
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import  OneHotEncoder

np.random.seed(2019)
random.seed(2019)
pd.set_option('display.max_rows', 6)
pd.set_option('display.max_columns', 10)
pd.set_option('display.width', 280)
pd.set_option('display.max_colwidth', 150)
data_path = '/data/workspace/kimi/tencent_ads/2020/dataset'
preprocess_path = 'preprocess'

In [2]:
user_base_statics_df= pd.read_pickle(f'{preprocess_path}/train_user_base_statics.pkl')
user_base_statics_df.columns = ['_'.join(i) for i in user_base_statics_df.columns.values]
label_df = pd.read_csv(f'{data_path}/train_preliminary/user.csv')
user_base_statics_df = user_base_statics_df.merge(label_df,on='user_id')
user_base_statics_df['click_times_sum_log'] = user_base_statics_df['click_times_sum'].apply(lambda x :math.log(x))
user_base_statics_df['click_times_count_log'] = user_base_statics_df['click_times_count'].apply(lambda x :math.log(x))
user_base_statics_df['age'] = user_base_statics_df['age'] -1
user_base_statics_df = user_base_statics_df.astype(float)
print(user_base_statics_df)


         user_id  click_times_sum  click_times_count  ad_id_nunique  product_id_nunique  ...  industry_nunique  age  gender  click_times_sum_log  click_times_count_log
0            1.0             14.0               13.0           12.0                 6.0  ...               9.0  3.0     1.0             2.639057               2.564949
1            2.0             46.0               45.0           42.0                20.0  ...              15.0  9.0     1.0             3.828641               3.806662
2            3.0             30.0               30.0           30.0                17.0  ...               8.0  6.0     2.0             3.401197               3.401197
...          ...              ...                ...            ...                 ...  ...               ...  ...     ...                  ...                    ...
899997  899998.0             15.0               14.0           14.0                 5.0  ...               5.0  3.0     2.0             2.708050               2

In [3]:
train_df = user_base_statics_df[user_base_statics_df.user_id <= 720000]
valid_df = user_base_statics_df[user_base_statics_df.user_id > 720000]
valid_df = valid_df[valid_df.user_id < 2000000]
print(valid_df)


         user_id  click_times_sum  click_times_count  ad_id_nunique  product_id_nunique  ...  industry_nunique  age  gender  click_times_sum_log  click_times_count_log
720000  720001.0             32.0               28.0           26.0                11.0  ...              11.0  2.0     2.0             3.465736               3.332205
720001  720002.0             29.0               29.0           26.0                 2.0  ...              19.0  1.0     2.0             3.367296               3.367296
720002  720003.0             21.0               20.0           17.0                 8.0  ...              10.0  1.0     1.0             3.044522               2.995732
...          ...              ...                ...            ...                 ...  ...               ...  ...     ...                  ...                    ...
899997  899998.0             15.0               14.0           14.0                 5.0  ...               5.0  3.0     2.0             2.708050               2

In [4]:
def merge_features(train_df,valid_df,train_file,valid_file,target_encode=False):
    train_features_df  = pd.read_pickle(f'{preprocess_path}/{train_file}')
    valid_features_df = pd.read_pickle(f'{preprocess_path}/{valid_file}')
    if target_encode:
        train_features_df.columns = [ '_'.join(i) for i in train_features_df.columns.values  ]
        valid_features_df.columns = ['_'.join(i) for i in valid_features_df.columns.values  ]

    train_df = train_df.merge(train_features_df,on='user_id')
    valid_df = valid_df.merge(valid_features_df,on='user_id')
    return train_df,valid_df


In [5]:
for i in ['creative_id','ad_id', 'product_id','advertiser_id','industry','product_category']:
    print(f'merge {i}...')
    train_df,valid_df = merge_features(train_df,valid_df,f'train_user_target_encoder_{i}_age.pkl',f'valid_user_target_encoder_{i}.pkl',True)
    print(train_df)
    print(valid_df)

merge creative_id...
         user_id  click_times_sum  click_times_count  ad_id_nunique  product_id_nunique  ...  creative_id_age8_kfold_mean_std  creative_id_age9_kfold_mean_min  creative_id_age9_kfold_mean_max  creative_id_age9_kfold_mean_mean  creative_id_age9_kfold_mean_std
0            1.0             14.0               13.0           12.0                 6.0  ...                         0.022296                              0.0                         0.061728                          0.016455                         0.020560
1            2.0             46.0               45.0           42.0                20.0  ...                         0.026155                              0.0                         0.500000                          0.029619                         0.074746
2            3.0             30.0               30.0           30.0                17.0  ...                         0.061345                              0.0                         0.053333           

         user_id  click_times_sum  click_times_count  ad_id_nunique  product_id_nunique  ...  advertiser_id_age8_kfold_mean_std  advertiser_id_age9_kfold_mean_min  advertiser_id_age9_kfold_mean_max  advertiser_id_age9_kfold_mean_mean  advertiser_id_age9_kfold_mean_std
0            1.0             14.0               13.0           12.0                 6.0  ...                           0.015910                           0.001097                           0.045790                            0.014564                           0.013377
1            2.0             46.0               45.0           42.0                20.0  ...                           0.020615                           0.000000                           0.111888                            0.025346                           0.023220
2            3.0             30.0               30.0           30.0                17.0  ...                           0.053055                           0.000000                           0.07

         user_id  click_times_sum  click_times_count  ad_id_nunique  product_id_nunique  ...  product_category_age8_kfold_mean_std  product_category_age9_kfold_mean_min  product_category_age9_kfold_mean_max  product_category_age9_kfold_mean_mean  \
0            1.0             14.0               13.0           12.0                 6.0  ...                              0.002065                              0.009877                              0.015052                               0.012807   
1            2.0             46.0               45.0           42.0                20.0  ...                              0.014790                              0.013372                              0.091313                               0.017904   
2            3.0             30.0               30.0           30.0                17.0  ...                              0.013904                              0.013372                              0.091177                               0.018247   
... 

In [6]:
def merge_w2v_features(train_df,valid_df,train_file,valid_file,f=None,target_encode=False):
    train_features_df  = pd.read_pickle(f'{preprocess_path}/{train_file}').drop([f'{f}_top0_{i}'  for i in range(64)],axis=1)
    print(train_features_df)
    valid_features_df = pd.read_pickle(f'{preprocess_path}/{valid_file}').drop([f'{f}_top0_{i}'  for i in range(64)],axis=1)
    if target_encode:
        train_features_df.columns = [ '_'.join(i) for i in train_features_df.columns.values  ]
        valid_features_df.columns = ['_'.join(i) for i in valid_features_df.columns.values  ]

    train_df = train_df.merge(train_features_df,on='user_id')
    valid_df = valid_df.merge(valid_features_df,on='user_id')
    return train_df,valid_df


In [7]:
for i in ['creative_id','ad_id', 'product_id','advertiser_id','industry']:
    print(f'merge {i}...')
    train_df,valid_df = merge_w2v_features(train_df,valid_df,f'{i}_top1_l64',f'{i}_top1_l64',i,False)
    print(train_df)
    print(valid_df)

merge creative_id...
         user_id  creative_id_0  creative_id_1  creative_id_2  creative_id_3  ...  creative_id_63  creative_id_top0_64  creative_id_mean  creative_id_std  creative_id_min
0         2267.0      -0.586680       0.305217      -0.972132      -0.620510  ...       -0.376148                    2          1.060976         0.239286                1
1       512898.0      -0.323687       0.625864       0.309842       0.228552  ...       -0.414702                    2          1.035088         0.184002                1
2       524600.0      -0.038975       0.117351       0.293796       0.167996  ...       -1.145435                    2          1.029412         0.168958                1
...          ...            ...            ...            ...            ...  ...             ...                  ...               ...              ...              ...
899997  868400.0       0.454240       0.360373       0.742856      -0.225245  ...       -0.395166                    1      

         user_id  click_times_sum  click_times_count  ad_id_nunique  product_id_nunique  ...  product_id_63  product_id_top0_64  product_id_mean  product_id_std  product_id_min
0            1.0             14.0               13.0           12.0                 6.0  ...       0.366167                   7         2.166667        2.192158               1
1            2.0             46.0               45.0           42.0                20.0  ...       0.685685                  14         2.250000        3.014548               1
2            3.0             30.0               30.0           30.0                17.0  ...      -1.398228                  10         1.764706        2.101244               1
...          ...              ...                ...            ...                 ...  ...            ...                 ...              ...             ...             ...
719997  719998.0             41.0               38.0           38.0                21.0  ...      -1.293426        

         user_id  click_times_sum  click_times_count  ad_id_nunique  product_id_nunique  ...  industry_63  industry_top0_64  industry_mean  industry_std  industry_min
0            1.0             14.0               13.0           12.0                 6.0  ...    -0.725836                 3       1.444444      0.684935             1
1            2.0             46.0               45.0           42.0                20.0  ...    -0.171384                 9       3.000000      2.422120             1
2            3.0             30.0               30.0           30.0                17.0  ...     1.569111                 8       3.750000      2.633913             1
...          ...              ...                ...            ...                 ...  ...          ...               ...            ...           ...           ...
719997  719998.0             41.0               38.0           38.0                21.0  ...    -1.763492                 9       1.809524      1.815779             

In [8]:
# train_x = train_df.drop(['age','user_id','gender','advertiser_id_seq','industry_seq','product_id_seq'], axis=1)
# train_y = train_df['age'].astype(int)

# valid_x = valid_df.drop(['age','user_id','gender','advertiser_id_seq','industry_seq','product_id_seq'], axis=1)
# valid_y = valid_df['age'].astype(int)

drop_list = ['age','user_id','gender','click_times_sum','click_times_count']
train_x = train_df.drop(drop_list, axis=1)
train_y = train_df['age'].astype(int)

valid_x = valid_df.drop(drop_list, axis=1)
valid_y = valid_df['age'].astype(int)
#print(x)
#print(y)
print(train_x)
print(train_y)

print(valid_x)
print(valid_y)

#train_x,valid_x,train_y,valid_y =  train_test_split(x,y,test_size=0.20, random_state=42)
#print(train_x.shape)
train_data = lgb.Dataset(train_x.values, label=train_y, feature_name=list(train_x.columns),free_raw_data=False)
valid_data = lgb.Dataset(valid_x.values, label=valid_y, feature_name=list(train_x.columns),free_raw_data=False,reference=train_data)



        ad_id_nunique  product_id_nunique  product_category_nunique  advertiser_id_nunique  industry_nunique  ...  industry_63  industry_top0_64  industry_mean  industry_std  industry_min
0                12.0                 6.0                       3.0                   12.0               9.0  ...    -0.725836                 3       1.444444      0.684935             1
1                42.0                20.0                       3.0                   36.0              15.0  ...    -0.171384                 9       3.000000      2.422120             1
2                30.0                17.0                       6.0                   28.0               8.0  ...     1.569111                 8       3.750000      2.633913             1
...               ...                 ...                       ...                    ...               ...  ...          ...               ...            ...           ...           ...
719997           38.0                21.0                   

In [None]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'softmax',
    'num_class':10,
    'metric': 'multi_error',
    'num_leaves': 128 + 1,
    'learning_rate': 0.1,
    'feature_fraction': 0.7,
    'bagging_fraction': 0.7,
    'bagging_freq': 5,
    'verbose': 1
}
print('Starting training...')
# train

gbm = lgb.train(params,
                train_data,
                 valid_sets=[train_data,valid_data],
                num_boost_round = 2000,
                early_stopping_rounds=100
               )
gbm.save_model('model/age_target.model')
print('Feature importances:', list(gbm.feature_importance()))

Starting training...
[1]	training's multi_error: 0.772056	valid_1's multi_error: 0.771106
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_error: 0.747074	valid_1's multi_error: 0.746678
[3]	training's multi_error: 0.712054	valid_1's multi_error: 0.713256
[4]	training's multi_error: 0.679428	valid_1's multi_error: 0.683833
[5]	training's multi_error: 0.647682	valid_1's multi_error: 0.655311
[6]	training's multi_error: 0.625335	valid_1's multi_error: 0.636683
[7]	training's multi_error: 0.609518	valid_1's multi_error: 0.623939
[8]	training's multi_error: 0.597982	valid_1's multi_error: 0.614267
[9]	training's multi_error: 0.589444	valid_1's multi_error: 0.607222
[10]	training's multi_error: 0.583014	valid_1's multi_error: 0.602411
[11]	training's multi_error: 0.577982	valid_1's multi_error: 0.598822
[12]	training's multi_error: 0.573633	valid_1's multi_error: 0.595939
[13]	training's multi_error: 0.570308	valid_1's multi_error: 0.593478
[14]	training's

[118]	training's multi_error: 0.463831	valid_1's multi_error: 0.562878
[119]	training's multi_error: 0.463003	valid_1's multi_error: 0.562822
[120]	training's multi_error: 0.462256	valid_1's multi_error: 0.562661
[121]	training's multi_error: 0.461336	valid_1's multi_error: 0.562528
[122]	training's multi_error: 0.460336	valid_1's multi_error: 0.562522
[123]	training's multi_error: 0.459531	valid_1's multi_error: 0.562556
[124]	training's multi_error: 0.458711	valid_1's multi_error: 0.562317
[125]	training's multi_error: 0.457971	valid_1's multi_error: 0.562233
[126]	training's multi_error: 0.457311	valid_1's multi_error: 0.562194
[127]	training's multi_error: 0.456456	valid_1's multi_error: 0.562294
[128]	training's multi_error: 0.455635	valid_1's multi_error: 0.56225
[129]	training's multi_error: 0.454779	valid_1's multi_error: 0.562217
[130]	training's multi_error: 0.454024	valid_1's multi_error: 0.56195
[131]	training's multi_error: 0.453244	valid_1's multi_error: 0.561933
[132]	tr

[234]	training's multi_error: 0.374964	valid_1's multi_error: 0.559794
[235]	training's multi_error: 0.374346	valid_1's multi_error: 0.559667
[236]	training's multi_error: 0.373588	valid_1's multi_error: 0.559567
[237]	training's multi_error: 0.372826	valid_1's multi_error: 0.559456
[238]	training's multi_error: 0.372224	valid_1's multi_error: 0.559667
[239]	training's multi_error: 0.371531	valid_1's multi_error: 0.559639
[240]	training's multi_error: 0.370793	valid_1's multi_error: 0.559428
[241]	training's multi_error: 0.370104	valid_1's multi_error: 0.559172
[242]	training's multi_error: 0.3694	valid_1's multi_error: 0.559183
[243]	training's multi_error: 0.368556	valid_1's multi_error: 0.559172
[244]	training's multi_error: 0.367872	valid_1's multi_error: 0.558822
[245]	training's multi_error: 0.367118	valid_1's multi_error: 0.558983
[246]	training's multi_error: 0.366387	valid_1's multi_error: 0.558783
[247]	training's multi_error: 0.365589	valid_1's multi_error: 0.558728
[248]	tr

In [None]:
before_one_hot =  train_y.values.reshape([-1,1])
print(before_one_hot)
enc = OneHotEncoder()
enc.fit(before_one_hot)

one_hoted_y  = enc.transform(before_one_hot).toarray()
print(one_hoted_y.shape)


In [None]:
gc.collect()
y_pred = gbm.predict(train_x.astype(float))
for i in range(len(y_pred)):
        max_value=max(y_pred[i])
        for j in range(len(y_pred[i])):
            if max_value==y_pred[i][j]:
                y_pred[i][j]=1
            else:
                y_pred[i][j]=0
                
print(precision_score(one_hoted_y, y_pred,average='micro'))

ret = []
for user_id,age in zip(range(1000000),y_pred):
    ret.append([int(user_id),int(age.tolist().index(1) + 1)])
ret_df = pd.DataFrame(ret,columns=['user_id','predicted_age'])
print(ret_df['predicted_age'].value_counts())

In [None]:

before_one_hot =  valid_y.values.reshape([-1,1])
print(before_one_hot)
enc = OneHotEncoder()
enc.fit(before_one_hot)

one_hoted_y  = enc.transform(before_one_hot).toarray()
print(one_hoted_y.shape)


In [None]:
y_pred = gbm.predict(valid_x.astype(float))
for i in range(len(y_pred)):
        max_value=max(y_pred[i])
        for j in range(len(y_pred[i])):
            if max_value==y_pred[i][j]:
                y_pred[i][j]=1
            else:
                y_pred[i][j]=0
                
precision_score(one_hoted_y, y_pred,average='micro')


In [None]:
ret = []
for user_id,age,label_age in zip(range(1000000),y_pred,valid_y):
    ret.append([int(user_id + 720001),int(age.tolist().index(1)),label_age])
ret_df = pd.DataFrame(ret,columns=['user_id','predicted_age','label_age'])
print(ret_df)
print(ret_df['predicted_age'].value_counts())

In [None]:
true_user_df = ret_df[ret_df.predicted_age == ret_df.label_age ]
print(true_user_df)


In [None]:
true_user_df = true_user_df.merge(user_base_statics_df,on='user_id')
print(true_user_df)

In [None]:
plt_values =  true_user_df['click_times_count'].value_counts().sort_index()
pd.set_option('display.max_rows', 100)
print(plt_values)
pd.set_option('display.max_rows', 6)
x = [str(i) for i in plt_values.index]
y = plt_values.values
plt.scatter(x,y)
plt.show()

In [None]:
plt_values =  true_user_df['click_times_sum'].value_counts().sort_index()
pd.set_option('display.max_rows', 100)
print(plt_values)
pd.set_option('display.max_rows', 6)
x = [str(i) for i in plt_values.index]
y = plt_values.values
plt.scatter(x,y)
plt.show()

In [None]:
plt_values =  true_user_df['age'].value_counts()
print(plt_values)
x = [str(i) for i in plt_values.index]
y = plt_values.values
plt.scatter(x,y)
plt.show()

In [None]:
valid_user_base_statics_df =  user_base_statics_df[user_base_statics_df.user_id > 720000]
print(valid_user_base_statics_df)

In [None]:
len_10_30_df = valid_user_base_statics_df[valid_user_base_statics_df.click_times_count > 10]
len_10_30_df = len_10_30_df[len_10_30_df.click_times_count <=20]
print(valid_user_base_statics_df)
plt_values =  len_10_30_df['age'].value_counts()
print(plt_values)
x = [str(i) for i in plt_values.index]
y = plt_values.values
plt.scatter(x,y)
plt.show()

In [None]:

plt_values =  valid_user_base_statics_df['age'].value_counts()
print(plt_values)
x = [str(i) for i in plt_values.index]
y = plt_values.values
plt.scatter(x,y)
plt.show()

In [None]:
plt_values =  valid_user_base_statics_df['click_times_count'].value_counts().sort_index()
pd.set_option('display.max_rows', 100)
print(plt_values)
pd.set_option('display.max_rows', 6)
x = [str(i) for i in plt_values.index]
y = plt_values.values
plt.scatter(x,y)
plt.show()

In [None]:
plt_values =  valid_user_base_statics_df['click_times_sum'].value_counts().sort_index()
pd.set_option('display.max_rows', 100)
print(plt_values)
pd.set_option('display.max_rows', 6)
x = [str(i) for i in plt_values.index]
y = plt_values.values
plt.scatter(x,y)
plt.show()

In [None]:
valid_user_base_statics_df['click_times_sum_log'] =  valid_user_base_statics_df['click_times_sum'].apply(lambda x: math.log(x))
print(valid_user_base_statics_df)
plt_values =  valid_user_base_statics_df['click_times_sum_log'].value_counts().sort_index()
pd.set_option('display.max_rows', 100)
print(plt_values)
pd.set_option('display.max_rows', 6)
x = [str(i) for i in plt_values.index]
y = plt_values.values
plt.scatter(x,y)
plt.show()

In [None]:
print(valid_user_base_statics_df)

In [None]:
false_user_df = ret_df[ret_df.predicted_age != ret_df.label_age ]
false_user_df = false_user_df.merge(user_base_statics_df,on='user_id')
print(false_user_df)

In [None]:
plt_values =  false_user_df['click_times_count'].value_counts().sort_index()
pd.set_option('display.max_rows', 100)
print(plt_values)
pd.set_option('display.max_rows', 6)
x = [str(i) for i in plt_values.index]
y = plt_values.values
plt.scatter(x,y)
plt.show()