In [1]:
import lightgbm as lgb
import os
import pandas as pd
import numpy as np
import random
import gc
from datetime import datetime
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import  OneHotEncoder

np.random.seed(2019)
random.seed(2019)
pd.set_option('display.max_rows', 6)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 280)
pd.set_option('display.max_colwidth', 150)
data_path = '/data/workspace/kimi/tencent_ads/2020/dataset'
preprocess_path = 'preprocess'

In [2]:
def load_dataset(base_file,user_statics,product_id,product_category_df,train1_df):
    df =pd.read_pickle(base_file)
    df['age']  = df['age'] -1
    df = df.merge(user_statics_df,on='user_id')
    df = df.merge(product_id,on='user_id')
    df = df.merge(product_category_df,on='user_id')
    df = df.merge(train1_df,on='user_id')
    return df
    
    

In [4]:
trian1_df =pd.read_pickle(f'{preprocess_path}/train1.pkl')


In [4]:
user_id_industry_df  =pd.read_pickle(f'{preprocess_path}/user_id_industry_user_id_train_deepwalk_64.pkl')
user_id_industry_df['user_id'] = user_id_industry_df['user_id'].astype(float)
print(user_id_industry_df)

user_id                                              float64
user_id_industry_user_id_deepwalk_embedding_64_0     float64
user_id_industry_user_id_deepwalk_embedding_64_1     float64
                                                      ...   
user_id_industry_user_id_deepwalk_embedding_64_61    float64
user_id_industry_user_id_deepwalk_embedding_64_62    float64
user_id_industry_user_id_deepwalk_embedding_64_63    float64
Length: 65, dtype: object


In [12]:

user_statics_df =pd.read_pickle(f'{preprocess_path}/user_statics_train.pkl').drop(['click_times_total','active_days'] + ['active_days_std',  'week_active_days_std' ,'month_acitve_days_std'] ,axis=1)
product_category_df =pd.read_pickle(f'{preprocess_path}/product_category.pkl')
#train1_df = trian1_df[['user_id'] + [f'tfidf_{i}' for i in range(317)]]



train_product_id_df  = pd.read_pickle(f'{preprocess_path}/product_id_top3_l32_train.pkl').drop(['product_id_std'],axis=1).astype(float)
valid_product_id_df  = pd.read_pickle(f'{preprocess_path}/product_id_top3_l32_valid.pkl').drop(['product_id_std'],axis=1).astype(float)
#print(train_product_id_df)
train_df = load_dataset('train5.pkl',user_statics_df,train_product_id_df,product_category_df,user_id_industry_df)
valid_df = load_dataset('valid5.pkl',user_statics_df,valid_product_id_df,product_category_df,user_id_industry_df)



print(train_df)
print(valid_df)
print(train_df.columns)


        user_id  active_days  click_times_total                                                                                                                                      advertiser_id_seq  \
0             1           10                 14                                                                    [7293, 9702, 29455, 14668, 11411, 14681, 17189, 367, 44865, 188, 23575, 188, 10988]   
1             2           28                 46  [22885, 10686, 18562, 25932, 22885, 34505, 768, 26006, 918, 34503, 17284, 13732, 20033, 27843, 25260, 27843, 42272, 26006, 38785, 2302, 42272, 149...   
2             3           23                 30  [32974, 9877, 18492, 14186, 17018, 9058, 8371, 2336, 39500, 45169, 14289, 13119, 36384, 48608, 16764, 9970, 15351, 14498, 55248, 16895, 52263, 918...   
...         ...          ...                ...                                                                                                                                                 

In [13]:
final_train_x_df = train_df.drop(['age','user_id','gender','advertiser_id_seq','industry_seq'], axis=1)
# final_train_x_df = train_df[['active_days','click_times_total'] +\
#                             'active_days_max,active_days_min,active_days_mean'.split(',') + \
#                 'week_active_days_max,week_active_days_min,week_active_days_mean'.split(',') + \
#             'month_acitve_days_max,month_acitve_days_min,month_acitve_days_mean'.split(',') 
#                            ]
#final_train_x_df = train_df.drop(['age','user_id','gender','active_days'], axis=1)
final_train_y_df = train_df['age']

final_valid_x_df = valid_df.drop(['age','user_id','gender','advertiser_id_seq','industry_seq'], axis=1)
# final_valid_x_df = valid_df[['active_days','click_times_total'] + \
#                                           'active_days_max,active_days_min,active_days_mean'.split(',') + \
#                 'week_active_days_max,week_active_days_min,week_active_days_mean'.split(',') + \
#             'month_acitve_days_max,month_acitve_days_min,month_acitve_days_mean'.split(',') 
#                            ]
final_valid_y_df = valid_df['age']
num_normal_features = ['_clicks_max_click_cnt','_max_clicked_ratio','_clicks_min_click_cnt','_min_clicked_ratio','_clicks_len','_clicks_mean','_clicks_median','_clicks_std']
num_date_features  = [ '_clicks_max_click_cnt', '_clicks_min_click_cnt','_clicks_len','_clicks_mean','_clicks_median','_clicks_std']
num_features = ['click_times_total'] +\
                [f'date{i}'  for i in num_date_features] + \
                [f'wday{i}'  for i in num_date_features] + \
                [f'month{i}'  for i in num_date_features] + \
                 [f'product_id{i}'  for i in num_normal_features] + \
                 [f'product_category{i}'  for i in num_normal_features] + \
                [f'industry{i}'  for i in num_normal_features] + \
                [f'advertiser_id{i}'  for i in num_normal_features] 

#print(num_features)

c_features = ['industry_clicks_max_click','industry_clicks_min_click',
              'advertiser_id_clicks_max_click','advertiser_id_clicks_min_click',
              'product_id_clicks_max_click','product_id_clicks_min_click',
              'product_category_clicks_max_click','product_category_clicks_min_click',
             ]
features= num_features + c_features
topN = 3
def forfor(a): 
    return [item for sublist in a for item in sublist] 
features= ['active_days','click_times_total'] +  \
            [f"industry_{i}" for i in range(16)] + \
            [f"advertiser_id_{i}" for i in range(32)]  +\
            forfor([[f'industry_top{i}_{j}'  for j in range(16)]  for i in range(topN)]) + \
            forfor([[f'advertiser_id_top{i}_{j}'  for j in range(32)]  for i in range(topN)]) +\
            'active_days_max,active_days_min,active_days_mean'.split(',') + \
            'week_active_days_max,week_active_days_min,week_active_days_mean'.split(',') + \
            'month_acitve_days_max,month_acitve_days_min,month_acitve_days_mean'.split(',') + \
            [f"product_id_{i}" for i in range(32)]  +\
            forfor([[f'product_id_top{i}_{j}'  for j in range(32 +1)]  for i in range(topN)]) +\
            'product_id_mean,product_id_min'.split(',') +\
             [f'product_category_{i}' for i in range(18)] + \
            [f'product_category_{i}_percent' for i in range(18)] + \
            [f'user_id_industry_user_id_deepwalk_embedding_64_{i}' for i in range(64)]
            #[f'tfidf_{i}' for i in range(317)]
            
# features= ['active_days','click_times_total']   + \
#             'active_days_max,active_days_min,active_days_mean'.split(',') + \
#             'week_active_days_max,week_active_days_min,week_active_days_mean'.split(',') + \
#             'month_acitve_days_max,month_acitve_days_min,month_acitve_days_mean'.split(',')
print(features)
#train_data = lgb.Dataset(final_train_x_df, label=final_train_y_df, feature_name=[   'max_clicked_industry', 'max_clicked_advertiser_id' ], categorical_feature=['max_clicked_industry','max_clicked_advertiser_id'])
#train_data = lgb.Dataset(X_train, label=y_train, feature_name=features, categorical_feature=c_features,free_raw_data=False)
train_data = lgb.Dataset(final_train_x_df, label=final_train_y_df, feature_name=features,free_raw_data=False)
eval_data = lgb.Dataset(final_valid_x_df, label=final_valid_y_df, feature_name=features,free_raw_data=False,reference=train_data)
print(len(features))
#print(train_data)
#print(eval_data)
for f in  final_train_x_df.columns:
    if f not in features:
        print(f"not in data feature:{f}")

for f in features:
    if f not in final_train_x_df.columns:
        print(f"not in featues data:{f}")

['active_days', 'click_times_total', 'industry_0', 'industry_1', 'industry_2', 'industry_3', 'industry_4', 'industry_5', 'industry_6', 'industry_7', 'industry_8', 'industry_9', 'industry_10', 'industry_11', 'industry_12', 'industry_13', 'industry_14', 'industry_15', 'advertiser_id_0', 'advertiser_id_1', 'advertiser_id_2', 'advertiser_id_3', 'advertiser_id_4', 'advertiser_id_5', 'advertiser_id_6', 'advertiser_id_7', 'advertiser_id_8', 'advertiser_id_9', 'advertiser_id_10', 'advertiser_id_11', 'advertiser_id_12', 'advertiser_id_13', 'advertiser_id_14', 'advertiser_id_15', 'advertiser_id_16', 'advertiser_id_17', 'advertiser_id_18', 'advertiser_id_19', 'advertiser_id_20', 'advertiser_id_21', 'advertiser_id_22', 'advertiser_id_23', 'advertiser_id_24', 'advertiser_id_25', 'advertiser_id_26', 'advertiser_id_27', 'advertiser_id_28', 'advertiser_id_29', 'advertiser_id_30', 'advertiser_id_31', 'industry_top0_0', 'industry_top0_1', 'industry_top0_2', 'industry_top0_3', 'industry_top0_4', 'industr

In [15]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'softmax',
    'num_class':10,
    'metric': 'multi_error',
    'num_leaves': 128,
    'learning_rate': 0.1,
    'feature_fraction': 0.7,
    'bagging_fraction': 0.7,
    'bagging_freq': 5,
    'verbose': 1
}
print('Starting training...')
# train

gbm = lgb.train(params,
                train_data,
                valid_sets=[train_data,eval_data],
                num_boost_round = 2000,
                early_stopping_rounds=100
               )
                #early_stopping_rounds=5)
gbm.save_model('age_emb.txt')
print('Feature importances:', list(gbm.feature_importance()))

Starting training...
[1]	training's multi_error: 0.773908	valid_1's multi_error: 0.773011
Training until validation scores don't improve for 100 rounds.
[2]	training's multi_error: 0.773518	valid_1's multi_error: 0.772839
[3]	training's multi_error: 0.767863	valid_1's multi_error: 0.767961
[4]	training's multi_error: 0.757625	valid_1's multi_error: 0.759544
[5]	training's multi_error: 0.749007	valid_1's multi_error: 0.751689
[6]	training's multi_error: 0.740776	valid_1's multi_error: 0.745294
[7]	training's multi_error: 0.732443	valid_1's multi_error: 0.738772
[8]	training's multi_error: 0.723789	valid_1's multi_error: 0.732356
[9]	training's multi_error: 0.716058	valid_1's multi_error: 0.726756
[10]	training's multi_error: 0.708925	valid_1's multi_error: 0.721706
[11]	training's multi_error: 0.702826	valid_1's multi_error: 0.717167
[12]	training's multi_error: 0.697249	valid_1's multi_error: 0.713394
[13]	training's multi_error: 0.692226	valid_1's multi_error: 0.709978
[14]	training's

[118]	training's multi_error: 0.544615	valid_1's multi_error: 0.6704
[119]	training's multi_error: 0.54345	valid_1's multi_error: 0.670111
[120]	training's multi_error: 0.542493	valid_1's multi_error: 0.66995
[121]	training's multi_error: 0.541593	valid_1's multi_error: 0.669828
[122]	training's multi_error: 0.540275	valid_1's multi_error: 0.669789
[123]	training's multi_error: 0.539338	valid_1's multi_error: 0.669878
[124]	training's multi_error: 0.538419	valid_1's multi_error: 0.6696
[125]	training's multi_error: 0.537499	valid_1's multi_error: 0.66945
[126]	training's multi_error: 0.536415	valid_1's multi_error: 0.669478
[127]	training's multi_error: 0.535542	valid_1's multi_error: 0.669156
[128]	training's multi_error: 0.534563	valid_1's multi_error: 0.669072
[129]	training's multi_error: 0.533421	valid_1's multi_error: 0.669172
[130]	training's multi_error: 0.532713	valid_1's multi_error: 0.669194
[131]	training's multi_error: 0.531692	valid_1's multi_error: 0.669161
[132]	trainin

[234]	training's multi_error: 0.441479	valid_1's multi_error: 0.666928
[235]	training's multi_error: 0.440618	valid_1's multi_error: 0.666833
[236]	training's multi_error: 0.439753	valid_1's multi_error: 0.666583
[237]	training's multi_error: 0.438979	valid_1's multi_error: 0.666933
[238]	training's multi_error: 0.438142	valid_1's multi_error: 0.66675
[239]	training's multi_error: 0.437297	valid_1's multi_error: 0.666928
[240]	training's multi_error: 0.436432	valid_1's multi_error: 0.667044
[241]	training's multi_error: 0.435636	valid_1's multi_error: 0.666828
[242]	training's multi_error: 0.434831	valid_1's multi_error: 0.666817
[243]	training's multi_error: 0.4339	valid_1's multi_error: 0.666517
[244]	training's multi_error: 0.433169	valid_1's multi_error: 0.666483
[245]	training's multi_error: 0.432378	valid_1's multi_error: 0.666667
[246]	training's multi_error: 0.431553	valid_1's multi_error: 0.666511
[247]	training's multi_error: 0.430643	valid_1's multi_error: 0.666506
[248]	tra

[350]	training's multi_error: 0.355787	valid_1's multi_error: 0.664333
[351]	training's multi_error: 0.355064	valid_1's multi_error: 0.664433
[352]	training's multi_error: 0.354411	valid_1's multi_error: 0.664356
[353]	training's multi_error: 0.353797	valid_1's multi_error: 0.664656
[354]	training's multi_error: 0.35311	valid_1's multi_error: 0.664606
[355]	training's multi_error: 0.352596	valid_1's multi_error: 0.664506
[356]	training's multi_error: 0.351946	valid_1's multi_error: 0.664444
[357]	training's multi_error: 0.351168	valid_1's multi_error: 0.664467
[358]	training's multi_error: 0.350565	valid_1's multi_error: 0.664561
[359]	training's multi_error: 0.349861	valid_1's multi_error: 0.664417
[360]	training's multi_error: 0.349146	valid_1's multi_error: 0.664378
[361]	training's multi_error: 0.348618	valid_1's multi_error: 0.664372
[362]	training's multi_error: 0.348037	valid_1's multi_error: 0.664433
[363]	training's multi_error: 0.347397	valid_1's multi_error: 0.664511
[364]	t

[466]	training's multi_error: 0.286033	valid_1's multi_error: 0.664628
[467]	training's multi_error: 0.285425	valid_1's multi_error: 0.664478
[468]	training's multi_error: 0.284918	valid_1's multi_error: 0.664328
[469]	training's multi_error: 0.284353	valid_1's multi_error: 0.66425
[470]	training's multi_error: 0.283782	valid_1's multi_error: 0.664156
[471]	training's multi_error: 0.283336	valid_1's multi_error: 0.66425
[472]	training's multi_error: 0.282794	valid_1's multi_error: 0.664306
[473]	training's multi_error: 0.282268	valid_1's multi_error: 0.664461
[474]	training's multi_error: 0.281682	valid_1's multi_error: 0.664439
[475]	training's multi_error: 0.281235	valid_1's multi_error: 0.664439
[476]	training's multi_error: 0.280672	valid_1's multi_error: 0.664472
[477]	training's multi_error: 0.280082	valid_1's multi_error: 0.664478
[478]	training's multi_error: 0.279504	valid_1's multi_error: 0.6644
[479]	training's multi_error: 0.27906	valid_1's multi_error: 0.664278
[480]	train

In [10]:
before_one_hot =  final_train_y_df.values.reshape([-1,1])
print(before_one_hot)
enc = OneHotEncoder()
enc.fit(before_one_hot)

one_hoted_y  = enc.transform(before_one_hot).toarray()
print(one_hoted_y.shape)


[[3]
 [9]
 [6]
 ...
 [3]
 [3]
 [4]]
(720000, 10)


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [11]:
y_pred = gbm.predict(final_train_x_df)
for i in range(len(y_pred)):
        max_value=max(y_pred[i])
        for j in range(len(y_pred[i])):
            if max_value==y_pred[i][j]:
                y_pred[i][j]=1
            else:
                y_pred[i][j]=0
                
print(precision_score(one_hoted_y, y_pred,average='micro'))

ret = []
for user_id,age in zip(range(1000000),y_pred):
    ret.append([int(user_id),int(age.tolist().index(1) + 1)])
ret_df = pd.DataFrame(ret,columns=['user_id','predicted_age'])
print(ret_df['predicted_age'].value_counts())

MemoryError: 

In [None]:

before_one_hot =  final_valid_y_df.values.reshape([-1,1])
print(before_one_hot)
enc = OneHotEncoder()
enc.fit(before_one_hot)

one_hoted_y  = enc.transform(before_one_hot).toarray()
print(one_hoted_y.shape)


In [None]:
y_pred = gbm.predict(final_valid_x_df)
for i in range(len(y_pred)):
        max_value=max(y_pred[i])
        for j in range(len(y_pred[i])):
            if max_value==y_pred[i][j]:
                y_pred[i][j]=1
            else:
                y_pred[i][j]=0
                
precision_score(one_hoted_y, y_pred,average='micro')


In [None]:
ret = []
for user_id,age in zip(range(1000000),y_pred):
    ret.append([int(user_id),int(age.tolist().index(1) + 1)])
ret_df = pd.DataFrame(ret,columns=['user_id','predicted_age'])
print(ret_df['predicted_age'].value_counts())