In [1]:
import lightgbm as lgb
import os
import pandas as pd
import numpy as np
import random
import gc
from datetime import datetime
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import  OneHotEncoder

np.random.seed(2019)
random.seed(2019)
pd.set_option('display.max_rows', 6)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 280)
pd.set_option('display.max_colwidth', 150)
data_path = '/data/workspace/kimi/tencent_ads/2020/dataset'
preprocess_path = 'preprocess'

In [3]:
train_df =pd.read_pickle('train5.pkl')
train_df['age']  = train_df['age'] -1

valid_df =pd.read_pickle('valid5.pkl')
valid_df['age']  = valid_df['age'] -1

user_statics_df =pd.read_pickle(f'{preprocess_path}/user_statics_train.pkl').drop(['click_times_total','active_days'],axis=1)

train_df = train_df.merge(user_statics_df,on='user_id')
valid_df = valid_df.merge(user_statics_df,on='user_id')

print(train_df)
print(valid_df)


        user_id  active_days  click_times_total                                                                                                                                      advertiser_id_seq  \
0             1           10                 14                                                                    [7293, 9702, 29455, 14668, 11411, 14681, 17189, 367, 44865, 188, 23575, 188, 10988]   
1             2           28                 46  [22885, 10686, 18562, 25932, 22885, 34505, 768, 26006, 918, 34503, 17284, 13732, 20033, 27843, 25260, 27843, 42272, 26006, 38785, 2302, 42272, 149...   
2             3           23                 30  [32974, 9877, 18492, 14186, 17018, 9058, 8371, 2336, 39500, 45169, 14289, 13119, 36384, 48608, 16764, 9970, 15351, 14498, 55248, 16895, 52263, 918...   
...         ...          ...                ...                                                                                                                                                 

In [23]:
final_train_x_df = train_df.drop(['age','user_id','gender','advertiser_id_seq','industry_seq'], axis=1)
#final_train_x_df = train_df.drop(['age','user_id','gender','active_days'], axis=1)
final_train_y_df = train_df['age']
print(final_train_x_df)

final_valid_x_df = valid_df.drop(['age','user_id','gender','advertiser_id_seq','industry_seq'], axis=1)
final_valid_y_df = valid_df['age']
num_normal_features = ['_clicks_max_click_cnt','_max_clicked_ratio','_clicks_min_click_cnt','_min_clicked_ratio','_clicks_len','_clicks_mean','_clicks_median','_clicks_std']
num_date_features  = [ '_clicks_max_click_cnt', '_clicks_min_click_cnt','_clicks_len','_clicks_mean','_clicks_median','_clicks_std']
num_features = ['click_times_total'] +\
                [f'date{i}'  for i in num_date_features] + \
                [f'wday{i}'  for i in num_date_features] + \
                [f'month{i}'  for i in num_date_features] + \
                 [f'product_id{i}'  for i in num_normal_features] + \
                 [f'product_category{i}'  for i in num_normal_features] + \
                [f'industry{i}'  for i in num_normal_features] + \
                [f'advertiser_id{i}'  for i in num_normal_features]

#print(num_features)

c_features = ['industry_clicks_max_click','industry_clicks_min_click',
              'advertiser_id_clicks_max_click','advertiser_id_clicks_min_click',
              'product_id_clicks_max_click','product_id_clicks_min_click',
              'product_category_clicks_max_click','product_category_clicks_min_click',
             ]
features= num_features + c_features
topN = 3
def forfor(a): 
    return [item for sublist in a for item in sublist] 
features= ['active_days','click_times_total'] +  \
            [f"industry_{i}" for i in range(16)] + \
            [f"advertiser_id_{i}" for i in range(32)]  +\
            forfor([[f'industry_top{i}_{j}'  for j in range(16)]  for i in range(topN)]) + \
            forfor([[f'advertiser_id_top{i}_{j}'  for j in range(32)]  for i in range(topN)]) +\
            'active_days_max,active_days_min,active_days_mean,active_days_std,week_active_days_max,week_active_days_min,week_active_days_mean,week_active_days_std,month_acitve_days_max,month_acitve_days_min,month_acitve_days_mean,month_acitve_days_std'.split(',')
#print(features)
print(len(features))

drop_features= ['active_days_std',  'week_active_days_std' ,'month_acitve_days_std'] 
#             [f"industry_{i}" for i in range(16)] + \
#             [f"advertiser_id_{i}" for i in range(32)]  +\
#             forfor([[f'industry_top{i}_{j}'  for j in range(16)]  for i in range(topN)]) + \
#             forfor([[f'advertiser_id_top{i}_{j}'  for j in range(32)]  for i in range(topN)]) + \
            
final_train_x_df = final_train_x_df.drop(drop_features,axis=1)
final_valid_x_df = final_valid_x_df.drop(drop_features,axis=1)
#print(final_train_x_df)
final_train_y_one_hot_df =  final_train_y_df.values.reshape([-1,1])
enc = OneHotEncoder()
enc.fit(final_train_y_one_hot_df)

final_train_y_one_hot_df  = enc.transform(final_train_y_one_hot_df).toarray()
print(final_train_y_one_hot_df.shape)


final_valid_y_one_hot_df =  final_valid_y_df.values.reshape([-1,1])
enc = OneHotEncoder()
enc.fit(final_valid_y_one_hot_df)

final_valid_y_one_hot_df  = enc.transform(final_valid_y_one_hot_df).toarray()
print(final_valid_y_one_hot_df.shape)


from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
print(final_train_x_df)
final_train_x_df = scaler.fit_transform(
    final_train_x_df.values.astype(np.float32).reshape(-1, 1)).reshape(-1, 203)
final_valid_x_df = scaler.transform(
    final_valid_x_df.values.astype(np.float32).reshape(-1, 1)).reshape(-1,203)




        active_days  click_times_total  industry_0  industry_1  industry_2  industry_3  industry_4  industry_5  industry_6  industry_7  industry_8  industry_9  industry_10  industry_11  industry_12  industry_13  industry_14  industry_15  industry_top0_0  industry_top0_1  \
0                10                 14   -2.407525   -2.480743    0.258189    0.068892    2.275061    0.609698   -1.124533   -2.170345   -3.801786   -1.069965    -1.431871    -0.779060    -2.454330     3.554383    -3.526832     2.944196        -3.168944        -9.633119   
1                28                 46    0.528324    0.173733    0.413423   -2.342237    2.927688    2.095639    4.499999   -2.768379   -0.016797    0.830495    -3.164093     2.597350    -2.405309     4.605047    -3.062250    -0.915257        -1.735877         2.013226   
2                23                 30   -3.704870   -3.906977    0.328102   -1.694905   -0.775484    2.218845   -2.914866   -2.651409    0.200202    2.303957     0.117288    -0.

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


(720000, 10)
(180000, 10)
        active_days  click_times_total  industry_0  industry_1  industry_2  industry_3  industry_4  industry_5  industry_6  industry_7  industry_8  industry_9  industry_10  industry_11  industry_12  industry_13  industry_14  industry_15  industry_top0_0  industry_top0_1  \
0                10                 14   -2.407525   -2.480743    0.258189    0.068892    2.275061    0.609698   -1.124533   -2.170345   -3.801786   -1.069965    -1.431871    -0.779060    -2.454330     3.554383    -3.526832     2.944196        -3.168944        -9.633119   
1                28                 46    0.528324    0.173733    0.413423   -2.342237    2.927688    2.095639    4.499999   -2.768379   -0.016797    0.830495    -3.164093     2.597350    -2.405309     4.605047    -3.062250    -0.915257        -1.735877         2.013226   
2                23                 30   -3.704870   -3.906977    0.328102   -1.694905   -0.775484    2.218845   -2.914866   -2.651409    0.200202    2.

In [24]:
print(final_train_x_df)
print(final_train_x_df.shape)

[[ 1.7032868e+00  2.4206741e+00 -5.2196366e-01 ...  8.0655247e-01
   4.4785884e-01  6.8698794e-01]
 [ 4.9315295e+00  8.1597719e+00  4.5715240e-03 ...  3.6761017e+00
   8.9165181e-02  1.9274702e+00]
 [ 4.0347953e+00  5.2902231e+00 -7.5463831e-01 ...  2.0619805e+00
   1.3445929e+00  1.7032868e+00]
 ...
 [ 4.5728359e+00  7.2630382e+00  7.1024126e-01 ...  3.6761017e+00
   2.6851201e-01  1.6136132e+00]
 [ 7.2630382e+00  1.1567363e+01  3.9367393e-01 ...  4.3934889e+00
   2.6851201e-01  2.7345309e+00]
 [ 8.1597719e+00  9.7738934e+00 -1.3509930e-02 ...  3.8554485e+00
   2.6000209e+00  3.1380613e+00]]
(720000, 203)


In [26]:
import keras as K
    # 2. 定义模型
init = K.initializers.random_uniform()

simple_adam = K.optimizers.Adam()
model = K.models.Sequential()
#model.add(K.layers.normalization.BatchNormalization(axis=-1))
model.add(K.layers.Dense(units=512, input_dim=203,kernel_initializer='random_uniform', activation='relu'))
model.add(K.layers.Dense(units=256, kernel_initializer='random_uniform',activation='relu'))
model.add(K.layers.Dense(units=128, kernel_initializer='random_uniform',activation='relu'))
model.add(K.layers.Dense(units=10, kernel_initializer='random_uniform',activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer=simple_adam, metrics=['accuracy'])


In [None]:
max_epochs = 10
print("Starting training ")
h = model.fit(final_train_x_df, final_train_y_one_hot_df,validation_data=(final_valid_x_df,final_valid_y_one_hot_df), batch_size=128, epochs=max_epochs, shuffle=True, verbose=1)
print("Training finished \n")

Starting training 
Train on 720000 samples, validate on 180000 samples
Epoch 1/10

In [None]:
y_pred = gbm.predict(final_train_x_df)
for i in range(len(y_pred)):
        max_value=max(y_pred[i])
        for j in range(len(y_pred[i])):
            if max_value==y_pred[i][j]:
                y_pred[i][j]=1
            else:
                y_pred[i][j]=0
                
print(precision_score(one_hoted_y, y_pred,average='micro'))

ret = []
for user_id,age in zip(range(1000000),y_pred):
    ret.append([int(user_id),int(age.tolist().index(1) + 1)])
ret_df = pd.DataFrame(ret,columns=['user_id','predicted_age'])
print(ret_df['predicted_age'].value_counts())

In [None]:

before_one_hot =  final_valid_y_df.values.reshape([-1,1])
print(before_one_hot)
enc = OneHotEncoder()
enc.fit(before_one_hot)

one_hoted_y  = enc.transform(before_one_hot).toarray()
print(one_hoted_y.shape)


In [None]:
y_pred = gbm.predict(final_valid_x_df)
for i in range(len(y_pred)):
        max_value=max(y_pred[i])
        for j in range(len(y_pred[i])):
            if max_value==y_pred[i][j]:
                y_pred[i][j]=1
            else:
                y_pred[i][j]=0
                
precision_score(one_hoted_y, y_pred,average='micro')


In [None]:
ret = []
for user_id,age in zip(range(1000000),y_pred):
    ret.append([int(user_id),int(age.tolist().index(1) + 1)])
ret_df = pd.DataFrame(ret,columns=['user_id','predicted_age'])
print(ret_df['predicted_age'].value_counts())