In [1]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf

from time import time
from deepctr.feature_column import SparseFeat, DenseFeat, get_feature_names

from mmoe import MMOE
from evaluation import evaluate_deepctr

# GPU相关设置
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
# 设置GPU按需增长
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)

Please check the latest version manually on https://pypi.org/project/deepctr/#history


In [2]:
df = pd.read_csv("data/lgb.csv")
df

Unnamed: 0,userid,feedid,date_,device,read_comment,comment,like,play,stay,click_avatar,...,authorid_in_userid_count_prop,userid_bgm_song_id_count,userid_in_bgm_song_id_count_prop,bgm_song_id_in_userid_count_prop,userid_bgm_singer_id_count,userid_in_bgm_singer_id_count_prop,bgm_singer_id_in_userid_count_prop,videoplayseconds_in_userid_mean,videoplayseconds_in_authorid_mean,feedid_in_authorid_nunique
0,8,71474,1,1,0.0,0.0,1.0,500.0,5366.0,0.0,...,0.007664,1,0.002487,0.003832,1,0.002487,0.003832,12642.308,10255.0550,5
1,8,73916,1,1,0.0,0.0,0.0,250.0,1533.0,0.0,...,0.003832,117,0.000030,0.448200,117,0.000030,0.448200,12642.308,16000.0000,1
2,8,50282,1,1,0.0,0.0,0.0,750.0,1302.0,0.0,...,0.003832,117,0.000030,0.448200,117,0.000030,0.448200,12642.308,28736.4940,24
3,8,11391,1,1,0.0,0.0,1.0,3750.0,5191.0,0.0,...,0.011500,1,0.003311,0.003832,1,0.003311,0.003832,12642.308,6875.7370,17
4,8,27349,1,1,0.0,0.0,0.0,250.0,800.0,0.0,...,0.007664,1,0.000875,0.003832,1,0.000875,0.003832,12642.308,17253.5780,19
5,8,30287,1,1,0.0,0.0,0.0,0.0,1496.0,0.0,...,0.011500,1,0.001279,0.003832,1,0.000403,0.003832,12642.308,20497.5840,38
6,8,1151,1,1,0.0,0.0,0.0,250.0,976.0,0.0,...,0.003832,1,0.006622,0.003832,1,0.006622,0.003832,12642.308,10398.5160,142
7,8,69745,1,1,0.0,0.0,0.0,250.0,817.0,0.0,...,0.003832,1,0.002500,0.003832,1,0.002500,0.003832,12642.308,12257.1390,64
8,8,22451,1,1,0.0,0.0,0.0,0.0,2014.0,0.0,...,0.003832,1,0.001920,0.003832,1,0.000607,0.003832,12642.308,13573.3990,10
9,8,97911,1,1,0.0,0.0,0.0,250.0,861.0,0.0,...,0.007664,1,0.000889,0.003832,1,0.000889,0.003832,12642.308,12580.0610,40


In [3]:
data = df[~df['read_comment'].isna()].reset_index(drop=True)
test = df[df['read_comment'].isna()].reset_index(drop=True)

In [4]:
play_cols = ['is_finish', 'play_times', 'play', 'stay']
y_list = ['read_comment', 'like', 'click_avatar', 'forward', 'favorite', 'comment', 'follow']
cols = [f for f in data.columns if f not in ['date_'] + play_cols + y_list]

In [5]:
target = ["read_comment", "like", "click_avatar", "forward"]
sparse_features = ['userid', 'feedid', 'authorid', 'bgm_song_id', 'bgm_singer_id']
dense_features = [f for f in cols if f not in sparse_features]

In [6]:
data[dense_features] = data[dense_features].fillna(0, )
test[dense_features] = test[dense_features].fillna(0, )

data[dense_features] = np.log(data[dense_features] + 1.0)
test[dense_features] = np.log(test[dense_features] + 1.0)

In [7]:
train = data[data['date_'] < 14].reset_index(drop=True)
valid = data[data['date_'] == 14].reset_index(drop=True)

In [10]:
feed_embedding = pd.read_csv("feed_embedding.csv")
feed_embedding['feed_embedding'] = feed_embedding['feed_embedding'].apply(
    lambda x: list(map(float, x.strip().split())))
feed_embedding = np.array(feed_embedding['feed_embedding'].values.tolist())

In [11]:
pretrained_feed_embedding_initializer = tf.initializers.identity(feed_embedding)

In [None]:
val = valid
epochs = 4
batch_size = 512
embedding_dim = 128
fixlen_feature_columns = [SparseFeat('feedid', vocabulary_size=data['feedid'].max() + 1, embedding_dim=512,
                                     embeddings_initializer=pretrained_feed_embedding_initializer)] + [
                             SparseFeat(feat, vocabulary_size=data[feat].max() + 1, embedding_dim=embedding_dim)
                             for feat in sparse_features if feat is not 'feedid'] + [DenseFeat(feat, 1) for feat in
                                                                                     dense_features]

dnn_feature_columns = fixlen_feature_columns
feature_names = get_feature_names(dnn_feature_columns)

# 3.generate input data for model
train_model_input = {name: data[name] for name in feature_names}
val_model_input = {name: val[name] for name in feature_names}
userid_list = val['userid'].astype(str).tolist()
test_model_input = {name: test[name] for name in feature_names}

train_labels = [data[y].values for y in target]
val_labels = [val[y].values for y in target]

# 4.Define Model,train,predict and evaluate
train_model = MMOE(dnn_feature_columns, num_tasks=4, expert_dim=8, dnn_hidden_units=(128, 128),
                   tasks=['binary', 'binary', 'binary', 'binary'])
train_model.compile("adagrad", loss='binary_crossentropy')
# print(train_model.summary())
for epoch in range(epochs):
    history = train_model.fit(train_model_input, train_labels,
                              batch_size=batch_size, epochs=1, verbose=1)

    val_pred_ans = train_model.predict(val_model_input, batch_size=batch_size * 4)
    evaluate_deepctr(val_labels, val_pred_ans, userid_list, target)

t1 = time()
pred_ans = train_model.predict(test_model_input, batch_size=batch_size * 20)
t2 = time()
print('4个目标行为%d条样本预测耗时（毫秒）：%.3f' % (len(test), (t2 - t1) * 1000.0))
ts = (t2 - t1) * 1000.0 / len(test) * 2000.0
print('4个目标行为2000条样本平均预测耗时（毫秒）：%.3f' % ts)

# 5.生成提交文件
for i, action in enumerate(target):
    test[action] = pred_ans[i]
test[['userid', 'feedid'] + target].to_csv('result.csv', index=None, float_format='%.6f')
print('to_csv ok')


{'read_comment': 0.6825774984008062, 'like': 0.6503710386494936, 'click_avatar': 0.7436147564109667, 'forward': 0.7438085912818702}
Weighted uAUC:  0.691246
{'read_comment': 0.7085825307357007, 'like': 0.667640404520166, 'click_avatar': 0.7675483792447118, 'forward': 0.7751675779445854}
Weighted uAUC:  0.714752