# 导入包

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

# 数据地址

In [2]:
phase = 0
nrows = None
train_path = '../../data/underexpose_train'  
test_path = '../../data/underexpose_test' 

# 读取数据

In [3]:
click_train = pd.read_csv(
                        train_path + '/underexpose_train_click-{phase}.csv'.format(phase=phase)
                        ,header=None
                        ,nrows=nrows
                        ,names=['user_id', 'item_id', 'time']
                        ,sep=','
                        ,dtype={'user_id':np.str,'item_id':np.str,'time':np.float}
                        ) 

In [4]:
click_test = pd.read_csv(
                        test_path + '/underexpose_test_click-{phase}/underexpose_test_click-{phase}.csv'.format(phase=phase)
                        ,header=None
                        ,nrows=nrows
                        ,names=['user_id', 'item_id', 'time']
                        ,sep=','
                        ,dtype={'user_id':np.str,'item_id':np.str,'time':np.float}
                        )

# 合并数据

In [5]:
click_all = click_train.append(click_test)

# 处理数据 

In [6]:
set_item_id = set(click_all['item_id'])
dict_item_id_map = dict(zip(set_item_id,range(1,len(set_item_id)+1)))
click_all['item_id_map'] = click_all['item_id'].map(dict_item_id_map)

# 按时间排序

In [7]:
click_all = click_all.sort_values('time')

In [8]:
click_all.head()

Unnamed: 0,user_id,item_id,time,item_id_map
89202,20937,18599,0.98374,31836
247,25129,3852,0.98374,24011
97167,12136,28195,0.98374,11962
9827,22528,48057,0.98374,16243
188473,23813,85991,0.98374,24975


In [9]:
max_item_id, min_item_id = click_all['item_id_map'].max(), click_all['item_id_map'].min()
max_item_id, min_item_id, len(set_item_id)

(40776, 1, 40776)

# 时间归一化

In [10]:
click_all['time'] = (click_all['time'] - np.min(click_all['time'])) / (np.max(click_all['time']) - np.min(click_all['time']))

In [11]:
click_all.columns

Index(['user_id', 'item_id', 'time', 'item_id_map'], dtype='object')

# 聚合成一列

In [12]:
click_all_gb = click_all.groupby(['user_id']).agg({'item_id_map':lambda x:list(x), 'time':lambda x:list(x)}).reset_index()

In [13]:
click_all_gb.head()

Unnamed: 0,user_id,item_id_map,time
0,1,"[38553, 4746, 4977, 1673, 30860, 14976, 29329,...","[0.008894984779548962, 0.07786149982061645, 0...."
1,10,"[12476, 37327, 27652, 571, 28754, 15324, 10207...","[0.08402780189097342, 0.08881095408378337, 0.2..."
2,100,"[31950, 17557, 14981, 23451, 5671, 8651, 24658...","[0.10338321932372357, 0.15522529717377634, 0.3..."
3,10000,"[2698, 26905, 30914]","[0.08657418660304175, 0.20507945877707037, 0.5..."
4,10002,"[10232, 2684, 34467, 27480, 2701, 20326, 35051...","[0.46173767607676885, 0.46441138002507715, 0.4..."


In [14]:
click_all[click_all['user_id']=='1']

Unnamed: 0,user_id,item_id,time,item_id_map
19829,1,78142,0.008895,38553
236132,1,26646,0.077861,4746
20480,1,89568,0.108965,4977
19709,1,76240,0.141049,1673
108033,1,87533,0.228535,30860
56362,1,78380,0.228631,14976
159250,1,85492,0.622863,29329
20968,1,97795,0.630401,12251
111177,1,18522,0.673606,880
3123,1,47611,0.674045,20815


# 训练和预测数据打标

In [15]:
user_test = set(click_test['user_id'])
user_train = set(click_all['user_id']) - user_test

In [16]:
click_all_gb['flag'] = click_all_gb['user_id'].map(lambda x: 'test' if x in user_test else 'train')

In [17]:
click_all_gb.head()

Unnamed: 0,user_id,item_id_map,time,flag
0,1,"[38553, 4746, 4977, 1673, 30860, 14976, 29329,...","[0.008894984779548962, 0.07786149982061645, 0....",train
1,10,"[12476, 37327, 27652, 571, 28754, 15324, 10207...","[0.08402780189097342, 0.08881095408378337, 0.2...",train
2,100,"[31950, 17557, 14981, 23451, 5671, 8651, 24658...","[0.10338321932372357, 0.15522529717377634, 0.3...",train
3,10000,"[2698, 26905, 30914]","[0.08657418660304175, 0.20507945877707037, 0.5...",train
4,10002,"[10232, 2684, 34467, 27480, 2701, 20326, 35051...","[0.46173767607676885, 0.46441138002507715, 0.4...",train


# 获取训练和预测数据集合

In [18]:
def get_data(click_all_gb, list_label_loc=[0,1,2], seq_length=10):
    list_seq_feature_item = []
    list_seq_feature_time = []
    list_label = []
    list_user = []
    
    # 0 1 2
    for loc in list_label_loc:
        for i, row in tqdm(click_all_gb.iterrows()):
            user_id, list_item_id, list_time, flag = row['user_id'], row['item_id_map'][::-1], row['time'][::-1], row['flag']
            
            if (flag == 'test') and (loc == list_label_loc[0]):
                list_label.append(-1)
                list_user.append(user_id)
                seq_feature_item_test = list_item_id[0:loc+seq_length]
                len_seq_feature_item_test = len(seq_feature_item_test)
                if len_seq_feature_item_test < seq_length:
                    seq_feature_item_test += [0] * (seq_length - len_seq_feature_item_test) 
                    
                seq_feature_time_test = list_time[0:loc+seq_length]
                len_seq_feature_time_test = len(seq_feature_time_test)
                if len_seq_feature_time_test < seq_length:
                    seq_feature_time_test += [0] * (seq_length - len_seq_feature_time_test) 
            
                assert len(seq_feature_item_test) == seq_length
                assert len(seq_feature_time_test) == seq_length
                
                list_seq_feature_item.append(seq_feature_item_test)
                list_seq_feature_time.append(seq_feature_time_test)
            
            
            if len(list_item_id) >= loc + 2:
                list_label.append(list_item_id[loc])
                list_user.append(user_id)

                seq_feature_item = list_item_id[loc+1:loc+seq_length+1] 
                len_seq_feature_item = len(seq_feature_item)
                if len_seq_feature_item < seq_length:
                    seq_feature_item += [0] * (seq_length - len_seq_feature_item) 
                
                seq_feature_time = list_time[loc+1:loc+seq_length+1] 
                len_seq_feature_time = len(seq_feature_time)
                if len_seq_feature_time < seq_length:
                    seq_feature_time += [0] * (seq_length - len_seq_feature_time) 
                 
                assert len(seq_feature_item) == seq_length
                assert len(seq_feature_time) == seq_length
                
                list_seq_feature_item.append(seq_feature_item)
                list_seq_feature_time.append(seq_feature_time)
    
    df_train_test_label = pd.DataFrame()
    df_train_test_label['seq_feature_item'] = list_seq_feature_item
    df_train_test_label['seq_feature_time'] = list_seq_feature_time
    df_train_test_label['label'] = list_label
    df_train_test_label['user_id'] = list_user
    
        
    return df_train_test_label

In [19]:
train_test_data = get_data(click_all_gb=click_all_gb,list_label_loc=[0],seq_length=10)

18505it [00:03, 5640.15it/s]


In [20]:
click_all_gb[click_all_gb['user_id']=='10007']

Unnamed: 0,user_id,item_id_map,time,flag
8,10007,"[9329, 25743, 24599, 20695]","[0.4001817192724083, 0.4461613250459523, 0.608...",train


In [21]:
# click_all_gb

In [22]:
train_test_data[train_test_data['user_id']=='10007']

Unnamed: 0,seq_feature_item,seq_feature_time,label,user_id
8,"[24599, 25743, 9329, 0, 0, 0, 0, 0, 0, 0]","[0.6084817759881395, 0.4461613250459523, 0.400...",20695,10007


# 查看数据

In [23]:
train_test_data[train_test_data['label']==-1].head()

Unnamed: 0,seq_feature_item,seq_feature_time,label,user_id
10,"[38145, 13095, 20793, 31285, 0, 0, 0, 0, 0, 0]","[0.9681528293808541, 0.9474113684510539, 0.947...",-1,1001
26,"[7285, 21109, 23274, 13316, 37409, 28767, 0, 0...","[0.8957602694540879, 0.8607619477526436, 0.435...",-1,10032
37,"[16201, 24312, 25696, 10175, 5972, 14039, 2776...","[0.6106346285175493, 0.6102989687147685, 0.610...",-1,10043
43,"[34262, 15116, 26607, 35304, 34453, 39907, 383...","[0.8909684364042963, 0.8909192448814004, 0.890...",-1,10054
51,"[23876, 34830, 28834, 16109, 22998, 19554, 400...","[0.6541893815762587, 0.6430547356964651, 0.642...",-1,10065


In [24]:
click_all_gb[click_all_gb['user_id']=='1001']

Unnamed: 0,user_id,item_id_map,time,flag
10,1001,"[31285, 20793, 13095, 38145]","[0.4442544301310222, 0.9471943470260036, 0.947...",test


In [25]:
click_all_gb[click_all_gb['user_id']=='10032']

Unnamed: 0,user_id,item_id_map,time,flag
25,10032,"[28767, 37409, 13316, 23274, 21109, 7285]","[0.43361459309918204, 0.43391552947352713, 0.4...",test


In [26]:
train_test_data[train_test_data['user_id']=='10032']

Unnamed: 0,seq_feature_item,seq_feature_time,label,user_id
26,"[7285, 21109, 23274, 13316, 37409, 28767, 0, 0...","[0.8957602694540879, 0.8607619477526436, 0.435...",-1,10032
27,"[21109, 23274, 13316, 37409, 28767, 0, 0, 0, 0...","[0.8607619477526436, 0.4353825943034243, 0.433...",7285,10032


# 深度建模

# 全分类 双塔 Rank

# 全分类

In [31]:
import tensorflow as tf

In [27]:
class RnnClassificationModel:
    def __init__(self, 
                 seq_length,
                 vocab_size,
                 embedding_dim,
                 output_class_num
                ):
        self.seq_length = seq_length
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.output_class_num = output_class_num
        
        
    def model(self):
        # 输入
        self.inputs_item_seq = tf.keras.Input(shape=(self.seq_length,))
        self.inputs_time_seq = tf.keras.Input(shape=(self.seq_length,))
        
        # embedding 化
        self.cat_embedding = tf.keras.layers.Embedding(input_dim=self.vocab_size, 
                                                       output_dim=self.embedding_dim, 
                                                       input_length=self.seq_length)(self.inputs_item_seq)
        
        list_float_embedding = []
        split_ = tf.split(self.inputs_time_seq,num_or_size_splits=self.seq_length,axis=-1)
        dense_ = tf.keras.layers.Dense(units=self.embedding_dim, activation=tf.nn.relu)
        for s in split_:
            float_embedding_ = tf.expand_dims(dense_(s),1)
            list_float_embedding.append(float_embedding_)
        self.float_embedding = tf.concat(list_float_embedding,axis=1)

        
        # 简单 concat
        self.concat = tf.keras.layers.concatenate([self.cat_embedding,self.float_embedding], axis=-1)
        
        # lstm 网络
        self.lstm = tf.keras.layers.LSTM(units=64)(self.concat)
        
        # fc 网路
        self.dense = tf.keras.layers.Dense(units=64, activation=tf.nn.relu)(self.lstm)
        
        # softmax
        self.outputs = tf.keras.layers.Dense(units=self.output_class_num, activation=tf.nn.softmax)(self.dense)
        
#         print(self.inputs_item_seq,self.inputs_time_seq,self.outputs)
        
        self.model = tf.keras.Model(inputs=[self.inputs_item_seq,self.inputs_time_seq], outputs=self.outputs)
    
        return self.model

# 模型参数

In [28]:
embedding_dim = 16
vocab_size = max_item_id
seq_length = 10
output_class_num = max_item_id

# 全分类模型

In [29]:
rnn_classification_model = RnnClassificationModel(seq_length=seq_length,
                 vocab_size=vocab_size,
                 embedding_dim=embedding_dim,
                 output_class_num=output_class_num)

In [32]:
model = rnn_classification_model.model()

# 模型结构查看

In [37]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 10)]         0                                            
__________________________________________________________________________________________________
tf_op_layer_split (TensorFlowOp [(None, 1), (None, 1 0           input_2[0][0]                    
__________________________________________________________________________________________________
dense (Dense)                   (None, 16)           32          tf_op_layer_split[0][0]          
                                                                 tf_op_layer_split[0][1]          
                                                                 tf_op_layer_split[0][2]          
                                                                 tf_op_layer_split[0][3]      

# 模型loss 评测 优化

In [38]:
def top_50(y_true, y_pred):
    return tf.keras.metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=50)
def logloss(y_true, y_pred):
    return tf.keras.metrics.sparse_categorical_crossentropy(y_true, y_pred)
def accuracy(y_true, y_pred):
    return tf.keras.metrics.sparse_categorical_accuracy(y_true, y_pred)

In [39]:
model.compile(loss="sparse_categorical_crossentropy",optimizer=tf.keras.optimizers.RMSprop(),metrics=[logloss,accuracy,top_50])

# 模型输入数据

In [40]:
train_seq_mat1 = np.array(list(train_test_data[train_test_data['label']!=-1]['seq_feature_item']))
train_seq_mat2 = np.array(list(train_test_data[train_test_data['label']!=-1]['seq_feature_time']))
train_y = np.array(list(train_test_data[train_test_data['label']!=-1]['label']))

# 模型训练

In [42]:
model_fit = model.fit([train_seq_mat1,train_seq_mat2],train_y,batch_size=128,epochs=10,
                      validation_split=0.2)

Train on 14804 samples, validate on 3701 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [81]:
train_test_data.head()

Unnamed: 0,seq_feature_item,seq_feature_time,label,user_id
0,"[32530, 33844, 8333, 8441, 36729, 3209, 18135,...","[0.7062513744688279, 0.6754545875429329, 0.674...",22156,1
1,"[935, 2959, 5630, 7634, 35576, 18212, 18813, 1...","[0.9475357940677906, 0.9474142620695385, 0.945...",4678,10
2,"[40373, 21953, 25097, 8830, 25706, 26347, 2598...","[0.8525122400087298, 0.8484380244684021, 0.847...",35976,100
3,"[35056, 27145, 0, 0, 0, 0, 0, 0, 0, 0]","[0.20507945877707037, 0.08657418660304175, 0, ...",35129,10000
4,"[32430, 29174, 24411, 4714, 22536, 22565, 6081...","[0.9466156232277368, 0.4690440640291869, 0.468...",18311,10002


# 模型预测

In [43]:
test_seq_mat1 = np.array(list(train_test_data[train_test_data['label']==-1]['seq_feature_item']))
test_seq_mat2 = np.array(list(train_test_data[train_test_data['label']==-1]['seq_feature_time']))
# test_y = np.array(list(train_test_data[train_test_data['label']==-1]['label']))

In [45]:
pred = model.predict([test_seq_mat1,test_seq_mat2])

# 双塔

In [46]:
class CustomLayer(tf.keras.layers.Layer):
    def __init__(self,num_uids):
        super(CustomLayer, self).__init__(trainable=True,dtype=tf.int64)
        self.num_uids = num_uids

    def build(self,input_shape):
        self.params = tf.Variable(tf.random.normal((num_uids, 9)), trainable=True)
        self.built=True

    def call(self, input_uid,input_shared_features):
        param = tf.gather_nd(self.params, input_uid)
        combined = tf.concat([param, input_shared_features], axis=-1)
        return combined

    def get_config(self):
        config = super(CustomLayer, self).get_config()
        config.update({'num_uids': self.num_uids})
        return config

In [47]:
class TwoTowersModel:
    def __init__(self, 
                 seq_length,
                 vocab_size,
                 embedding_dim
                ):
        self.seq_length = seq_length
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        
    def model(self):
        # 输入
        # 用户序列特征
        self.inputs_item_seq = tf.keras.Input(shape=(self.seq_length,),dtype=tf.int32)
        self.inputs_time_seq = tf.keras.Input(shape=(self.seq_length,),dtype=tf.float32)
        
        # 用户下一个item
        self.inputs_item_next = tf.keras.Input(shape=(1,),dtype=tf.int32)
        
        # seq embedding 化
#         param = tf.Variable(tf.zeros([self.vocab_size,self.embedding_dim]))
#         self.cat_embedding = tf.nn.embedding_lookup(param, self.inputs_item_seq)
        
        # item embeeding 化
#         self.item_embedding = tf.nn.embedding_lookup(param, self.inputs_item_next)
        
        self.embedding_ = tf.keras.layers.Embedding(input_dim=self.vocab_size, 
                                                       output_dim=self.embedding_dim
                                                       )
        self.cat_embedding = self.embedding_(self.inputs_item_seq)
        self.item_embedding = tf.squeeze(self.embedding_(self.inputs_item_next),axis=1)

    
        # float embedding化
        list_float_embedding = []
        split_ = tf.split(self.inputs_time_seq,num_or_size_splits=self.seq_length,axis=-1)
        dense_ = tf.keras.layers.Dense(units=self.embedding_dim, activation=tf.nn.relu)
        for s in split_:
            float_embedding_ = tf.expand_dims(dense_(s),1)
            list_float_embedding.append(float_embedding_)
        self.float_embedding = tf.concat(list_float_embedding,axis=1)

#         print(self.cat_embedding,self.float_embedding)
        # 简单 concat
        self.concat = tf.keras.layers.concatenate([self.cat_embedding,self.float_embedding], axis=-1)
        
#         print(self.concat)
#         self.concat = tf.Tensor(self.concat,dtype=tf.float32)
        # lstm 网络
        self.lstm = tf.keras.layers.LSTM(units=64)(self.concat)
        
        # 用户 fc 网路
        self.dense_user = tf.keras.layers.Dense(units=64, activation=tf.nn.relu)(self.lstm)
        
        # 商品 fc 网络
        self.dense_item = tf.keras.layers.Dense(units=64, activation=tf.nn.relu)(self.item_embedding)

        # a * b
        self.similar = tf.reduce_sum(tf.multiply(self.dense_user, self.dense_item),axis=-1)
        
        # softmax
        self.outputs = tf.nn.sigmoid(self.similar)
        
        # 输出
        self.model_user = tf.keras.Model(inputs=[self.inputs_item_seq,self.inputs_time_seq], outputs=self.dense_user)
        self.model_item = tf.keras.Model(inputs=[self.inputs_item_next], outputs=self.dense_item)
        
        self.model = tf.keras.Model(inputs=[self.inputs_item_seq,self.inputs_time_seq,self.inputs_item_next], outputs=self.outputs)
    
        return self.model, self.model_user, self.model_item

# 双塔模型

In [48]:
two_towers_model = TwoTowersModel(seq_length=seq_length,
                 vocab_size=vocab_size,
                 embedding_dim=embedding_dim)

In [49]:
model2, model2_user, model2_item = two_towers_model.model()

# 查看 商品item 模型结构

In [50]:
model2_item.summary()

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         [(None, 1)]               0         
_________________________________________________________________
embedding_1 (Embedding)      multiple                  652416    
_________________________________________________________________
tf_op_layer_Squeeze (TensorF [(None, 16)]              0         
_________________________________________________________________
dense_5 (Dense)              (None, 64)                1088      
Total params: 653,504
Trainable params: 653,504
Non-trainable params: 0
_________________________________________________________________


# 查看 用户user 模型结构

In [52]:
model2_user.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            [(None, 10)]         0                                            
__________________________________________________________________________________________________
tf_op_layer_split_1 (TensorFlow [(None, 1), (None, 1 0           input_4[0][0]                    
__________________________________________________________________________________________________
dense_3 (Dense)                 (None, 16)           32          tf_op_layer_split_1[0][0]        
                                                                 tf_op_layer_split_1[0][1]        
                                                                 tf_op_layer_split_1[0][2]        
                                                                 tf_op_layer_split_1[0][3]  

# 查看整体模型结构

In [53]:
model2.summary()

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            [(None, 10)]         0                                            
__________________________________________________________________________________________________
tf_op_layer_split_1 (TensorFlow [(None, 1), (None, 1 0           input_4[0][0]                    
__________________________________________________________________________________________________
dense_3 (Dense)                 (None, 16)           32          tf_op_layer_split_1[0][0]        
                                                                 tf_op_layer_split_1[0][1]        
                                                                 tf_op_layer_split_1[0][2]        
                                                                 tf_op_layer_split_1[0][3]  

# 模型 loss 评测 优化

In [54]:
model2.compile(loss=tf.keras.losses.binary_crossentropy,optimizer=tf.keras.optimizers.RMSprop(),metrics=[tf.keras.metrics.binary_crossentropy,tf.keras.metrics.binary_accuracy])

# 迭代器

In [64]:
def generator(X1,X2,y,batch_size=128):
    # seq_feature_item	seq_feature_time	label
    length = len(list(y))
    index = 0
    while True:
        index += 1
        if index > 1000:
            break
        
        sample_index1 = np.random.randint(length, size=batch_size)
        X1_1 = X1[sample_index1]
        X2_1 = X2[sample_index1]
        y_1 = y[sample_index1]
        
        sample_index2 = np.random.randint(length, size=batch_size)
        y_0 = y[sample_index2]
        
        label_1_0 = np.array([1] * batch_size + [1 if i==j else 0 for i,j in zip(y_1,y_0)]) 
        
        yield [np.vstack([X1_1,X1_1]), np.vstack([X2_1,X2_1]), np.hstack([y_1,y_0])], label_1_0

# 测试迭代器

In [58]:
data=next(generator(train_seq_mat1,train_seq_mat2,train_y,batch_size=128))

In [59]:
data

([array([[28317,  1206, 35574, ...,  6019,   642, 39609],
         [15055, 11327, 38786, ..., 17432, 15830,  4605],
         [ 9569, 23903,  6016, ...,  9926, 16662,  4215],
         ...,
         [33592, 25060, 28384, ...,     0,     0,     0],
         [35266, 19392, 14210, ...,     0,     0,     0],
         [21604, 30239, 15968, ...,     0,     0,     0]]),
  array([[0.62669132, 0.62631804, 0.46359827, ..., 0.46310346, 0.46307453,
          0.4630398 ],
         [0.9843571 , 0.98363658, 0.98315914, ..., 0.95427793, 0.9542432 ,
          0.95396542],
         [0.90602394, 0.90579534, 0.85741982, ..., 0.37876026, 0.31506592,
          0.31484021],
         ...,
         [0.46348542, 0.46268968, 0.46133546, ..., 0.        , 0.        ,
          0.        ],
         [0.65940079, 0.47213734, 0.34166696, ..., 0.        , 0.        ,
          0.        ],
         [0.6989392 , 0.6973535 , 0.21630381, ..., 0.        , 0.        ,
          0.        ]]),
  array([27170,  2747, 12380, 31

# 测试模型训练

In [60]:
model2.fit([data[0][0],data[0][1],data[0][2]],data[1])

Train on 256 samples


<tensorflow.python.keras.callbacks.History at 0x142f749e8>

# 模型训练

In [65]:
model2.fit_generator(generator(train_seq_mat1,train_seq_mat2,train_y,batch_size=128),epochs=1)

Instructions for updating:
Please use Model.fit, which supports generators.
  ...
    to  
  ['...']


<tensorflow.python.keras.callbacks.History at 0x143bf7d68>

# 用户user 模型预测

In [69]:
test_seq_mat1 = np.array(list(train_test_data[train_test_data['label']==-1]['seq_feature_item']))
test_seq_mat2 = np.array(list(train_test_data[train_test_data['label']==-1]['seq_feature_time']))

In [68]:
user_embedding = model2_user.predict([test_seq_mat1,test_seq_mat2])

# 查看用户user embedding 

In [75]:
user_embedding

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.18476534],
       [0.        , 0.        , 0.05551466, ..., 0.02946857, 0.32896775,
        0.0164078 ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.2463957 ,
        0.05669748],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.15368362,
        0.07387834],
       [0.02807279, 0.        , 0.        , ..., 0.04799657, 0.33363134,
        0.02776098],
       [0.        , 0.56821394, 0.8854616 , ..., 0.        , 0.        ,
        0.38548687]], dtype=float32)

# 商品item 模型预测

In [72]:
item_mat = np.array(list(train_test_data[train_test_data['label']!=-1]['label']))

In [73]:
item_embedding = model2_item.predict([item_mat])

# 查看商品item embedding

In [76]:
item_embedding

array([[0.        , 0.        , 0.        , ..., 0.        , 0.0324979 ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.09356035,
        0.        ],
       [0.        , 0.05674259, 0.        , ..., 0.        , 0.03220441,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.07289328,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.03572592,
        0.        ],
       [0.01450562, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]], dtype=float32)

# Rrank 模型

In [77]:
class RankModel:
    def __init__(self, 
                 seq_length,
                 vocab_size,
                 embedding_dim
                ):
        self.seq_length = seq_length
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        
    def model(self):
        # 输入
        # 用户序列特征
        self.inputs_item_seq = tf.keras.Input(shape=(self.seq_length,),dtype=tf.int32)
        self.inputs_time_seq = tf.keras.Input(shape=(self.seq_length,),dtype=tf.float32)
        
        # 用户下一个item
        self.inputs_item_next = tf.keras.Input(shape=(1,),dtype=tf.int32)
        
        # seq embedding 化
#         param = tf.Variable(tf.zeros([self.vocab_size,self.embedding_dim]))
#         self.cat_embedding = tf.nn.embedding_lookup(param, self.inputs_item_seq)
        
        # item embeeding 化
#         self.item_embedding = tf.nn.embedding_lookup(param, self.inputs_item_next)
        
        self.embedding_ = tf.keras.layers.Embedding(input_dim=self.vocab_size, 
                                                       output_dim=self.embedding_dim
                                                       )
        self.cat_embedding = self.embedding_(self.inputs_item_seq)
        self.item_embedding = tf.squeeze(self.embedding_(self.inputs_item_next),axis=1)

    
        # float embedding化
        list_float_embedding = []
        split_ = tf.split(self.inputs_time_seq,num_or_size_splits=self.seq_length,axis=-1)
        dense_ = tf.keras.layers.Dense(units=self.embedding_dim, activation=tf.nn.relu)
        for s in split_:
            float_embedding_ = tf.expand_dims(dense_(s),1)
            list_float_embedding.append(float_embedding_)
        self.float_embedding = tf.concat(list_float_embedding,axis=1)

#         print(self.cat_embedding,self.float_embedding)
        # 简单 concat
        self.concat = tf.keras.layers.concatenate([self.cat_embedding,self.float_embedding],axis=-1)
        
#         print(self.concat)
#         self.concat = tf.Tensor(self.concat,dtype=tf.float32)
        # lstm 网络
        self.lstm = tf.keras.layers.LSTM(units=64)(self.concat)
        
        # 用户 fc 网路
        self.dense_user = tf.keras.layers.Dense(units=64, activation=tf.nn.relu)(self.lstm)
        
        # 商品 fc 网络
        self.dense_item = tf.keras.layers.Dense(units=64, activation=tf.nn.relu)(self.item_embedding)
        
        # 合并网络
        self.dense = tf.keras.layers.concatenate([self.dense_user,self.dense_item],axis=-1)
            
        # softmax
        self.outputs = tf.keras.layers.Dense(units=1, activation=tf.nn.sigmoid)(self.dense)
        
        
        # 输出
        # self.model_user = tf.keras.Model(inputs=[self.inputs_item_seq,self.inputs_time_seq], outputs=self.dense_user)
        # self.model_item = tf.keras.Model(inputs=[self.inputs_item_next], outputs=self.dense_item)
        
        self.model = tf.keras.Model(inputs=[self.inputs_item_seq,self.inputs_time_seq,self.inputs_item_next], outputs=self.outputs)
    
        return self.model

# 排序模型

In [78]:
rank_model = RankModel(seq_length=seq_length,
                 vocab_size=vocab_size,
                 embedding_dim=embedding_dim)

In [79]:
model3 = rank_model.model()

# 模型结构查看

In [80]:
model3.summary()

Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            [(None, 10)]         0                                            
__________________________________________________________________________________________________
tf_op_layer_split_2 (TensorFlow [(None, 1), (None, 1 0           input_7[0][0]                    
__________________________________________________________________________________________________
dense_6 (Dense)                 (None, 16)           32          tf_op_layer_split_2[0][0]        
                                                                 tf_op_layer_split_2[0][1]        
                                                                 tf_op_layer_split_2[0][2]        
                                                                 tf_op_layer_split_2[0][3]  

# 赠送大家 封装的基础结构

# 特征配置文件

In [82]:
class ConfigFeatures:
    def __init__(self,
                features_name,
                group_name,
                features_type,
                features_length,
                data_type,
                vocab_size):
        self.features_name = features_name
        self.group_name = group_name
        self.features_type = features_type
        self.features_length = int(features_length)
        self.data_type = data_type
        self.vocab_size = int(vocab_size) + 1
        self.deal_type()
    
    def deal_type(self):
        type_map = {
                    'int8':tf.int8,
                    'int16':tf.int16,
                    'int32':tf.int32,
                    'int64':tf.int64,
                    'float16':tf.float16,
                    'float32':tf.float32,
                    'float64':tf.float64,
                    'double':tf.double
                    }
        self.data_type = type_map[self.data_type]

# 数据输入文件

In [83]:
class Input:
    def __init__(self,
                 config_features
                ):
        self.config_features = config_features
        
    def __call__(self):
        tf_input = tf.keras.Input(shape=self.config_features.features_length, 
                       name='input_'+self.config_features.features_name, 
                       dtype=self.config_features.data_type)
        return tf_input

# Embedding Layer

In [84]:
class DenseEmbeeding:
    def __init__(self,
                output_dim,
                activation=None, 
                use_bias=True,
                trainable=True
                ):
        self.output_dim = output_dim
        self.activation = activation
        self.use_bias = use_bias
        self.trainable = trainable
    def __call__(self, input_layer):
        output_layer = tf.keras.layers.Dense(
                            units=self.output_dim,
                            activation=self.activation,
                            use_bias=self.use_bias,
                            trainable=self.trainable,
                            )(input_layer)
        return output_layer

In [85]:
class CategoryEmbeeding:
    def __init__(self,
                output_dim,
                vocab_size,
                embeddings_initializer='uniform',
                trainable=True
                ):
        self.output_dim = output_dim
        self.vocab_size = vocab_size
        self.embeddings_initializer = embeddings_initializer
        self.trainable = trainable
    def __call__(self, input_layer):
        output_layer = tf.keras.layers.Embedding(
                        input_dim=self.vocab_size, 
                        output_dim=self.output_dim,
                        embeddings_initializer=self.embeddings_initializer,
                        trainable=self.trainable,
                        )(input_layer)
        output_layer = tf.squeeze(output_layer,axis=1)
        return output_layer

In [86]:
class DenseSequenceEmbeeding:
    def __init__(self,
                output_dim,
                activation=None, 
                use_bias=True,
                trainable=True
                ):
        self.output_dim = output_dim
        self.activation = activation
        self.use_bias = use_bias
        self.trainable = trainable
    def __call__(self, input_layer):
        input_layer = tf.expand_dims(input_layer,axis=-1)
        output_layer = tf.keras.layers.Dense(
                            units=self.output_dim,
                            activation=self.activation,
                            use_bias=self.use_bias,
                            trainable=self.trainable,
                            )(input_layer)
        return output_layer

In [87]:
class CategorySequenceEmbeeding:
    def __init__(self,
                output_dim,
                vocab_size,
                embeddings_initializer='uniform',
                trainable=True
                ):
        self.output_dim = output_dim
        self.vocab_size = vocab_size
        self.embeddings_initializer = embeddings_initializer
        self.trainable = trainable
    def __call__(self, input_layer):
        output_layer = tf.keras.layers.Embedding(
                        input_dim=self.vocab_size, 
                        output_dim=self.output_dim,
                        embeddings_initializer=self.embeddings_initializer,
                        trainable=self.trainable,
                        )(input_layer)
        return output_layer

In [88]:
class CategorySequenceEmbeeding_same:
    def __init__(self,
                output_dim,
                vocab_size,
                embeddings_initializer='uniform',
                trainable=True
                ):
        self.output_dim = output_dim
        self.vocab_size = vocab_size
        self.embeddings_initializer = embeddings_initializer
        self.trainable = trainable
    def __call__(self, input_layer):
        output_layer = tf.keras.layers.Embedding(
                        input_dim=self.vocab_size, 
                        output_dim=self.output_dim,
                        embeddings_initializer=self.embeddings_initializer,
                        trainable=self.trainable,
                        )(input_layer)
        return output_layer

# FmLayer

In [89]:
class FmLayer:
    def __init__(self,
                dense_use=True, 
                dense_units=8, 
                activation=tf.nn.relu,
                use_bias=True,
                trainable=True,
                reduce_sum=True
                ):
        self.dense_use = dense_use
        self.dense_units = dense_units
        self.activation = activation
        self.reduce_sum = reduce_sum
        self.use_bias = use_bias
        self.trainable = trainable
        
    def __call__(self, list_input_layer):

        list_interaction = []
        for input_layer_i in list_input_layer:
            for input_layer_j in list_input_layer:
                
                assert len(input_layer_i.get_shape().as_list()) == 2
                assert len(input_layer_j.get_shape().as_list()) == 2
                
                interaction_layer = input_layer_i * input_layer_j
                if self.dense_use:
                    interaction_layer = tf.keras.layers.Dense(
                            units=self.dense_units,
                            activation=self.activation,
                            use_bias=self.use_bias,
                            trainable=self.trainable,
                            )(interaction_layer)
                if self.reduce_sum:
                    interaction_layer = tf.reduce_sum(interaction_layer,axis=-1)
                if len(interaction_layer.get_shape().as_list()) == 1:
                    interaction_layer = tf.expand_dims(interaction_layer,axis=-1)
                    
                list_interaction.append(interaction_layer)
        return tf.concat(list_interaction,axis=-1)

# DenseLayer

In [90]:
class DenseLayer:
    def __init__(self,
                list_dense_units,
                activation=tf.nn.relu,
                use_bias=True,
                trainable=True
                ):
        self.list_dense_units = list_dense_units
        self.activation = activation
        self.use_bias = use_bias
        self.trainable = trainable
    
    def __call__(self, list_input_layer):
        output_layer = tf.concat(list_input_layer,axis=-1)
        
        assert len(output_layer.get_shape().as_list()) == 2
        for units in self.list_dense_units:
            output_layer = tf.keras.layers.Dense(
                            units=units,
                            activation=self.activation,
                            use_bias=self.use_bias,
                            trainable=self.trainable,
                            )(output_layer)
        return output_layer

# AddMultiDenseLayer

In [91]:
class AddMultiDenseLayer:
    def __init__(self,
                activation=tf.nn.relu,
                use_bias=True,
                trainable=True
                ):
        self.activation = activation
        self.use_bias = use_bias
        self.trainable = trainable 
        
    def __call__(self, input_layer):
        input_layer = input_layer
        assert len(input_layer.get_shape().as_list()) == 2
        
        layer_add = input_layer + tf.keras.layers.Dense(
                            units=input_layer.get_shape().as_list()[-1],
                            activation=self.activation,
                            use_bias=self.use_bias,
                            trainable=self.trainable,
                            )(input_layer)
        
        layer_multi = input_layer * tf.keras.layers.Dense(
                            units=input_layer.get_shape().as_list()[-1],
                            activation=self.activation,
                            use_bias=self.use_bias,
                            trainable=self.trainable,
                            )(input_layer)
        output_layer = tf.concat([layer_add,layer_multi])
        
        output_layer = tf.keras.layers.Dense(
                            units=input_layer.get_shape().as_list()[-1],
                            activation=self.activation,
                            use_bias=self.use_bias,
                            trainable=self.trainable,
                            )(output_layer)
        return output_layer, input_layer + output_layer


# DinLayer

In [92]:
class DinLayer:
    def __init__(self,
                activation=tf.nn.relu,
                use_bias=True,
                trainable=True,
                use_dense=True
                ):
        self.activation = activation
        self.use_bias = use_bias
        self.trainable = trainable
        self.use_dense = use_dense
    def __call__(self, 
                 list_input_sequence_layer,
                 list_input_layer
                ):
        output_sequence_layer = tf.concat(list_input_sequence_layer,axis=-1)
        _, sequence_length, units = output_sequence_layer.get_shape().as_list()
        
        
        output_layer = tf.concat(list_input_layer,axis=-1)
        output_layer = tf.keras.layers.Dense(
                            units=units,
                            activation=self.activation,
                            use_bias=self.use_bias,
                            trainable=self.trainable,
                            )(output_layer)
        
        
        list_layer = []
        for layer_ in tf.split(output_sequence_layer,sequence_length,axis=1):
            
            layer_ = tf.squeeze(layer_,axis=1)
            layer = layer_
            if self.use_dense:
                layer_ = tf.keras.layers.Dense(
                            units=units,
                            activation=self.activation,
                            use_bias=self.use_bias,
                            trainable=self.trainable,
                            )(layer_)
                
            dot_ = output_layer * layer_
            concat_ = tf.concat([layer_, output_layer, dot_],axis=-1)
            
            active_units_ = tf.keras.layers.Dense(
                            units=1,
                            activation=self.activation,
                            use_bias=self.use_bias,
                            trainable=self.trainable,
                            )(concat_)
            layer = tf.expand_dims(layer * active_units_,-1)
            list_layer.append(layer)
            
        output_layer = tf.reduce_sum(tf.concat(list_layer,axis=-1),axis=-1)
        
        return output_layer

# DinSameLayer

In [93]:
class DinSameLayer:
    def __init__(self,
                activation=tf.nn.relu,
                use_bias=True,
                trainable=True,
                use_dense=True
                ):
        self.activation = activation
        self.use_bias = use_bias
        self.trainable = trainable
        self.use_dense = use_dense
    def __call__(self, 
                 list_input_sequence_layer,
                 list_input_layer
                ):
        output_sequence_layer = tf.concat(list_input_sequence_layer,axis=-1)
        _, sequence_length, units = output_sequence_layer.get_shape().as_list()
        
        
        output_layer = tf.concat(list_input_layer,axis=-1)
        output_layer = tf.keras.layers.Dense(
                            units=units,
                            activation=self.activation,
                            use_bias=self.use_bias,
                            trainable=self.trainable,
                            )(output_layer)
        
        
        list_layer = []
        dense_layer_1 = tf.keras.layers.Dense(
                            units=units,
                            activation=self.activation,
                            use_bias=self.use_bias,
                            trainable=self.trainable,
                            )
        dense_layer_2 = tf.keras.layers.Dense(
                            units=1,
                            activation=self.activation,
                            use_bias=self.use_bias,
                            trainable=self.trainable,
                            )
        for layer_ in tf.split(output_sequence_layer,sequence_length,axis=1):
            
            layer_ = tf.squeeze(layer_,axis=1)
            layer = layer_
            if self.use_dense:
                layer_ = dense_layer_1(layer_)
                
            dot_ = output_layer * layer_
            concat_ = tf.concat([layer_, output_layer, dot_],axis=-1)
            
            active_units_ = dense_layer_2(concat_)
            layer = tf.expand_dims(layer * active_units_,-1)
            list_layer.append(layer)
            
        output_layer = tf.reduce_sum(tf.concat(list_layer,axis=-1),axis=-1)
        
        return output_layer