In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import sys
import time
import random

In [2]:
def to_df(file_path):
    with open(file_path, "r")as f:
        i, df = 0, {}
        for line in f:
            df[i] = eval(line)
            i += 1
        # 使用字典里面在键作为索引
        df = pd.DataFrame.from_dict(df, orient="index")
        return df

In [3]:
review_df = to_df("./reviews_Electronics_5.json")

In [10]:
meta_df = to_df('./meta_Electronics.json') 

In [5]:
review_df.head(5)

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,AO94DHGC771SJ,528881469,amazdnu,"[0, 0]",We got this GPS for my husband who is an (OTR)...,5.0,Gotta have GPS!,1370131200,"06 2, 2013"
1,AMO214LNFCEI4,528881469,Amazon Customer,"[12, 15]","I'm a professional OTR truck driver, and I bou...",1.0,Very Disappointed,1290643200,"11 25, 2010"
2,A3N7T0DY83Y4IG,528881469,C. A. Freeman,"[43, 45]","Well, what can I say. I've had this unit in m...",3.0,1st impression,1283990400,"09 9, 2010"
3,A1H8PY3QHMQQA0,528881469,"Dave M. Shaw ""mack dave""","[9, 10]","Not going to write a long review, even thought...",2.0,"Great grafics, POOR GPS",1290556800,"11 24, 2010"
4,A24EV6RXELQZ63,528881469,Wayne Smith,"[0, 0]",I've had mine for a year and here's what we go...,1.0,"Major issues, only excuses for support",1317254400,"09 29, 2011"


In [11]:
meta_df['categories'].head(5)

0    [[Electronics, Computers & Accessories, Cables...
1    [[Electronics, Computers & Accessories, Cables...
2    [[Electronics, Computers & Accessories, PDAs, ...
3    [[Electronics, Accessories & Supplies, Audio &...
4    [[Electronics, GPS & Navigation, Vehicle GPS, ...
Name: categories, dtype: object

In [12]:
# 选取在review数据出现在goods_id
meta_df = meta_df[meta_df['asin'].isin(review_df['asin'].unique())]
meta_df = meta_df.reset_index(drop=True)

In [13]:
# 重新考量ID
review_df = review_df[["reviewerID","asin","unixReviewTime"]]
meta_df = meta_df[["asin", "categories"]]
#  目前分类都按照最后一类来分
meta_df["categories"] = meta_df["categories"].map(lambda x: x[-1][-1])

In [14]:
# 定义一个有序的字段和索引的映射函数
def build_map(df, col_name):
    key = sorted(df[col_name].unique().tolist())
    m  = dict(zip(key , range(len(key))))
    df[col_name] = df[col_name].map(lambda x: m[x])
    return m, key

In [15]:
asin_map, asin_key = build_map(meta_df,"asin")
cate_map, cate_key = build_map(meta_df, "categories")
rev_map, rev_key = build_map(review_df, "reviewerID")

In [16]:
user_count, item_count, cate_count, example_count = len(rev_map),len(asin_map),len(cate_map),review_df.shape[0]

In [17]:
# 统计用户量商品量分类量和样本数
print('user_count: %d\titem_count: %d\tcate_count: %d       example_count: %d' %(user_count, item_count, cate_count, example_count))

user_count: 19168	item_count: 971	cate_count: 241       example_count: 24187


In [18]:
meta_df = meta_df.sort_values("asin").reset_index(drop=True)
# 统一转化成下标
review_df["asin"] = review_df["asin"].map(lambda x: asin_map[x])
review_df = review_df.sort_values(["reviewerID", "unixReviewTime"]).reset_index(drop=True)
review_df = review_df[["reviewerID","asin","unixReviewTime"]]

In [19]:
review_df.head(5)

Unnamed: 0,reviewerID,asin,unixReviewTime
0,0,75,1385337600
1,1,890,1358035200
2,2,643,1361750400
3,3,168,1390003200
4,4,533,1350086400


In [21]:
meta_df.head(5)

Unnamed: 0,asin,categories
0,0,217
1,1,47
2,2,167
3,3,206
4,4,210


In [22]:
cate_list = np.array([meta_df["categories"][i] for i in range(len(asin_map))],dtype=np.int32)

In [32]:
review_df.groupby("reviewerID").head()

Unnamed: 0,reviewerID,asin,unixReviewTime
0,0,75,1385337600
1,1,890,1358035200
2,2,643,1361750400
3,3,168,1390003200
4,4,533,1350086400
...,...,...,...
24182,19164,439,1381104000
24183,19164,596,1381708800
24184,19165,359,1282435200
24185,19166,364,1184889600


In [36]:
train_set, test_set = [], []
num=0
for reviewerID, hist in review_df.groupby("reviewerID"):
    # 分类购买物品的物品asin码
    pos_list = hist["asin"].tolist()
    # 生成负样本函数。
    def gen_neg():
        neg = pos_list[0]
        while neg in pos_list:
            neg = random.randint(0, item_count - 1)
        return neg
    # 负样本
    neg_list = [gen_neg() for i in range(len(pos_list))]
    # 生成测试集和训练集
    for i in range(1, len(pos_list)):
        # 这里为什么不取闭区间
        hist = pos_list[:i]
        # 如果是倒数第二个元素的索引
        if i == len(pos_list) -1:
            lable = (pos_list[i], neg_list[i])
            test_set.append((reviewerID, hist, lable))
        else:
            train_set.append((reviewerID, hist, pos_list[i], 1))
            train_set.append((reviewerID, hist, neg_list[i], 0))

In [27]:
random.shuffle(train_set)
random.shuffle(test_set)

In [37]:
test_set

[(6, [223], (346, 605)),
 (9, [865, 666], (257, 323)),
 (28, [371, 268], (497, 375)),
 (31, [803, 84], (896, 522)),
 (38, [453], (724, 93)),
 (41, [411], (912, 302)),
 (46, [303], (771, 385)),
 (56, [218, 248], (881, 635)),
 (63, [583], (596, 264)),
 (64, [411], (236, 23)),
 (71, [471], (706, 380)),
 (73, [596], (706, 749)),
 (81, [599, 854, 853, 856, 860], (841, 878)),
 (86, [359], (439, 719)),
 (90, [380, 583, 586], (596, 844)),
 (98, [380], (443, 617)),
 (99, [706, 235], (471, 574)),
 (107, [625, 408], (742, 80)),
 (109, [453], (103, 934)),
 (110, [759], (956, 928)),
 (117, [450], (748, 442)),
 (121, [714, 71], (51, 637)),
 (135, [845], (835, 649)),
 (142, [26], (668, 945)),
 (147, [442], (402, 258)),
 (167, [359], (596, 956)),
 (183, [144, 953], (726, 804)),
 (191, [643], (648, 518)),
 (192, [381], (179, 256)),
 (194, [647], (652, 460)),
 (200, [287], (421, 553)),
 (201, [149], (861, 283)),
 (204, [272], (164, 863)),
 (205, [202, 800, 512, 189, 963], (335, 723)),
 (206, [554], (371

In [38]:
train_set

[(9, [865], 666, 1),
 (9, [865], 289, 0),
 (28, [371], 268, 1),
 (28, [371], 335, 0),
 (31, [803], 84, 1),
 (31, [803], 461, 0),
 (56, [218], 248, 1),
 (56, [218], 336, 0),
 (81, [599], 854, 1),
 (81, [599], 2, 0),
 (81, [599, 854], 853, 1),
 (81, [599, 854], 729, 0),
 (81, [599, 854, 853], 856, 1),
 (81, [599, 854, 853], 264, 0),
 (81, [599, 854, 853, 856], 860, 1),
 (81, [599, 854, 853, 856], 904, 0),
 (90, [380], 583, 1),
 (90, [380], 455, 0),
 (90, [380, 583], 586, 1),
 (90, [380, 583], 367, 0),
 (99, [706], 235, 1),
 (99, [706], 265, 0),
 (107, [625], 408, 1),
 (107, [625], 789, 0),
 (121, [714], 71, 1),
 (121, [714], 252, 0),
 (183, [144], 953, 1),
 (183, [144], 849, 0),
 (205, [202], 800, 1),
 (205, [202], 51, 0),
 (205, [202, 800], 512, 1),
 (205, [202, 800], 487, 0),
 (205, [202, 800, 512], 189, 1),
 (205, [202, 800, 512], 922, 0),
 (205, [202, 800, 512, 189], 963, 1),
 (205, [202, 800, 512, 189], 238, 0),
 (210, [60], 216, 1),
 (210, [60], 10, 0),
 (210, [60, 216], 87, 1),
 (

# 构建训练需要的样本数据

In [39]:
# 训练集数据
class DataInput(object):
    def __init__(self,data, batch_size):
        # len(data), batch_size= 2608764, 32
        self.batch_size = batch_size
        self.data = data
        self.epoch_size = len(self.data) // self.batch_size
        # 计算迭代次数
        self.epoch_size + 1 if self.epoch_size * self.batch_size <len(self.data) else self.epoch_size
        self.idx = 0
    def __iter__(self):
        return self
    def __next__(self):
        if self.idx == self.epoch_size:
            raise StopIteration
        start, end = self.idx*self.batch_size, min((self.idx+1)*self.batch_size, len(self.data))
        b_data = self.data[start: end]
        self.idx += 1
        user_id, item_id, y, sample_len = [],[],[],[]
        for i in b_data:
            user_id.append(i[0])
            item_id.append(i[2])
            y.append(i[3])
            sample_len.append(len(i[1]))
        # 获取最长列表
        max_sl = max(sample_len)
        # 获取兴趣矩阵(以最长的为主) [用户个数，最长兴趣列表长度]
        hist_i = np.zeros([len(b_data),max_sl], np.int64)
        # 填充兴趣列表，并且用0填充的方法
        k = 0
        for l in b_data:
            for j in range(len(l[1])):
                hist_i[k][j] = l[1][j]
            k += 1
        return self.idx, (user_id, item_id, y, hist_i, sample_len)

In [40]:
# user_count item_count cate_count cate_list
random.seed(1234)
np.random.seed(1234)
tf.set_random_seed(1234)

predict_users_num = 1000
predict_batch_size = 32
predict_ads_num = 100

train_batch_size, test_barch_size = 32, 512

# Dice

In [42]:
def dice(_x, axis=-1, epsilon=0.000000001, name=''):
    with tf.variable_scope(name_or_scope='', reuse=tf.AUTO_REUSE):
        alphas = tf.get_variable('alpha' + name, _x.get_shape()[-1],
                                 initializer=tf.constant_initializer(0.0),
                                 dtype=tf.float32)
        beta = tf.get_variable('beta' + name, _x.get_shape()[-1],
                               initializer=tf.constant_initializer(0.0),
                               dtype=tf.float32)
    input_shape = list(_x.get_shape())

    reduction_axes = list(range(len(input_shape)))
    del reduction_axes[axis]
    broadcast_shape = [1] * len(input_shape)
    broadcast_shape[axis] = input_shape[axis]
    mean = tf.reduce_mean(_x, axis=reduction_axes)
    brodcast_mean = tf.reshape(mean, broadcast_shape)
    std = tf.reduce_mean(tf.square(_x - brodcast_mean) + epsilon, axis=reduction_axes)
    std = tf.sqrt(std)
    brodcast_std = tf.reshape(std, broadcast_shape)
    x_normed = (_x - brodcast_mean) / (brodcast_std + epsilon)
    x_p = tf.sigmoid(beta * x_normed)

    return alphas * (1.0 - x_p) * _x + x_p * _x

# attention

In [43]:
def attention(queries, keys, keys_length):
    '''
      queries:     [B, H]   item_emb
      keys:        [B, T, H]   hist_emb
      keys_length: [B]    sample_len
    '''
    # shape:  0 ,data: 128
    queries_hidden_units = queries.get_shape().as_list()[-1]
    # shape: [32,,T*128] ,data: (item, W)
    queries = tf.tile(queries, [1, tf.shape(keys)[1]])
    # shape: [32, T, 128], data:(item, W)
    queries = tf.reshape(queries, [-1, tf.shape(keys)[1], queries_hidden_units])
    # shape: [32, T, 128+128+128+128]
    din_all = tf.concat([queries, keys, queries - keys, queries * keys], axis=-1)
    d_layer_1_all = tf.layers.dense(din_all, 80, activation=tf.nn.sigmoid, name='f1_att', reuse=tf.AUTO_REUSE)
    d_layer_2_all = tf.layers.dense(d_layer_1_all, 40, activation=tf.nn.sigmoid, name='f2_att', reuse=tf.AUTO_REUSE)
    d_layer_3_all = tf.layers.dense(d_layer_2_all, 1, activation=None, name='f3_att', reuse=tf.AUTO_REUSE)
    d_layer_3_all = tf.reshape(d_layer_3_all, [-1, 1, tf.shape(keys)[1]])
    # shape: [32,1,T]
    outputs = d_layer_3_all
    key_masks = tf.sequence_mask(keys_length, tf.shape(keys)[1])  # [32, T]
    # shape: [32,1,T]
    key_masks = tf.expand_dims(key_masks, 1)
    # 这里的初始化使用了数值很小的数而不是0，shape: [32,1,T]
    paddings = tf.ones_like(outputs) * (-2 ** 32 + 1)
    # shape: [32, 1, T]
    outputs = tf.where(key_masks, outputs, paddings)
    # Scale  [32,1,T]/ sqrt(128) (这里进行标准化)
    outputs = outputs / (keys.get_shape().as_list()[-1] ** 0.5)
    outputs = tf.nn.softmax(outputs)  # [B, 1, T]
    # Weighted sum  加权平均 三维矩阵相乘，相乘发生在后两维，即 B * (( 1 * T ) * ( T * H ))
    # [32, 1, T]*[32, T, H] = [32, 1, H]
    outputs = tf.matmul(outputs, keys)  # [B, 1, H]

    return outputs

# DIN

In [44]:
class Model(object):
    # 传入用户id列表，物品id列表，不感兴趣的item_id, [0,1]标签，用户兴趣二维列表，各个用户兴趣个数列表，学习率
    def __init__(self,user_count,item_count,cate_count,cate_list):
        self.u = tf.placeholder(tf.int32, [None, ])# user_id
        self.i = tf.placeholder(tf.int32, [None, ])# item_id
        self.j = tf.placeholder(tf.int32, [None, ])# not_like_id
        self.y = tf.placeholder(tf.float32, [None, ]) # lable
        self.hist_i = tf.placeholder(tf.int32, [None,None])# [Batch, like] 
        self.sl = tf.placeholder(tf.int32,[None, ]) # sample len list
        self.lr = tf.placeholder(tf.float64, []) # learning rate
        # 定义神经网络的层数
        hidden_units = 128
        with tf.variable_scope("wight",reuse=tf.AUTO_REUSE):
        # 定义变量 用户层数是128层，物品层数是64层，类别变量128层
            item_emb_w = tf.get_variable("item_emb_w", [item_count, hidden_units // 2])
            item_b = tf.get_variable("item_b", [item_count], initializer=tf.constant_initializer(0.0))
            cate_emb_w = tf.get_variable("cate_emb_w", [cate_count, hidden_units // 2])
        cate_list = tf.convert_to_tensor(cate_list, dtype=tf.int64)
        ic = tf.gather(cate_list,self.i)
        i_b = tf.gather(item_b, self.i)
        # 对H层进行拼接
        item_emb = tf.concat(
            values=[tf.nn.embedding_lookup(item_emb_w, self.i), tf.nn.embedding_lookup(cate_emb_w,ic)],
            axis=1
        )
        hc = tf.gather(cate_list,self.hist_i)
        # 对H层进行拼接
        hist_emb = tf.concat(
            values=[tf.nn.embedding_lookup(item_emb_w, self.hist_i),tf.nn.embedding_lookup(cate_emb_w,hc)],
            axis = 2
        )
        hist = attention(item_emb,hist_emb,self.sl)
        hist = tf.layers.batch_normalization(inputs=hist)
        hist = tf.reshape(hist, [-1, hidden_units])
        hist = tf.layers.dense(hist, hidden_units)
        user_emb = hist
        base_i = tf.concat([user_emb, item_emb], axis = -1)
        base_i = tf.layers.batch_normalization(inputs=base_i, name="b1",reuse=tf.AUTO_REUSE)
        d_layer_1_i = tf.layers.dense(base_i, 80, activation=None, name="f1",reuse=tf.AUTO_REUSE)
        d_layer_1_i = dice(d_layer_1_i, name="dice_1")
        d_layer_2_i = tf.layers.dense(base_i, 40, activation=None, name="f2",reuse=tf.AUTO_REUSE)
        d_layer_2_i = dice(d_layer_2_i, name = "dice_2")
        d_layer_3_i = tf.layers.dense(base_i, 1, activation=None, name="f3",reuse=tf.AUTO_REUSE)
        # 特征平铺
        d_layer_3_i = tf.reshape(d_layer_3_i,[-1])
        self.y_p = i_b + d_layer_3_i
        # 定义一轮的epoch的步数
        self.global_step = tf.Variable(0, trainable=False, name='global_step')
        # 定义epoch的步数
        self.global_epoch_step = tf.Variable(0,trainable=False, name="global_epoch_step")
        self.global_epoch_step_op = tf.assign(self.global_epoch_step, 1)
        self.loss = tf.reduce_mean(
            tf.nn.sigmoid_cross_entropy_with_logits(
            logits = self.y_p,
            labels = self.y
            )
        )
        
        trainable_params = tf.trainable_variables()
        self.opt = tf.train.GradientDescentOptimizer(learning_rate=self.lr)
        gradients = tf.gradients(self.loss, trainable_params)
        # 对梯度进行修剪
        clip_gradients, _ = tf.clip_by_global_norm(gradients, 5)
        self.train_op = self.opt.apply_gradients(
            zip(clip_gradients, trainable_params),global_step=self.global_step
        )
        
    def train(self, sess, item, lr):
        loss, _ = sess.run([self.loss, self.train_op], feed_dict={
            self.u: item[0],
            self.i: item[1],
            self.y: item[2],
            self.hist_i: item[3],
            self.sl: item[4],
            self.lr: lr,
        })
        return loss


# Train

In [47]:
gpu_options = tf.GPUOptions(allow_growth=True)
with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
    model = Model(user_count,item_count,cate_count,cate_list)
    # 初始化全局变量和局部变量
    sess.run(tf.global_variables_initializer())
    sess.run(tf.local_variables_initializer())
    
    # writer = tf.summary.FileWriter('pic/', graph=sess.graph)
    # 需要添加eval()评估函数
    sys.stdout.flush() # 定时输出但是在windows下作用不大
    lr = 1.0
    start_time = time.time()
    for _ in range(50):
        random.shuffle(train_set)
        # 每次迭代epoch次数
        epoch_size = round(len(train_set) / train_batch_size)
        loss_sum = 0.0
        for idx, item in DataInput(train_set, train_batch_size):
            loss = model.train(sess, item, lr)
            loss_sum += loss
            if model.global_step.eval()%1000 == 0:
                print("Epoch:  %d, Global_step:  %d, Train_loss: %.4f"%(
                    model.global_epoch_step.eval(),
                    model.global_step.eval(),
                    loss_sum / 1000
                ))
                sys.stdout.flush()
                loss_sum = 0.0
            if model.global_step.eval() % 336000 == 0:
                lr = 0.0

Epoch:  0, Global_step:  1000, Train_loss: 0.0041
Epoch:  0, Global_step:  2000, Train_loss: 0.0076
Epoch:  0, Global_step:  3000, Train_loss: 0.0107
Epoch:  0, Global_step:  4000, Train_loss: 0.0146
Epoch:  0, Global_step:  5000, Train_loss: 0.0179
Epoch:  0, Global_step:  6000, Train_loss: 0.0217
