In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
import sys
import time
import random

In [3]:
def to_df(file_path):
    with open(file_path, "r")as f:
        i, df = 0, {}
        for line in f:
            df[i] = eval(line)
            i += 1
        # 使用字典里面在键作为索引
        df = pd.DataFrame.from_dict(df, orient="index")
        return df

In [4]:
review_df = to_df("../raw_data/reviews_Electronics_5.json")

In [5]:
meta_df = to_df('../raw_data/meta_Electronics.json')

In [6]:
review_df.head(5)

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,AO94DHGC771SJ,528881469,amazdnu,"[0, 0]",We got this GPS for my husband who is an (OTR)...,5.0,Gotta have GPS!,1370131200,"06 2, 2013"
1,AMO214LNFCEI4,528881469,Amazon Customer,"[12, 15]","I'm a professional OTR truck driver, and I bou...",1.0,Very Disappointed,1290643200,"11 25, 2010"
2,A3N7T0DY83Y4IG,528881469,C. A. Freeman,"[43, 45]","Well, what can I say. I've had this unit in m...",3.0,1st impression,1283990400,"09 9, 2010"
3,A1H8PY3QHMQQA0,528881469,"Dave M. Shaw ""mack dave""","[9, 10]","Not going to write a long review, even thought...",2.0,"Great grafics, POOR GPS",1290556800,"11 24, 2010"
4,A24EV6RXELQZ63,528881469,Wayne Smith,"[0, 0]",I've had mine for a year and here's what we go...,1.0,"Major issues, only excuses for support",1317254400,"09 29, 2011"


In [7]:
meta_df.head(5)

Unnamed: 0,asin,imUrl,description,categories,title,price,salesRank,related,brand
0,132793040,http://ecx.images-amazon.com/images/I/31JIPhp%...,The Kelby Training DVD Mastering Blend Modes i...,"[[Electronics, Computers & Accessories, Cables...",Kelby Training DVD: Mastering Blend Modes in A...,,,,
1,321732944,http://ecx.images-amazon.com/images/I/31uogm6Y...,,"[[Electronics, Computers & Accessories, Cables...",Kelby Training DVD: Adobe Photoshop CS5 Crash ...,,,,
2,439886341,http://ecx.images-amazon.com/images/I/51k0qa8f...,Digital Organizer and Messenger,"[[Electronics, Computers & Accessories, PDAs, ...",Digital Organizer and Messenger,8.15,{'Electronics': 144944},"{'also_viewed': ['0545016266', 'B009ECM8QY', '...",
3,511189877,http://ecx.images-amazon.com/images/I/41HaAhbv...,The CLIKR-5 UR5U-8780L remote control is desig...,"[[Electronics, Accessories & Supplies, Audio &...",CLIKR-5 Time Warner Cable Remote Control UR5U-...,23.36,,"{'also_viewed': ['B001KC08A4', 'B00KUL8O0W', '...",
4,528881469,http://ecx.images-amazon.com/images/I/51FnRkJq...,"Like its award-winning predecessor, the Intell...","[[Electronics, GPS & Navigation, Vehicle GPS, ...",Rand McNally 528881469 7-inch Intelliroute TND...,299.99,,"{'also_viewed': ['B006ZOI9OY', 'B00C7FKT2A', '...",


In [8]:
# 选取在review数据出现在goods_id
meta_df = meta_df[meta_df['asin'].isin(review_df['asin'].unique())]
meta_df = meta_df.reset_index(drop=True)

In [9]:
# 重新考量ID
review_df = review_df[["reviewerID","asin","unixReviewTime"]]
meta_df = meta_df[["asin", "categories"]]
#  目前分类都按照最后一类来分
meta_df["categories"] = meta_df["categories"].map(lambda x: x[-1][-1])

In [10]:
# 定义一个有序的字段和索引的映射函数
def build_map(df, col_name):
    key = sorted(df[col_name].unique().tolist())
    m  = dict(zip(key , range(len(key))))
    df[col_name] = df[col_name].map(lambda x: m[x])
    return m, key

In [11]:
asin_map, asin_key = build_map(meta_df,"asin")
cate_map, cate_key = build_map(meta_df, "categories")
rev_map, rev_key = build_map(review_df, "reviewerID")

In [12]:
user_count, item_count, cate_count, example_count = len(rev_map),len(asin_map),len(cate_map),review_df.shape[0]

In [13]:
# 统计用户量商品量分类量和样本数
print('user_count: %d\titem_count: %d\tcate_count: %d       example_count: %d' %(user_count, item_count, cate_count, example_count))

user_count: 192403	item_count: 63001	cate_count: 801       example_count: 1689188


In [14]:
meta_df = meta_df.sort_values("asin").reset_index(drop=True)
# 统一转化成下标
review_df["asin"] = review_df["asin"].map(lambda x: asin_map[x])
review_df = review_df.sort_values(["reviewerID", "unixReviewTime"]).reset_index(drop=True)
review_df = review_df[["reviewerID","asin","unixReviewTime"]]

In [15]:
review_df.head(5)

Unnamed: 0,reviewerID,asin,unixReviewTime
0,0,13179,1400457600
1,0,17993,1400457600
2,0,28326,1400457600
3,0,29247,1400457600
4,0,62275,1400457600


In [16]:
meta_df.head(5)

Unnamed: 0,asin,categories
0,0,738
1,1,157
2,2,571
3,3,707
4,4,714


In [18]:
cate_list = np.array([meta_df["categories"][i] for i in range(len(asin_map))],dtype=np.int32)

In [19]:
train_set, test_set = [], []
for reviewerID, hist in review_df.groupby("reviewerID"):
    # 分类购买物品的物品asin码
    pos_list = hist["asin"].tolist()
    # 生成负样本函数。
    def gen_neg():
        neg = pos_list[0]
        while neg in pos_list:
            neg = random.randint(0, item_count - 1)
        return neg
    # 负样本
    neg_list = [gen_neg() for i in range(len(pos_list))]
    # 生成测试集和训练集
    for i in range(1, len(pos_list)):
        # 这里为什么不取闭区间
        hist = pos_list[:i]
        # 如果是倒数第二个元素的索引
        if i == len(pos_list) -1:
            lable = (pos_list[i], neg_list[i])
            test_set.append((reviewerID, hist, lable))
        else:
            train_set.append((reviewerID, hist, pos_list[i], 1))
            train_set.append((reviewerID, hist, neg_list[i], 0))

In [20]:
random.shuffle(train_set)
random.shuffle(test_set)

# 构建训练需要的样本数据

In [22]:
# 训练集数据
class DataInput(object):
    def __init__(self,data, batch_size):
        # len(data), batch_size= 2608764, 32
        self.batch_size = batch_size
        self.data = data
        self.epoch_size = len(self.data) // self.batch_size
        # 计算迭代次数
        self.epoch_size + 1 if self.epoch_size * self.batch_size <len(self.data) else self.epoch_size
        self.idx = 0
    def __iter__(self):
        return self
    def __next__(self):
        if self.idx == self.epoch_size:
            raise StopIteration
        start, end = self.idx*self.batch_size, min((self.idx+1)*self.batch_size, len(self.data))
        b_data = self.data[start: end]
        self.idx += 1
        user_id, item_id, y, sample_len = [],[],[],[]
        for i in b_data:
            user_id.append(i[0])
            item_id.append(i[2])
            y.append(i[3])
            sample_len.append(len(i[1]))
        # 获取最长列表
        max_sl = max(sample_len)
        # 获取兴趣矩阵(以最长的为主) [用户个数，最长兴趣列表长度]
        hist_i = np.zeros([len(b_data),max_sl], np.int64)
        # 填充兴趣列表，并且用0填充的方法
        k = 0
        for l in b_data:
            for j in range(len(l[1])):
                hist_i[k][j] = l[1][j]
            k += 1
        return self.idx, (user_id, item_id, y, hist_i, sample_len)

In [23]:
# user_count item_count cate_count cate_list
random.seed(1234)
np.random.seed(1234)
tf.set_random_seed(1234)

predict_users_num = 1000
predict_batch_size = 32
predict_ads_num = 100

train_batch_size, test_barch_size = 32, 512

# 定义baselines模型

In [24]:
class Model(object):
    # 传入用户id列表，物品id列表，不感兴趣的item_id, [0,1]标签，用户兴趣二维列表，各个用户兴趣个数列表，学习率
    def __init__(self,user_count,item_count,cate_count,cate_list):
        self.u = tf.placeholder(tf.int32, [None, ])# user_id
        self.i = tf.placeholder(tf.int32, [None, ])# item_id
        self.j = tf.placeholder(tf.int32, [None, ])# not_like_id
        self.y = tf.placeholder(tf.float32, [None, ]) # lable
        self.hist_i = tf.placeholder(tf.int32, [None,None])# [Batch, like] 
        self.sl = tf.placeholder(tf.int32,[None, ]) # sample len list
        self.lr = tf.placeholder(tf.float64, []) # learning rate
        # 定义神经网络的层数
        hidden_units = 128
        with tf.variable_scope("wight",reuse=tf.AUTO_REUSE):
        # 定义变量 用户层数是128层，物品层数是64层，类别变量128层
            item_emb_w = tf.get_variable("item_emb_w", [item_count, hidden_units // 2])
            item_b = tf.get_variable("item_b", [item_count], initializer=tf.constant_initializer(0.0))
            cate_emb_w = tf.get_variable("cate_emb_w", [cate_count, hidden_units // 2])
        cate_list = tf.convert_to_tensor(cate_list, dtype=tf.int64)
        ic = tf.gather(cate_list,self.i)
        i_b = tf.gather(item_b, self.i)
        # 对H层进行拼接
        item_emb = tf.concat(
            values=[tf.nn.embedding_lookup(item_emb_w, self.i), tf.nn.embedding_lookup(cate_emb_w,ic)],
            axis=1
        )
        hc = tf.gather(cate_list,self.hist_i)
        # 对H层进行拼接
        hist_emb = tf.concat(
            values=[tf.nn.embedding_lookup(item_emb_w, self.hist_i),tf.nn.embedding_lookup(cate_emb_w,hc)],
            axis = 2
        )
        # 对未购买的物品对应的W进行归0
        mask = tf.sequence_mask(self.sl, tf.shape(hist_emb)[1], dtype=tf.float32)
        mask = tf.tile(tf.expand_dims(mask, -1), [1, 1, tf.shape(hist_emb)[2]])
        hist_emb *= mask
        hist = hist_emb
        # 对每个商品进行拼接
        hist = tf.reduce_sum(hist, 1)
        # Sum pooling
        hist = tf.div(hist , tf.cast(tf.tile(tf.expand_dims(self.sl,1),[1,128]), tf.float32))
        # 归一化以及前向传播
        hist = tf.layers.batch_normalization(inputs=hist)
        hist = tf.reshape(hist, [-1, hidden_units])
        hist = tf.layers.dense(hist, hidden_units)
        user_emb = hist
        base_i = tf.concat([user_emb, item_emb], axis = -1)
        base_i = tf.layers.batch_normalization(inputs=base_i, name="b1",reuse=tf.AUTO_REUSE)
        d_layer_1_i = tf.layers.dense(base_i, 80, activation=tf.nn.sigmoid, name="f1",reuse=tf.AUTO_REUSE)
        d_layer_2_i = tf.layers.dense(base_i, 40, activation=tf.nn.sigmoid, name="f2",reuse=tf.AUTO_REUSE)
        d_layer_3_i = tf.layers.dense(base_i, 1, activation=None, name="f3",reuse=tf.AUTO_REUSE)
        # 特征平铺
        d_layer_3_i = tf.reshape(d_layer_3_i,[-1])
        self.y_p = i_b + d_layer_3_i
        # 定义一轮的epoch的步数
        self.global_step = tf.Variable(0, trainable=False, name='global_step')
        # 定义epoch的步数
        self.global_epoch_step = tf.Variable(0,trainable=False, name="global_epoch_step")
        self.global_epoch_step_op = tf.assign(self.global_epoch_step, 1)
        self.loss = tf.reduce_mean(
            tf.nn.sigmoid_cross_entropy_with_logits(
            logits = self.y_p,
            labels = self.y
            )
        )
        
        trainable_params = tf.trainable_variables()
        self.opt = tf.train.GradientDescentOptimizer(learning_rate=self.lr)
        gradients = tf.gradients(self.loss, trainable_params)
        # 对梯度进行修剪
        clip_gradients, _ = tf.clip_by_global_norm(gradients, 5)
        self.train_op = self.opt.apply_gradients(
            zip(clip_gradients, trainable_params),global_step=self.global_step
        )
        
    def train(self, sess, item, lr):
        loss, _ = sess.run([self.loss, self.train_op], feed_dict={
            self.u: item[0],
            self.i: item[1],
            self.y: item[2],
            self.hist_i: item[3],
            self.sl: item[4],
            self.lr: lr,
        })
        return loss


In [None]:
gpu_options = tf.GPUOptions(allow_growth=True)
with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
    model = Model(user_count,item_count,cate_count,cate_list)
    # 初始化全局变量和局部变量
    sess.run(tf.global_variables_initializer())
    sess.run(tf.local_variables_initializer())
    # 需要添加eval()评估函数
    sys.stdout.flush() # 定时输出但是在windows下作用不大
    lr = 1.0
    start_time = time.time()
    for _ in range(50):
        random.shuffle(train_set)
        # 每次迭代epoch次数
        epoch_size = round(len(train_set) / train_batch_size)
        loss_sum = 0.0
        for idx, item in DataInput(train_set, train_batch_size):
            loss = model.train(sess, item, lr)
            loss_sum += loss
            if model.global_step.eval()%1000 == 0:
                print("Epoch:  %d, Global_step:  %d, Train_loss: %.4f"%(
                    model.global_epoch_step.eval(),
                    model.global_step.eval(),
                    loss_sum / 1000
                ))
                sys.stdout.flush()
                loss_sum = 0.0
            if model.global_step.eval() % 336000 == 0:
                lr = 0.0

Epoch:  0, Global_step:  1000, Train_loss: 0.6857
Epoch:  0, Global_step:  2000, Train_loss: 0.6717
Epoch:  0, Global_step:  3000, Train_loss: 0.6661
Epoch:  0, Global_step:  4000, Train_loss: 0.6526
Epoch:  0, Global_step:  5000, Train_loss: 0.6318
Epoch:  0, Global_step:  6000, Train_loss: 0.6086
Epoch:  0, Global_step:  7000, Train_loss: 0.5954
Epoch:  0, Global_step:  8000, Train_loss: 0.5869
Epoch:  0, Global_step:  9000, Train_loss: 0.5798
Epoch:  0, Global_step:  10000, Train_loss: 0.5803
Epoch:  0, Global_step:  11000, Train_loss: 0.5757
Epoch:  0, Global_step:  12000, Train_loss: 0.5760
Epoch:  0, Global_step:  13000, Train_loss: 0.5729
Epoch:  0, Global_step:  14000, Train_loss: 0.5727
Epoch:  0, Global_step:  15000, Train_loss: 0.5709
Epoch:  0, Global_step:  16000, Train_loss: 0.5645
Epoch:  0, Global_step:  17000, Train_loss: 0.5637
Epoch:  0, Global_step:  18000, Train_loss: 0.5660
Epoch:  0, Global_step:  19000, Train_loss: 0.5635
Epoch:  0, Global_step:  20000, Train_lo