# Data Processing

In [None]:
import numpy as np

csv = np.loadtxt('./ratings_Video_Games.csv', dtype={'names': ('user', 'item', 'rating', 'time'),
                                                        'formats': ('S14', 'S10', 'f', 'i')}, delimiter=',')

In [25]:
csv = np.array(csv)

In [32]:
print(csv.shape, csv[0])

(1324753,) (b'AB9S9279OZ3QO', b'0078764343', 5., 1373155200)


In [117]:
from collections import defaultdict

class StringMap:
    def __init__(self):
        self.mi = {}
        self.ms = {}
        self.counter = 0
        
    def s2i(self, s):
        if not s in self.ms:
            self.mi[self.counter] = s
            self.ms[s] = self.counter
            self.counter += 1
        return self.ms[s]
    
    def i2s(self, i):
        return self.mi[i]

def csv_to_time_series(csv):
    user_count = defaultdict(int)
    item_count = defaultdict(int)
    for i in range(csv.shape[0]):
        line = csv[i]
        user_count[line[0]] += 1
        item_count[line[1]] += 1

    user = StringMap()
    item = StringMap()

    useritem = {}
    for i in range(csv.shape[0]):
        line = csv[i]
        userid = user.s2i(line[0])
        itemid = item.s2i(line[1])
        ts = line[3]
        if userid not in useritem:
            useritem[userid] = []
        useritem[userid].append((ts, itemid))
        
    return user, item, useritem
    

In [120]:
def time_series_to_triple(user_map, item_map, user_item):
    test_set = []
    validation_set = []
    training_set = []
    for x in useritem:
        useritem[x].sort()
        if len(useritem[x]) < 4:
            continue
        test_set.append((x, useritem[x][-2][1], useritem[x][-1][1]))
        validation_set.append((x, useritem[x][-3][1], useritem[x][-2][1]))
        for i in range(len(useritem[x]) - 3):
            training_set.append((x, useritem[x][i][1], useritem[x][i+1][1]))

    final_user_map = StringMap()
    final_item_map = StringMap()

    def finale_map(old_user_map, old_item_map, user_map, item_map, data_set):
        result = []
        for u, i, j in data_set:
            userid = user_map.s2i(old_user_map.i2s(u))
            iid = item_map.s2i(old_item_map.i2s(i))
            jid = item_map.s2i(old_item_map.i2s(j))
            result.append((userid, iid, jid))
        return result

    training_data = np.array(finale_map(user_map, item_map, final_user_map, final_item_map, training_set))
    validation_data = np.array(finale_map(user_map, item_map, final_user_map, final_item_map, validation_set))
    test_data = np.array(finale_map(user_map, item_map, final_user_map, final_item_map, test_set))
    return final_user_map, final_item_map, training_data, validation_data, test_data
        

In [121]:
user_map, item_map, user_item = csv_to_time_series(csv)
user_map, item_map, training_data, validation_data, test_data = time_series_to_triple(user_map, item_map, user_item)


In [122]:
print(training_data)

[[    0     0     1]
 [    0     1     2]
 [    0     2     3]
 ...
 [47916 29293 32176]
 [47916 32176 32166]
 [47917 32418 32419]]


In [123]:
n_item = item_map.counter
n_user = user_map.counter
n_record = training_data.shape[0]
j_neg = np.random.randint(0, n_item, (n_record, 1))
training_data = np.concatenate((training_data, j_neg), axis=1)
np.random.shuffle(training_data)
print(training_data)
print(n_record, n_item, n_user)

[[  297  3412  3413 26185]
 [ 7503  4548  4425  9067]
 [36825  9021   409 35279]
 ...
 [  699  3020  4064 11014]
 [ 8290  5790  6989 21421]
 [23269 10402  2845 19084]]
223813 36303 47918


In [124]:
import sys
print(sys.version)
import tensorflow as tf
print(tf.__version__)
from tqdm import tnrange

3.6.4 |Anaconda, Inc.| (default, Mar 12 2018, 20:05:31) 
[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]
1.5.0


# Trans-E Model for Recommendation

In [125]:
class TransE(object):
    def __init__(self, n_item, n_user, embedding_dim, batch_size, beta, learning_rate):
        self.n_item = n_item
        self.n_user = n_user
        self.embedding_dim = embedding_dim
        self.batch_size = batch_size
        self.beta = beta
        self.learning_rate = learning_rate
        
        self.global_step = tf.Variable(initial_value=0, trainable=False, name='global_step')
        self.quadruple = tf.placeholder(tf.int32, [None, 4])
        
        with tf.variable_scope("embedding"):
            self.item_embedding = tf.get_variable(name = "item_embedding", shape = [self.n_item, self.embedding_dim], initializer = tf.contrib.layers.xavier_initializer(uniform = False))
            self.user_embedding = tf.get_variable(name = "user_embedding", shape = [self.n_user, self.embedding_dim], initializer = tf.zeros_initializer())
            self.item_bias = tf.get_variable(name = "item_bias", shape = [self.n_item], initializer = tf.zeros_initializer())

    def build_graph(self):
        # Normalize item embeddings into \Omega space
        with tf.name_scope('normalization'):
            self.item_embedding = tf.nn.l2_normalize(self.item_embedding, dim=1)
            
        with tf.name_scope('training'):
            dist_positive, dist_negative, bias_positive, bias_negative = self.inference(self.quadruple)
            self.loss = self.loss_function(dist_positive, dist_negative, bias_positive, bias_negative)
            
            #tf.summary.scalar(name=self.loss.op.name, tensor=self.loss)
            optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
            self.train_op = optimizer.minimize(self.loss, global_step=self.global_step)
            #self.merge = tf.summary.merge_all()
            
    def loss_function(self, dist_positive, dist_negative, bias_positive, bias_negative):
        with tf.name_scope('loss_function'):
            # L2 distance
            prob_positive = bias_positive - tf.reduce_sum(dist_positive ** 2, axis=1)
            prob_negative = bias_negative - tf.reduce_sum(dist_negative ** 2, axis=1)
            
            # L2 regularization
            regularizer = tf.nn.l2_loss(self.item_embedding) + tf.nn.l2_loss(self.user_embedding)
            
            # S-BPR loss
            loss = -tf.reduce_sum(tf.log(tf.nn.sigmoid(prob_positive - prob_negative)), name='SBPR_loss') + self.beta * regularizer
        return loss
    
    # Modify this method to apply Trans-E
    def inference(self, quadruple):
        # quadruple (user, i, j, j')
        with tf.name_scope('embedding_lookup'):
            i = tf.nn.embedding_lookup(self.item_embedding, quadruple[:, 1])
            j = tf.nn.embedding_lookup(self.item_embedding, quadruple[:, 2])
            j_neg = tf.nn.embedding_lookup(self.item_embedding, quadruple[:, 3])
            u = tf.nn.embedding_lookup(self.user_embedding, quadruple[:, 0])
            
            bias_positive = tf.nn.embedding_lookup(self.item_bias, quadruple[:, 2])
            bias_negative = tf.nn.embedding_lookup(self.item_bias, quadruple[:, 3])
            
        with tf.name_scope('link'):
            dist_positive = i + u - j # -> 0
            dist_negative = i + u - j_neg # -> +oo
        return dist_positive, dist_negative, bias_positive, bias_negative
    
    def train(self, training_data, n_epochs, session, summary_writer):
        for epoch in tnrange(n_epochs):
            epoch_loss = 0
            n_batch = training_data.shape[0] // self.batch_size
            t = tnrange(n_batch, leave=False)
            for i in t:
                quadruple = training_data[i : i+self.batch_size]
                batch_loss, _ = session.run(fetches=[self.loss, self.train_op],
                                                     feed_dict={self.quadruple: quadruple})
                # summary_writer.add_summary(summary, global_step=self.global_step.eval(session=session))
                epoch_loss += batch_loss
                t.set_description("%.3f" % batch_loss)
                t.refresh()

In [130]:
tf.reset_default_graph()
model = TransE(n_item=n_item,
       n_user=n_user,
       batch_size=50,
       learning_rate=1e-5,
       beta=1e-7,
       embedding_dim=20)
model.build_graph()


In [132]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())
model.train(training_data, 10, sess, None)


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=4476), HTML(value='')))

HBox(children=(IntProgress(value=0, max=4476), HTML(value='')))

HBox(children=(IntProgress(value=0, max=4476), HTML(value='')))

HBox(children=(IntProgress(value=0, max=4476), HTML(value='')))

KeyboardInterrupt: 

# Trans-H Model

In [135]:
class TransH(object):
    def __init__(self, n_item, n_user, embedding_dim, batch_size, beta, learning_rate):
        self.n_item = n_item
        self.n_user = n_user
        self.embedding_dim = embedding_dim
        self.batch_size = batch_size
        self.beta = beta
        self.learning_rate = learning_rate
        
        self.global_step = tf.Variable(initial_value=0, trainable=False, name='global_step')
        self.quadruple = tf.placeholder(tf.int32, [None, 4])
        
        with tf.variable_scope("embedding"):
            self.item_embedding = tf.get_variable(name = "item_embedding", shape = [self.n_item, self.embedding_dim], initializer = tf.contrib.layers.xavier_initializer(uniform = False))
            self.user_embedding = tf.get_variable(name = "user_embedding", shape = [self.n_user, self.embedding_dim], initializer = tf.contrib.layers.xavier_initializer(uniform = False))
            self.normal_vector = tf.get_variable(name = "normal_vector", shape = [self.n_user, self.embedding_dim], initializer = tf.contrib.layers.xavier_initializer(uniform = False))
            self.item_bias = tf.get_variable(name = "item_bias", shape = [self.n_item], initializer = tf.zeros_initializer())

    def build_graph(self):
        # Normalize item embeddings into \Omega space
        with tf.name_scope('normalization'):
            self.item_embedding = tf.nn.l2_normalize(self.item_embedding, dim=1)
            self.normal_vector = tf.nn.l2_normalize(self.normal_vector, dim=1)
            
        with tf.name_scope('training'):
            dist_positive, dist_negative, bias_positive, bias_negative = self.inference(self.quadruple)
            self.loss = self.loss_function(dist_positive, dist_negative, bias_positive, bias_negative)
            
            #tf.summary.scalar(name=self.loss.op.name, tensor=self.loss)
            optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
            self.train_op = optimizer.minimize(self.loss, global_step=self.global_step)
            #self.merge = tf.summary.merge_all()
            
    def loss_function(self, dist_positive, dist_negative, bias_positive, bias_negative):
        with tf.name_scope('loss_function'):
            # L2 distance
            prob_positive = bias_positive - tf.reduce_sum(dist_positive ** 2, axis=1)
            prob_negative = bias_negative - tf.reduce_sum(dist_negative ** 2, axis=1)
            
            # L2 regularization
            regularizer = tf.nn.l2_loss(self.item_embedding) + tf.nn.l2_loss(self.user_embedding)
            
            # S-BPR loss
            loss = -tf.reduce_sum(tf.log(tf.nn.sigmoid(prob_positive - prob_negative)), name='SBPR_loss') + self.beta * regularizer
        return loss
    
    # Modify this method to apply Trans-E
    def inference(self, quadruple):
        # quadruple (user, i, j, j')
        def projection(embedded, norm):
            return embedded - tf.reduce_sum(embedded * norm, 1, keepdims = True) * norm
        
        with tf.name_scope('embedding_lookup'):
            i = tf.nn.embedding_lookup(self.item_embedding, quadruple[:, 1])
            j = tf.nn.embedding_lookup(self.item_embedding, quadruple[:, 2])
            j_neg = tf.nn.embedding_lookup(self.item_embedding, quadruple[:, 3])
            u = tf.nn.embedding_lookup(self.user_embedding, quadruple[:, 0])
            norm = tf.nn.embedding_lookup(self.normal_vector, quadruple[:, 0])
            
            i, j, j_neg = projection(i, norm), projection(j, norm), projection(j_neg, norm)
            
            bias_positive = tf.nn.embedding_lookup(self.item_bias, quadruple[:, 2])
            bias_negative = tf.nn.embedding_lookup(self.item_bias, quadruple[:, 3])
            
        with tf.name_scope('link'):
            dist_positive = i + u - j # -> 0
            dist_negative = i + u - j_neg # -> +oo
        return dist_positive, dist_negative, bias_positive, bias_negative
    
    def train(self, training_data, n_epochs, session, summary_writer):
        for epoch in tnrange(n_epochs):
            epoch_loss = 0
            n_batch = training_data.shape[0] // self.batch_size
            t = tnrange(n_batch, leave=False)
            for i in t:
                quadruple = training_data[i : i+self.batch_size]
                batch_loss, _ = session.run(fetches=[self.loss, self.train_op],
                                                     feed_dict={self.quadruple: quadruple})
                # summary_writer.add_summary(summary, global_step=self.global_step.eval(session=session))
                epoch_loss += batch_loss
                t.set_description("%.3f" % batch_loss)
                t.refresh()

In [136]:
tf.reset_default_graph()
model = TransH(n_item=n_item,
       n_user=n_user,
       batch_size=50,
       learning_rate=1e-5,
       beta=1e-7,
       embedding_dim=20)
model.build_graph()
sess = tf.Session()
sess.run(tf.global_variables_initializer())
model.train(training_data, 10, sess, None)


Instructions for updating:
keep_dims is deprecated, use keepdims instead


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=4476), HTML(value='')))

KeyboardInterrupt: 

# Trans-R Model