In this notebook I will concentrate solely on the model. The training process will be described in a separate notebook

In [1]:
import numpy as np
import tensorflow as tf
import os
import sys
import argparse
import multiprocessing
import heapq
import random as rd
import scipy.sparse as sp

from utility.load_data import Data
from tqdm import tqdm
from time import time

import pdb

os.environ['TF_CPP_MIN_LOG_LEVEL']='2'

I will try to go through their code as it is in their release, so for now, no eager execution

In [2]:
# tf.enable_eager_execution()

In [3]:
weights_path=''
data_path='Data/'
dataset='gowalla'
pretrain=0
verbosedt=1
epoch=500
emb_dim=64
layer_size=[64]
batch_size=1024
regs=[1e-5,1e-5,1e-2]
lr=0.01
model_type='ngcf'
adj_type='norm'
alg_type='ngcf'
gpu_id=0
node_dropout_flag=0
node_dropout=[0.1]
mess_dropout=[0.1]
Ks=[20, 40, 60, 80, 100]
save_flag=0
test_flag='part'
report=0
pretrain_data = None

In [4]:
data_generator = Data(path='Data/gowalla', batch_size=batch_size)
USR_NUM, ITEM_NUM = data_generator.n_users, data_generator.n_items
N_TRAIN, N_TEST = data_generator.n_train, data_generator.n_test
BATCH_SIZE = data_generator.batch_size

n_users=29858, n_items=40981
n_interactions=1027370
n_train=810128, n_test=217242, sparsity=0.00084


In [5]:
os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)

config = dict()
config['n_users'] = data_generator.n_users
config['n_items'] = data_generator.n_items

In [6]:
plain_adj, norm_adj, mean_adj = data_generator.get_adj_mat()

already load adj matrix (70839, 70839) 0.22672343254089355


Here we use their default configuration: `Each decay factor between two connected nodes is set as 1/(out degree of the node) and each node is also assigned with 1 for self-connections`

In [7]:
config['norm_adj'] = mean_adj + sp.eye(mean_adj.shape[0])

In [8]:
n_users = config['n_users']
n_items = config['n_items']

n_fold = 100
norm_adj = config['norm_adj']
n_nonzero_elems = norm_adj.count_nonzero()

weight_size = layer_size
n_layers = len(weight_size)

model_type += '_%s_%s_l%d' % (adj_type, alg_type, n_layers)
decay = regs[0]

In [9]:
model_type

'ngcf_norm_ngcf_l1'

Let's initialise the weights. You will notice that they do not consider the weights of the "graph layers" when loading pre-trained weights...I do not know why that is.

In [10]:
def _init_weights():
    all_weights = dict()

    initializer = tf.contrib.layers.xavier_initializer()

    if pretrain_data is None:
        all_weights['user_embedding'] = tf.Variable(initializer([n_users, emb_dim]), name='user_embedding')
        all_weights['item_embedding'] = tf.Variable(initializer([n_items, emb_dim]), name='item_embedding')
        print('using xavier initialization')
    else:
        all_weights['user_embedding'] = tf.Variable(initial_value=pretrain_data['user_embed'], trainable=True,
                                                    name='user_embedding', dtype=tf.float32)
        all_weights['item_embedding'] = tf.Variable(initial_value=pretrain_data['item_embed'], trainable=True,
                                                    name='item_embedding', dtype=tf.float32)
        print('using pretrained initialization')

    weight_size_list = [emb_dim] + weight_size

    for k in range(n_layers):
        # k = 0 are the embeddings
        all_weights['W_gc_%d' %k] = tf.Variable(
            initializer([weight_size_list[k], weight_size_list[k+1]]), name='W_gc_%d' % k)
        all_weights['b_gc_%d' %k] = tf.Variable(
            initializer([1, weight_size_list[k+1]]), name='b_gc_%d' % k)

        all_weights['W_bi_%d' % k] = tf.Variable(
            initializer([weight_size_list[k], weight_size_list[k + 1]]), name='W_bi_%d' % k)
        all_weights['b_bi_%d' % k] = tf.Variable(
            initializer([1, weight_size_list[k+1]]), name='b_bi_%d' % k)

        all_weights['W_mlp_%d' % k] = tf.Variable(
            initializer([weight_size_list[k], weight_size_list[k+1]]), name='W_mlp_%d' % k)
        all_weights['b_mlp_%d' % k] = tf.Variable(
            initializer([1, weight_size_list[k+1]]), name='b_mlp_%d' % k)

    return all_weights

In [11]:
users = tf.placeholder(tf.int32, shape=(None,))
pos_items = tf.placeholder(tf.int32, shape=(None,))
neg_items = tf.placeholder(tf.int32, shape=(None,))

# dropout: node dropout (adopted on the ego-networks);
#          ... since the usage of node dropout have higher computational cost,
#          ... please use the 'node_dropout_flag' to indicate whether use such technique.
#          message dropout (adopted on the convolution operations).
node_dropout = tf.placeholder(tf.float32, shape=[None])
mess_dropout = tf.placeholder(tf.float32, shape=[None])

In [12]:
node_dropout

<tf.Tensor 'Placeholder_3:0' shape=(?,) dtype=float32>

In [13]:
weights = _init_weights()

W0807 13:56:29.461400 140613546653440 lazy_loader.py:50] 
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



using xavier initialization


In [14]:
weights.keys()

dict_keys(['user_embedding', 'item_embedding', 'W_gc_0', 'b_gc_0', 'W_bi_0', 'b_bi_0', 'W_mlp_0', 'b_mlp_0'])

In [15]:
print(weights['user_embedding'].shape)
print(weights['item_embedding'].shape)
print(weights['W_gc_0'].shape)
print(weights['W_bi_0'].shape)
print(weights['W_mlp_0'].shape)

(29858, 64)
(40981, 64)
(64, 64)
(64, 64)
(64, 64)


In [16]:
def _convert_sp_mat_to_sp_tensor(X):
    coo = X.tocoo().astype(np.float32)
    indices = np.mat([coo.row, coo.col]).transpose()
    return tf.SparseTensor(indices, coo.data, coo.shape)

In [17]:
X = np.random.randint(1,5, (5, 3))
X_csr = sp.csc_matrix(X)
X_coo = X_csr.tocoo().astype(np.float32)
indices = np.mat([X_coo.row, X_coo.col]).transpose()
res = tf.SparseTensor(indices, X_coo.data, X_coo.shape)

In [18]:
res

<tensorflow.python.framework.sparse_tensor.SparseTensor at 0x7fe29463bfd0>

In [19]:
def _dropout_sparse(X, keep_prob, n_nonzero_elems):
    """
    Dropout for sparse tensors.
    """
    noise_shape = [n_nonzero_elems]
    random_tensor = keep_prob
    random_tensor += tf.random_uniform(noise_shape)
    dropout_mask = tf.cast(tf.floor(random_tensor), dtype=tf.bool)
    pre_out = tf.sparse_retain(X, dropout_mask)
    
    return pre_out * tf.math.divide(1., keep_prob)

Let's go step by step to understand what the `_dropout_sparse` function does. Let's create a sparse tensor, which is the input that `_dropout_sparse` yakes

In [20]:
X = np.zeros((10,10))
coord = [(np.random.randint(10), np.random.randint(10)) for i in range(5)]
for i,j in coord: X[i,j] = 1

In [21]:
X

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 1., 0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.]])

In [22]:
X_sp_mtx = sp.csr_matrix(X)
n_nonzero_elems = X_sp_mtx.count_nonzero()
X_sp_tsr = _convert_sp_mat_to_sp_tensor(X_sp_mtx)

We now create a tensor with values uniformly randomly distributed between 0 and 1 to which we have added `1 - node_dropout[0] -> the keep_prob` 

In [23]:
noise_shape = [n_nonzero_elems]
random_tensor = 1 - node_dropout[0]
random_tensor += tf.random_uniform(noise_shape)

And now we create a mask dropping rows (i.e. nodes) according to the `node_dropout` param.

In [24]:
dropout_mask = tf.cast(tf.floor(random_tensor), dtype=tf.bool)

And if you run this in eager execution (just don't use `placeholders`) you will see that dropout_mask containes booleans, that will indicate whether we drop or not a certain row/node

In [27]:
dropout_mask

<tf.Tensor 'Cast:0' shape=(5,) dtype=bool>

In [29]:
pre_out = tf.sparse_retain(X_sp_tsr, dropout_mask)

W0807 13:58:25.327770 140613546653440 deprecation.py:323] From /home/ubuntu/anaconda3/envs/ngcf/lib/python3.6/site-packages/tensorflow/python/ops/sparse_ops.py:1719: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Let's just now split the Adjancency matrix so is tractable, nothing major here

In [31]:
def _split_A_hat(X):
    "split the Adjancency matrix so is tractable"
    A_fold_hat = []

    fold_len = (n_users + n_items) // n_fold
    for i_fold in range(n_fold):
        start = i_fold * fold_len
        if i_fold == n_fold -1:
            end = n_users + n_items
        else:
            end = (i_fold + 1) * fold_len

        A_fold_hat.append(_convert_sp_mat_to_sp_tensor(X[start:end]))
    return A_fold_hat

In [32]:
def _split_A_hat_node_dropout(X):
    A_fold_hat = []

    fold_len = (n_users + n_items) // n_fold
    for i_fold in range(n_fold):
        start = i_fold * fold_len
        if i_fold == n_fold -1:
            end = n_users + n_items
        else:
            end = (i_fold + 1) * fold_len

        temp = _convert_sp_mat_to_sp_tensor(X[start:end])
        n_nonzero_temp = X[start:end].count_nonzero()
        A_fold_hat.append(_dropout_sparse(temp, 1 - node_dropout[0], n_nonzero_temp))

    return A_fold_hat

In [33]:
node_dropout_flag

0

In [34]:
# Generate a set of adjacency sub-matrix.
if node_dropout_flag:
    # node dropout.
    A_fold_hat = _split_A_hat_node_dropout(norm_adj)
else:
    A_fold_hat = _split_A_hat(norm_adj)

In [35]:
len(A_fold_hat)

100

In [36]:
A_fold_hat[0].shape

TensorShape([Dimension(708), Dimension(70839)])

Here they build embeddings based on, or thinking in ego-networks, which, quoting directly [this page](http://www.analytictech.com/networks/egonet.htm): Ego networks consist of a focal node ("ego") and the nodes to whom ego is directly connected to (these are called "alters") plus the ties, if any, among the alters.

In [37]:
ego_embeddings = tf.concat([weights['user_embedding'], weights['item_embedding']], axis=0)

In [38]:
ego_embeddings.shape

TensorShape([Dimension(70839), Dimension(64)])

For this notebook we only use 1 layer. This is, in their Figure 2 there will be only one `Embedding Propagation Layer`

In [40]:
n_layers

1

`A_fold_hat[0]` is a subset of the adjacency matrix, with dimensions `TensorShape([Dimension(708), Dimension(70839)])`. This matrix has non-zero elements in those locations where there are node connections. For example, the first row will correspond to the 1st user that interacted with the 1st 127 items (0 to 126). 
`row_0 (1st element is the user_id) = 0 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126`

In [41]:
nz_idx=np.where(norm_adj.todense()[0, :])

In [42]:
nz_idx[1]

array([    0, 29858, 29859, 29860, 29861, 29862, 29863, 29864, 29865,
       29866, 29867, 29868, 29869, 29870, 29871, 29872, 29873, 29874,
       29875, 29876, 29877, 29878, 29879, 29880, 29881, 29882, 29883,
       29884, 29885, 29886, 29887, 29888, 29889, 29890, 29891, 29892,
       29893, 29894, 29895, 29896, 29897, 29898, 29899, 29900, 29901,
       29902, 29903, 29904, 29905, 29906, 29907, 29908, 29909, 29910,
       29911, 29912, 29913, 29914, 29915, 29916, 29917, 29918, 29919,
       29920, 29921, 29922, 29923, 29924, 29925, 29926, 29927, 29928,
       29929, 29930, 29931, 29932, 29933, 29934, 29935, 29936, 29937,
       29938, 29939, 29940, 29941, 29942, 29943, 29944, 29945, 29946,
       29947, 29948, 29949, 29950, 29951, 29952, 29953, 29954, 29955,
       29956, 29957, 29958, 29959, 29960, 29961, 29962, 29963, 29964,
       29965, 29966, 29967, 29968, 29969, 29970, 29971, 29972, 29973,
       29974, 29975, 29976, 29977, 29978, 29979, 29980, 29981, 29982,
       29983, 29984]

Remember we added a diagonal to account for node-self-connections. We can see that the location of the 1st non-zero element, apart from that in the diagonal (1,1) is 29859, so all good. When multiplying that row by the tensor `ego_embeddings` you will get the summation of the embeddings of the user 0 plus the weighted embeddings (remember the matrix is normalised) corresponding to items from 0-126. 

(1,70839) x (70839 x 64) = (1,64)

Let's have a look

In [43]:
temp = tf.sparse_tensor_dense_matmul(A_fold_hat[0], ego_embeddings)

In [44]:
temp.shape

TensorShape([Dimension(708), Dimension(64)])

Now we move down the adjacency matrix to row index 29858, remember from there in advance is filled with `R.T`

In [47]:
nz_idx_2=np.where(norm_adj.todense()[29858, :])

In [48]:
nz_idx_2[1]

array([    0,   140,   241,  1056,  1445,  2539,  3216,  3403,  5216,
        9443, 24111, 26313, 29184, 29858])

These are the users that interacted with item 0, so when we multiply the corresponding A_fold_hat with  ego_embeddings we get the summation of the embeddings for those users plus the embedding for item 0

In [49]:
temp_embed = []
for f in range(n_fold):
    temp_embed.append(tf.sparse_tensor_dense_matmul(A_fold_hat[f], ego_embeddings))

In [50]:
side_embeddings = tf.concat(temp_embed, 0)

In [51]:
side_embeddings.shape

TensorShape([Dimension(70839), Dimension(64)])

The 1st row of this matrix contains the weighted sum of all the item embeddings that user_id 0 interacted with plus the embeddings of that user while rwo # n_users+1 contains the summation of all user embeddings that interacted with item_id 0 plus the embeddings of that item.  

At this stage we have multiplied all embeddings by the adjacency matrix 

In [52]:
k=0

In [53]:
weights['W_gc_0'].shape

TensorShape([Dimension(64), Dimension(64)])

This below corresponds to the 1st term in their expression (3). 

In [54]:
sum_embeddings = tf.nn.leaky_relu(
    tf.matmul(side_embeddings, weights['W_gc_%d' % k]) + weights['b_gc_%d' % k])

In [55]:
sum_embeddings.shape

TensorShape([Dimension(70839), Dimension(64)])

Remember, ego_embeddings is simply the concatenation (over rows) of the user and item embeddings and  side_embeddings contains the weighted sum of the item embeddings that a certain user interacted with (plus the embeddings of that user) and the weighted sum of the user embeddings that an item "interacted" with (plus the embeddings of that item). 

We now move to the second term of their expression (3). They call `bi_embeddings` to the element-wise multiplication of the two tensors (and then we apply the same operation as before). In their own words: "we additionally
encode the interaction between ei and eu into the message being passed via ei ⊙ eu , where ⊙ denotes the element-wise product. This makes the message dependent on the affinity between ei and eu , e.g., passing more messages from the similar items. This not only increases the model representation ability, but also boosts the performance for recommendation "

In [56]:
# bi messages of neighbors.
bi_embeddings = tf.multiply(ego_embeddings, side_embeddings)
# transformed bi messages of neighbors.
bi_embeddings = tf.nn.leaky_relu(
    tf.matmul(bi_embeddings, weights['W_bi_%d' % k]) + weights['b_bi_%d' % k])

In [57]:
bi_embeddings.shape

TensorShape([Dimension(70839), Dimension(64)])

Their expression (3)

In [59]:
ego_embeddings = sum_embeddings + bi_embeddings

In [60]:
# message dropout.
ego_embeddings = tf.nn.dropout(ego_embeddings, 1 - mess_dropout[k])

W0807 14:23:39.308569 140613546653440 deprecation.py:506] From <ipython-input-60-c7abd9f97775>:2: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Here they add an extra normalization that I can't find in the paper, since the Laplacian norm that they refer to is already applied we built the Adjacency Matrix...

In [63]:
# normalize the distribution of embeddings.
norm_embeddings = tf.math.l2_normalize(ego_embeddings, axis=1)

In [66]:
norm_embeddings.shape

TensorShape([Dimension(70839), Dimension(64)])

In the paper they show how they concatenate the output from all the `Embedding Propagation Layer` + the so called `ego_embeddings`. This would be done recursively if n_layers would be > 1 

In [67]:
all_embeddings = [ego_embeddings] + [norm_embeddings]

In [68]:
all_embeddings = tf.concat(all_embeddings, 1)

In [69]:
all_embeddings.shape

TensorShape([Dimension(70839), Dimension(128)])

In [70]:
u_g_embeddings, i_g_embeddings = tf.split(all_embeddings, [n_users, n_items], 0)

In [72]:
print(u_g_embeddings.shape), print(i_g_embeddings.shape)

(29858, 128)
(40981, 128)


(None, None)

And that's it, that is the model. Nonetheless, and even though we will refer to this function in the corresponding training notebook, let's described here the pairwise Bayesian Personalized Ranking (BPR) loss as implemented in their paper. This loss considers the relative order between observed and unobserved user-item interactions. Specifically, BPR assumes that the observed interactions, which are more reflective of a user’s preferences, should be assigned higher prediction values than unobserved ones. Let's see it in code.

In [73]:
def create_bpr_loss(users, pos_items, neg_items):
    pos_scores = tf.reduce_sum(tf.multiply(users, pos_items), axis=1)
    neg_scores = tf.reduce_sum(tf.multiply(users, neg_items), axis=1)

    # regularization term
    regularizer = tf.nn.l2_loss(users) + tf.nn.l2_loss(pos_items) + tf.nn.l2_loss(neg_items)
    regularizer = regularizer/batch_size
    emb_loss = decay * regularizer

    # First term of their expression 11
    maxi = tf.log(tf.nn.sigmoid(pos_scores - neg_scores))
    mf_loss = tf.negative(tf.reduce_mean(maxi))

    # ? no idea...a 0.0 constant
    reg_loss = tf.constant(0.0, tf.float32, [1])

    return mf_loss, emb_loss, reg_loss

In [74]:
u_g_embeddings = tf.nn.embedding_lookup(tf.identity(u_g_embeddings), users)
pos_i_g_embeddings = tf.nn.embedding_lookup(i_g_embeddings, pos_items)
neg_i_g_embeddings = tf.nn.embedding_lookup(i_g_embeddings, neg_items)

In [75]:
mf_loss, emb_loss, reg_loss = create_bpr_loss(u_g_embeddings,pos_i_g_embeddings,neg_i_g_embeddings)
loss = mf_loss + emb_loss + reg_loss
opt = tf.train.AdamOptimizer(learning_rate=lr).minimize(loss)

In [76]:
loss

<tf.Tensor 'add_8:0' shape=(1,) dtype=float32>