In [1]:
import numpy as np
import random as rd
import scipy.sparse as sp
import pdb

from pathlib import Path
from time import time
from tqdm import tqdm

from joblib import Parallel, delayed

In [2]:
path = Path("/home/ubuntu/projects/neural_graph_cf/Data/gowalla")
batch_size = 32

train_file = path/'train.txt'
test_file = path/'test.txt'

Users and items are numbered from 0 to (n_users-1) and (n_items-1), so let's count

In [3]:
# get number of users and items. 
n_users, n_items = 0, 0
n_train, n_test = 0, 0

exist_users = []
with open(train_file) as f:
    for l in f.readlines():
        if len(l) > 0:
            l = l.strip('\n').split(' ')
            # first element is the user_id, then items
            uid = int(l[0])
            items = [int(i) for i in l[1:]]
            exist_users.append(uid)
            n_items = max(n_items, max(items))
            n_users = max(n_users, uid)
            n_train += len(items)

# same as before but for testing
with open(test_file) as f:
    for l in f.readlines():
        if len(l) > 0:
            l = l.strip('\n')
            try:
                items = [int(i) for i in l.split(' ')[1:]]
            except Exception:
                continue
            n_items = max(n_items, max(items))
            n_test += len(items)
n_items += 1
n_users += 1

In [4]:
print(n_items, n_users)

40981 29858


We will build the interactions/ratings matrix

In [6]:
R = sp.dok_matrix((n_users, n_items), dtype=np.float32)
train_set, test_set = {}, {}
with open(train_file) as f_train, open(test_file) as f_test:
    for l in f_train.readlines():
        if len(l) == 0: break
        l = l.strip('\n')
        items = [int(i) for i in l.split(' ')]
        uid, train_items = items[0], items[1:]
        # simply 1 if user interacted with item, otherwise, 0.
        for i in train_items:
            R[uid, i] = 1.
        train_set[uid] = train_items

    for l in f_test.readlines():
        if len(l) == 0: break
        l = l.strip('\n')
        try:
            items = [int(i) for i in l.split(' ')]
        except Exception:
            continue
        uid, test_items = items[0], items[1:]
        test_set[uid] = test_items

In [7]:
R

<29858x40981 sparse matrix of type '<class 'numpy.float32'>'
	with 810128 stored elements in Dictionary Of Keys format>

In [9]:
print(train_set[0][:10])
print(test_set[0][:10])

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
[7580, 3730, 5983, 5990, 7608, 1213, 6017, 7510, 7513, 8343]


The use a number of difference adjacency matrix, see [here](https://github.com/xiangwang1223/neural_graph_collaborative_filtering): 

In [11]:
def normalized_adj_single(adj):
    # rowsum = out-degree of the node    
    rowsum = np.array(adj.sum(1))
    # inverted and set to 0 if no connections
    d_inv = np.power(rowsum, -1).flatten()
    d_inv[np.isinf(d_inv)] = 0.
    # sparse diagonal matrix with the normalizing factors in the diagonal
    d_mat_inv = sp.diags(d_inv)
    # dot product resulting in a row-normalised version of the input matrix
    norm_adj = d_mat_inv.dot(adj)
    return norm_adj.tocoo()

The following function is used to check the expression 8 in their paper

In [12]:
def check_adj_if_equal(adj):
    dense_A = np.array(adj.todense())
    degree = np.sum(dense_A, axis=1, keepdims=False)
    temp = np.dot(np.diag(np.power(degree, -1)), dense_A)
    return temp

In [13]:
adj_mat = sp.dok_matrix((n_users + n_items, n_users + n_items), dtype=np.float32)
adj_mat = adj_mat.tolil()

# This would be their A matrix in expression 8
adj_mat[:n_users, n_users:] = R.tolil()
adj_mat[n_users:, :n_users] = R.tolil().T
adj_mat = adj_mat.todok()

In [14]:
adj_mat

<70839x70839 sparse matrix of type '<class 'numpy.float32'>'
	with 1620256 stored elements in Dictionary Of Keys format>

along with the "normal" adjancecy matrix, we generate two additional ones

`norm_adj_mat`: each decay factor bewteen two connected nodes is set as 1/(out degree of the node + self-conncetion)
`mean_adj_mat`: each decay factor bewteen two connected nodes is set as 1/(out degree of the node)

eventually a forth one will also be used which will be
`norm_adj_mat + sp.eye(mean_adj.shape[0])`: each decay factor bewteen two connected nodes is set as 1/(out degree of the node) and each node is also assigned with 1 for self-connections.

In [15]:
norm_adj_mat = normalized_adj_single(adj_mat + sp.eye(adj_mat.shape[0]))
mean_adj_mat = normalized_adj_single(adj_mat)

Let's have a look to the 1st row and search for non-zero elements

In [20]:
uid0_nonzero = np.where(adj_mat[0].todense())

In [21]:
uid0_nonzero

(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([29858, 29859, 29860, 29861, 29862, 29863, 29864, 29865, 29866,
        29867, 29868, 29869, 29870, 29871, 29872, 29873, 29874, 29875,
        29876, 29877, 29878, 29879, 29880, 29881, 29882, 29883, 29884,
        29885, 29886, 29887, 29888, 29889, 29890, 29891, 29892, 29893,
        29894, 29895, 29896, 29897, 29898, 29899, 29900, 29901, 29902,
        29903, 29904, 29905, 29906, 29907, 29908, 29909, 29910, 29911,
        29912, 29913, 29914, 29915, 29916, 29917, 29918, 29919, 29920,
        29921, 29922, 29923, 29924, 29925, 29926, 29927, 29928, 29929,
 

Let's check the training data for the 1st user (id=0)

In [19]:
print(train_set[0])

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126]


In [23]:
len(uid0_nonzero[0]) == len(train_set[0])

True

We see that the 1st element different than 0 is 29858, which is equal to the number of users. From there in advance the non-zero elements are ordered consecutively through the next 126 numbers. Note that if we included self-connections, the elements in the diagonal woudl also be diff than 0. 

Let's now create "negative pools", simply collections of 100 items that users never interacted with

In [25]:
neg_pools = {}
for u in tqdm(train_set.keys()):
    neg_items = list(set(range(n_items)) - set(train_set[u]))
    pools = np.random.choice(neg_items, 100)
    neg_pools[u] = pools

100%|██████████| 29858/29858 [02:40<00:00, 185.52it/s]


The following functions sample positive and negative items either directly from the dataset, or from the previously generated "negative pools"

In [26]:
def sample_pos_items_for_u(u, num):
    pos_items = train_set[u]
    n_pos_items = len(pos_items)
    pos_batch = []
    while True:
        if len(pos_batch) == num: break
        pos_id = np.random.randint(low=0, high=n_pos_items, size=1)[0]
        pos_i_id = pos_items[pos_id]

        if pos_i_id not in pos_batch:
            pos_batch.append(pos_i_id)
    return pos_batch

In [27]:
def sample_neg_items_for_u(u, num):
    neg_items = []
    while True:
        if len(neg_items) == num: break
        neg_id = np.random.randint(low=0, high=n_items,size=1)[0]
        if neg_id not in train_set[u] and neg_id not in neg_items:
            neg_items.append(neg_id)
    return neg_items

In [28]:
def sample_neg_items_for_u_from_pools(u, num):
    # this line must be a bug because no train_items[u] will ever be in neg_pools[u], 
    # neg_items = list(set(range(n_items)) - set(train_set[u]))
    # pools = np.random.choice(neg_items, 100)
    # neg_pools[u] = pools
    neg_items = list(set(neg_pools[u]) - set(train_set[u]))
    return rd.sample(neg_items, num)

# To me this should be
def sample_neg_items_for_u_from_pools(u, num):
    return rd.sample(neg_pools[u], num)

In [29]:
# I'd say something wild need to happen for this first condition to be true...
if batch_size <= n_users:
    users = rd.sample(exist_users, batch_size)
else:
    # I prefer:
    # users = np.random.choice(exist_users, batch_size, replace=False)
    # This allows for user repetition. Is still ok, since likely it will 
    # appear with different items after sample_pos_items_for_u(u, 1), but 
    # still...Maybe is intentional
    users = [rd.choice(exist_users) for _ in range(batch_size)]

pos_items, neg_items = [], []
for u in users:
    pos_items += sample_pos_items_for_u(u, 1)
    neg_items += sample_neg_items_for_u(u, 1)

In [33]:
print(pos_items), print(neg_items), print(users)

[32095, 13531, 38977, 6661, 26459, 12706, 23297, 40753, 15854, 372, 28526, 6562, 3211, 30712, 10119, 19004, 21719, 8052, 26495, 26569, 18559, 15043, 7666, 7383, 11197, 16875, 22241, 32487, 1619, 26453, 36401, 4924]
[18323, 23035, 13776, 2253, 14696, 2403, 35035, 18789, 39980, 21736, 38653, 37400, 14521, 3717, 21929, 10842, 4253, 26933, 26023, 9167, 28712, 7036, 38847, 17432, 3442, 6357, 37294, 40265, 36642, 13861, 30338, 22653]
[22996, 18198, 28724, 6748, 20207, 3565, 18808, 22092, 20675, 22479, 9179, 4620, 977, 29645, 8052, 4787, 12676, 23904, 12416, 23144, 8764, 4733, 6967, 17445, 4683, 12457, 13287, 26701, 11972, 21529, 24212, 17115]


(None, None, None)

In [34]:
print(len(pos_items)), print(len(neg_items)), print(len(users))

32
32
32


(None, None, None)

In [31]:
n_train, n_test

(810128, 217242)

And that is about it for us, because the functions below will not be used in this repo. # this function is something that I will not use in this repo. These correspond to their study of the effect of sparsity. Have a look to their section 4.3.2 Performance Comparison w.r.t. Interaction Sparsity Levels: ".... In particular, based on interaction number per user, we divide the test set into four groups, each of which has the same total interactions..."

In [52]:
def create_sparsity_split():
    all_users_to_test = list(test_set.keys())
    user_n_iid = dict()

    # generate a dictionary to store (key=n_iids, value=a list of uid).
    for uid in all_users_to_test:
        # train and test items for user_id
        train_iids = train_set[uid]
        test_iids = test_set[uid]

        # number of "interactions"
        n_iids = len(train_iids) + len(test_iids)

        if n_iids not in user_n_iid.keys():
            # dictionary where the keys are the number of interactions 
            # and the values are the users that have that number of interactions
            user_n_iid[n_iids] = [uid]
        else:
            user_n_iid[n_iids].append(uid)
    split_uids = list()

    # split the whole user set into four subset.
    temp = []
    count = 1
    fold = 4
    # total number of interactions in the dataset
    n_count = (n_train + n_test) 
    n_rates = 0

    split_state = []
    for idx, n_iids in enumerate(sorted(user_n_iid)):
        temp += user_n_iid[n_iids]
        # n_rates -> number of ratings
        # n_iids  -> key corresponding to a certain number of interactions (e.g. 10 ratins)
        # len(user_n_iid[n_iids]) -> number of users that interacted with 10 items
        n_rates += n_iids * len(user_n_iid[n_iids])
        n_count -= n_iids * len(user_n_iid[n_iids])
        # when number of rates/interaction has reached 25% of the total number of interactions, 
        # append the corresponding users to split_uids (remember we loop over sorted(user_n_iid))
        if n_rates >= count * 0.25 * (n_train + n_test):
            split_uids.append(temp)

            state = '#inter per user<=[%d], #users=[%d], #all rates=[%d]' %(n_iids, len(temp), n_rates)
            split_state.append(state)
            print(state)

            temp = []
            n_rates = 0
            fold -= 1 # don't think we need this if we manually state 0.25
        
        if idx == len(user_n_iid.keys()) - 1 or n_count == 0:
            split_uids.append(temp)

            state = '#inter per user<=[%d], #users=[%d], #all rates=[%d]' % (n_iids, len(temp), n_rates)
            split_state.append(state)
            print(state)
    return split_uids, split_state

In [53]:
def get_sparsity_split():
    try:
        split_uids, split_state = [], []
        lines = open(path + '/sparsity.split', 'r').readlines()

        for idx, line in enumerate(lines):
            if idx % 2 == 0:
                split_state.append(line.strip())
                print(line.strip())
            else:
                split_uids.append([int(uid) for uid in line.strip().split(' ')])
        print('get sparsity split.')

    except Exception:
        split_uids, split_state = create_sparsity_split()
        f = open(path + '/sparsity.split', 'w')
        for idx in range(len(split_state)):
            f.write(split_state[idx] + '\n')
            f.write(' '.join([str(uid) for uid in split_uids[idx]]) + '\n')
        print('create sparsity split.')

    return split_uids, split_state