## The Metrics

Before we move onto the model, and how is trained and tested, let's quickly go through the metrics that we will use here. The first part of the code below is either a direct copy/paste from the original [repo](https://github.com/xiangwang1223/neural_graph_collaborative_filtering) or a minor adaptation. When this is not the case I will explain the corresponding details. Therefore, **all credit to the authors** (Xiang Wang, Xiangnan He, Meng Wang, Fuli Feng and Tat-Seng Chua).

In [1]:
import numpy as np
import heapq

from sklearn.metrics import roc_auc_score

In [6]:
r = np.random.choice(2, 20, p=[0.7, 0.3])
k = 10
n_inter = 20

In [7]:
r

array([0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1])

In [8]:
def recall_at_k(r, k, n_inter):
    """recall @ k
    Parameters:
    ----------
    r: Iterable
        binary iterable (nonzero is relevant).
    k: Int
        number of recommendations to consider
    n_inter: Int
        number of interactions
    Returns:
    ----------
    recall @ k
    """
    r = np.asfarray(r)[:k]
    return np.sum(r) / n_inter

In [9]:
recall_at_k(r, k, n_inter)

0.15

In [10]:
def precision_at_k(r, k):
    """precision @ k
    Parameters:
    ----------
    r: Iterable
        binary iterable (nonzero is relevant).
    k: Int
        number of recommendations to consider
    Returns:
    ----------
    Precision @ k
    """
    assert k >= 1
    r = np.asarray(r)[:k]
    return np.mean(r)

In [11]:
precision_at_k(r, k)

0.3

In [12]:
def dcg_at_k(r, k, method=1):
    """ discounted cumulative gain (dcg) @ k
    Parameters:
    ----------
    r: Iterable
        Relevance is positive real values. If binary, nonzero is relevant.
    k: Int
        number of recommendations to consider
    method: Int
        one of 0 or 1. Simply, different dcg implementations
    Returns:
    ----------
    dcg @ k
    """
    r = np.asfarray(r)[:k]
    if r.size:
        if method == 0:
            return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))
        elif method == 1:
            return np.sum(r / np.log2(np.arange(2, r.size + 2)))
        else:
            raise ValueError('method must be 0 or 1.')
    return 0.

In [13]:
dcg_at_k(r, k)

1.1737365524159569

In [14]:
dcg_at_k(r, k, method=0)

1.3175293653079347

In [15]:
def ndcg_at_k(r, k, method=1):
    """ Normalized discounted cumulative gain @ k
    """
    dcg_max = dcg_at_k(sorted(r, reverse=True), k, method)
    if not dcg_max:
        return 0.
    return dcg_at_k(r, k, method) / dcg_max

In [16]:
ndcg_at_k(r, k)

0.35517551357284516

In [17]:
def hit_at_k(r, k):
    """hit ratio @ k
    Parameters:
    ----------
    r: Iterable
        binary iterable (nonzero is relevant).
    k: Int
        number of recommendations to consider
    Returns:
    ----------
    hit ratio @ k
    """
    r = np.array(r)[:k]
    if np.sum(r) > 0:
        return 1.
    else:
        return 0.

In [18]:
hit_at_k(r,k)

1.0

In [19]:
def get_auc(item_score, user_pos_test):
    """Wrap up around sklearn's roc_auc_score
    Parameters:
    ----------
    item_score: Dict
        Dict. keys are item_ids, values are predictions
    user_pos_test: List
        List with the items that the user actually interacted with
    Returns:
    ----------
    res: Float
        roc_auc_score
    """
    item_score = sorted(item_score.items(), key=lambda kv: kv[1])
    item_score.reverse()
    item_id = [x[0] for x in item_score]
    score = [x[1] for x in item_score]

    r = []
    for i in item_id:
        if i in user_pos_test:
            r.append(1)
        else:
            r.append(0)

    try:
        res = roc_auc_score(r, score)
    except Exception:
        res = 0.

    return res

Let's build the inputs of the function:

In [20]:
# for example...let's assume 100 items in total
item_score = {k:v for k,v in zip(np.arange(100), np.random.rand(100))}
user_pos_test = np.random.choice(100, 20, replace=False)

In [21]:
item_id = 1
item_score[item_id]

0.020429522439006087

In [22]:
print(user_pos_test)

[90 26 58 86 93 70 91  8 96 50 38 33 66 99 41 20 74 28 22 49]


In [23]:
get_auc(item_score, user_pos_test)

0.51125

In [24]:
def auc(true, pred):
    """Simple wrap up around sklearn's roc_auc_score
    """
    try:
        res = roc_auc_score(true, pred)
    except Exception:
        res = 0.
    return res


def ranklist_by_sorted(user_pos_test, test_items, rating, Ks):
    """
    Retursn a binary list, where relevance is nonzero, based on a ranked list
    with the n largest scores. Also returns the AUC
    Parameters:
    ----------
    user_pos_test: List
        List with the items that the user actually interacted with
    test_items: List
        List with the all items in the test dataset
    rating: List
        List with the ratings corresponding to test_items
    Ks: Int or List
        the k in @k
    Returns:
    ----------
    r: binary list where nonzero in relevant
    auc: testing roc_auc_score
    """
    item_score = {}
    for i in test_items:
        item_score[i] = rating[i]

    K_max = max(Ks)
    K_max_item_score = heapq.nlargest(K_max, item_score, key=item_score.get)

    r = []
    for i in K_max_item_score:
        if i in user_pos_test:
            r.append(1)
        else:
            r.append(0)
    auc = get_auc(item_score, user_pos_test)
    return r, auc

In [25]:
test_items, rating = list(item_score.keys()), list(item_score.values())
Ks = [5,10]

In [26]:
ranklist_by_sorted(user_pos_test, test_items, rating, Ks)

([0, 0, 0, 0, 0, 1, 0, 0, 1, 0], 0.51125)

In [27]:
def ranklist_by_heapq(user_pos_test, test_items, rating, Ks):
    """
    Retursn a binary list, where relevance is nonzero, based on a ranked list
    with the n largest scores. For consistency with ranklist_by_sorted, also
    returns auc=0 (since auc does not make sense within a mini batch)
    Parameters:
    ----------
    user_pos_test: List
        List with the items that the user actually interacted with
    test_items: List
        List with the all items in the test dataset
    rating: List
        List with the ratings corresponding to test_items
    Ks: Int or List
        the k in @k
    Returns:
    ----------
    r: binary list where nonzero in relevant
    """
    item_score = {}
    for i in test_items:
        item_score[i] = rating[i]

    K_max = max(Ks)
    K_max_item_score = heapq.nlargest(K_max, item_score, key=item_score.get)

    r = []
    for i in K_max_item_score:
        if i in user_pos_test:
            r.append(1)
        else:
            r.append(0)
    auc = 0.
    return r, auc

In [28]:
ranklist_by_heapq(user_pos_test, test_items, rating, Ks)

([0, 0, 0, 0, 0, 1, 0, 0, 1, 0], 0.0)

And finally, getting altogether:

In [29]:
def get_performance(user_pos_test, r, auc, Ks):
    """wrap up around all other previous functions
    ----------
    user_pos_test: List
        List with the items that the user actually interacted with
    r: List
        binary list where nonzero in relevant
    auc: Float
        sklearn's roc_auc_score
    Ks: List
        the k in @k
    Returns:
    ----------
    dictionary of metrics
    """

    precision, recall, ndcg, hit_ratio = [], [], [], []

    for K in Ks:
        precision.append(precision_at_k(r, K))
        recall.append(recall_at_k(r, K, len(user_pos_test)))
        ndcg.append(ndcg_at_k(r, K))
        hit_ratio.append(hit_at_k(r, K))

    return {'recall': np.array(recall), 'precision': np.array(precision),
            'ndcg': np.array(ndcg), 'hit_ratio': np.array(hit_ratio), 'auc': auc}

In [30]:
r, auc = ranklist_by_sorted(user_pos_test, test_items, rating, Ks)

In [31]:
get_performance(user_pos_test, r, auc, Ks)

{'recall': array([0. , 0.1]),
 'precision': array([0. , 0.2]),
 'ndcg': array([0.        , 0.40298313]),
 'hit_ratio': array([0., 1.]),
 'auc': 0.51125}

# GPU test

Before we leave the metrics behind, let's pause for one second and have a look to the functions above. They all provide scores/metrics for one user. This means that this will have run in a loop or distributed over the cores of the machine where we run the algorithm. Given the fact that the algorithm will run on a GPU, maybe we could take advantage of it and write some evaluation function that uses tensors and therefore can run on the GPU. 

In [32]:
import torch
import scipy.sparse as sp

In [33]:
use_cuda = torch.cuda.is_available()

n_users = 100
n_items = 200
n_embed = 12
Ks=[5,10]

let's create some small, fake dataset to illustrate the use of this testing method

In [34]:
# user and item embeddings
user_emb = torch.from_numpy(np.random.rand(n_users, n_embed))
item_emb = torch.from_numpy(np.random.rand(n_items, n_embed))

In [35]:
# Train Ratings Matrix
def randbin(r,c,p):
    return np.random.choice([0, 1], size=(r,c), p=[p, 1-p])
R_tr = randbin(n_users, n_items, 0.8)

In [36]:
# Test Rating Matrix
# removing all items in training
temp_mtx = 1 - R_tr
# finding the corresponding indexes
temp_idx = np.where(temp_mtx)
# setting the testing size as, for example, training//5
test_fr = np.where(R_tr)[0].size//5
# chosing indexes at random
R_te_idx = np.random.choice(temp_idx[0].size, test_fr, replace=False)
i,j = temp_idx[0][R_te_idx], temp_idx[1][R_te_idx]
# setting them to 1
R_te = np.zeros((n_users, n_items))
R_te[i,j] = 1

When we run the "real thing" `R_tr` and `R_te` will be sparse matrices

In [39]:
R_tr = sp.csr_matrix(R_tr, dtype='float64')
R_te = sp.csr_matrix(R_te, dtype='float64')

They will be large, so we need to split them in folds and we will then run a loop over the `n_folds`

In [40]:
def split_mtx(X, n_folds=10):
    """
    Split a matrix/Tensor into n_folds    
    """
    X_folds = []
    fold_len = X.shape[0]//n_folds
    for i in range(n_folds):
        start = i * fold_len
        if i == n_folds -1:
            end = X.shape[0]
        else:
            end = (i + 1) * fold_len
        X_folds.append(X[start:end])
    return X_folds

We will need another helper to make the code more readable

In [42]:
def ndcg_at_k_gpu(pred_items, test_items, test_indices, k):
    """
    pred_items: Tensor dim(fold_size, n_items)
        binary tensor with 1s in those locations corresponding to the predicted item interactions
    test_items: Tensor dim(fold_size, n_items)
        binary tensor with 1s in locations corresponding to the real test interactions
    test_indices: Tensor dim(fold_size, max(Ks))
        tensor with the location of the topk predicted items 
    k: int
    """    
    r = (test_items * pred_items).gather(1, test_indices)
    f = torch.from_numpy(np.log2(np.arange(2, k+2))).float().cuda()
    dcg = (r[:, :k]/f).sum(1)
    dcg_max = (torch.sort(r, dim=1, descending=True)[0][:, :k]/f).sum(1)
    ndcg = dcg/dcg_max
    ndcg[torch.isnan(ndcg)] = 0
    return ndcg

We will go in detail on what's going on inside that function just below, so keep reading...

And this is the testing function

In [43]:
def test_GPU(u_emb, i_emb, Rtr, Rte, Ks):

    ue_folds = split_mtx(u_emb)
    tr_folds = split_mtx(Rtr)
    te_folds = split_mtx(Rte)

    fold_prec, fold_rec, fold_ndcg, fold_hr = \
        defaultdict(list), defaultdict(list), defaultdict(list), defaultdict(list)
    for ue_f, tr_f, te_f in zip(ue_folds, tr_folds, te_folds):

        scores = torch.mm(ue_f, i_emb.t())
        test_items = torch.from_numpy(te_f.todense()).float().cuda()
        non_train_items = torch.from_numpy(1-(tr_f.todense())).float().cuda()
        scores = scores * non_train_items
        _, test_indices = torch.topk(scores, dim=1, k=max(Ks))
        pred_items = torch.zeros_like(scores).float()
        pred_items.scatter_(dim=1,index=test_indices,src=torch.tensor(1.0).cuda())

        for k in Ks:
            topk_preds = torch.zeros_like(scores).float()
            topk_preds.scatter_(dim=1,index=test_indices[:, :k],src=torch.tensor(1.0))

            TP = (test_items * topk_preds).sum(1)
            prec = TP/k
            rec = TP/test_items.sum(1)
            hit_r = (TP > 0).float()
            ndcg = ndcg_at_k_gpu(pred_items, test_items, test_indices, k)

            fold_prec[k].append(prec)
            fold_rec[k].append(rec)
            fold_ndcg[k].append(ndcg)
            fold_hr[k].append(hit_r)

    result = {'precision': [], 'recall': [], 'ndcg': [], 'hit_ratio': []}
    for k in Ks:
        result['precision'].append(torch.cat(fold_prec[k]).mean())
        result['recall'].append(torch.cat(fold_rec[k]).mean())
        result['ndcg'].append(torch.cat(fold_ndcg[k]).mean())
        result['hit_ratio'].append(torch.cat(fold_hr[k]).mean())
    return result

Let's have a look to what we are doing there. 

First, when running the real "thing", the rating matrices will be large, so we split them into folds

In [45]:
ue_folds = split_mtx(user_emb)
tr_folds = split_mtx(R_tr)
te_folds = split_mtx(R_te)
ue_f, tr_f, te_f = ue_folds[0], tr_folds[0], te_folds[0]

Per fold, we do the following:

In [50]:
# scores are simply the matrix multiplication between user and item embeddings
scores = torch.mm(ue_f, item_emb.t())
# test_items is a binary tensor containing REAL interactions
test_items = torch.from_numpy(te_f.todense()).float().cuda()
# non_train_items is a binary tensor containing ALL interactions that are not in train
non_train_items = torch.from_numpy(1-(tr_f.todense())).float().cuda()
# We only need to rate non train items
scores = scores.float().cuda() * non_train_items
# test_indices is a tensor containing the topk indices, per row, in score
_, test_indices = torch.topk(scores, dim=1, k=max(Ks))
# pred_items is a binary tensor of dim (fold_size, n_items) with 1s in those locations where 
# we have predicted an interaction
pred_items = torch.zeros_like(scores).float()
pred_items.scatter_(dim=1,index=test_indices,src=torch.tensor(1.0).cuda())

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0')

And now, per each k value we do:

In [53]:
# topk_preds is a binary tensor of dim (fold_size, n_items) with 1s in those locations where
# we have predicted the topk interactions
topk_preds = torch.zeros_like(scores).float()
topk_preds.scatter_(dim=1,index=test_indices[:, :k],src=torch.tensor(1.0))

# True positives 
TP = (test_items * topk_preds).sum(1)
# precision as defined by Xiang Wang et al: np.mean(np.asarray(r)[:k])
prec = TP/k
# recall as defined by Xiang Wang et al: np.sum(np.asfarray(r)[:k]) / all_pos_num
rec = TP/test_items.sum(1)
# hit ratio = 1 if np.sum(np.array(r)[:k]) > 0 else 0
hit_r = (TP > 0).float()
# ndcg as defined by Xiang Wang et al
ndcg = ndcg_at_k_gpu(pred_items, test_items, test_indices, k)

Let's just finish by having a look into `ndcg_at_k_gpu`

In [54]:
# r is a binary tensor of dim (fold_size, n_items) with 1s in ALL locations where 
# we have predicted an interaction
r = (test_items * pred_items).gather(1, test_indices)
# simply the denominator in their expression: np.sum(r / np.log2(np.arange(2, r.size + 2)))
f = torch.from_numpy(np.log2(np.arange(2, k+2))).float().cuda()
# the tensor equivalent to  np.sum(r / np.log2(np.arange(2, r.size + 2))) 
dcg = (r[:, :k]/f).sum(1)
# From here on is pretty straightforward
dcg_max = (torch.sort(r, dim=1, descending=True)[0][:, :k]/f).sum(1)
ndcg = dcg/dcg_max
ndcg[torch.isnan(ndcg)] = 0