## The Metrics

Before we move onto the model, and how is trained and tested, let's quickly go through the metrics that we will use here. The first part of the code below is either a direct copy/paste from the original [repo](https://github.com/xiangwang1223/neural_graph_collaborative_filtering) or a minor adaptation. When this is not the case I will explain the corresponding details. Therefore, **all credit for the authors**.

In [12]:
import numpy as np
import heapq

from sklearn.metrics import roc_auc_score

In [13]:
r = np.random.randint(2, size=20)
k = 10
n_inter = 20

In [14]:
r

array([0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1])

In [15]:
def recall_at_k(r, k, n_inter):
    """recall @ k
    Parameters:
    ----------
    r: Iterable
        binary iterable (nonzero is relevant).
    k: Int
        number of recommendations to consider
    n_inter: Int
        number of interactions
    Returns:
    ----------
    recall @ k
    """
    r = np.asfarray(r)[:k]
    return np.sum(r) / n_inter

In [16]:
recall_at_k(r, k, n_inter)

0.3

In [17]:
def precision_at_k(r, k):
    """precision @ k
    Parameters:
    ----------
    r: Iterable
        binary iterable (nonzero is relevant).
    k: Int
        number of recommendations to consider
    Returns:
    ----------
    Precision @ k
    """
    assert k >= 1
    r = np.asarray(r)[:k]
    return np.mean(r)

In [18]:
precision_at_k(r, k)

0.6

In [19]:
def dcg_at_k(r, k, method=1):
    """ discounted cumulative gain (dcg) @ k
    Parameters:
    ----------
    r: Iterable
        Relevance is positive real values. If binary, nonzero is relevant.
    k: Int
        number of recommendations to consider
    method: Int
        one of 0 or 1. Simply, different dcg implementations
    Returns:
    ----------
    dcg @ k
    """
    r = np.asfarray(r)[:k]
    if r.size:
        if method == 0:
            return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))
        elif method == 1:
            return np.sum(r / np.log2(np.arange(2, r.size + 2)))
        else:
            raise ValueError('method must be 0 or 1.')
    return 0.

In [20]:
dcg_at_k(r, k)

2.25667559290693

In [21]:
dcg_at_k(r, k, method=0)

2.736711950964459

In [22]:
def ndcg_at_k(r, k, method=1):
    """ Normalized discounted cumulative gain @ k
    """
    dcg_max = dcg_at_k(sorted(r, reverse=True), k, method)
    if not dcg_max:
        return 0.
    return dcg_at_k(r, k, method) / dcg_max

In [23]:
ndcg_at_k(r, k)

0.49667571720465364

In [24]:
def hit_at_k(r, k):
    """hit ratio @ k
    Parameters:
    ----------
    r: Iterable
        binary iterable (nonzero is relevant).
    k: Int
        number of recommendations to consider
    Returns:
    ----------
    hit ratio @ k
    """
    r = np.array(r)[:k]
    if np.sum(r) > 0:
        return 1.
    else:
        return 0.

In [25]:
hit_at_k(r,k)

1.0

In [26]:
def get_auc(item_score, user_pos_test):
    """Wrap up around sklearn's roc_auc_score
    Parameters:
    ----------
    item_score: Dict
        Dict. keys are item_ids, values are predictions
    user_pos_test: List
        List with the items that the user actually interacted with
    Returns:
    ----------
    res: Float
        roc_auc_score
    """
    item_score = sorted(item_score.items(), key=lambda kv: kv[1])
    item_score.reverse()
    item_id = [x[0] for x in item_score]
    score = [x[1] for x in item_score]

    r = []
    for i in item_id:
        if i in user_pos_test:
            r.append(1)
        else:
            r.append(0)

    try:
        res = roc_auc_score(r, score)
    except Exception:
        res = 0.

    return res

Let's build the function inputs

In [27]:
# for example...let's assume 100 items in total
item_score = {k:v for k,v in zip(np.arange(100), np.random.rand(100))}
user_pos_test = np.random.choice(100, 20, replace=False)

In [28]:
get_auc(item_score, user_pos_test)

0.5856250000000001

In [29]:
def auc(true, pred):
    """Simple wrap up around sklearn's roc_auc_score
    """
    try:
        res = roc_auc_score(true, pred)
    except Exception:
        res = 0.
    return res


def ranklist_by_sorted(user_pos_test, test_items, rating, Ks):
    """
    Retursn a binary list, where relevance is nonzero, based on a ranked list
    with the n largest scores. Also returns the AUC
    Parameters:
    ----------
    user_pos_test: List
        List with the items that the user actually interacted with
    test_items: List
        List with the all items in the test dataset
    rating: List
        List with the ratings corresponding to test_items
    Ks: Int or List
        the k in @k
    Returns:
    ----------
    r: binary list where nonzero in relevant
    auc: testing roc_auc_score
    """
    item_score = {}
    for i in test_items:
        item_score[i] = rating[i]

    K_max = max(Ks)
    K_max_item_score = heapq.nlargest(K_max, item_score, key=item_score.get)

    r = []
    for i in K_max_item_score:
        if i in user_pos_test:
            r.append(1)
        else:
            r.append(0)
    auc = get_auc(item_score, user_pos_test)
    return r, auc

In [30]:
test_items, rating = list(item_score.keys()), list(item_score.values())
Ks = [5,10]

In [31]:
ranklist_by_sorted(user_pos_test, test_items, rating, Ks)

([1, 1, 0, 0, 0, 1, 0, 0, 0, 0], 0.5856250000000001)

In [32]:
def ranklist_by_heapq(user_pos_test, test_items, rating, Ks):
    """
    Retursn a binary list, where relevance is nonzero, based on a ranked list
    with the n largest scores. For consistency with ranklist_by_sorted, also
    returns auc=0 (since auc does not make sense within a mini batch)
    Parameters:
    ----------
    user_pos_test: List
        List with the items that the user actually interacted with
    test_items: List
        List with the all items in the test dataset
    rating: List
        List with the ratings corresponding to test_items
    Ks: Int or List
        the k in @k
    Returns:
    ----------
    r: binary list where nonzero in relevant
    """
    item_score = {}
    for i in test_items:
        item_score[i] = rating[i]

    K_max = max(Ks)
    K_max_item_score = heapq.nlargest(K_max, item_score, key=item_score.get)

    r = []
    for i in K_max_item_score:
        if i in user_pos_test:
            r.append(1)
        else:
            r.append(0)
    auc = 0.
    return r, auc

In [33]:
ranklist_by_heapq(user_pos_test, test_items, rating, Ks)

([1, 1, 0, 0, 0, 1, 0, 0, 0, 0], 0.0)

And finally, getting altogether:

In [34]:
def get_performance(user_pos_test, r, auc, Ks):
    """wrap up around all other previous functions
    ----------
    user_pos_test: List
        List with the items that the user actually interacted with
    r: List
        binary list where nonzero in relevant
    auc: Float
        sklearn's roc_auc_score
    Ks: List
        the k in @k
    Returns:
    ----------
    dictionary of metrics
    """

    precision, recall, ndcg, hit_ratio = [], [], [], []

    for K in Ks:
        precision.append(precision_at_k(r, K))
        recall.append(recall_at_k(r, K, len(user_pos_test)))
        ndcg.append(ndcg_at_k(r, K))
        hit_ratio.append(hit_at_k(r, K))

    return {'recall': np.array(recall), 'precision': np.array(precision),
            'ndcg': np.array(ndcg), 'hit_ratio': np.array(hit_ratio), 'auc': auc}

In [35]:
r, auc = ranklist_by_sorted(user_pos_test, test_items, rating, Ks)

In [36]:
get_performance(user_pos_test, r, auc, Ks)

{'recall': array([0.1 , 0.15]),
 'precision': array([0.4, 0.3]),
 'ndcg': array([0.76536064, 0.93252109]),
 'hit_ratio': array([1., 1.]),
 'auc': 0.5856250000000001}

### GPU test

Before we leave the metrics behind, let's pause for one second and have a look to the functions above. They all provide scores/metrics for one user. This means that this will have run in a loop or distributed over the cores of the machine where we run the algorithm. Given the fact that the algorithm will run on a GPU, maybe we could take advantage and write some evaluation function that runs on the GPU. 

The code below is taken mostly from [here](https://github.com/sh0416/bpr/blob/master/train.py), adapated to the fact that here our rating matrix is large enough so that it wont fit in memory when move to dense (i.e. we cannot run lines like: `test_pred_mask = 1 - (train_w)` in that code).

In [38]:
import torch
import scipy.sparse as sp

In [39]:
use_cuda = torch.cuda.is_available()

n_users = 100
n_items = 200
n_embed = 12
Ks=[5,10]

let's create some small, fake dataset to illustrate the use of this testing method

In [40]:
# user and item embeddings
user_emb = torch.from_numpy(np.random.rand(n_users, n_embed))
item_emb = torch.from_numpy(np.random.rand(n_items, n_embed))

In [41]:
# Train Ratings Matrix
def randbin(r,c,p):
    return np.random.choice([0, 1], size=(r,c), p=[p, 1-p])
R_tr = randbin(n_users, n_items, 0.8)

In [42]:
# Test Rating Matrix
# removing all items in training
temp_mtx = 1 - R_tr
# finding the corresponding indexes
temp_idx = np.where(temp_mtx)
# setting the testing size as, for example, training//5
test_fr = np.where(R_tr)[0].size//5
# chosing indexes at random
R_te_idx = np.random.choice(temp_idx[0].size, test_fr, replace=False)
i,j = temp_idx[0][R_te_idx], temp_idx[1][R_te_idx]
# setting them to 1
R_te = np.zeros((n_users, n_items))
R_te[i,j] = 1

When we run the "real thing" `R_tr` and `R_te` will be sparse matrices

In [43]:
R_tr = sp.csr_matrix(R_tr, dtype='float64')
R_te = sp.csr_matrix(R_te, dtype='float64')

In [44]:
def split_mtx(X, n_folds=10):
    """
    Split a matrix/Tensor into n_folds    
    """
    X_folds = []
    fold_len = X.shape[0]//n_folds
    for i in range(n_folds):
        start = i * fold_len
        if i == n_folds -1:
            end = X.shape[0]
        else:
            end = (i + 1) * fold_len
        X_folds.append(X[start:end])
    return X_folds

And this is the testing function

In [45]:
def precision_and_recall_k(user_emb, item_emb, R_tr, R_te, Ks):
    """
    Compute precision and recall using tensors    

    Parameters:
    ----------
    user_emb: Tensor
        user embeddings of shape (n_users, n_emb)
    item_emb: Tensor
        item embeddings of shape (n_items, n_emb)
    R_tr: scipy.sp matrix
        training ratings shape (n_users, n_items)
    R_te: scipy.sp matrix
        testing ratings shape (n_users, n_items)
    Ks: List
        k order of recommendations (the k in precision@k)

    Returns:
    ----------
    precision[k],recall[k]: Dict
        Dictionary where keys are the Ks and values are precision and recall
    """
    # splits into n_folds
    tr_folds = split_mtx(R_tr)
    te_folds = split_mtx(R_te)
    ue_folds = split_mtx(user_emb)

    fold_prec, fold_rec = {}, {}
    for ue_fold, tr_fold, te_fold in zip(ue_folds, tr_folds, te_folds):
        
        # score for all items, per user.
        result = torch.sigmoid(torch.mm(ue_fold, item_emb.t()))
        # this mask contains that is not training (negatives+testing)
        test_pred_mask = torch.from_numpy(1 - tr_fold.todense())
        # this mask contains only the true testing items
        test_true_mask = torch.from_numpy(te_fold.todense())
        if use_cuda:
            test_pred_mask, test_true_mask = test_pred_mask.cuda(), test_true_mask.cuda()
        test_pred = test_pred_mask * result
        test_true = test_true_mask * result

        _, test_indices = torch.topk(test_pred, dim=1, k=max(Ks))
        for k in Ks:
            topk_mask = torch.zeros_like(test_pred)
            source = torch.tensor(1.0).cuda() if use_cuda else torch.tensor(1.0)
            # this will create a mask with 1 located in positions test_indices[:, :k]
            topk_mask.scatter_(dim=1, index=test_indices[:, :k], src=source)
            # matrix with the actual predictions in positions test_indices[:, :k]
            test_pred_topk = topk_mask * test_pred
            # precision and recall
            acc_result = (test_pred_topk != 0) & (test_pred_topk == test_true)
            pr_k = acc_result.sum().float() / (user_emb.shape[0] * k)
            rec_k = (acc_result.float().sum(dim=1) / test_true_mask.float().sum(dim=1))
            try:
                fold_prec[k].append(pr_k)
                fold_rec[k].append(rec_k)
            except KeyError:
                fold_prec[k] = [pr_k]
                fold_rec[k] = [rec_k]

    precision, recall = {}, {}
    for k in Ks:
        precision[k] = np.sum(fold_prec[k])
        recall[k] = torch.cat(fold_rec[k]).mean()
    return precision,recall

Let's have a look to what happens inside that function

In [46]:
tr_folds = split_mtx(R_tr)
te_folds = split_mtx(R_te)
ue_folds = split_mtx(user_emb)

In [47]:
tr_folds[0]

<10x200 sparse matrix of type '<class 'numpy.float64'>'
	with 395 stored elements in Compressed Sparse Row format>

In [48]:
print(len(ue_folds), ue_folds[0].shape)

10 torch.Size([10, 12])


now we have 10 folds/partition of the rating and user embedding matrices and we are ready to loop. Let's got through one loop

In [49]:
tr_fold, te_fold, ue_fold = tr_folds[0], te_folds[0], ue_folds[0]

the authors explain that they want to make all scores between 0 and 1, using a sigmoid. Below are the score for all items, for the N users in the corresponding fold

In [50]:
result = torch.sigmoid(torch.mm(ue_fold, item_emb.t()))
print(result.shape)

torch.Size([10, 200])


In [51]:
# masks with 1 for all items that are NOT in training -> test+negatives
test_pred_mask = torch.from_numpy(1 - tr_fold.todense())
# masks with 1 for test items (is a copy of R_te per fold)
test_true_mask = torch.from_numpy(te_fold.todense())

In [52]:
# matrix with scores for all items that are NOT in training -> test+negatives
test_pred = test_pred_mask * result
# matrix with scores for "true" test items 
test_true = test_true_mask * result

In [53]:
print(test_pred.shape, test_true.shape)

torch.Size([10, 200]) torch.Size([10, 200])


Let's find the locations of the top K recommended items

In [54]:
_, test_indices = torch.topk(test_pred, dim=1, k=max(Ks))

In [55]:
test_indices

tensor([[ 96, 185, 120,  22,  90, 159, 166, 175,  50, 115],
        [ 70, 111,  86,   8, 147, 184, 119,  93, 144, 152],
        [185,  70, 111,  96, 119,  95,   8, 120, 175, 168],
        [119, 111,  86,  37, 185, 166, 175, 168, 150,  95],
        [ 86, 119,  37,  70,  96,  95, 185, 115,  90,  35],
        [111,  70, 119, 166,  96, 168, 130, 175, 159,  86],
        [119,  86,  70,  96, 185,  37,  95, 150,  62,   3],
        [119,  86,  70,  96, 111,  37, 185,   8, 168,  12],
        [119,  70,  86,  95, 111,  37, 154, 150, 185,  12],
        [119,  96,  86, 185, 111, 148, 154, 120,  37, 168]])

let's assume k=5

In [56]:
k=5
topk_mask = torch.zeros_like(test_pred)
source = torch.tensor(1.0).cuda() if use_cuda else torch.tensor(1.0)
topk_mask.scatter_(dim=1, index=test_indices[:, :k], src=source)

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], dtype=torch.float64)

In [57]:
topk_mask[0,].nonzero()

tensor([[ 22],
        [ 90],
        [ 96],
        [120],
        [185]])

effectively, the nonzero locations (75, 139, ...) in the first row correspond to the top 5 items in test_indices. Let's get these locations from the `test_pred` tensor and compute the precision and recall (or almost)  

In [58]:
test_pred_topk = topk_mask * test_pred 

In [59]:
# if item in not in training and is in testing (i.e. not negative)
acc_result = (test_pred_topk != 0) & (test_pred_topk == test_true)

We save the neccesary information per fold that will be used to calculate precision and recall for the whole dataset

In [60]:
pr_k = acc_result.sum().float() / (user_emb.shape[0] * k)
rec_k = (acc_result.float().sum(dim=1) / test_true_mask.float().sum(dim=1))

In [61]:
pr_k

tensor(0.0040)

In [62]:
rec_k

tensor([0.0000, 0.0000, 0.0909, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.1429,
        0.0000])

In [63]:
fold_prec, fold_rec = {}, {}
try:
    fold_prec[k].append(pr_k)
    fold_rec[k].append(rec_k)
except KeyError:
    fold_prec[k] = [pr_k]
    fold_rec[k] = [rec_k]

In [64]:
print(fold_prec)
print(fold_rec)

{5: [tensor(0.0040)]}
{5: [tensor([0.0000, 0.0000, 0.0909, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.1429,
        0.0000])]}


Finally (remember, this would run into a loop of Ks)

In [65]:
precision, recall = {}, {}
precision[k] = np.sum(fold_prec[k])
recall[k] = torch.cat(fold_rec[k]).mean()

In [66]:
precision

{5: 0.004}

In [67]:
recall

{5: tensor(0.0234)}

Let's run the whole thing

In [68]:
precision, recall = precision_and_recall_k(user_emb, item_emb, R_tr, R_te, Ks=[5, 10])

In [69]:
print(precision, recall)

{5: 0.038, 10: 0.043999996} {5: tensor(0.0233), 10: tensor(0.0492)}


Note that in the final version of the code, I will rename `precision_and_recall_k` to `test_GPU` as "opposed" to the [Wang Xiang et al](https://arxiv.org/pdf/1905.08108.pdf) paper test funcion, which I will refer as `test_CPU`