### init environment

In [383]:
# import torch
import numpy as np
from functools import partial

In [384]:
good_count = 20        # общее кол-во товаров
user_count = 100         # кол-во пользователей для которых генерим данные
recommend_count = 10    # кол-во генерируемых рекомендаций для каждого пользователя
price_range = (199, 2490)

# генерируем проверочные данные для заданий
rnd = np.random.default_rng(11)
goods = np.arange(good_count) + 50      # список id существующих товаров
prices = np.round(rnd.random(good_count) * (max(price_range) - min(price_range)) + min(price_range), 2)     # цены товаров

recommends = []         # списки рекомендаций для пользователей
rec_prices = []         # списки цен рекомендованных товаров
boughts = []            # список покупок
b_prices = []           # список цен покупок
for _ in range(user_count):
    indexes = rnd.choice(goods.size, size=recommend_count, replace=False)    # индексы рекомендаций
    recommends.append(goods[indexes])
    rec_prices.append(prices[indexes])

    boughts_count = rnd.integers(good_count) + 1
    indexes = rnd.choice(goods.size, size=boughts_count, replace=False)
    boughts.append(goods[indexes])
    b_prices.append(prices[indexes])

### 1. hit rate at k

In [385]:
def hit_rate(recommended_list, bought_list):
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    flags = np.isin(bought_list, recommended_list)
    hit_rate = (flags.sum() > 0) * 1
    return hit_rate


def hit_rate_at_k(recommended_list, bought_list, k=5):
    """ Hit rate@k = (был ли хотя бы 1 релевантный товар среди топ-k рекомендованных) """
    # с использованием numpy
    flags = np.isin(bought_list, recommended_list[:k])
    return (flags.sum() > 0) * 1

    # без использования numpy
    # return (len(set(bought_list) & set(recommended_list[:k])) > 0) * 1

In [386]:
# check
user = 0
val = hit_rate_at_k(recommends[user], boughts[user], 5)
print(f'Hit rate@k value (k=5): {val}')

Hit rate@k value (k=5): 1


In [387]:
# также можно в hit_rate() как recommended_list передавать нужный slice:
user = 0
val = hit_rate(recommends[user][:5], boughts[user])
print(f'Hit rate@k value (k=5): {val}')

Hit rate@k value (k=5): 1


### money precision at k

In [388]:
def money_precision_at_k(recommended_list, bought_list, prices_recommended, k=5):
    """ Доля дохода по рекомендованным объектам
    :param recommended_list - список id рекомендаций
    :param bought_list - список id покупок
    :param prices_recommended - список цен для рекомендаций
    """
    flags = np.isin(recommended_list[:k], bought_list)
    prices = np.array(prices_recommended[:k])
    return flags @ prices / prices.sum()

In [389]:
# check
user = 10
money_precision_at_k(recommends[user], boughts[user], rec_prices[user], k=5)

0.6854211885678256

In [390]:
recommends[user], boughts[user], rec_prices[user]

(array([53, 58, 67, 68, 51, 62, 63, 61, 57, 59]),
 array([58, 66, 63, 50, 57, 51, 61, 55, 56, 67, 59, 62, 52, 64, 65, 54, 60]),
 array([ 264.73, 2371.62, 1372.87, 2070.14, 1342.85, 1717.57,  829.73,
        1370.59,  496.31, 1623.74]))

### recall at k

In [391]:
def recall_at_k(recommended_list, bought_list, k=5):
    """ Recall on top k items """
    flags = np.isin(bought_list, recommended_list[:k])
    return flags.sum() / len(bought_list)

In [392]:
# check
user = 10
recall_at_k(recommends[user], boughts[user], k=5)

0.17647058823529413

### money recall at k

In [393]:
def money_recall_at_k(recommended_list, bought_list, prices_recommended, prices_bought, k=5):
    """ Доля дохода по релевантным рекомендованным объектам
    :param recommended_list - список id рекомендаций
    :param bought_list - список id покупок
    :param prices_recommended - список цен для рекомендаций
    :param prices_bought - список цен покупок
    """
    flags = np.isin(recommended_list[:k], bought_list)      # get recommend to bought matches
    prices = np.array(prices_recommended[:k])               # get prices of recommended items
    return flags @ prices / np.sum(prices_bought)

In [394]:
# check
user = 10
money_recall_at_k(recommends[user], boughts[user], rec_prices[user], b_prices[user], k=5)

0.23424252219100425

### map at k

In [395]:
def precision_at_k(recommended_list, bought_list, k=5):
    flags = np.isin(bought_list, recommended_list[:k])
    return flags.sum() / k

def ap_k(recommended_list, bought_list, k=5):
    # переработано
    flags = np.isin(recommended_list, bought_list)
    if sum(flags) == 0:
        return 0

    sum_ = 0
    for i in range(0, k-1):
        if flags[i]:
            sum_ += precision_at_k(recommended_list, bought_list, k=i+1)
    result = sum_ / sum(flags)
    return result

    # func = partial(precision_at_k, recommended_list, bought_list)
    # rel_items = np.arange(1, k + 1)[flags[:k]]                  # получаем номера релевантных объектов
    # return np.sum(list(map(func, rel_items))) / flags.sum()     # считаем avg precision@k для этих объектов

In [396]:
# v1
def map_k_v1(recommended_list, bought_list, k=5, u=1):
    """ Среднее AP@k по u пользователям """
    apk = []
    for user in range(u):
        apk.append(ap_k(recommended_list[user], bought_list[user]))
    
    return np.mean(apk)

In [397]:
# v2
def map_k_v2(recommended_list, bought_list, k=5, u=1):
    """ Среднее AP@k по u пользователям """
    func = partial(ap_k, k=k)
    apk = list(map(func, recommended_list[:u], bought_list[:u]))
    return np.mean(apk)

In [398]:
%%time
# check
map_k_v1(recommends, boughts, u=50)

CPU times: user 20.4 ms, sys: 4.6 ms, total: 25 ms
Wall time: 33.2 ms


0.29224801587301585

In [399]:
%%time
# check
map_k_v2(recommends, boughts, u=50)

CPU times: user 25.4 ms, sys: 0 ns, total: 25.4 ms
Wall time: 32 ms


0.29224801587301585

### mean reciprocal rank

Mean Reciprocal Rank

- Считаем для первых k рекоммендаций
- Найти ранк первого релевантного предсказания $k_u$
- Посчитать reciprocal rank = $\frac{1}{k_u}$

$$MRR = mean(\frac{1}{k_u})$$

In [400]:
def reciprocal_rank(recommended_list, bought_list, n=1, k=5):    
    """ обратный ранг n релевантных рекомендаций среди первых k рекомендаций
    (при n=1 оно как раз должно работать как сказано в задании)
    :param recommended_list - список рекомендаций
    :param bought_list - список покупок
    :param n - учитывать первые n релевантных рекомендаций
    :param k - искать релевантные среди первых k рекомендаций
    """
    flags = np.isin(recommended_list[:k], bought_list)
    ranks = np.arange(1, k + 1)[flags][:n]      # ранги первых n рекомендаций из первых k. равен 0 если рекомендация нерелевантна
    ideal_ranks = np.arange(1, n + 1)
    return (1 / ranks).sum() / (1 / ideal_ranks).sum() if flags.any() else 0

In [401]:
# check
user = 10
reciprocal_rank(recommends[user], boughts[user], n=5)

0.4525547445255474

In [402]:
recommends[user], boughts[user]

(array([53, 58, 67, 68, 51, 62, 63, 61, 57, 59]),
 array([58, 66, 63, 50, 57, 51, 61, 55, 56, 67, 59, 62, 52, 64, 65, 54, 60]))

In [403]:
def mean_reciprocal_rank(recommended_list, bought_list, k=5):
    """ Среднеобратный ранг """
    ranks = []
    for data in zip(recommended_list, bought_list):
        ranks.append(reciprocal_rank(*data, k))
    return np.mean(ranks)

In [404]:
# check
mean_reciprocal_rank(recommends, boughts, k=5)

0.549051094890511

### NDCG@k

Normalized discounted cumulative gain

$$DCG = \frac{1}{|r|} \sum_u{\frac{[bought fact]}{discount(i)}}$$  

$discount(i) = 1$ if $i <= 2$,   
$discount(i) = log_2(i)$ if $i > 2$


(!) Считаем для первых k рекоммендаций   
(!) - существуют вариации с другими $discount(i)$  
i - ранк рекомендованного товара  
|r| - кол-во рекомендованных товаров 

$$NDCG = \frac{DCG}{ideal DCG}$$

In [551]:
N = 5

rnd = np.random.default_rng(7)
# ys_true = torch.randint(0, 5, (N, ))
# ys_pred = torch.rand(N)
ys_true = rnd.integers(5, size=N)
ys_pred = rnd.random(size=N)


In [552]:
ys_true, ys_pred

(array([4, 3, 3, 4, 2]),
 array([0.22520719, 0.30016628, 0.87355345, 0.0052653 , 0.82122842]))

In [514]:
# def compute_gain(y_value: float, gain_scheme: str) -> float:
#     """ Cumulative gain """
#     if gain_scheme == "exp2":
#         gain = 2 ** y_value - 1
#     elif gain_scheme == "const":
#         gain = y_value
#     else:
#         raise ValueError(f"{gain_scheme} method not supported, only exp2 and const.")
#     return float(gain)

In [524]:
def cumulative_gain(y_true, y_pred, gain_scheme: str = 'const', k=3) -> float:
    """ Cumulative gain at k """
    argsort = np.argsort(y_pred)[:-k - 1:-1]        # @k descending sort indexes    
    if gain_scheme == "exp2":
        gain = 2 ** (y_true[argsort] * y_pred[argsort]) - 1
    elif gain_scheme == "const":
        gain = y_true[argsort] * y_pred[argsort]
    else:
        raise ValueError(f"{gain_scheme} method not supported, only exp2 and const.")
    return gain

In [553]:
cumulative_gain(ys_true, ys_pred, 'exp2', k=3)

array([5.15031514, 2.12197035, 0.86671134])

In [537]:
def dcg(ys_true, ys_pred, gain_scheme: str = 'const', k=3) -> float:
    """ Discounted cumulative gain at K """
    cg_pred = cumulative_gain(ys_true, ys_pred, gain_scheme, k)

    ret = 0
    for idx, cg in enumerate(cg_pred, 1):
        ret += cg / np.log2(idx + 1)
        
    return ret

In [559]:
# def dcg(ys_true, ys_pred, gain_scheme: str = 'const', k=3) -> float:
#     """ Discounted cumulative gain at K """
#     argsort = np.argsort(ys_pred)[:-k - 1:-1]   # sort @k
#     ys_true_sorted = ys_true[argsort]
#     ret = 0
#     for st, sp in zip(ys_true_sorted, ys_pred[argsort]):
#         # st - sorted true, sp - sorted pred
#         gain = compute_gain(st * sp, gain_scheme)
#         ret += gain / np.log2(k + 1)
        
#     return ret

In [512]:
# def dcg(ys_true, ys_pred, gain_scheme: str = 'const', k=3) -> float:
#     """ Discounted cumulative gain at K """
#     argsort = np.argsort(ys_pred)[:-k - 1:-1]   # sort @k
#     ys_true_sorted = ys_true[argsort]
#     ret = 0
#     for idx, (st, sp) in enumerate(zip(ys_true_sorted, ys_pred[argsort]), 1):
#         # st - sorted true, sp - sorted pred
#         gain = compute_gain(st * sp, gain_scheme)
#         ret += gain / (np.log2(idx + 1))
        
#     return ret

In [560]:
dcg(ys_true, ys_pred, 'exp2', k=3)

IndexError: invalid index to scalar variable.

In [557]:
dcg(ys_true, ys_pred, 'const', k=3)

4.107184650827857

In [558]:
k = 3
(1 / np.log2(np.arange(1, k + 1) + 1)).sum()

2.1309297535714578

In [528]:
# ideal dcg ?
dcg(np.array([1, 1, 1]), np.array([1, 1, 1]), k=3)

2.1309297535714578

In [None]:
def ndcg(ys_true, ys_pred, gain_scheme: str = 'const', k=3) -> float:
    """ Normalized Discounted Cumulative Gain at K """
    pred_dcg = dcg(ys_true, ys_pred, gain_scheme)
    ideal_dcg = 1 / np.log2(range(k) + 1)

    return pred_dcg / ideal_dcg

In [414]:
# def ndcg(ys_true: torch.Tensor, ys_pred: torch.Tensor, gain_scheme: str = 'const') -> float:
#     pred_dcg = dcg(ys_true, ys_pred, gain_scheme)
#     # your code ideal_dcg = 
    
#     ndcg = pred_dcg / ideal_dcg
#     return ndcg