In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix, coo_matrix

# Детерминированные алгоритмы
from implicit.nearest_neighbours import ItemItemRecommender, CosineRecommender, TFIDFRecommender, BM25Recommender

# Метрики
from implicit.evaluation import train_test_split
# from implicit.evaluation import precision_at_k, mean_average_precision_at_k, AUC_at_k, ndcg_at_k

In [2]:
data = pd.read_csv('retail_train.csv')
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [None]:
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

### Задание 1. Weighted Random Recommendation

Напишите код для случайных рекоммендаций, в которых вероятность рекомендовать товар прямо пропорциональна логарифму продаж
- Можно сэмплировать товары случайно, но пропорционально какому-либо весу
- Например, прямопропорционально популярности. Вес = log(sales_sum товара)

In [49]:
result = data.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head(2)

Unnamed: 0,user_id,actual
0,1,"[825123, 831447, 840361, 845307, 852014, 85498..."
1,2,"[854852, 930118, 1077555, 1098066, 5567388, 55..."


In [57]:
def weighted_random_recommendation(item_weight, n=5):
    
    recs = np.random.choice(item_weight['item_id'].to_list(), size=n, replace=False, p=item_weight['norm_weight'].to_list())

    return recs

In [56]:
top_sales = data.groupby('item_id')['quantity'].sum().reset_index()
top_sales.loc[top_sales['quantity']==0, 'quantity']=1 # замена нулей
top_sales['weight'] = np.log10(top_sales['quantity']) 
top_sales['norm_weight'] = top_sales['weight']/ top_sales['weight'].sum()
top_sales.head()

Unnamed: 0,item_id,quantity,weight,norm_weight
0,25671,6,0.778151,1.2e-05
1,26081,1,0.0,0.0
2,26093,1,0.0,0.0
3,26190,1,0.0,0.0
4,26355,2,0.30103,5e-06


In [58]:
%%time

result['weighted_random'] = result['user_id'].apply(lambda x: weighted_random_recommendation(top_sales, n=5))

Wall time: 35.1 s


Unnamed: 0,user_id,actual,weighted_random
0,1,"[825123, 831447, 840361, 845307, 852014, 85498...","[8068373, 1076744, 1134606, 979022, 1134214]"
1,2,"[854852, 930118, 1077555, 1098066, 5567388, 55...","[872146, 998112, 881216, 1102295, 8090956]"


### Задание 2. Расчет метрик
Рассчитайте Precision@5 для каждого алгоритма с помощью функции из вебинара 1. Какой алгоритм показывает лучшее качество?

In [2]:
old_result = pd.read_csv('predictions_basic.csv', sep=';')
old_result.head(2)

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,weighted_random,itemitem,cosine,tfidf,own_purchases
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[820603, 1102628, 9827351, 10356075, 957233]","[6534178, 6533889, 1029743, 6534166, 1082185]","[6773108, 887633, 898448, 856432, 1030868]","[6666, 1082185, 981760, 1127831, 995242]","[1082185, 6666, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 6666, 1098066]","[1082185, 995242, 1029743, 840361, 904360]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[6424447, 13115942, 15627461, 1096740, 6772768]","[6534178, 6533889, 1029743, 6534166, 1082185]","[13095241, 5571690, 1106286, 895332, 1036921]","[6666, 1082185, 981760, 1098066, 995242]","[1082185, 1098066, 981760, 6666, 826249]","[1082185, 981760, 1098066, 826249, 6666]","[1082185, 1098066, 6534178, 826249, 1127831]"


In [3]:
from ast import literal_eval

columns = old_result.columns
for column in columns[1:]:
    old_result[column] = old_result[column].apply(lambda row: literal_eval(row))

In [57]:
def precision_at_k(recommended_list, bought_list, k=5):
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)[:k]
    
    flags = np.isin(bought_list, recommended_list)
    return flags.sum() / len(recommended_list)

def recall_at_k(recommended_list, bought_list, k=5):
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)[:k]
    
    flags = np.isin(bought_list, recommended_list)
    return flags.sum() / len(bought_list)

def reciprocal_rank(recommended_list, bought_list, k=1):
    recommended_list = np.array(recommended_list)
    bought_list = np.array(bought_list)
    
    amount_user = len(bought_list)
    rr = []
    for i in np.arange(amount_user):    
        relevant_indexes = np.nonzero(np.isin(recommended_list[i][:k], bought_list[i]))[0]
        if len(relevant_indexes) != 0:
            rr.append(1/(relevant_indexes[0]+1))
    
    if len(rr) == 0:
        return 0
    
    return sum(rr)/amount_user

def ndcg_at_k(recommended_list, bought_list, k=5):
    rec = recommended_list
    b = bought_list
    
    recommended_list = np.array(recommended_list)[:k]
    bought_list = np.array(bought_list)
    
    flags = np.isin(recommended_list, bought_list)
    rank_list = []
    for i in np.arange(len(recommended_list)):
        if i < 2:
            rank_list.append(i+1)
        else:
            rank_list.append(math.log2(i+1))
    if len(recommended_list) == 0:
        return 0
    dcg = sum(np.divide(flags, rank_list)) / len(recommended_list)

    i_dcg = sum(np.divide(1, rank_list)) / len(recommended_list)
#     print(i_dcg)
    return dcg/i_dcg

def ap_k(recommended_list, bought_list, k=5):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)[:k]
    
    relevant_indexes = np.nonzero(np.isin(recommended_list, bought_list))[0]
    if len(relevant_indexes) == 0:
        return 0
    
    amount_relevant = len(relevant_indexes)
            
    sum_ = sum([precision_at_k(recommended_list, bought_list, k=index_relevant+1) for index_relevant in relevant_indexes])
    return sum_/amount_relevant

def map_k(recommended_list, bought_list, k=5):

    amount_user = len(bought_list)
    list_ap_k = [ap_k(recommended_list[i], bought_list[i], k) for i in np.arange(amount_user)]
    
    sum_ap_k = sum(list_ap_k)  
    return sum_ap_k/amount_user

In [5]:
print('Precision at 5:')
for name_col in old_result.columns[1:]:
    print(f"{round(old_result.apply(lambda row: precision_at_k(row[name_col], row['actual']), axis=1).mean(),4):<8}:{name_col}")

Precision at 5:
1.0     :actual
0.0003  :random_recommendation
0.1552  :popular_recommendation
0.0024  :weighted_random
0.1368  :itemitem
0.1329  :cosine
0.139   :tfidf
0.2199  :own_purchases


  


In [6]:
print('Recall at 5:')
for name_col in old_result.columns[1:]:
    print(f"{round(old_result.apply(lambda row: recall_at_k(row[name_col], row['actual']), axis=1).mean(),4):<8}:{name_col}")

Recall at 5:
0.271   :actual
0.0     :random_recommendation
0.025   :popular_recommendation
0.0002  :weighted_random
0.0157  :itemitem
0.0148  :cosine
0.0154  :tfidf
0.0289  :own_purchases


In [7]:
print('MRR at 5:')
for name_col in old_result.columns[1:]:
    print(f"{reciprocal_rank(old_result[name_col], old_result['actual'], 5):<8.4f}:{name_col}")

MRR at 5:
1.0000  :actual
0.0008  :random_recommendation
0.2860  :popular_recommendation
0.0057  :weighted_random
0.2136  :itemitem
0.3955  :cosine
0.4013  :tfidf
0.4519  :own_purchases


In [56]:
print('NDCG at 5:')
for name_col in old_result.columns[1:]:
    print(f"{old_result.apply(lambda row: ndcg_at_k(row[name_col], row['actual']), axis=1).mean():<8.4f}:{name_col}")

NDCG at 5:
1.0000  :actual
0.0003  :random_recommendation
0.1596  :popular_recommendation
0.0026  :weighted_random
0.1161  :itemitem
0.1647  :cosine
0.1707  :tfidf
0.2431  :own_purchases


In [58]:
print('MAP at 5:')
for name_col in old_result.columns[1:]:
    print(f"{map_k(old_result[name_col], old_result['actual'], 5):<8.4f}:{name_col}")

MAP at 5:
1.0000  :actual
0.0008  :random_recommendation
0.2568  :popular_recommendation
0.0056  :weighted_random
0.2210  :itemitem
0.3743  :cosine
0.3816  :tfidf
0.4181  :own_purchases
