# Практическое задание

## Урок 2. Бейзлайны и детерминированные алгоритмы item-item

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix, coo_matrix

# Детерминированные алгоритмы
from implicit.nearest_neighbours import ItemItemRecommender, CosineRecommender, TFIDFRecommender, BM25Recommender

# Метрики
from implicit.evaluation import train_test_split
# from implicit.evaluation import precision_at_k, mean_average_precision_at_k, AUC_at_k, ndcg_at_k

In [2]:
data = pd.read_csv('retail_train.csv')
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [3]:
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']

### Задание 1. Weighted Random Recommendation

Напишите код для случайных рекоммендаций, в которых вероятность рекомендовать товар прямо пропорциональна логарифму продаж
- Можно сэмплировать товары случайно, но пропорционально какому-либо весу
- Например, прямопропорционально популярности. Вес = log(sales_sum товара)

In [4]:
def weighted_random_recommendation(items_weights, n=5):
    """Случайные рекоммендации
    
    Input
    -----
    items_weights: pd.DataFrame
        Датафрейм со столбцами item_id, weight. Сумма weight по всем товарам = 1
    """
    # Подсказка: необходимо модифицировать функцию random_recommendation()
    # your_code
    items = np.array(items_weights['item_id'])
    weights = np.array(items_weights['weight'])
    recs = np.random.choice(items, size=n, p=weights, replace=False)
    
    return recs.tolist()

In [5]:
%%time

items = data_train.groupby('item_id')['sales_value'].sum().reset_index() # сумма продажи сгруппированных items
sales_value_sum = data_train['sales_value'].sum() # общая сумму продаж
items['weight'] = items['sales_value'] / sales_value_sum # вес item в зависимости от объема продаж
result['weighted_random_recommendation'] = result['user_id'].apply(lambda x: weighted_random_recommendation(items, n=5))

result.head(3)


CPU times: total: 3.02 s
Wall time: 3.02 s


Unnamed: 0,user_id,actual,weighted_random_recommendation
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[819982, 7443137, 838396, 12263090, 911974]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[894511, 865174, 1124142, 12731714, 944466]"
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[973861, 12384736, 924006, 12810464, 999625]"


### Задание 2. Расчет метрик
Рассчитайте Precision@5, Precision@3, Recall@5 для каждого алгоритма с помощью функции из вебинара 1. Какой алгоритм показывает лучшее качество?

In [6]:
result = pd.read_csv('predictions_basic.csv')
result.head(2)

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,own_purchases
0,1,[ 821867 834484 856942 865456 889248 ...,"[5586238, 1015228, 866118, 2416733, 2603573]","[6534178, 6533889, 1029743, 6534166, 1082185]","[981760, 1127831, 1098066, 826249, 878996]","[981760, 1127831, 1098066, 878996, 826249]","[981760, 1127831, 1098066, 826249, 878996]","[999999, 1082185, 1029743, 995785, 1004906]"
1,3,[ 835476 851057 872021 878302 879948 ...,"[161354, 63027, 1027802, 12263694, 307395]","[6534178, 6533889, 1029743, 6534166, 1082185]","[981760, 995242, 1029743, 840361, 961554]","[981760, 1004906, 961554, 1096036, 1080414]","[981760, 1004906, 859075, 1096036, 961554]","[999999, 1082185, 1098066, 6534178, 1127831]"


In [7]:
# преобразование в массивы

# функция для преобразования в массив actual (bought_list)

def bought_list_to_array(bought_list): #actual
    bought_list = bought_list.strip('[]\n').split(' ')
    bought_list = list(bought_list)
    return np.array(bought_list)

# функция для преобразования в массив recommended_list

def recommended_list_to_array(recommended_list): 
    recommended_list = recommended_list.strip('[]\n').split(' ')
    recommended_list = list(recommended_list)
    return np.array(recommended_list)

In [8]:
# функция для расчета Precision@3 и Precision@5 (передать значение k)

def precision_at_k(recommended_list, bought_list, k):
    
    bought_list = bought_list_to_array(bought_list)
    recommended_list = recommended_list_to_array(recommended_list)
    
    bought_list = bought_list  # Тут нет [:k] !!
    recommended_list = recommended_list[:k]
    
    flags = np.isin(recommended_list, bought_list)
    
    precision = flags.sum() / len(recommended_list)
    
    
    return precision

In [9]:
# функция для расчета Recall@5


def recall_at_k(recommended_list, bought_list, k=5):
    
    bought_list = bought_list_to_array(bought_list)
    recommended_list = recommended_list_to_array(recommended_list)
    
    recommended_list = recommended_list[:k]
        
    flags = np.isin(recommended_list, bought_list) #!!! сначала лист рекомендаций 
    
    recall = flags.sum() / len(bought_list)
   
    return recall

In [10]:
list_algs = ['random_recommendation', 'popular_recommendation', 'itemitem', 'cosine', 'tfidf', 'own_purchases']
list_metric = ['Precision@3', 'Precision@5', 'Recall@5']

list_prec_3 = []
list_prec_5 = []
list_rec_5 = []

for alg in list_algs:
    alg_pak_prec_3 = result.apply(lambda row: precision_at_k(row[alg], row['actual'], 3), axis=1).mean()
    alg_pak_prec_5 = result.apply(lambda row: precision_at_k(row[alg], row['actual'], 5), axis=1).mean()
    alg_pak_rec_5 = result.apply(lambda row: recall_at_k(row[alg], row['actual']), axis=1).mean()
    
    list_prec_3.append(alg_pak_prec_3)
    list_prec_5.append(alg_pak_prec_5)
    list_rec_5.append(alg_pak_rec_5)
    
intrm_dict = {'algorithm':list_algs, 'Precision@3':list_prec_3, 'Precision@5':list_prec_5, 'Recall@5':list_rec_5} 

metric_pivot = pd.DataFrame(intrm_dict).sort_values(by='Precision@5', ascending=False)
metric_pivot 

Unnamed: 0,algorithm,Precision@3,Precision@5,Recall@5
1,popular_recommendation,0.0,0.061508,0.003365
5,own_purchases,0.0,0.028893,0.003967
2,itemitem,0.0,0.006856,0.000659
4,tfidf,0.0,0.005975,0.000274
3,cosine,0.0,0.005779,0.000278
0,random_recommendation,0.0,0.000196,7e-06


Вывод: Лучшее качество по метрике Precision@5 харакатерно для алгоритма popular_recommendation, по метрике Recall@5 - для алгоритма own_purchases	