In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix, coo_matrix

# Детерминированные алгоритмы
from implicit.nearest_neighbours import ItemItemRecommender, CosineRecommender, TFIDFRecommender, BM25Recommender

# Метрики
from implicit.evaluation import train_test_split
from implicit.evaluation import precision_at_k, mean_average_precision_at_k, AUC_at_k, ndcg_at_k

# import warnings
# warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('./retail_train.csv')
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [3]:
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

### Задание 1. Weighted Random Recommendation

Напишите код для случайных рекоммендаций, в которых вероятность рекомендовать товар прямо пропорциональна логарифму продаж
- Можно сэмплировать товары случайно, но пропорционально какому-либо весу
- Например, прямопропорционально популярности. Вес = log(sales_sum товара)

In [4]:
def weighted_random_recommendation(popular, n=5):
    items = np.array(popular['item_id'])
    probability = np.array(popular['probability'])
    
    recs = np.random.choice(items, size=n, replace=False, p=probability)
    
    return recs.tolist()

In [5]:
%%time
popular = data_train.groupby('item_id')['sales_value'].sum().reset_index()

#как правило эта проблема решается через log(1 + x), в Numpy для этого есть специальная функция np.log1p()
#это также поможет обойти ситуацию с нулевыми продажами

popular['log'] = np.log1p(popular['sales_value'])
sum_ = popular.log.sum()
print(sum_)

#probability -- это условная вероятность сэмплирования товара
popular['probability'] = popular['log']/sum_.sum()

#сумма по всем товарам д.б. равна единице, проверим
popular.probability.sum()

popular

241205.8574273651
CPU times: total: 156 ms
Wall time: 135 ms


Unnamed: 0,item_id,sales_value,log,probability
0,25671,20.94,3.088311,0.000013
1,26081,0.99,0.688135,0.000003
2,26093,1.59,0.951658,0.000004
3,26190,1.54,0.932164,0.000004
4,26355,1.98,1.091923,0.000005
...,...,...,...,...
86860,17381856,0.00,0.000000,0.000000
86861,17382205,7.99,2.196113,0.000009
86862,17383227,4.49,1.702928,0.000007
86863,17827644,2.50,1.252763,0.000005


In [6]:
weighted_random_recommendation(popular,5)

[7027871, 12781829, 1052134, 1120815, 993913]

### Задание 2. Расчет метрик
Рассчитайте Precision@5 для каждого алгоритма с помощью функции из вебинара 1. Какой алгоритм показывает лучшее качество?

In [7]:
result = pd.read_csv('./predictions_basic.csv')
result.head(2)

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,own_purchases
0,1,[ 821867 834484 856942 865456 889248 ...,"[5586238, 1015228, 866118, 2416733, 2603573]","[6534178, 6533889, 1029743, 6534166, 1082185]","[981760, 1127831, 1098066, 826249, 878996]","[981760, 1127831, 1098066, 878996, 826249]","[981760, 1127831, 1098066, 826249, 878996]","[999999, 1082185, 1029743, 995785, 1004906]"
1,3,[ 835476 851057 872021 878302 879948 ...,"[161354, 63027, 1027802, 12263694, 307395]","[6534178, 6533889, 1029743, 6534166, 1082185]","[981760, 995242, 1029743, 840361, 961554]","[981760, 1004906, 961554, 1096036, 1080414]","[981760, 1004906, 859075, 1096036, 961554]","[999999, 1082185, 1098066, 6534178, 1127831]"


In [8]:
result.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2042 entries, 0 to 2041
Data columns (total 8 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   user_id                 2042 non-null   int64 
 1   actual                  2042 non-null   object
 2   random_recommendation   2042 non-null   object
 3   popular_recommendation  2042 non-null   object
 4   itemitem                2042 non-null   object
 5   cosine                  2042 non-null   object
 6   tfidf                   2042 non-null   object
 7   own_purchases           2042 non-null   object
dtypes: int64(1), object(7)
memory usage: 127.8+ KB


In [9]:
result['actual'] = ''.join(' '.join(l) for l in result['actual'].str.lstrip().str.split())
res = []
for i in range(len(result['actual'])):
    res.append(list(map(int, result['actual'][i].split(']')[0].split('[')[1].split(' ')[1:])))
result['actual'] =  pd.DataFrame({'actual':res})

res = []
for i in range(len(result['random_recommendation'])):
    res.append(list(map(int, result['random_recommendation'][i].split(']')[0].split('[')[1].split(',')[1:])))
result['random_recommendation'] =  pd.DataFrame({'random_recommendation':res})

res = []
for i in range(len(result['popular_recommendation'])):
    res.append(list(map(int, result['popular_recommendation'][i].split(']')[0].split('[')[1].split(',')[1:])))
result['popular_recommendation'] =  pd.DataFrame({'popular_recommendation':res})

res = []
for i in range(len(result['itemitem'])):
    res.append(list(map(int, result['itemitem'][i].split(']')[0].split('[')[1].split(',')[1:])))
result['itemitem'] =  pd.DataFrame({'itemitem':res})

res = []
for i in range(len(result['cosine'])):
    res.append(list(map(int, result['cosine'][i].split(']')[0].split('[')[1].split(',')[1:])))
result['cosine'] =  pd.DataFrame({'cosine':res})

res = []
for i in range(len(result['tfidf'])):
    res.append(list(map(int, result['tfidf'][i].split(']')[0].split('[')[1].split(',')[1:])))
result['tfidf'] =  pd.DataFrame({'tfidf':res})
    
res = []
for i in range(len(result['own_purchases'])):
    res.append(list(map(int, result['own_purchases'][i].split(']')[0].split('[')[1].split(',')[1:])))
result['own_purchases'] =  pd.DataFrame({'own_purchases':res})
                          
# for name in result.columns:
#     if  name != ('user_id' and 'actual'):
#         res = []
#         for i in range(len(result[name])):
#             res.append(list(map(int, result[name][i].split(']')[0].split('[')[1].split(',')[1:])))
#         result[name] =  pd.DataFrame({name:res})

In [10]:
def precision_at_k(recommended_list, bought_list, k):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
   
    bought_list = bought_list  # Тут нет [:k] !!
    recommended_list = recommended_list[:k]
    
    flags = np.isin(bought_list, recommended_list)
    
    precision = flags.sum() / len(recommended_list)
    
    return precision


def recall_at_k(recommended_list, bought_list, k=5):
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    
    bought_list = bought_list
    recommended_list = recommended_list[:k]
    
    flags = np.isin(bought_list, recommended_list)
    
    recall = flags.sum() / len(bought_list)
    
    return recall

In [11]:
precision_at_5 = {}
precision_at_3 = {}
recall_at_5 = {}

In [12]:
precision_at_5['random_recommendation'] = result.apply(lambda row: precision_at_k(row['random_recommendation'], 
                                                                                  row['actual'], 5), axis=1).mean()
precision_at_5['popular_recommendation'] = result.apply(lambda row: precision_at_k(row['popular_recommendation'], 
                                                                                   row['actual'], 5), axis=1).mean()
precision_at_5['itemitem'] = result.apply(lambda row: precision_at_k(row['itemitem'], 
                                                                     row['actual'], 5), axis=1).mean()
precision_at_5['cosine'] = result.apply(lambda row: precision_at_k(row['cosine'], 
                                                                   row['actual'], 5), axis=1).mean()
precision_at_5['tfidf'] = result.apply(lambda row: precision_at_k(row['tfidf'], 
                                                                  row['actual'], 5), axis=1).mean()
precision_at_5['own_purchases'] = result.apply(lambda row: precision_at_k(row['own_purchases'], 
                                                                          row['actual'], 5), axis=1).mean()
max(precision_at_5, key=precision_at_5.get)

  precision = flags.sum() / len(recommended_list)


'own_purchases'

In [13]:
precision_at_3['random_recommendation'] = result.apply(lambda row: precision_at_k(row['random_recommendation'], 
                                                                                  row['actual'], 3), axis=1).mean()
precision_at_3['popular_recommendation'] = result.apply(lambda row: precision_at_k(row['popular_recommendation'], 
                                                                                   row['actual'], 3), axis=1).mean()
precision_at_3['itemitem'] = result.apply(lambda row: precision_at_k(row['itemitem'], 
                                                                     row['actual'], 3), axis=1).mean()
precision_at_3['cosine'] = result.apply(lambda row: precision_at_k(row['cosine'], 
                                                                   row['actual'], 3), axis=1).mean()
precision_at_3['tfidf'] = result.apply(lambda row: precision_at_k(row['tfidf'], 
                                                                  row['actual'], 3), axis=1).mean()
precision_at_3['own_purchases'] = result.apply(lambda row: precision_at_k(row['own_purchases'], 
                                                                          row['actual'], 5), axis=1).mean()
max(precision_at_3, key=precision_at_3.get)

  precision = flags.sum() / len(recommended_list)


'own_purchases'

In [14]:
recall_at_5['random_recommendation'] = result.apply(lambda row: recall_at_k(row['random_recommendation'], 
                                                                                  row['actual'], 5), axis=1).mean()
recall_at_5['popular_recommendation'] = result.apply(lambda row: recall_at_k(row['popular_recommendation'], 
                                                                                   row['actual'], 5), axis=1).mean()
recall_at_5['itemitem'] = result.apply(lambda row: recall_at_k(row['itemitem'], 
                                                                     row['actual'], 5), axis=1).mean()
recall_at_5['cosine'] = result.apply(lambda row: recall_at_k(row['cosine'], 
                                                                   row['actual'], 5), axis=1).mean()
recall_at_5['tfidf'] = result.apply(lambda row: recall_at_k(row['tfidf'], 
                                                                  row['actual'], 5), axis=1).mean()
recall_at_5['own_purchases'] = result.apply(lambda row: recall_at_k(row['own_purchases'], 
                                                                          row['actual'], 5), axis=1).mean()
max(recall_at_5, key=recall_at_5.get)

'popular_recommendation'

In [15]:
precision_at_5, precision_at_3, recall_at_5 

({'random_recommendation': 0.0006121449559255632,
  'popular_recommendation': 0.25,
  'itemitem': 0.15291380999020568,
  'cosine': 0.1928256611165524,
  'tfidf': 0.20004897159647406,
  'own_purchases': 0.2503687315634218},
 {'random_recommendation': 0.0004897159647404506,
  'popular_recommendation': 0.0,
  'itemitem': 0.1638916095331367,
  'cosine': 0.19637610186091894,
  'tfidf': 0.20323212536728538,
  'own_purchases': 0.2503687315634218},
 {'random_recommendation': 3.400805310697574e-05,
  'popular_recommendation': 0.013888888888889221,
  'itemitem': 0.0084952116661227,
  'cosine': 0.010712536728697562,
  'tfidf': 0.011113831755359885,
  'own_purchases': 0.01385488083578223})

### Задание 3*. Улучшение бейзлайнов и ItemItem

- Попробуйте улучшить бейзлайны, считая их на топ-5000 товаров
- Попробуйте улучшить разные варианты ItemItemRecommender, выбирая число соседей $K$.

In [16]:
popularity = data_train.groupby('item_id')['quantity'].sum().reset_index()
popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)

popularity.head()

Unnamed: 0,item_id,n_sold
0,25671,6
1,26081,1
2,26093,1
3,26190,1
4,26355,2


In [17]:
top_5000 = popularity.sort_values('n_sold', ascending=False).head(5000).item_id.tolist()

In [18]:
# Заведем фиктивный item_id (если юзер покупал товары из топ-5000, то он "купил" такой товар)
data_train.loc[~data_train['item_id'].isin(top_5000), 'item_id'] = 999999

user_item_matrix = pd.pivot_table(data_train, 
                                  index='user_id', columns='item_id', 
                                  values='quantity',
                                  aggfunc='count', 
                                  fill_value=0
                                 )

user_item_matrix[user_item_matrix > 0] = 1 # так как в итоге хотим предсказать 
user_item_matrix = user_item_matrix.astype(float) # необходимый тип матрицы для implicit

# переведем в формат saprse matrix
sparse_user_item = csr_matrix(user_item_matrix).tocsr()

user_item_matrix.head(3)

item_id,202291,397896,420647,480014,545926,707683,731106,818980,819063,819227,...,15778533,15831255,15926712,15926775,15926844,15926886,15927403,15927661,15927850,16809471
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
user_item_matrix.shape

(2499, 5001)

In [20]:
userids = user_item_matrix.index.values
itemids = user_item_matrix.columns.values

matrix_userids = np.arange(len(userids))
matrix_itemids = np.arange(len(itemids))

id_to_itemid = dict(zip(matrix_itemids, itemids))
id_to_userid = dict(zip(matrix_userids, userids))

itemid_to_id = dict(zip(itemids, matrix_itemids))
userid_to_id = dict(zip(userids, matrix_userids))

In [21]:
result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


In [None]:
%%time

model = ItemItemRecommender(K=1, num_threads=4) # K - кол-во билжайших соседей

model.fit(csr_matrix(user_item_matrix).T.tocsr(), 
          show_progress=True)

recs = model.recommend(userid=userid_to_id[1], 
                        user_items=csr_matrix(user_item_matrix).tocsr(),   # на вход user-item matrix
                        N=5, 
                        filter_already_liked_items=False, 
                        filter_items=None, 
                        recalculate_user=True)

  0%|          | 0/2499 [00:00<?, ?it/s]

# Почему то здесь умирает ядро :(

In [None]:
[id_to_itemid[rec[0]] for rec in recs]

In [None]:
%%time

result['K1'] = result['user_id'].\
    apply(lambda x: [id_to_itemid[rec[0]] for rec in 
                    model.recommend(userid=userid_to_id[x], 
                                    user_items=sparse_user_item,   # на вход user-item matrix
                                    N=5, 
                                    filter_already_liked_items=False, 
                                    filter_items=[999999], 
                                    recalculate_user=False)])

In [None]:
%%time

model = ItemItemRecommender(K=2, num_threads=4) # K - кол-во билжайших соседей

model.fit(csr_matrix(user_item_matrix).T.tocsr(), 
          show_progress=True)

recs = model.recommend(userid=userid_to_id[1], 
                        user_items=csr_matrix(user_item_matrix).tocsr(),   # на вход user-item matrix
                        N=5, 
                        filter_already_liked_items=False, 
                        filter_items=None, 
                        recalculate_user=True)

In [None]:
[id_to_itemid[rec[0]] for rec in recs]

In [None]:
%%time

result['K2'] = result['user_id'].\
    apply(lambda x: [id_to_itemid[rec[0]] for rec in 
                    model.recommend(userid=userid_to_id[x], 
                                    user_items=sparse_user_item,   # на вход user-item matrix
                                    N=5, 
                                    filter_already_liked_items=False, 
                                    filter_items=[999999], 
                                    recalculate_user=False)])

In [None]:
result.apply(lambda row: precision_at_k(row['K1'], row['actual'], 5), axis=1).mean()

In [None]:
result.apply(lambda row: precision_at_k(row['K2'], row['actual'], 5), axis=1).mean()