# Домашнее задание №3 по теме "Коллаборативная фильтрация".

В видеоуроке мы рассмотрели базовое применение ALS, а также ALS с использованием взвешивания по типу: tfidf и bm25. 

В качестве домашнего задания предлагается самому еще раз протестировать данные методы и попробовать улучшить результат, полученный на уроке.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import bm25_weight, tfidf_weight

import sys

In [2]:
# Функции из 1-ого вебинара
def precision_at_k(recommended_list, bought_list, k=5):
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    
    bought_list = bought_list 
    recommended_list = recommended_list[:k]
    flags = np.isin(bought_list, recommended_list)
    precision = flags.sum() / len(recommended_list)
    
    return precision

def ap_at_k(recommended_list, bought_list, k=5):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    
    flags = np.isin(recommended_list, bought_list)
    
    if sum(flags) == 0:
        return 0
    
    sum_ = 0
    for i in range(k):
        
        if flags[i]:
            p_k = precision_at_k(recommended_list, bought_list, k=i+1)
            sum_ += p_k
            
    result = sum_ / k
    
    return result

# Функции из 3-ого вебинара
def get_recommendations(user, model, sparse_user_item, N=5):
    res = [id_to_itemid[rec] for rec in 
                    model.recommend(userid=userid_to_id[user], 
                                    user_items=sparse_user_item[userid_to_id[user]],
                                    N=N, 
                                    filter_already_liked_items=False, 
                                    filter_items=[itemid_to_id[999999]], 
                                    recalculate_user=True)[0]] 
    return res

**Загрузка данных:**

In [3]:
data = pd.read_csv('retail_train.csv')

test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

data_train.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [4]:
result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


**Обработка Топ-5000 товаров:**

In [5]:
popularity = data_train.groupby('item_id')['quantity'].sum().reset_index()
popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)

top_5000 = popularity.sort_values('n_sold', ascending=False).head(5000).item_id.tolist()

data_train.loc[~data_train['item_id'].isin(top_5000), 'item_id'] = 999999

user_item_matrix = pd.pivot_table(data_train, 
                                  index='user_id', 
                                  columns='item_id', 
                                  values='quantity',
                                  aggfunc='count', 
                                  fill_value=0
                                 )

user_item_matrix = user_item_matrix.astype(float) # необходимый тип матрицы для implicit

# переведем в формат sparse matrix
sparse_user_item = csr_matrix(user_item_matrix).tocsr()

In [6]:
userids = user_item_matrix.index.values
itemids = user_item_matrix.columns.values

matrix_userids = np.arange(len(userids))
matrix_itemids = np.arange(len(itemids))

id_to_itemid = dict(zip(matrix_itemids, itemids))
id_to_userid = dict(zip(matrix_userids, userids))

itemid_to_id = dict(zip(itemids, matrix_itemids))
userid_to_id = dict(zip(userids, matrix_userids))

In [7]:
evaluate_dict = {}

### 1. Базовое применение Alternating Least Squares

In [8]:
factors_list = [50, 100, 150, 200]
regularization_list = [0.001, 0.005]
iterations_list = [10, 15, 20]
n_vars = len(factors_list) * len(regularization_list) * len(iterations_list)

In [9]:
i=1
for f in factors_list:
    for r in regularization_list:
        for it in iterations_list:
            model = AlternatingLeastSquares(factors=f,
                                            regularization=r,
                                            iterations=it,
                                            calculate_training_loss=True,
                                            num_threads=4, 
                                            random_state=42)
            model.fit(sparse_user_item, 
                      show_progress=False)
            
            alg_name = f"als(f={f},r={r},it={it})"
            result[alg_name] = result['user_id'].apply(lambda x: get_recommendations(x, model, sparse_user_item, N=5))
            
            evaluate_dict[alg_name] = []
            evaluate_dict[alg_name].append(result.apply(lambda row: precision_at_k(row[alg_name], row['actual'], k=5), axis=1).mean())
            evaluate_dict[alg_name].append(result.apply(lambda row: ap_at_k(row[alg_name], row['actual'], k=5), axis=1).mean())
            
            print(f"\rProgress: {i}/{n_vars}", end="")
            sys.stdout.flush()
            i+=1

Progress: 24/24

###  2. Alternating Least Squares и TF-IDF взвешивание

In [10]:
factors_list = [100, 150, 200]
regularization_list = [0.01, 0.05]
iterations_list = [5, 10, 15, 20]
n_vars = len(factors_list) * len(regularization_list) * len(iterations_list)

In [11]:
tfidf_user_item_matrix = tfidf_weight(user_item_matrix).tocsr() 

In [12]:
i=1
for f in factors_list:
    for r in regularization_list:
        for it in iterations_list:
            model = AlternatingLeastSquares(factors=f,
                                            regularization=r,
                                            iterations=it,
                                            calculate_training_loss=True,
                                            num_threads=4, 
                                            random_state=42)
            model.fit(tfidf_user_item_matrix, 
                      show_progress=False)
            
            alg_name = f"als_tfidf(f={f},r={r},it={it})"
            result[alg_name] = result['user_id'].apply(lambda x: get_recommendations(x, model, sparse_user_item, N=5))
            
            evaluate_dict[alg_name] = []
            evaluate_dict[alg_name].append(result.apply(lambda row: precision_at_k(row[alg_name], row['actual'], k=5), axis=1).mean())
            evaluate_dict[alg_name].append(result.apply(lambda row: ap_at_k(row[alg_name], row['actual'], k=5), axis=1).mean())
            
            print(f"\rProgress: {i}/{n_vars}", end="")
            sys.stdout.flush()
            i+=1

Progress: 24/24

### 3. Alternating Least Squares и BM25 взвешивание

In [13]:
factors_list = [100, 150, 200, 400]
regularization_list = [0.001, 0.005]
iterations_list = [5, 10, 20]
n_vars = len(factors_list) * len(regularization_list) * len(iterations_list)

In [14]:
bm25_user_item_matrix = bm25_weight(user_item_matrix.T).T.tocsr()

In [15]:
i=1
for f in factors_list:
    for r in regularization_list:
        for it in iterations_list:
            model = AlternatingLeastSquares(factors=f,
                                            regularization=r,
                                            iterations=it,
                                            calculate_training_loss=True,
                                            num_threads=4, 
                                            random_state=42)
            model.fit(csr_matrix(bm25_user_item_matrix).tocsr(), 
                      show_progress=False)
            
            alg_name = f"als_bm25(f={f},r={r},it={it})"
            result[alg_name] = result['user_id'].apply(lambda x: get_recommendations(x, model, sparse_user_item, N=5))
            
            evaluate_dict[alg_name] = []
            evaluate_dict[alg_name].append(result.apply(lambda row: precision_at_k(row[alg_name], row['actual'], k=5), axis=1).mean())
            evaluate_dict[alg_name].append(result.apply(lambda row: ap_at_k(row[alg_name], row['actual'], k=5), axis=1).mean())
            
            print(f"\rProgress: {i}/{n_vars}", end="")
            sys.stdout.flush()
            i+=1

Progress: 24/24

### 4. Оценка 

In [16]:
evaluate_df = pd.DataFrame(evaluate_dict, index=['Precision@5', 'MAP@5'])
evaluate_df.T.sort_values('Precision@5', ascending=False).head(10)

Unnamed: 0,Precision@5,MAP@5
"als_tfidf(f=100,r=0.05,it=5)",0.243095,0.168394
"als_tfidf(f=150,r=0.05,it=5)",0.242507,0.166476
"als_tfidf(f=100,r=0.05,it=20)",0.242409,0.164853
"als_tfidf(f=100,r=0.05,it=15)",0.241626,0.164427
"als_tfidf(f=100,r=0.01,it=10)",0.240059,0.163439
"als_tfidf(f=150,r=0.05,it=10)",0.239961,0.162457
"als_tfidf(f=100,r=0.01,it=5)",0.239471,0.165772
"als_tfidf(f=100,r=0.01,it=15)",0.238883,0.162196
"als_tfidf(f=100,r=0.05,it=10)",0.238883,0.164107
"als_tfidf(f=150,r=0.01,it=5)",0.238002,0.162556


In [17]:
evaluate_df.T.sort_values('MAP@5', ascending=False).head(10)

Unnamed: 0,Precision@5,MAP@5
"als_tfidf(f=100,r=0.05,it=5)",0.243095,0.168394
"als_tfidf(f=150,r=0.05,it=5)",0.242507,0.166476
"als_tfidf(f=100,r=0.01,it=5)",0.239471,0.165772
"als_tfidf(f=100,r=0.05,it=20)",0.242409,0.164853
"als_tfidf(f=100,r=0.05,it=15)",0.241626,0.164427
"als_tfidf(f=100,r=0.05,it=10)",0.238883,0.164107
"als_tfidf(f=100,r=0.01,it=10)",0.240059,0.163439
"als_bm25(f=150,r=0.001,it=5)",0.236631,0.16256
"als_tfidf(f=150,r=0.01,it=5)",0.238002,0.162556
"als_tfidf(f=150,r=0.05,it=10)",0.239961,0.162457


**Вывод:** Получили лучшие метрики при использовании ALS и TF-IDF взвешивания. Такого результата удалось достичь уменьшением итераций и увеличением регуляризации.

---