# Домашнее задание №5 по теме "Поиск похожих товаров и пользователей. Гибридные рекомендательные системы".

## Подбор оптимальных гиперпараметров для LightFM

- Постройте модели с помощью библиотеки LightFM, изменяя следующие параметры
  - функция потерь, регуляризация
  - количество компонент
  - отдельно постройте модели, используя только матрицу взаимодействий и матрицу взаимодействий + признаки (набор признаков может быть различным, например как на вебинаре)
  
- Посчитайте метрики (Precision@5, Recall@5) для разных наборов гиперпараметров и выберете лучший набор


In [1]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix, coo_matrix

# Матричная факторизация
import lightfm
from lightfm import LightFM
from lightfm.evaluation import precision_at_k, recall_at_k

import warnings
warnings.filterwarnings('ignore')



### Подготовка данных

In [2]:
data = pd.read_csv('../../data/retail_train.csv')
item_features = pd.read_csv('../../data/product.csv')
user_features = pd.read_csv('../../data/hh_demographic.csv')

In [3]:
# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

# train test split
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

data_train.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


Фильтрация

In [4]:
popularity = data_train.groupby('item_id')['quantity'].sum().reset_index()
popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)
top_5000 = popularity.sort_values('n_sold', ascending=False).head(5000).item_id.tolist()
data_train.loc[~data_train['item_id'].isin(top_5000), 'item_id'] = 999999

In [5]:
user_item_matrix = pd.pivot_table(data_train, 
                                  index='user_id', 
                                  columns='item_id', 
                                  values='quantity',
                                  aggfunc='count', 
                                  fill_value=0
                                 )

user_item_matrix = user_item_matrix.astype(float)
sparse_user_item = csr_matrix(user_item_matrix).tocsr()
user_item_matrix.head(2)

item_id,202291,397896,420647,480014,545926,707683,731106,818980,819063,819227,...,15778533,15831255,15926712,15926775,15926844,15926886,15927403,15927661,15927850,16809471
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
data_test = data_test[data_test['item_id'].isin(data_train['item_id'].unique())]
test_user_item_matrix = pd.pivot_table(data_test, 
                                  index='user_id', 
                                  columns='item_id', 
                                  values='quantity', # Можно пробоват ьдругие варианты
                                  aggfunc='count', 
                                  fill_value=0
                                 )

test_user_item_matrix = test_user_item_matrix.astype(float)
sparse_user_item_test = csr_matrix(test_user_item_matrix).tocsr()

In [7]:
userids = user_item_matrix.index.values
itemids = user_item_matrix.columns.values

matrix_userids = np.arange(len(userids))
matrix_itemids = np.arange(len(itemids))

id_to_itemid = dict(zip(matrix_itemids, itemids))
id_to_userid = dict(zip(matrix_userids, userids))

itemid_to_id = dict(zip(itemids, matrix_itemids))
userid_to_id = dict(zip(userids, matrix_userids))

Подготовка фичей

In [8]:
test_user_feat = pd.DataFrame(test_user_item_matrix.index)
test_user_feat = test_user_feat.merge(user_features, on='user_id', how='left')
test_user_feat.set_index('user_id', inplace=True)

user_feat = pd.DataFrame(user_item_matrix.index)
user_feat = user_feat.merge(user_features, on='user_id', how='left')
user_feat.set_index('user_id', inplace=True)

item_feat = pd.DataFrame(user_item_matrix.columns)
item_feat = item_feat.merge(item_features, on='item_id', how='left')
item_feat.set_index('item_id', inplace=True)

In [9]:
test_user_feat_lightfm = pd.get_dummies(test_user_feat, columns=test_user_feat.columns.tolist())
user_feat_lightfm = pd.get_dummies(user_feat, columns=user_feat.columns.tolist())
item_feat_lightfm = pd.get_dummies(item_feat, columns=item_feat.columns.tolist())

In [10]:
result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 914190, 95804..."
1,3,"[851057, 872021, 878302, 879948, 909638, 91320..."


### Построение модели и подбор лучших гиперпараметров

In [11]:
components_list = [5, 10, 15, 30]
user_regularization_list = [0.01, 0.1]
item_regularization_list = [0.01, 0.1]
loss_list = ['bpr', 'warp']
n_vars = len(components_list) * len(user_regularization_list) * len(item_regularization_list) * len(loss_list)

In [12]:
evaluate_dict = {}

In [13]:
i = 1
for n_components in components_list:
    for user_reg in user_regularization_list:
        for item_reg in item_regularization_list:
            for loss_type in loss_list:
                model = LightFM(no_components=n_components,
                                loss=loss_type, 
                                learning_rate=0.05, 
                                item_alpha=item_reg, 
                                user_alpha=user_reg, 
                                random_state=42)
                
                model.fit((sparse_user_item > 0) * 1,  # user-item matrix из 0 и 1
                          sample_weight=coo_matrix(user_item_matrix),
                          user_features=csr_matrix(user_feat_lightfm.values).tocsr(),
                          item_features=csr_matrix(item_feat_lightfm.values).tocsr(),
                          epochs=15, 
                          num_threads=4) 
                
                alg_name = f"LightFM(comp={n_components},u_r={user_reg},i_r={item_reg},loss={loss_type})"
                
                precision = precision_at_k(model, sparse_user_item, 
                                 user_features=csr_matrix(user_feat_lightfm.values).tocsr(),
                                 item_features=csr_matrix(item_feat_lightfm.values).tocsr(),
                                 k=5).mean()
                recall = recall_at_k(model, sparse_user_item, 
                                 user_features=csr_matrix(user_feat_lightfm.values).tocsr(),
                                 item_features=csr_matrix(item_feat_lightfm.values).tocsr(),
                                 k=5).mean()
                evaluate_dict[alg_name] = []
                evaluate_dict[alg_name].append(precision)
                evaluate_dict[alg_name].append(recall)
                
                print(f"\rProgress: {i}/{n_vars}", end="")
                sys.stdout.flush()
                i+=1

Progress: 32/32

### Оценка

In [14]:
evaluate_df = pd.DataFrame(evaluate_dict, index=['Precision@5', 'Recall@5'])
evaluate_df.T.sort_values('Precision@5', ascending=False).head(10)

Unnamed: 0,Precision@5,Recall@5
"LightFM(comp=15,u_r=0.01,i_r=0.01,loss=warp)",0.533894,0.013292
"LightFM(comp=10,u_r=0.1,i_r=0.01,loss=warp)",0.533574,0.013287
"LightFM(comp=30,u_r=0.1,i_r=0.01,loss=warp)",0.533574,0.013287
"LightFM(comp=5,u_r=0.1,i_r=0.01,loss=warp)",0.533574,0.013287
"LightFM(comp=5,u_r=0.1,i_r=0.1,loss=warp)",0.533574,0.013287
"LightFM(comp=15,u_r=0.1,i_r=0.01,loss=warp)",0.533574,0.013287
"LightFM(comp=10,u_r=0.01,i_r=0.01,loss=warp)",0.533173,0.013281
"LightFM(comp=5,u_r=0.01,i_r=0.01,loss=warp)",0.52565,0.013186
"LightFM(comp=30,u_r=0.01,i_r=0.01,loss=warp)",0.52389,0.013021
"LightFM(comp=15,u_r=0.1,i_r=0.1,loss=warp)",0.523249,0.013038


In [15]:
evaluate_df.T.sort_values('Recall@5', ascending=False).head(10)

Unnamed: 0,Precision@5,Recall@5
"LightFM(comp=10,u_r=0.1,i_r=0.1,loss=bpr)",0.378471,0.022281
"LightFM(comp=15,u_r=0.01,i_r=0.1,loss=bpr)",0.37391,0.022133
"LightFM(comp=5,u_r=0.01,i_r=0.1,loss=bpr)",0.371829,0.022088
"LightFM(comp=10,u_r=0.01,i_r=0.1,loss=bpr)",0.362545,0.021909
"LightFM(comp=5,u_r=0.1,i_r=0.01,loss=bpr)",0.361265,0.021895
"LightFM(comp=5,u_r=0.01,i_r=0.01,loss=bpr)",0.361185,0.021894
"LightFM(comp=10,u_r=0.01,i_r=0.01,loss=bpr)",0.361185,0.021894
"LightFM(comp=30,u_r=0.01,i_r=0.01,loss=bpr)",0.361185,0.021894
"LightFM(comp=15,u_r=0.01,i_r=0.01,loss=bpr)",0.361185,0.021894
"LightFM(comp=5,u_r=0.1,i_r=0.1,loss=bpr)",0.361185,0.021894


**Вывод:** Recall довольно на низком уровне, поэтому лучше всего выбрать модель LightFM(comp=15,u_r=0.01,i_r=0.01,loss=warp), которая показала наиболее высокий Precision.

---