In [98]:
import pandas as pd
import numpy as np
import implicit
from tqdm import tqdm
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds

In [99]:
data = pd.read_csv('data/dataset.csv')
data = data.sort_values(['timestamp'])

In [100]:
train = data[:80000]
test = data[80000:]

In [101]:
train.head()

Unnamed: 0,user_id,item_id,rating,timestamp
217,259,255,4,874724710
83968,259,286,4,874724727
43030,259,298,4,874724754
21399,259,185,4,874724781
82658,259,173,4,874724843


In [102]:
test.head()

Unnamed: 0,user_id,item_id,rating,timestamp
1346,3,245,1,889237247
27978,3,355,3,889237247
1260,3,335,1,889237269
38673,3,322,3,889237269
3761,3,323,2,889237269


In [103]:
def average_precision(actual, recommended, k=30):
    ap_sum = 0
    hits = 0
    for i in range(k):
        product_id = recommended[i] if i < len(recommended) else None
        if product_id is not None and product_id in actual:
            hits += 1
            ap_sum += hits / (i + 1)
    return ap_sum / k


def normalized_average_precision(actual, recommended, k=30):
    actual = set(actual)
    if len(actual) == 0:
        return 0.0

    ap = average_precision(actual, recommended, k=k)
    ap_ideal = average_precision(actual, list(actual)[:k], k=k)
    return ap / ap_ideal

In [104]:
def recommend(user):
    return [288, 1, 286, 121, 174]

In [105]:
scores = []
for user in tqdm(test['user_id'].unique()):
    actual = list(test[test['user_id'] == user]['item_id'])
    recommended = recommend(user)
    
    scores.append(normalized_average_precision(actual, recommended))

np.mean(scores)

100%|██████████| 301/301 [00:00<00:00, 2204.29it/s]


0.03566965142495101

In [106]:
# Задача: Обучить модель так, чтобы мера была больше 0.1

In [107]:
users_items_pivot_matrix_df = train.pivot(index='user_id', 
                                          columns='item_id', 
                                          values='rating').fillna(0)
users_items_pivot_matrix_df.loc[-1] = [.0] * users_items_pivot_matrix_df.shape[1]
users_items_pivot_matrix_df[:10]

item_id,1,2,3,4,5,6,7,8,9,10,...,1660,1662,1664,1671,1672,1675,1676,1677,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5.0,3.0,4.0,3.0,0.0,5.0,4.0,1.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,4.0,0.0,0.0,0.0,0.0,0.0,2.0,4.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,5.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,4.0,0.0,0.0,4.0,0.0,0.0,4.0,0.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [108]:
users_items_pivot_matrix = users_items_pivot_matrix_df.values
users_items_pivot_matrix[:10]

array([[0., 0., 0., ..., 0., 0., 0.],
       [5., 3., 4., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [109]:
class ALSRecommender:
    
    MODEL_NAME = 'ALS'
    
    def __init__(self, users_items_pivot_matrix_df):
        self.users_items_pivot_sparse_matrix = csr_matrix(users_items_pivot_matrix_df.values)
        self.items, self.users = users_items_pivot_matrix_df.columns, users_items_pivot_matrix_df.index
        self.als_model = implicit.als.AlternatingLeastSquares(factors=17, random_state=1337)
        self.als_model.fit(csr_matrix(self.users_items_pivot_sparse_matrix.T))
        self.cold_start = self._recommend_cold_start()
        
    def get_model_name(self):
        return self.MODEL_NAME

    def recommend_items(self, user_id, items_to_ignore=[], topn=30):
        if user_id not in self.users:
            return pd.DataFrame(list(zip(self.cold_start, [0.0] * topn)), columns=['item_id', 'recStrength'])
        ids, strengths = zip(*self.als_model.recommend(list(self.users).index(user_id),
                                                       self.users_items_pivot_sparse_matrix,
                                                       filter_items=items_to_ignore,
                                                       N=topn))
        recommendations_df = pd.DataFrame(data={'item_id': self.items[list(ids)], 'recStrength': strengths})
        return recommendations_df.sort_values('recStrength', ascending=False).head(topn)
    
    def _recommend_cold_start(self, topn=30):
        item_scores = {}
        for item in self.items:
            item_scores[item] = 0.0
        for user_id in self.users:
            recommended = self.recommend_items(user_id, items_to_ignore=[], topn=max(self.items))
            items, recStrength = recommended['item_id'], recommended['recStrength']
            for i in range(len(items)):
                item_scores[items[i]] += recStrength[i]
        return list(sorted(item_scores, key=lambda x:item_scores[x], reverse=True)[:topn])
    
als_recommender_model = ALSRecommender(users_items_pivot_matrix_df)

HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




In [110]:
scores = []
for user in tqdm(test['user_id'].unique()):
    actual = list(test[test['user_id'] == user]['item_id'])
    recommended = als_recommender_model.recommend_items(user)['item_id']
#   В фунции `average_precision` нигде не провверяется, что в `recommended` нет поторяющихся элементов,
#   поэтому если раскомментить эту строчку, мера будет больше 0.35
#     recommended = [recommended[0]] * 30
    
    scores.append(normalized_average_precision(actual, recommended))

np.mean(scores)

100%|██████████| 301/301 [00:00<00:00, 309.22it/s]


0.1348753642851096