In [145]:
import scipy.stats as sps
import numpy as np

import pandas as pd
import math
import random
import sklearn

from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import scipy

Данные, которые подаются на вход базовому алгоритму состоят из:
 - множества всех товаров на данной итерации,
 - множества всех пользователей,
 - таблица F формы (c_len, w_len) с фидбеком пользователей для товаров. фидбек $\in [0, 1]$


Данные генерируем синтетическим образом:
cчитаем, что пользователи имеют одну фичу и она распределена нормально:  $c_i
\sim \cal{N} (1.65,\ 0.2^2)$ . Товары задаются одной фичей, которая распределена равномерно: $w_i \sim \cal {U}(1, 2) $.
Фидбек будем генерировать так: $F_{c,w} = C[a_c + t_w + \varepsilon_{c, w}]$. Где
 $a_c \sim \cal{N}(0.6, 0.2^2) $ - центр оценки пользователя $c$, $t_w \sim \cal N(0,0.2^2)$ - полезность товара,  $\varepsilon \sim \cal N(0, 0.05^2)$ - случайный шум. C(x) - это функция, которая округляет фидбэк к отрезку $[0,1]$: \\
 если x > 1: C(x) =  1 \\
 если x < 0: C(x) = 0 \\
 иначе: C(x) = x


In [146]:
c_size = 100
w_size = 50

c_mean = 1.65
c_sigma = 0.2
c_distr = sps.norm(c_mean, c_sigma)
c_sample = c_distr.rvs(size=c_size)

w_params = [0, 1]
w_distr = sps.uniform(*w_params)
w_sample = w_distr.rvs(size=w_size)


In [147]:
centr = sps.norm(0.6, 0.2).rvs(size=c_size)
usefulness = sps.norm(0, 0.2).rvs(size=w_size)
epsilon = sps.norm(0, 0.05).rvs(size=(c_size, w_size))


In [148]:
Feedback = (epsilon + usefulness) + centr[:, np.newaxis]


Релизуем базовый алгоритм TopPop. Он не зависит от конкретного пользователя и рекомендует ему n товаров, с самым большим рейтингом.

In [149]:
def TopPop(X, n):
    # X: (c_len, w_len)
    indices = np.argpartition(np.mean(X, axis=0), n)[:n]
    # return value: nd.array indicies of recommended items, probability for every customer of buiyng recommended items.
    return indices, X[:, indices]

Ткт я решил все-так использовать MovieLens... Но можно попробовать и на синтетических потом
MovieLens 100K:
ContentBasedRecommender - рекоммендует товары, наиболее похожие на те, с которыми пользователь взаимодействовал до этого


In [150]:

u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('ml-100k/u.user', sep='|', names=u_cols,
                    encoding='latin-1')

r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=r_cols,
                      encoding='latin-1')

m_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url']
movies = pd.read_csv('ml-100k/u.item', sep='|', names=m_cols, usecols=range(5),
                     encoding='latin-1')


# users_interactions_count_df = ratings.groupby(['user_id', 'movie_id']).size().groupby('user_id').size()
# users_with_enough_interactions_df = users_interactions_count_df[users_interactions_count_df >= 2].reset_index()[['user_id']]
# interactions_from_selected_users_df = ratings.merge(users_with_enough_interactions_df, 
#                how = 'right',
#                left_on = 'user_id',
#                right_on = 'user_id')
# interactions_from_selected_users_df

Разобъем обучающую и тренеровочную выборки относительно времени:

In [151]:
# ratings = ratings.sort_values(by="unix_timestamp", ascending=True)
# ratings

In [152]:

ratings_train, ratings_test = train_test_split(ratings, test_size=0.1, shuffle=True)

ratings_indexed = ratings.set_index('user_id')
ratings_train_indexed = ratings_train.set_index('user_id')
ratings_test_indexed = ratings_test.set_index('user_id')

In [153]:
ratings_train["user_id"]

76952    924
90530    916
90343    907
58920    357
22691    115
        ... 
43143    390
53423    500
43765    450
68136    485
37211    399
Name: user_id, Length: 90000, dtype: int64

In [165]:
#Top-N accuracy metrics consts
EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS = 200


def get_items_interacted(person_id, interactions_df):
    # Get the user's data and merge in the movie information.
    interacted_items = interactions_df.loc[person_id]['movie_id']
    return set(interacted_items if type(interacted_items) == pd.Series else [interacted_items])


class ModelEvaluator:
    def get_not_interacted_items_sample(self, person_id, sample_size, seed=42):
        interacted_items = get_items_interacted(person_id, ratings_indexed)
        all_items = set(movies['movie_id'])
        non_interacted_items = sorted(all_items - interacted_items)

        random.seed(seed)
        non_interacted_items_sample = random.sample(non_interacted_items, sample_size)
        return set(non_interacted_items_sample)

    def _verify_hit_top_n(self, item_id, recommended_items, topn):
        try:
            index = next(i for i, c in enumerate(recommended_items) if c == item_id)
        except:
            index = -1
        hit = int(index in range(0, topn))
        return hit, index

    def evaluate_model_for_user(self, model, person_id):
        #Getting the items in test set
        interacted_values_testset = ratings_test_indexed.loc[person_id]
        if type(interacted_values_testset['movie_id']) == pd.Series:
            person_interacted_items_testset = set(interacted_values_testset['movie_id'])
        else:
            person_interacted_items_testset = set([int(interacted_values_testset['movie_id'])])
        interacted_items_count_testset = len(person_interacted_items_testset)

        #Getting a ranked recommendation list from a model for a given user
        person_recs_df = model.recommend_items(person_id,
                                               items_to_ignore=get_items_interacted(person_id,
                                                                                    ratings_train_indexed),
                                               topn=10000000000)

        hits_at_5_count = 0
        hits_at_10_count = 0
        #For each item the user has interacted in test set
        for item_id in person_interacted_items_testset:
            #Getting a random sample (100) items the user has not interacted 
            #(to represent items that are assumed to be no relevant to the user)
            non_interacted_items_sample = self.get_not_interacted_items_sample(person_id,
                                                                               sample_size=EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS,
                                                                               seed=item_id % (2 ** 32))

            #Combining the current interacted item with the 100 random items
            items_to_filter_recs = non_interacted_items_sample.union(set([item_id]))

            #Filtering only recommendations that are either the interacted item or from a random sample of 100 non-interacted items
            valid_recs_df = person_recs_df[person_recs_df['movie_id'].isin(items_to_filter_recs)]
            valid_recs = valid_recs_df['movie_id'].values
            #Verifying if the current interacted item is among the Top-N recommended items
            hit_at_5, index_at_5 = self._verify_hit_top_n(item_id, valid_recs, 5)
            hits_at_5_count += hit_at_5
            hit_at_10, index_at_10 = self._verify_hit_top_n(item_id, valid_recs, 10)
            hits_at_10_count += hit_at_10

        #Recall is the rate of the interacted items that are ranked among the Top-N recommended items, 
        #when mixed with a set of non-relevant items
        recall_at_5 = hits_at_5_count / float(interacted_items_count_testset)
        recall_at_10 = hits_at_10_count / float(interacted_items_count_testset)

        person_metrics = {'hits@5_count': hits_at_5_count,
                          'hits@10_count': hits_at_10_count,
                          'interacted_count': interacted_items_count_testset,
                          'recall@5': recall_at_5,
                          'recall@10': recall_at_10}
        return person_metrics

    def evaluate_model(self, model):
        #print('Running evaluation for users')
        people_metrics = []
        for idx, person_id in enumerate(list(ratings_test_indexed.index.unique().values)):
            person_metrics = self.evaluate_model_for_user(model, person_id)
            person_metrics['_person_id'] = person_id
            people_metrics.append(person_metrics)
        print('%d users processed' % idx)

        detailed_results_df = pd.DataFrame(people_metrics) \
            .sort_values('interacted_count', ascending=False)

        global_recall_at_5 = detailed_results_df['hits@5_count'].sum() / float(
            detailed_results_df['interacted_count'].sum())
        global_recall_at_10 = detailed_results_df['hits@10_count'].sum() / float(
            detailed_results_df['interacted_count'].sum())

        global_metrics = {'recall@5': global_recall_at_5,
                          'recall@10': global_recall_at_10}
        return global_metrics, detailed_results_df


model_evaluator = ModelEvaluator()

In [166]:
item_ids = movies["movie_id"].tolist()

In [167]:

vectorizer = TfidfVectorizer(analyzer='word',
                     ngram_range=(1, 2),
                     min_df=0.003,
                     max_df=0.5,
                     max_features=5000)

tfidf_matrix = vectorizer.fit_transform(movies['title'])
tfidf_feature_names = vectorizer.get_feature_names_out()
tfidf_matrix

<1682x137 sparse matrix of type '<class 'numpy.float64'>'
	with 3479 stored elements in Compressed Sparse Row format>

In [168]:


def get_item_profile(item_id):
    idx = item_ids.index(item_id)
    item_profile = tfidf_matrix[idx:idx+1]
    return item_profile


def get_item_profiles(ids):
    item_profiles_list = [get_item_profile(x) for x in ids]
    item_profiles = scipy.sparse.vstack(item_profiles_list)
    return item_profiles


def build_users_profile(person_id, interactions_indexed_df):
    interactions_person_df = interactions_indexed_df.loc[person_id]
    # if hasattr(interactions_person_df['movie_id'], '__iter__'):
    user_item_profiles = get_item_profiles(interactions_person_df['movie_id'])
    # else:
    #     user_item_profiles = get_item_profiles([interactions_person_df['movie_id']])



    user_item_ratings = np.array(interactions_person_df['rating']).reshape(-1, 1)
    #Weighted average of item profiles by the interactions rating
    user_item_ratings_weighted_avg = np.sum(user_item_profiles.multiply(user_item_ratings), axis=0) / np.sum(
        user_item_ratings)
    user_profile_norm = sklearn.preprocessing.normalize(np.asarray(user_item_ratings_weighted_avg))
    return user_profile_norm


def build_users_profiles():
    interactions_indexed_df = ratings_train.set_index('user_id')
    user_profiles = {}
    for person_id in interactions_indexed_df.index.unique():
        user_profiles[person_id] = build_users_profile(person_id, interactions_indexed_df)
    return user_profiles


user_profiles = build_users_profiles()
len(user_profiles)

943

In [169]:


class ContentBasedRecommender:
    def __init__(self, items_df=None):
        self.item_ids = item_ids
        self.items_df = items_df

    def _get_similar_items_to_user_profile(self, person_id, topn=1000):
        #Computes the cosine similarity between the user profile and all item profiles
        cosine_similarities = cosine_similarity(user_profiles[person_id], tfidf_matrix)
        #Gets the top similar items
        similar_indices = cosine_similarities.argsort().flatten()[-topn:]
        #Sort the similar items by similarity
        similar_items = sorted([(item_ids[i], cosine_similarities[0, i]) for i in similar_indices], key=lambda x: -x[1])
        return similar_items

    def recommend_items(self, customer_id, items_to_ignore=[], topn=10, verbose=False):
        similar_items = self._get_similar_items_to_user_profile(customer_id)
        #Ignores items the user has already interacted
        similar_items_filtered = list(filter(lambda x: x[0] not in items_to_ignore, similar_items))

        recommendations_df = pd.DataFrame(similar_items_filtered, columns=['movie_id', 'rating']).head(topn)

        return recommendations_df


content_based_recommender_model = ContentBasedRecommender(movies)

In [170]:
cb_global_metrics, cb_detailed_results_df = model_evaluator.evaluate_model(content_based_recommender_model)
print('\nGlobal metrics:', cb_global_metrics)
cb_detailed_results_df.head(10)

929 users processed

Global metrics: {'recall@5': 0.0456, 'recall@10': 0.0888}


Unnamed: 0,hits@5_count,hits@10_count,interacted_count,recall@5,recall@10,_person_id
27,10,15,69,0.144928,0.217391,405
81,1,3,64,0.015625,0.046875,655
205,2,7,60,0.033333,0.116667,537
143,0,0,59,0.0,0.0,13
144,2,5,55,0.036364,0.090909,279
84,0,0,53,0.0,0.0,276
475,2,2,48,0.041667,0.041667,450
170,3,4,47,0.06383,0.085106,846
61,0,2,46,0.0,0.043478,201
51,2,3,45,0.044444,0.066667,416


In [159]:

users_items_pivot_matrix_df = ratings_train.pivot(index='user_id', 
                                                          columns='movie_id', 
                                                          values='rating').fillna(0)
users_items_pivot_matrix = users_items_pivot_matrix_df.values
users_items_pivot_sparse_matrix = csr_matrix(users_items_pivot_matrix)

users_ids = list(users_items_pivot_matrix_df.index)

#The number of factors to factor the user-item matrix.
NUMBER_OF_FACTORS_MF = 30
#Performs matrix factorization of the original user item matrix
#U, sigma, Vt = svds(users_items_pivot_matrix, k = NUMBER_OF_FACTORS_MF)
U, sigma, Vt = svds(users_items_pivot_sparse_matrix, k = NUMBER_OF_FACTORS_MF)

In [160]:
U.shape, sigma.shape, Vt.shape

((943, 30), (30,), (30, 1665))

In [161]:
all_user_predicted_ratings = np.dot(np.dot(U, np.diag(sigma)), Vt) 

In [162]:
all_user_predicted_ratings_norm = (all_user_predicted_ratings - all_user_predicted_ratings.min()) / (all_user_predicted_ratings.max() - all_user_predicted_ratings.min())

cf_preds_df = pd.DataFrame(all_user_predicted_ratings_norm, columns = users_items_pivot_matrix_df.columns, index=users_ids).transpose()

In [163]:
class CFRecommender:
    
    MODEL_NAME = 'Collaborative Filtering'
    
    def __init__(self, cf_predictions_df, items_df=None):
        self.cf_predictions_df = cf_predictions_df
        self.items_df = items_df
        
        
    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        # Get and sort the user's predictions
        sorted_user_predictions = self.cf_predictions_df[user_id].sort_values(ascending=False) \
                                    .reset_index().rename(columns={user_id: 'rating'})

        # Recommend the highest predicted rating movies that the user hasn't seen yet.
        recommendations_df = sorted_user_predictions[~sorted_user_predictions['movie_id'].isin(items_to_ignore)] \
                               .sort_values('rating', ascending = False) \
                               .head(topn)

        # if verbose:
        #     if self.items_df is None:
        #         raise Exception('"items_df" is required in verbose mode')
        # 
        #     recommendations_df = recommendations_df.merge(self.items_df, how = 'left', 
        #                                                   left_on = 'movie_id', 
        #                                                   right_on = 'contentId')[['recStrength', 'contentId', 'title', 'url', 'lang']]


        return recommendations_df
    
cf_recommender_model = CFRecommender(cf_preds_df, movies)

In [164]:
cb_global_metrics, cb_detailed_results_df = model_evaluator.evaluate_model(cf_recommender_model)
print('\nGlobal metrics:', cb_global_metrics)
cb_detailed_results_df.head(10)

929 users processed

Global metrics: {'recall@5': 0.4499, 'recall@10': 0.588}


Unnamed: 0,hits@5_count,hits@10_count,interacted_count,recall@5,recall@10,_person_id
27,25,31,69,0.362319,0.449275,405
81,18,28,64,0.28125,0.4375,655
205,31,38,60,0.516667,0.633333,537
143,11,15,59,0.186441,0.254237,13
144,14,18,55,0.254545,0.327273,279
84,26,32,53,0.490566,0.603774,276
475,16,21,48,0.333333,0.4375,450
170,25,29,47,0.531915,0.617021,846
61,16,20,46,0.347826,0.434783,201
51,15,18,45,0.333333,0.4,416
