## Задание 3. Не использую готовые решения, реализовать матричное разложение BPR на implicit данных

In [3]:
import pandas as pd
import numpy as np
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
from tqdm.notebook import tqdm, trange
import time    # to be used in loop iterations

In [4]:
ratings = pd.read_csv('ds/ratings.dat', delimiter='::', header=None,
                      names=['user_id', 'movie_id', 'rating', 'timestamp'],
                      usecols=['user_id', 'movie_id', 'rating'], engine='python')
ratings.head(10)

Unnamed: 0,user_id,movie_id,rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5
5,1,1197,3
6,1,1287,5
7,1,2804,5
8,1,594,4
9,1,919,4


In [5]:
movie_info = pd.read_csv('ds/movies.dat', delimiter='::', header=None,
                         names=['movie_id', 'name', 'category'], engine='python')

In [6]:
implicit_ratings = ratings.loc[(ratings['rating'] >= 4)]
implicit_ratings.head(10)

Unnamed: 0,user_id,movie_id,rating
0,1,1193,5
3,1,3408,4
4,1,2355,5
6,1,1287,5
7,1,2804,5
8,1,594,4
9,1,919,4
10,1,595,5
11,1,938,4
12,1,2398,4


In [53]:
users = implicit_ratings["user_id"]
movies = implicit_ratings["movie_id"]
user_item = sparse.coo_matrix((np.ones_like(users), (users, movies)))
user_item_t_csr = user_item.T.tocsr()
user_item_csr = user_item.tocsr()

array([   1,   48,  150, ..., 3735, 3751, 3819], dtype=int32)

In [160]:
class ImplicitBPRMatrixFactorizer:
    def __init__(self, sparse_data, ratings, n_features):
        self.user_size, self.movie_size = sparse_data.shape
        self.ratings = ratings
        self.sparse_data = sparse_data
        self.n_features = n_features
        self.ds = []

        for ui in self.ratings['user_id'].unique():
            for mi in self.sparse_data[ui, :].indices:
                    self.ds.append((ui, mi))

    def fit(self, learning_rate=0.05, lambda_v=0.0001, max_iter=7):
        W = sparse.csr_matrix(np.random.uniform(0, 1 / np.sqrt(self.n_features), (self.user_size, self.n_features)))
        H = sparse.csr_matrix(np.random.uniform(0, 1 / np.sqrt(self.n_features), (self.movie_size, self.n_features)))

        for _ in trange(max_iter):
            np.random.shuffle(self.ds)
            for iter in trange(len(self.ds)):
                u, i = self.ds[iter]
                j = np.random.randint(1, self.movie_size)
                while self.sparse_data[u, j] != 0:
                    j = np.random.randint(1, self.movie_size)

                wu = W[u].toarray().squeeze()
                hi = H[i].toarray().squeeze()
                hj = H[j].toarray().squeeze()

                x_uij = wu.dot(hi) - wu.dot(hj)

                e = np.exp(-x_uij)
                coeff = e / (1 + e)

                W[u] += learning_rate * (coeff * (H[i] - H[j])) + lambda_v * W[u]
                H[i] += learning_rate * (coeff * (W[u])) + lambda_v * H[i]
                H[j] += learning_rate * (coeff * (-W[u])) + lambda_v * H[j]
        self.U = W.toarray()
        self.V = H.toarray()
        
    def predict(self, user_id, movie_id):
        return self.U[user_id, :].dot(self.V[movie_id, :].T)

In [161]:
bpr = ImplicitBPRMatrixFactorizer(user_item_csr, ratings, 64)

In [162]:
bpr.fit()

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=7.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=575281.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=575281.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=575281.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=575281.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=575281.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=575281.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=575281.0), HTML(value='')))





In [163]:
def get_similars(movie_id, model):
    movie_matrix = model.V[movie_id]
    movies = []
    
    for i, j in enumerate(model.V):
        dist = np.linalg.norm(j - movie_matrix)
        movies.append((i, dist))
    sorted_movies = list(sorted(movies, key=lambda v: v[1]))
    similars = []
    for x in sorted_movies:
        search = movie_info[movie_info["movie_id"] == x[0]]
        movie_name = search["name"].to_string()
        if len(search) > 0:
            similars.append(movie_name)
    return similars

In [164]:
def get_user_history(user_id):
    return [movie_info[movie_info["movie_id"] == x]["name"].to_string() 
        for x in ratings[ratings["user_id"] == user_id]["movie_id"]]

In [165]:
def get_recommendations(user_id, model, remove_watched=True):
    
    predictions = []
    for movie_id in range(model.V.shape[0]):
        predictions.append((movie_id, model.predict(user_id, movie_id)))
    predictions = list(reversed(sorted(predictions, key=lambda v: v[1])))
    recommendations = []

    for x in predictions:
         recommendations.append(movie_info[movie_info["movie_id"] == x[0]]["name"].to_string())
    
    if remove_watched:
        to_remove = set(get_user_history(user_id))
        filtered_recommendations = [r for r in recommendations if r not in to_remove]
        return filtered_recommendations
    
    return recommendations    

In [166]:
get_similars(1, bpr)[:20]

['0    Toy Story (1995)',
 '547    Nightmare Before Christmas, The (1993)',
 '2918    Who Framed Roger Rabbit? (1988)',
 '584    Aladdin (1992)',
 '2692    Iron Giant, The (1999)',
 '3327    Muppet Movie, The (1979)',
 '2012    Little Mermaid, The (1989)',
 '3045    Toy Story 2 (1999)',
 '1262    Fantasia (1940)',
 '1058    Willy Wonka and the Chocolate Factory (1971)',
 '436    Dave (1993)',
 "3184    Wayne's World (1992)",
 '2011    Lady and the Tramp (1955)',
 '360    Lion King, The (1994)',
 '591    Beauty and the Beast (1991)',
 '592    Pinocchio (1940)',
 '1943    Back to the Future Part III (1990)',
 '3090    Fantasia 2000 (1999)',
 '2105    Beetlejuice (1988)',
 "2286    Bug's Life, A (1998)"]

In [167]:
get_recommendations(4, bpr, remove_watched=True)[:20]

['2789    American Beauty (1999)',
 '2502    Matrix, The (1999)',
 '585    Terminator 2: Judgment Day (1991)',
 '589    Silence of the Lambs, The (1991)',
 '2693    Sixth Sense, The (1999)',
 '847    Godfather, The (1972)',
 '108    Braveheart (1995)',
 '1250    Back to the Future (1985)',
 '315    Shawshank Redemption, The (1994)',
 '1179    Princess Bride, The (1987)',
 '2327    Shakespeare in Love (1998)',
 '604    Fargo (1996)',
 "523    Schindler's List (1993)",
 '1575    L.A. Confidential (1997)',
 '453    Fugitive, The (1993)',
 '537    Blade Runner (1982)',
 '1539    Men in Black (1997)',
 '49    Usual Suspects, The (1995)',
 '1182    Aliens (1986)',
 '293    Pulp Fiction (1994)']

In [168]:
get_user_history(4)

['3399    Hustler, The (1961)',
 '1192    Star Wars: Episode VI - Return of the Jedi (1983)',
 '2882    Fistful of Dollars, A (1964)',
 '1196    Alien (1979)',
 '1023    Die Hard (1988)',
 '257    Star Wars: Episode IV - A New Hope (1977)',
 '1959    Saving Private Ryan (1998)',
 '476    Jurassic Park (1993)',
 '1178    Star Wars: Episode V - The Empire Strikes Back...',
 '1180    Raiders of the Lost Ark (1981)',
 '1885    Rocky (1976)',
 '1081    E.T. the Extra-Terrestrial (1982)',
 '3349    Thelma & Louise (1991)',
 '3633    Mad Max (1979)',
 '2297    King Kong (1933)',
 '1366    Jaws (1975)',
 '3458    Predator (1987)',
 '1183    Good, The Bad and The Ugly, The (1966)',
 '2623    Run Lola Run (Lola rennt) (1998)',
 '2878    Goldfinger (1964)',
 '1220    Terminator, The (1984)']