## Задание 2. Не использую готовые решения, реализовать матричное разложение используя ALS на implicit данных

In [1]:
import pandas as pd
import numpy as np
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
from tqdm.notebook import tqdm, trange
import time    # to be used in loop iterations

In [2]:
ratings = pd.read_csv('ds/ratings.dat', delimiter='::', header=None,
                      names=['user_id', 'movie_id', 'rating', 'timestamp'],
                      usecols=['user_id', 'movie_id', 'rating'], engine='python')
ratings.head(10)

Unnamed: 0,user_id,movie_id,rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5
5,1,1197,3
6,1,1287,5
7,1,2804,5
8,1,594,4
9,1,919,4


In [3]:
movie_info = pd.read_csv('ds/movies.dat', delimiter='::', header=None,
                         names=['movie_id', 'name', 'category'], engine='python')

In [4]:
implicit_ratings = ratings.loc[(ratings['rating'] >= 4)]
implicit_ratings.head(10)

Unnamed: 0,user_id,movie_id,rating
0,1,1193,5
3,1,3408,4
4,1,2355,5
6,1,1287,5
7,1,2804,5
8,1,594,4
9,1,919,4
10,1,595,5
11,1,938,4
12,1,2398,4


In [5]:
users = implicit_ratings["user_id"]
movies = implicit_ratings["movie_id"]
user_item = sparse.coo_matrix((np.ones_like(users), (users, movies)))
user_item_t_csr = user_item.T.tocsr()
user_item_csr = user_item.tocsr()

In [8]:
class ImplicitALSMatrixFactorizer:
    def __init__(self, sparse_data, n_features=64, alpha=40):
        self.user_size, self.movie_size = sparse_data.shape
        self.sparse_data = sparse_data
        self.n_features = n_features
        self.alpha = alpha

    def fit(self, lambda_v=0.1, max_iter=10):
        confidence = self.sparse_data * self.alpha
        X = sparse.csr_matrix(np.random.normal(size=(self.user_size, self.n_features)))
        Y = sparse.csr_matrix(np.random.normal(size=(self.movie_size, self.n_features)))
        X_I = sparse.eye(self.user_size)
        Y_I = sparse.eye(self.movie_size)

        I = sparse.eye(self.n_features)
        lI = lambda_v * I

        for i in trange(max_iter):
            yty = Y.T.dot(Y)
            xtx = X.T.dot(X)
            for u in range(self.user_size):
                u_row = confidence[u, :].toarray()
                p_u = u_row.copy()
                p_u[p_u != 0] = 1.0

                CuI = sparse.diags(u_row, [0])
                Cu = CuI + Y_I

                yT_CuI_y = Y.T.dot(CuI).dot(Y)
                yT_Cu_pu = Y.T.dot(Cu).dot(p_u.T)
                X[u] = spsolve(yty + yT_CuI_y + lI, yT_Cu_pu)
            for m in range(self.movie_size):
                m_row = confidence[:, m].T.toarray()
                p_i = m_row.copy()
                p_i[p_i != 0] = 1.0

                CiI = sparse.diags(m_row, [0])
                Ci = CiI + X_I

                xT_CiI_x = X.T.dot(CiI).dot(X)
                xT_Ci_pi = X.T.dot(Ci).dot(p_i.T)
                Y[m] = spsolve(xtx + xT_CiI_x + lI, xT_Ci_pi)
        self.X = X
        self.Y = Y

    def predict(self, user_id, movie_id):
        return self.X[user_id, :].dot(self.Y[movie_id, :].T)

In [9]:
als = ImplicitALSMatrixFactorizer(user_item_csr)
als.fit()

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




In [34]:
print(als.user_size, als.movie_size)

6041 3953


In [26]:
def get_similars(movie_id, model):
    movie_id -= 1
    movie_matrix = model.X.toarray()[movie_id]
    movies = []
    
    for i, j in enumerate(model.X.toarray()):
        dist = np.linalg.norm(j - movie_matrix)
        movies.append((i, dist))
    sorted_movies = list(sorted(movies, key=lambda v: v[1]))
    similars = []
    for x in sorted_movies:
        search = movie_info[movie_info["movie_id"] - 1 == x[0]]
        movie_name = search["name"].to_string()
        if len(search) > 0:
            similars.append(movie_name)
    return similars

In [43]:
def get_recommendations(user_id, model):
    user_id -= 1
    
    predictions = []
    for movie_id in range(model.Y.toarray().shape[0]):
        predictions.append((movie_id, model.predict(user_id, movie_id)))
    predictions = list(reversed(sorted(predictions, key=lambda v: v[1])))
    recommendations = []

#     print(predictions[:10])
    for x in predictions:
         recommendations.append(movie_info[movie_info["movie_id"] - 1 == x[0]]["name"].to_string())
    
#     print(recommendations[:10])
#     to_remove = set(get_user_history(user_id + 1))
#     filtered_recommendations = [r for r in recommendations if r not in to_remove]
    
    return recommendations    

In [44]:
get_recommendations(4, als)[:20]

6041


['431    Coneheads (1993)',
 '301    Roommates (1995)',
 '70    Fair Game (1995)',
 '169    Jeffrey (1995)',
 '648    Und keiner weint mir nach (1996)',
 '549    Tombstone (1993)',
 '2337    Romancing the Stone (1984)',
 '2667    Brighton Beach Memoirs (1986)',
 '439    Endless Summer 2, The (1994)',
 '1036    Looking for Richard (1996)',
 '1574    Peacemaker, The (1997)',
 '3207    Gun Shy (2000)',
 '2631    South Park: Bigger, Longer and Uncut (1999)',
 '3077    Deuce Bigalow: Male Gigolo (1999)',
 '2402    Crocodile Dundee II (1988)',
 '365    Mrs. Parker and the Vicious Circle (1994)',
 '1702    Shooting Fish (1997)',
 'Series([], )',
 "207    White Man's Burden (1995)",
 '1678    Horse Whisperer, The (1998)']