## Задание 1. Не использую готовые решения, реализовать SVD разложение используя SGD на explicit данных

In [1]:
import pandas as pd
import numpy as np
import scipy.sparse as sp
from tqdm.notebook import tqdm, trange
import time    # to be used in loop iterations

In [2]:
ratings = pd.read_csv('ds/ratings.dat', delimiter='::', header=None,
                      names=['user_id', 'movie_id', 'rating', 'timestamp'],
                      usecols=['user_id', 'movie_id', 'rating'], engine='python')

In [3]:
movie_info = pd.read_csv('ds/movies.dat', delimiter='::', header=None,
                         names=['movie_id', 'name', 'category'], engine='python')

In [4]:
ratings

Unnamed: 0,user_id,movie_id,rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5
...,...,...,...
1000204,6040,1091,1
1000205,6040,1094,5
1000206,6040,562,5
1000207,6040,1096,4


In [5]:
movie_info

Unnamed: 0,movie_id,name,category
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [9]:
class SGDMatrixFactorizer:
    def __init__(self, ratings, n_features):
        self.n_features = n_features
        self.ratings = ratings
        self.np_ratings = ratings.to_numpy()
        self.users_shape = max(ratings['user_id'])
        self.movies_shape = max(ratings['movie_id'])
        self.non_zero = len(ratings)
        self.initialize_data()
        
        print(self.users_shape, self.movies_shape)

    def initialize_data(self):
        self.U = np.random.uniform(0, 1 / np.sqrt(self.n_features), (self.users_shape, self.n_features))
        self.V = np.random.uniform(0, 1 / np.sqrt(self.n_features), (self.movies_shape, self.n_features))
        self.bias_u = np.zeros(self.users_shape)
        self.bias_v = np.zeros(self.movies_shape)
        self.global_bias = np.mean(self.ratings['rating'])
        print(self.global_bias)

    def predict(self, user_id, movie_id):
        return self.U[user_id, :].dot(self.V[movie_id, :].T) \
               + self.bias_u[user_id] \
               + self.bias_v[movie_id] \
               + self.global_bias

    def mse(self):
        err = 0
        for i in range(self.non_zero):
            user_id = self.np_ratings[i][0] - 1
            movie_id = self.np_ratings[i][1] - 1
            rating = self.np_ratings[i][2]
            err += (self.predict(user_id, movie_id) - rating) ** 2
        return err / self.non_zero


    def fit(self, learning_rate=0.1, max_iter=100):
        self.Q = self.mse()
        print("Start MSE:", self.Q)
        for iter in trange(max_iter):
            self.ratings.sample(frac=1)
            self.np_ratings = ratings.to_numpy()
            for rand_i in range(self.non_zero):
#                 rand_i = np.random.randint(0, self.non_zero)
                user_id = self.np_ratings[rand_i][0] - 1
                movie_id = self.np_ratings[rand_i][1] - 1
                rating = self.np_ratings[rand_i][2]

                error = self.predict(user_id, movie_id) - rating
                reg_param = 0.01

                self.U[user_id] -= learning_rate * (error * self.V[movie_id] + reg_param * self.U[user_id])
                self.V[movie_id] -= learning_rate * (error * self.U[user_id] + reg_param * self.V[movie_id])
                self.bias_u[user_id] -= learning_rate * (error + reg_param * self.bias_u[user_id])
                self.bias_v[movie_id] -= learning_rate * (error + reg_param * self.bias_v[movie_id])
            if iter % 1 == 0:
                self.Q = self.mse()
                print("Iteration", iter + 1,"| MSE:", self.Q)

In [10]:
mf = SGDMatrixFactorizer(ratings, 64)

3.581564453029317
6040 3952


Пробовал делать больше итераций (100), при этом MSE был ~0.23, но на мой взгляд произошло переобучение, и например 4 юзеру постоянно стали рекомендовать романтику/драмму, которым он ставил более хорошие оценки. А также get_similars выдавал менее релевантный результат.

In [11]:
mf.fit(0.01, 10)

Start MSE: 1.3111098153638818


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))

Iteration 1 | MSE: 0.850210275605962
Iteration 2 | MSE: 0.807843779757853
Iteration 3 | MSE: 0.7840635657109228
Iteration 4 | MSE: 0.7537741934179489
Iteration 5 | MSE: 0.7165927247637814
Iteration 6 | MSE: 0.6748620158153115
Iteration 7 | MSE: 0.6308259150484516
Iteration 8 | MSE: 0.5872639199842375
Iteration 9 | MSE: 0.5465082443146726
Iteration 10 | MSE: 0.5100397609933957



In [12]:
def get_similars(movie_id, model):
    movie_id -= 1
    movie_matrix = model.V[movie_id]
    movies = []
    
    for i, j in enumerate(model.V):
        dist = np.linalg.norm(j - movie_matrix)
        movies.append((i, dist))
    sorted_movies = list(sorted(movies, key=lambda v: v[1]))
    similars = []
    for x in sorted_movies:
        search = movie_info[movie_info["movie_id"] - 1 == x[0]]
        movie_name = search["name"].to_string()
        if len(search) > 0:
            similars.append(movie_name)
    return similars

In [13]:
def get_user_history(user_id):
    return [movie_info[movie_info["movie_id"] == x]["name"].to_string() 
        for x in ratings[ratings["user_id"] == user_id]["movie_id"]]

Из рекомендаций выбрасываю фильмы, которые юзер уже посмотрел

In [14]:
def get_recommendations(user_id, model):
    user_id -= 1
    
    predictions = []
    for movie_id in range(model.V.shape[0]):
        predictions.append((movie_id, model.predict(user_id, movie_id)))
    predictions = list(reversed(sorted(predictions, key=lambda v: v[1])))
    recommendations = []

#     print(predictions[:10])
    for x in predictions:
         recommendations.append(movie_info[movie_info["movie_id"] - 1 == x[0]]["name"].to_string())
    
#     print(recommendations[:10])
    to_remove = set(get_user_history(user_id + 1))
    filtered_recommendations = [r for r in recommendations if r not in to_remove]
    
    return filtered_recommendations    

In [15]:
get_similars(1, mf)[:10]

['0    Toy Story (1995)',
 '3045    Toy Story 2 (1999)',
 '1029    That Thing You Do! (1996)',
 "2286    Bug's Life, A (1998)",
 '922    Father of the Bride (1950)',
 '2692    Iron Giant, The (1999)',
 '3522    Mr. Mom (1983)',
 '1050    Aladdin and the King of Thieves (1996)',
 '2254    Cruise, The (1998)',
 '1838    Mulan (1998)']

In [16]:
get_recommendations(4, mf)[:20]

['900    Casablanca (1942)',
 '1189    To Kill a Mockingbird (1962)',
 '1950    Seven Samurai (The Magnificent Seven) (Shichin...',
 '2836    Sanjuro (1962)',
 '740    Dr. Strangelove or: How I Learned to Stop Worr...',
 '315    Shawshank Redemption, The (1994)',
 '3238    City Lights (1931)',
 '847    Godfather, The (1972)',
 '1186    Lawrence of Arabia (1962)',
 "523    Schindler's List (1993)",
 '589    Silence of the Lambs, The (1991)',
 "1176    One Flew Over the Cuckoo's Nest (1975)",
 '2134    Shadow of a Doubt (1943)',
 '2953    General, The (1927)',
 '1185    12 Angry Men (1957)',
 '892    Rear Window (1954)',
 '901    Maltese Falcon, The (1941)',
 '1242    Great Escape, The (1963)',
 '1876    On the Waterfront (1954)',
 '3026    Grapes of Wrath, The (1940)']