In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [4]:
ratings = pd.read_csv('ratings.csv')

In [5]:
def train_test_split_user(ratings, test_size=0.2):
    train_list = []
    test_list = []

    # Get all unique users
    for user_id in ratings['userId'].unique():
        # Filter the dataset to include only rows corresponding to curr user
        user_data = ratings[ratings['userId'] == user_id]
        # Check if user has rated more than 5 movies to meaningfully split data into train and test
        # Ex. user with 10 ratings -> 8 train, 2 test, but user with 3 ratings -> 2 train, 1 test
            # In the latter case, we would not have enough data to train the model
        # If user has rated more than 5 movies, split the data into train and test
        # Else, include all data in train
        if len(user_data) >= 5:
            train_data, test_data = train_test_split(user_data, test_size=test_size, random_state=42)
            train_list.append(train_data)
            test_list.append(test_data)
        else:
            train_list.append(user_data)
    
    # Combine training and testing data for all users into train and test
    train = pd.concat(train_list)
    test = pd.concat(test_list)
    return train, test

train_data, test_data = train_test_split_user(ratings)

# We don't need timestamp column
train_data = train_data.drop(columns=['timestamp'])
test_data = test_data.drop(columns=['timestamp'])

In [6]:
train_data

Unnamed: 0,userId,movieId,rating
55,1,1031,5.0
230,1,4006,4.0
69,1,1197,5.0
168,1,2596,5.0
109,1,1777,4.0
...,...,...,...
100629,610,103219,3.5
100664,610,107436,3.0
100828,610,163981,3.5
100394,610,70946,5.0


In [7]:
test_data

Unnamed: 0,userId,movieId,rating
219,1,3578,5.0
66,1,1127,4.0
9,1,157,5.0
170,1,2617,2.0
15,1,260,5.0
...,...,...,...
99733,610,3552,3.0
99830,610,4437,5.0
99820,610,4310,2.5
100531,610,90600,3.5


In [99]:
class MF():

    def __init__(self, R, K, alpha, beta, iterations):
        self.R = R
        self.num_users, self.num_movies = R.shape
        self.K = K
        self.alpha = alpha
        self.beta = beta
        self.iterations = iterations

    def train(self):
        self.P = np.random.normal(scale=1./self.K, size=(self.num_users, self.K))
        self.Q = np.random.normal(scale=1./self.K, size=(self.num_movies, self.K))

        self.b_u = np.zeros(self.num_users)
        self.b_i = np.zeros(self.num_movies)
        self.b = np.mean(self.R[np.where(self.R != 0)])

        self.samples = [
            (i, j, self.R[i, j])
            for i in range(self.num_users)
            for j in range(self.num_movies)
            if self.R[i, j] > 0
        ]

        for i in range(self.iterations):
            np.random.shuffle(self.samples)
            self.sgd()
            mae = self.mae()
            if (i+1) % 10 == 0:
                print("Iteration: %d ; error = %.4f" % (i+1, mae))
            

        print("Root MSE: ", self.rmse())

    def mae(self):
        xs, ys = self.R.nonzero() 
        predicted = self.full_matrix()  
        error = 0
        
        for x, y in zip(xs, ys):
            error += abs(self.R[x, y] - predicted[x, y])  
        
        return error / len(xs)

        
    
    def rmse(self):
        xs, ys = self.R.nonzero()
        predicted = self.full_matrix()
        error = 0
        for x, y in zip(xs, ys):
            error += pow(self.R[x, y] - predicted[x, y], 2)

        return np.sqrt(error/len(xs))

    def sgd(self):
        for i, j, r in self.samples:
            prediction = self.get_rating(i, j)
            e = (r - prediction)

            self.b_u[i] += self.alpha * (e - self.beta * self.b_u[i])
            self.b_i[j] += self.alpha * (e - self.beta * self.b_i[j])

            self.P[i, :] += self.alpha * (e * self.Q[j, :] - self.beta * self.P[i,:])
            self.Q[j, :] += self.alpha * (e * self.P[i, :] - self.beta * self.Q[j,:])

    def get_rating(self, i, j):
        prediction = self.b + self.b_u[i] + self.b_i[j] + self.P[i, :].dot(self.Q[j, :].T)
        return prediction

    def full_matrix(self):
        return self.b + self.b_u[:,np.newaxis] + self.b_i[np.newaxis:,] + self.P.dot(self.Q.T)


In [110]:
def test_model(mf_model, test_data, user_to_idx, movie_to_idx):
    squared_error = 0
    absolute_error = 0
    n = len(test_data)
    unfound_movies = 0

    for _, row in test_data.iterrows():
        uid = int(row['userId'])
        mid = int(row['movieId'])

        if mid not in movie_to_idx:
            unfound_movies += 1
            continue 
        
        user_id = int(user_to_idx[uid])
        movie_id = int(movie_to_idx[mid])
        actual_rating = row['rating']
        
        predicted_rating = mf_model.get_rating(user_id, movie_id)
        
        squared_error += (actual_rating - predicted_rating) ** 2
        
        absolute_error += abs(actual_rating - predicted_rating)
    
    rmse = np.sqrt(squared_error / n)
    mae = absolute_error / n

    print(f"{unfound_movies} movies not found")
    return rmse, mae


In [105]:
user_ids = train_data['userId'].unique()  
movie_ids = train_data['movieId'].unique()  


user_to_idx = {user_id: idx for idx, user_id in enumerate(user_ids)}
movie_to_idx = {movie_id: idx for idx, movie_id in enumerate(movie_ids)}

R = np.zeros((len(user_ids), len(movie_ids)))

for row in train_data.itertuples():
    user_idx = user_to_idx[row.userId]
    movie_idx = movie_to_idx[row.movieId]
    R[user_idx, movie_idx] = row.rating

mf = MF(R, K=50, alpha=0.01, beta=0.05, iterations=800)

mf.train()

Iteration: 10 ; error = 0.6784
Iteration: 20 ; error = 0.6024
Iteration: 30 ; error = 0.4435
Iteration: 40 ; error = 0.3039
Iteration: 50 ; error = 0.2202
Iteration: 60 ; error = 0.1721
Iteration: 70 ; error = 0.1437
Iteration: 80 ; error = 0.1254
Iteration: 90 ; error = 0.1137
Iteration: 100 ; error = 0.1058
Iteration: 110 ; error = 0.1000
Iteration: 120 ; error = 0.0961
Iteration: 130 ; error = 0.0927
Iteration: 140 ; error = 0.0904
Iteration: 150 ; error = 0.0884
Iteration: 160 ; error = 0.0870
Iteration: 170 ; error = 0.0856
Iteration: 180 ; error = 0.0846
Iteration: 190 ; error = 0.0838
Iteration: 200 ; error = 0.0831
Iteration: 210 ; error = 0.0823
Iteration: 220 ; error = 0.0817
Iteration: 230 ; error = 0.0812
Iteration: 240 ; error = 0.0810
Iteration: 250 ; error = 0.0804
Iteration: 260 ; error = 0.0804
Iteration: 270 ; error = 0.0800
Iteration: 280 ; error = 0.0797
Iteration: 290 ; error = 0.0793
Iteration: 300 ; error = 0.0790
Iteration: 310 ; error = 0.0788
Iteration: 320 ; 

In [109]:
rmse, mae = test_model(mf, test_data, user_to_idx, movie_to_idx)
print("Test RMSE:", rmse)
print("Test MAE:", rmse)

827 movies not found
Test RMSE: 0.8456058226452476
Test MAE: 0.8456058226452476
