# Data selection and preprocessing:

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
ratings = pd.read_csv('ratings.csv')


In [3]:
def train_test_split_user(ratings, test_size=0.2):
    train_list = []
    test_list = []

    # Get all unique users
    for user_id in ratings['userId'].unique():
        # Filter the dataset to include only rows corresponding to curr user
        user_data = ratings[ratings['userId'] == user_id]
        # Check if user has rated more than 5 movies to meaningfully split data into train and test
        # Ex. user with 10 ratings -> 8 train, 2 test, but user with 3 ratings -> 2 train, 1 test
            # In the latter case, we would not have enough data to train the model
        # If user has rated more than 5 movies, split the data into train and test
        # Else, include all data in train
        if len(user_data) >= 5:
            train_data, test_data = train_test_split(user_data, test_size=test_size, random_state=42)
            train_list.append(train_data)
            test_list.append(test_data)
        else:
            train_list.append(user_data)
    
    # Combine training and testing data for all users into train and test
    train = pd.concat(train_list)
    test = pd.concat(test_list)
    return train, test

train_data, test_data = train_test_split_user(ratings)
print(train_data.head())
# We don't need timestamp column
train_data = train_data.drop(columns=['timestamp'])
test_data = test_data.drop(columns=['timestamp'])

     userId  movieId  rating  timestamp
55        1     1031     5.0  964982653
230       1     4006     4.0  964982903
69        1     1197     5.0  964981872
168       1     2596     5.0  964981144
109       1     1777     4.0  964981230


In [4]:
def recommend_movies(preds_df, userID, movies_df, original_ratings_df, num_recommendations=5):
    
    # Get and sort the user's predictions
    user_row_number = userID - 1 # UserID starts at 1, not 0
    sorted_user_predictions = preds_df.iloc[user_row_number].sort_values(ascending=False) # UserID starts at 1
    
    # Get the user's data and merge in the movie information.
    user_data = original_ratings_df[original_ratings_df.userId == (userID)]
    user_full = (user_data.merge(movies_df, how = 'left', left_on = 'movieId', right_on = 'movieId').
                     sort_values(['rating'], ascending=False)
                 )

    print ('User {0} has already rated {1} movies.'.format(userID, user_full.shape[0]))
    print ('Recommending highest {0} predicted ratings movies not already rated.'.format(num_recommendations))
    
    # Recommend the highest predicted rating movies that the user hasn't seen yet.
    recommendations = (movies_df[~movies_df['movieId'].isin(user_full['movieId'])].
         merge(pd.DataFrame(sorted_user_predictions).reset_index(), how = 'left',
               left_on = 'movieId',
               right_on = 'movieId').
         rename(columns = {user_row_number: 'Predictions'}).
         sort_values('Predictions', ascending = False).
                       iloc[:num_recommendations, :-1]
                      )

    return user_full, recommendations


In [40]:
#testing to predict ratings
known_movies = set(train_data['movieId'])
print(len)
for _, row in test_data.iterrows():
    # print(int(row['userId']), int(row['movieId']))
    if int(row['movieId']) not in known_movies:
        print(int(row['movieId']))

In [14]:
class MF():

    def __init__(self, R, K, alpha, beta, iterations):
        self.R = R
        self.num_users, self.num_movies = R.shape
        self.K = K
        self.alpha = alpha
        self.beta = beta
        self.iterations = iterations

    def train(self):
        self.P = np.random.normal(scale=1./self.K, size=(self.num_users, self.K))
        self.Q = np.random.normal(scale=1./self.K, size=(self.num_movies, self.K))

        self.b_u = np.zeros(self.num_users)
        self.b_i = np.zeros(self.num_movies)
        self.b = np.mean(self.R[np.where(self.R != 0)])

        self.samples = [
            (i, j, self.R[i, j])
            for i in range(self.num_users)
            for j in range(self.num_movies)
            if self.R[i, j] > 0
        ]

        for i in range(self.iterations):
            np.random.shuffle(self.samples)
            self.sgd()
            print("Iteration: %d ; mae = %.4f ; rmse = %.4f" % (i+1, self.mae(), self.rmse()))
          
    def mae(self):
        xs, ys = self.R.nonzero() 
        predicted = self.full_matrix()  
        error = 0
        
        for x, y in zip(xs, ys):
            error += abs(self.R[x, y] - predicted[x, y])  
        
        return error / len(xs)
    
    def rmse(self):
        xs, ys = self.R.nonzero()
        predicted = self.full_matrix()
        error = 0
        for x, y in zip(xs, ys):
            error += pow(self.R[x, y] - predicted[x, y], 2)

        return np.sqrt(error/len(xs))

    def sgd(self):
        for i, j, r in self.samples:
            prediction = self.get_rating(i, j)
            e = (r - prediction)

            self.b_u[i] += self.alpha * (e - self.beta * self.b_u[i])
            self.b_i[j] += self.alpha * (e - self.beta * self.b_i[j])

            P_i = self.P[i, :].copy()
            Q_j = self.Q[j, :].copy()

            self.P[i, :] += self.alpha * (e * Q_j - self.beta * P_i)
            self.Q[j, :] += self.alpha * (e * P_i - self.beta * Q_j)

    def get_rating(self, i, j):
        m_b = self.b_i[j] if j < len(self.b_i) else 0
        l_i = self.Q[j, :].T if j < len(self.Q) else np.mean(self.Q, axis=0) 
        prediction = self.b + self.b_u[i] + m_b + self.P[i, :].dot(l_i)
        return prediction

    def full_matrix(self):
        return self.b + self.b_u[:,np.newaxis] + self.b_i[np.newaxis:,] + self.P.dot(self.Q.T)


In [15]:
user_ids = train_data['userId'].unique()  
movie_ids = train_data['movieId'].unique()  

user_to_idx = {user_id: idx for idx, user_id in enumerate(user_ids)}
movie_to_idx = {movie_id: idx for idx, movie_id in enumerate(movie_ids)}

R = np.zeros((len(user_ids), len(movie_ids)))

for row in train_data.itertuples():
    user_idx = user_to_idx[row.userId]
    movie_idx = movie_to_idx[row.movieId]
    R[user_idx, movie_idx] = row.rating

mf = MF(R, K=50, alpha=0.01, beta=0.05, iterations=300)
# Iteration: 178 ; mae = 0.1979 ; rmse = 0.2732

mf.train()

Iteration: 1 ; mae = 0.7002 ; rmse = 0.8990
Iteration: 2 ; mae = 0.6769 ; rmse = 0.8752
Iteration: 3 ; mae = 0.6655 ; rmse = 0.8621
Iteration: 4 ; mae = 0.6588 ; rmse = 0.8528
Iteration: 5 ; mae = 0.6514 ; rmse = 0.8457
Iteration: 6 ; mae = 0.6478 ; rmse = 0.8402
Iteration: 7 ; mae = 0.6436 ; rmse = 0.8352
Iteration: 8 ; mae = 0.6399 ; rmse = 0.8310
Iteration: 9 ; mae = 0.6370 ; rmse = 0.8272
Iteration: 10 ; mae = 0.6348 ; rmse = 0.8238
Iteration: 11 ; mae = 0.6312 ; rmse = 0.8206
Iteration: 12 ; mae = 0.6292 ; rmse = 0.8173
Iteration: 13 ; mae = 0.6271 ; rmse = 0.8139
Iteration: 14 ; mae = 0.6241 ; rmse = 0.8105
Iteration: 15 ; mae = 0.6209 ; rmse = 0.8069
Iteration: 16 ; mae = 0.6183 ; rmse = 0.8029
Iteration: 17 ; mae = 0.6150 ; rmse = 0.7982
Iteration: 18 ; mae = 0.6110 ; rmse = 0.7931
Iteration: 19 ; mae = 0.6069 ; rmse = 0.7870
Iteration: 20 ; mae = 0.6015 ; rmse = 0.7801
Iteration: 21 ; mae = 0.5954 ; rmse = 0.7719
Iteration: 22 ; mae = 0.5893 ; rmse = 0.7626
Iteration: 23 ; mae

KeyboardInterrupt: 

In [20]:
def test_model(mf_model, test_data, user_to_idx, movie_to_idx):
    squared_error = 0
    absolute_error = 0
    n = len(test_data)
    unfound_movies = 0

    for _, row in test_data.iterrows():
        uid = int(row['userId'])
        mid = int(row['movieId'])

        if mid not in movie_to_idx:
            unfound_movies += 1
             
        
        user_id = int(user_to_idx[uid])
        movie_id = int(movie_to_idx[mid]) if mid in movie_to_idx else len(movie_to_idx)
        actual_rating = row['rating']
        
        predicted_rating = mf_model.get_rating(user_id, movie_id)
        
        squared_error += (actual_rating - predicted_rating) ** 2
        
        absolute_error += abs(actual_rating - predicted_rating)
    
    rmse = np.sqrt(squared_error / n)
    mae = absolute_error / n

    print(f"{unfound_movies} movies not found")
    return rmse, mae

In [21]:
rmse, mae = test_model(mf, test_data, user_to_idx, movie_to_idx)
print("Test RMSE:", rmse)
print("Test MAE:", mae)

827 movies not found
Test RMSE: 0.8702347595584478
Test MAE: 0.6662584160400875


In [None]:
def unwatched_movies(ratings):
    all_movie_ids = set(ratings['movieId'])
    user_to_unwatched_movies = {}

    for user_id in ratings['userId'].unique():
        user = ratings[ratings['userId'] == user_id]
        watched_movies = set(user['movieId'])
        unwatched_movies = all_movie_ids - watched_movies
        user_to_unwatched_movies[user_id] = unwatched_movies

    return user_to_unwatched_movies

def recommend_movies(mf_model, user_to_idx, movie_to_idx, n=10):
    unwatched = unwatched_movies(train_data)
    recommendations = {}

    for user_id in user_to_idx.keys():
        # print(f'\ruser_{user_id}', end='')
        user_idx = user_to_idx[user_id]
        predictions = mf_model.full_matrix()[user_idx]
        recommended_movie_idxs = np.argsort(predictions)[::-1]
        recommended_movies = [k for k, v in movie_to_idx.items() if v in recommended_movie_idxs and k in unwatched[user_id]][:n]
        recommendations[user_id] = recommended_movies
        print(recommended_movies)

    return recommendations

top_10_recommendations = recommend_movies(mf, user_to_idx, movie_to_idx, n=10)

[99114, 318, 115713, 77455, 80489, 71535, 86345, 333, 6874, 8798]
[1031, 4006, 1197, 2596, 1777, 2048, 1224, 1275, 1445, 3450]
[1031, 4006, 1197, 2596, 1777, 2048, 1224, 1445, 3450, 2000]
[1031, 4006, 2596, 1777, 2048, 1224, 1275, 1445, 3450, 2000]
[1031, 4006, 1197, 2596, 1777, 2048, 1224, 1275, 1445, 3450]
[1031, 4006, 1197, 2596, 1777, 2048, 1224, 1275, 1445, 3450]
[1031, 4006, 1197, 2596, 1777, 2048, 1224, 1275, 1445, 3450]
[1031, 4006, 1197, 2596, 1777, 2048, 1224, 1275, 1445, 3450]
[1031, 4006, 1197, 2596, 1777, 2048, 1224, 1275, 1445, 3450]
[1031, 4006, 1197, 2596, 1777, 2048, 1224, 1275, 1445, 3450]
[1031, 4006, 1197, 2596, 1777, 2048, 1224, 1275, 1445, 3450]
[1031, 4006, 1197, 2596, 1777, 2048, 1224, 1275, 1445, 3450]
[1031, 4006, 1197, 2596, 1777, 2048, 1224, 1275, 1445, 3450]
[1031, 4006, 1197, 2596, 1777, 2048, 1224, 1275, 1445, 3450]
[1031, 4006, 1197, 2596, 1777, 2048, 1224, 1275, 1445, 3450]
[1031, 4006, 1197, 2596, 1777, 2048, 1224, 1275, 1445, 3450]
[1031, 4006, 1197, 