In [1]:
import pickle
import numpy as np
import pandas as pd
import zipfile
from collections import defaultdict
from surprise import Dataset, SVD, Reader
from surprise.model_selection import cross_validate

In [2]:
# Source: Modified from https://github.com/NicolasHug/Surprise/blob/master/examples/top_n_recommendations.py
def get_top_n_for_user(predictions, user_id, n=10):
    """
    Return the top-N recommendations for a given user from a set of predictions.

    Args:
        predictions (list of Prediction objects): The list of predictions, as returned by the test method of an algorithm.
        user_id (str or int): The user ID for whom to get recommendations.
        n (int): The number of recommendations to return. Default is 10.

    Returns:
        list or str: A sorted list of tuples [(item_id, estimated_rating), ...] of size n for the given user_id,
                     or a message if the user_id is not found.
    """

    # Filter predictions for the given user_id
    user_predictions = [(iid, est) for uid, iid, true_r, est, _ in predictions if uid == user_id]

    # If user_id is not found, return a message
    if not user_predictions:
        return f"User ID {user_id} not found in predictions."

    # Sort by estimated rating in descending order
    user_predictions.sort(key=lambda x: x[1], reverse=True)

    # Return top-N recommendations
    return user_predictions[:n]

In [3]:
# First train an SVD algorithm on the movielens dataset.
data = Dataset.load_builtin("ml-100k")
trainset = data.build_full_trainset()
links = pd.read_csv("data/links.csv")

In [4]:
# Get the raw training data
raw_trainset = data.raw_ratings

# Convert training data to use TMDB IDs
ratings_data = []
for uid, mid, rating, timestamp in raw_trainset:
    # Convert movie ID to int for matching with links DataFrame
    movie_id = int(mid)
    # Get the TMDB ID from links DataFrame
    tmdb_id = links[links['movieId'] == movie_id]['tmdbId'].values
    # Only add rating if TMDB ID exists
    if len(tmdb_id) > 0:
        ratings_data.append({
            'userID': str(uid),
            'movieID': str(tmdb_id[0]),
            'rating': rating
        })

# Convert to DataFrame
df = pd.DataFrame(ratings_data)

# Create a new Reader object
reader = Reader(rating_scale=(1, 5))

# Create a new Dataset object with TMDB IDs
data = Dataset.load_from_df(df[['userID', 'movieID', 'rating']], reader)

# Build the full trainset
trainset = data.build_full_trainset()

In [5]:

algo = SVD()
algo.fit(trainset)
# pd.DataFrame(cross_validate(algo, data, measures=["RMSE", "MAE"], cv=3, verbose=True))

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x30f3ddac0>

In [6]:
# Than predict ratings for all pairs (u, i) that are NOT in the training set.
testset = trainset.build_anti_testset()
predictions = algo.test(testset)

In [7]:
get_top_n_for_user(predictions, uid, n=10)


[('947.0', 4.933744498750255),
 ('981.0', 4.707889800003292),
 ('982.0', 4.677674757997651),
 ('961.0', 4.674904418394773),
 ('164.0', 4.6640870781456805),
 ('23383.0', 4.661465395570442),
 ('592.0', 4.6562497183370315),
 ('309.0', 4.642546629848182),
 ('845.0', 4.61202504679512),
 ('3529.0', 4.609674128813477)]

In [8]:
with open("../backend/model.pkl", "wb") as file:
    pickle.dump(predictions, file)

In [9]:
with zipfile.ZipFile('../backend/model.zip', 'w', zipfile.ZIP_DEFLATED) as zipf:
    zipf.write('../backend/model.pkl', 'model.pkl')