In [17]:
import pickle
import numpy as np
import pandas as pd
from collections import defaultdict
from surprise import Dataset, SVD, Reader
from surprise.model_selection import cross_validate

In [18]:
# Source: Modified from https://github.com/NicolasHug/Surprise/blob/master/examples/top_n_recommendations.py
def get_top_n_for_user(predictions, user_id, n=10):
    """
    Return the top-N recommendations for a given user from a set of predictions.

    Args:
        predictions (list of Prediction objects): The list of predictions, as returned by the test method of an algorithm.
        user_id (str or int): The user ID for whom to get recommendations.
        n (int): The number of recommendations to return. Default is 10.

    Returns:
        list or str: A sorted list of tuples [(item_id, estimated_rating), ...] of size n for the given user_id,
                     or a message if the user_id is not found.
    """

    # Filter predictions for the given user_id
    user_predictions = [(iid, est) for uid, iid, true_r, est, _ in predictions if uid == user_id]

    # If user_id is not found, return a message
    if not user_predictions:
        return f"User ID {user_id} not found in predictions."

    # Sort by estimated rating in descending order
    user_predictions.sort(key=lambda x: x[1], reverse=True)

    # Return top-N recommendations
    return user_predictions[:n]

In [19]:
# First train an SVD algorithm on the movielens dataset.
data = Dataset.load_builtin("ml-100k")
trainset = data.build_full_trainset()
links = pd.read_csv("data/links.csv")

In [20]:
# Get the raw training data
raw_trainset = data.raw_ratings

# Convert training data to use TMDB IDs
ratings_data = []
for uid, mid, rating, timestamp in raw_trainset:
    # Convert movie ID to int for matching with links DataFrame
    movie_id = int(mid)
    # Get the TMDB ID from links DataFrame
    tmdb_id = links[links['movieId'] == movie_id]['tmdbId'].values
    # Only add rating if TMDB ID exists
    if len(tmdb_id) > 0:
        ratings_data.append({
            'userID': str(uid),
            'movieID': str(tmdb_id[0]),
            'rating': rating
        })

# Convert to DataFrame
df = pd.DataFrame(ratings_data)

# Create a new Reader object
reader = Reader(rating_scale=(1, 5))

# Create a new Dataset object with TMDB IDs
data = Dataset.load_from_df(df[['userID', 'movieID', 'rating']], reader)

# Build the full trainset
trainset = data.build_full_trainset()

In [21]:

algo = SVD()
algo.fit(trainset)
pd.DataFrame(cross_validate(algo, data, measures=["RMSE", "MAE"], cv=3, verbose=True))

Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9470  0.9423  0.9457  0.9450  0.0020  
MAE (testset)     0.7478  0.7454  0.7486  0.7473  0.0013  
Fit time          0.21    0.22    0.23    0.22    0.01    
Test time         0.05    0.05    0.05    0.05    0.00    


Unnamed: 0,test_rmse,test_mae,fit_time,test_time
0,0.946965,0.747799,0.207521,0.046198
1,0.942331,0.745446,0.216075,0.046753
2,0.945655,0.748554,0.227183,0.046655


In [22]:
# Than predict ratings for all pairs (u, i) that are NOT in the training set.
testset = trainset.build_anti_testset()
predictions = algo.test(testset)

In [23]:
get_top_n_for_user(predictions, uid, n=10)


[('9073.0', 5),
 ('18069.0', 4.975258858479597),
 ('1413.0', 4.9745731803828095),
 ('19760.0', 4.9742146805654945),
 ('12110.0', 4.9486573017868585),
 ('25557.0', 4.930271174407694),
 ('5967.0', 4.872219657073608),
 ('15730.0', 4.869556051201163),
 ('22588.0', 4.861697457750311),
 ('22586.0', 4.841037495573584)]

In [24]:
with open("model.pkl", "wb") as file:
    pickle.dump(predictions, file)