In [1]:
pip install scikit-surprise

Note: you may need to restart the kernel to use updated packages.


In [4]:
# Cell 1: Import Libraries
import pandas as pd
from surprise import Reader, Dataset, SVD
import joblib

# ---

# Cell 2: Load and Prepare Data
ratings = pd.read_csv('/data1/students/rupam/Self_Project/Movie_Recommendation/csv/ratings.csv')
movies = pd.read_csv('/data1/students/rupam/Self_Project/Movie_Recommendation/csv/movies.csv')

# The 'surprise' library needs data in a specific format (user, item, rating).
# Our 'ratings' dataframe is already in this format.
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

# ---

# Cell 3: Train the Model
# We'll use Singular Value Decomposition (SVD), a popular matrix factorization technique.
trainset = data.build_full_trainset()
model = SVD()
model.fit(trainset)

print("Model training complete.")

# ---

# Cell 4: Create a Recommendation Function (for testing)
def get_recommendations(user_id, n=10):
    # Get a list of all movie IDs
    all_movie_ids = ratings['movieId'].unique()
    
    # Get a list of movie IDs that the user has already rated
    rated_movie_ids = ratings[ratings['userId'] == user_id]['movieId'].unique()
    
    # Get a list of movies the user has NOT rated
    unrated_movie_ids = [movie_id for movie_id in all_movie_ids if movie_id not in rated_movie_ids]
    
    # Predict ratings for all unrated movies
    predictions = [model.predict(user_id, movie_id) for movie_id in unrated_movie_ids]
    
    # Sort predictions by estimated rating in descending order
    predictions.sort(key=lambda x: x.est, reverse=True)
    
    # Get the top N recommendations
    top_n_preds = predictions[:n]
    
    # Get the movie IDs from the top predictions
    top_movie_ids = [pred.iid for pred in top_n_preds]
    
    # Get the movie titles from the IDs
    top_movie_titles = movies[movies['movieId'].isin(top_movie_ids)]['title'].tolist()
    
    return top_movie_titles

# Test the function for user 1
print("Recommendations for User 2:")
print(get_recommendations(user_id=2, n=5))

# ---

# Cell 5: Save the Model and Data for the API
# We need to save the trained model and the movies dataframe.

# Save the trained model
joblib.dump(model, 'collaborative_model.pkl')

# Save the movies dataframe so we can map IDs to titles in the API
movies.to_pickle('collaborative_movies.pkl')

# Save the ratings dataframe to know which movies a user has seen
ratings.to_pickle('collaborative_ratings.pkl')

print("\nSaved collaborative model and data files successfully!")

Model training complete.
Recommendations for User 2:
['Usual Suspects, The (1995)', 'Touch of Evil (1958)', 'Evil Dead II (Dead by Dawn) (1987)', 'Boondock Saints, The (2000)', 'Eternal Sunshine of the Spotless Mind (2004)']

Saved collaborative model and data files successfully!
