Collaborative-Based Filtering

In [1]:
!pip install scikit-surprise



In [2]:
!pip install numpy==1.24.4

import pandas as pd
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy
from collections import defaultdict

ratings_df = pd.read_csv('/content/cleaned_ratings.csv')
movies_df = pd.read_csv('/content/cleaned_movies_content.csv')

reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)

trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

model = SVD()
model.fit(trainset)

predictions = model.test(testset)
print("Model trained — RMSE:", accuracy.rmse(predictions))

RMSE: 0.7972
Model trained — RMSE: 0.7971954271304101


In [3]:
def get_top_n_recommendations(user_id, movies_df, model, ratings_df, n=10):
    rated_movie_ids = ratings_df[ratings_df['userId'] == user_id]['movieId'].tolist()
    all_movie_ids = movies_df['id'].unique()
    unseen_movie_ids = [movie_id for movie_id in all_movie_ids if movie_id not in rated_movie_ids]
    predictions = [model.predict(user_id, movie_id) for movie_id in unseen_movie_ids]
    predictions.sort(key=lambda x: x.est, reverse=True)
    top_n = predictions[:n]
    top_movie_ids = [pred.iid for pred in top_n]
    top_movies = movies_df[movies_df['id'].isin(top_movie_ids)][['id', 'title']]
    top_movies['predicted_rating'] = top_movies['id'].map({pred.iid: pred.est for pred in top_n})
    return top_movies.sort_values(by='predicted_rating', ascending=False).reset_index(drop=True)

user_id = 1
top_recs = get_top_n_recommendations(user_id, movies_df, model, ratings_df, n=10)
print(f"\nTop 10 recommendations for user {user_id}:")
print(top_recs)


Top 10 recommendations for user 1:
      id                         title  predicted_rating
0   2360                  Saving Grace          4.741142
1    214                       Saw III          4.722663
2    111                      Scarface          4.693497
3   2395  Asterix at the Olympic Games          4.692242
4   4979             Windows on Monday          4.687985
5    750               Murder She Said          4.681184
6  27834                            CQ          4.670098
7    608               Men in Black II          4.666895
8  92475       Three Songs About Lenin          4.664905
9  94959                Edward, My Son          4.656701


In [4]:
print("ratings_df exists:", 'ratings_df' in locals())
print("SVD model exists:", 'model' in locals())

ratings_df exists: True
SVD model exists: True


In [5]:
import pickle
from google.colab import files

with open("ratings_df.pkl", "wb") as f:
    pickle.dump(ratings_df, f)
files.download("ratings_df.pkl")

with open("svd_model.pkl", "wb") as f:
    pickle.dump(model, f)
files.download("svd_model.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>