In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
import time

In [2]:
class ItemBasedCF:
    
    def __init__(self, movies_file, ratings_file, sample_size=10000, random_seed=42):
        start_time = time.time()
        self.allowed_movies = {"Godfather, The (1972)"}
        self.movies_df = pd.read_csv(movies_file)
        self.ratings_df = pd.read_csv(ratings_file)
        print("Files read in {:.2f} seconds.".format(time.time() - start_time))
        
        start_time = time.time()
        self.sample_movies(sample_size, random_seed)
        print("Movies sampled in {:.2f} seconds.".format(time.time() - start_time))
        
        start_time = time.time()
        self.user_item_df = self.create_user_item_dataframe()
        print("User-item matrix created in {:.2f} seconds.".format(time.time() - start_time))
        
        start_time = time.time()
        self.similarity_df = self.calculate_similarity()
        print("Similarity matrix calculated in {:.2f} seconds.".format(time.time() - start_time))
    
    def sample_movies(self, sample_size, random_seed):
        np.random.seed(random_seed)
        sampled_movie_ids = np.random.choice(self.movies_df['movieId'].unique(), size=sample_size, replace=False)
        self.movies_df = self.movies_df[self.movies_df['movieId'].isin(sampled_movie_ids)]
        self.ratings_df = self.ratings_df[self.ratings_df['movieId'].isin(sampled_movie_ids)]

    def create_user_item_dataframe(self):
        user_item_df = pd.pivot_table(self.ratings_df, index='userId', columns='movieId', values='rating', fill_value=0)
        return user_item_df

    def calculate_similarity(self):
        movie_matrix = self.user_item_df.transpose()
        similarity_matrix = cosine_similarity(movie_matrix)
        return pd.DataFrame(similarity_matrix, index=movie_matrix.index, columns=movie_matrix.index)

    def recommend_movies(self, user_ratings, top_n=5):
        start_time = time.time()
        new_user_df = pd.DataFrame(user_ratings, index=['rating'], columns=self.user_item_df.columns).fillna(0).transpose()
        
        missing_movies = [title for title in user_ratings if title not in self.movies_df['title'].values]
        if missing_movies:
            print("These movies are not in the dataset:", missing_movies)
            return
        
        scores = self.similarity_df.dot(new_user_df['rating'])
        top_movie_ids = scores.nlargest(top_n).index
        recommended_movies = self.movies_df[self.movies_df['movieId'].isin(top_movie_ids)]['title'].tolist()
        print("Recommendation generated in {:.2f} seconds.".format(time.time() - start_time))
        return recommended_movies

In [3]:
cf_recommender = ItemBasedCF('ml-latest/movies.csv', 'ml-latest/ratings.csv')
movies = {'Bad Company (1995)': 5, 'Djomeh (2000)': 4}
recommendations = cf_recommender.recommend_movies(movies, top_n=5)
print("Recommended Movies:", recommendations)

Files read in 12.93 seconds.
Movies sampled in 1.02 seconds.


  num_cells = num_rows * num_columns


ValueError: negative dimensions are not allowed

In [None]:
df = pd.read_csv('ml-latest/movies.csv')
df
print("original movie genres:")
print()
for movie in movies:
    print(df[df["title"]==movie])
print()
print("recommended movie genres:")
for movie in recommendations:
    print(df[df["title"]==movie])