In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import NearestNeighbors
import warnings

class MovieRecommender:
    def __init__(self, movies_file, ratings_file):
        warnings.filterwarnings('ignore')
        self.movies_df = pd.read_csv(movies_file)
        self.ratings_df = pd.read_csv(ratings_file)
        self.process_data()

    def process_data(self):
        self.movies_df['genres'] = self.movies_df['genres'].apply(lambda x: ' '.join(eval(x)))
        cv = CountVectorizer()
        genres_one_hot = cv.fit_transform(self.movies_df['genres'])
        self.genres_df = pd.DataFrame(genres_one_hot.toarray(), columns=cv.get_feature_names_out())

        common_movie_ids = set(self.movies_df['id']).intersection(set(self.ratings_df['movieId']))
        self.filtered_movies_df = self.movies_df[self.movies_df['id'].isin(common_movie_ids)]
        self.filtered_ratings_df = self.ratings_df[self.ratings_df['movieId'].isin(common_movie_ids)]

        filtered_ratings_mean_count = self.filtered_ratings_df.groupby('movieId').agg({'rating': ['mean', 'count']})
        filtered_ratings_mean_count.columns = ['rating_mean', 'rating_count']

        self.filtered_movies_ratings_merged = pd.merge(self.filtered_movies_df, filtered_ratings_mean_count, left_on='id', right_index=True, how='left')
        self.nbrs_filtered = NearestNeighbors(n_neighbors=10, metric='cosine').fit(self.genres_df.loc[self.filtered_movies_df.index])

    def get_recommendations_for_movie(self, movie_title):
        movie_idx = self.filtered_movies_df[self.filtered_movies_df['title'] == movie_title].index[0]
        distances, indices = self.nbrs_filtered.kneighbors([self.genres_df.iloc[movie_idx]])
        return self.filtered_movies_ratings_merged.iloc[indices[0]][['title', 'id', 'rating_mean', 'rating_count']]

    def generate_recommendations(self, user_id, n_recommendations=10):
        return self.filtered_movies_df.sample(n_recommendations)['id'].tolist()

    def evaluate(self):
        preferred_movies = self.ratings_df[self.ratings_df['rating'] >= 4].groupby('userId')['movieId'].apply(list).to_dict()
        user_recommendations = {user: self.generate_recommendations(user) for user in preferred_movies.keys()}
        return self.calculate_evaluation_metrics(user_recommendations, preferred_movies)

    def calculate_evaluation_metrics(self, user_recommendations, preferred_movies):
        precisions, recalls, f1_scores = [], [], []
        for user, preferred in preferred_movies.items():
            recommended = user_recommendations.get(user, [])
            tp = len(set(preferred) & set(recommended))
            fp = len(set(recommended) - set(preferred))
            fn = len(set(preferred) - set(recommended))

            precision = tp / (tp + fp) if tp + fp > 0 else 0
            recall = tp / (tp + fn) if tp + fn > 0 else 0
            f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0

            precisions.append(precision)
            recalls.append(recall)
            f1_scores.append(f1)

        return np.mean(precisions), np.mean(recalls), np.mean(f1_scores)



In [3]:

# Example usage
recommender = MovieRecommender("./ML_test/preprocessed_movies.csv", "./ratings_small.csv")

# Generate recommendations for 'The Dark Knight'
dark_knight_recommendations = recommender.get_recommendations_for_movie('The Dark Knight')
print("Recommendations for 'The Dark Knight':")
print(dark_knight_recommendations)

# Evaluate system
avg_precision, avg_recall, avg_f1_score = recommender.evaluate()
print(f"Average Precision: {avg_precision}")
print(f"Average Recall: {avg_recall}")
print(f"Average F1 Score: {avg_f1_score}")

Recommendations for 'The Dark Knight':
                          title     id  rating_mean  rating_count
12621           The Dark Knight    155     3.055556             9
15142               Harry Brown  25941     4.000000             1
7912   I'll Sleep When I'm Dead   3515     3.400000             5
12351                    Hitman   1620     3.375000            20
11895              The Contract   1441     3.375000            28
12683              Street Kings   1266     3.885965            57
4177                   Scarface    111     4.224576           118
6124                  Dark Blue   4911     3.000000             3
3992                Best Seller   4639     2.607143            14
4787          Bangkok Dangerous   3173     2.940000            25
Average Precision: 0.012088094055307173
Average Recall: 0.0016643582479823464
Average F1 Score: 0.0025937599331314768
