In [61]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from annoy import AnnoyIndex
import time
import matplotlib.pyplot as plt

Building a basic ANN CF RecSys

In [62]:
class ANN:
    def __init__(self, data_path, movie_names_path, movies_info_path, num_neighbors=1000):
        self.num_neighbors = num_neighbors
        # Load data
        self.user_item_matrix = pd.read_pickle(data_path).transpose()
        self.movies_names = pd.read_csv(movie_names_path)
        self.movies_info = pd.read_csv(movies_info_path)
        self.factors = self.user_item_matrix.shape[1]
        self.annoy_index = AnnoyIndex(self.factors, 'angular')
        self._build_index()

    def _build_index(self):
        """Add all user vectors to the Annoy index and build the index."""
        for user_id in range(self.user_item_matrix.shape[0]):
            user_vector = self.user_item_matrix.iloc[user_id].fillna(0).tolist()
            self.annoy_index.add_item(user_id, user_vector)
        self.annoy_index.build(10)

    def generate_recommendations(self, user_input, number_of_recommendations):
        """Generate movie recommendations based on user input ratings."""
        query_vector = [0] * self.factors
        all_user_movie_ids = []
        for movie_title, rating in user_input.items():
            if movie_title in self.movies_names['title'].values:
                movie_id = self.movies_names[self.movies_names['title'] == movie_title]['movieId'].iloc[0]
                all_user_movie_ids.append(movie_id)
                movie_idx = self.user_item_matrix.copy().T.index.get_loc(movie_id)
                query_vector[movie_idx] = rating
        
        nearest_neighbors, distances = self.annoy_index.get_nns_by_vector(query_vector, self.num_neighbors, include_distances=True)
        weights = np.exp(-np.array(distances))
        neighbor_ratings = self.user_item_matrix.iloc[nearest_neighbors]

        weighted_means = []
        for column in neighbor_ratings.columns:
            valid_ratings = neighbor_ratings[column].dropna()
            if not valid_ratings.empty:
                weighted_avg = np.average(valid_ratings, weights=weights[:len(valid_ratings)])
                weighted_means.append((column, weighted_avg))
            else:
                weighted_means.append((column, np.nan))

        weighted_means_df = pd.DataFrame(weighted_means, columns=['movieId', 'weighted_mean']).set_index('movieId')
        global_avg_ratings = self.movies_info.set_index('movieId')['avg_rating']
        num_ratings = neighbor_ratings.count()

        combined = weighted_means_df.join(global_avg_ratings).join(num_ratings.rename('num_ratings'))
        combined.dropna(subset=['weighted_mean'], inplace=True)
        combined = combined[combined.index.isin(all_user_movie_ids)==False]
        combined_sorted = combined.sort_values(by=['weighted_mean', 'avg_rating', 'num_ratings'], ascending=[False, False, False])

        recommendations = combined_sorted.head(number_of_recommendations)
        recommendations = recommendations.join(self.movies_names.set_index('movieId'), on='movieId')
        recommendations['rank'] = range(1, number_of_recommendations+1)
        return recommendations

Now for testing...

In [63]:
recommender = ANN('sparse_ratings.pkl', 'ml-latest/movies.csv', 'suitable_movies.csv')

Find the appropriate user tests that we filtered out in "creating_test_users"

In [64]:
df_original = pd.read_pickle('sparse_ratings.pkl')
df_movies_info = pd.read_csv('ml-latest/movies.csv').set_index('movieId')
lst_test_files = []
for i in range(1,5):
    for j in range(7,12):
        lst_test_files.append(f'testing/n{i}_c{j}_full')

In [None]:
lst_overlaps = []
for file_name in lst_test_files:
    df_test = pd.read_pickle(file_name)
    overlaps = {}
    for user_id in df_test.index:
        
        test_movies = df_test['testMovies'][user_id]
        eval_movies = df_test['evalMovies'][user_id]
        test_ratings = [df_original[user_id][test_movies].values[x] for x in range(len(test_movies))]
        eval_ratings = [df_original[user_id][eval_movies].values[x] for x in range(len(eval_movies))]
        test_movies_titles = [df_movies_info[df_movies_info.index==x]['title'].values[0] for x in test_movies]
        eval_movies_titles = [df_movies_info[df_movies_info.index==x]['title'].values[0] for x in eval_movies]

        # for being nice and all but totally useless
        
        #test_df = pd.DataFrame()
        #test_df['title'] = test_movies_titles
        #test_df['rating'] = test_ratings
        #test_df['movieId'] = test_movies
        #eval_df = pd.DataFrame()
        #eval_df['title'] = eval_movies_titles
        #eval_df['rating'] = eval_ratings
        #eval_df['movieId'] = eval_movies
        
        # for every user feed the test movies to the recommender and see what it recommends
        test_dict = {}
        for i in range(len(test_movies)):
            test_dict[test_movies_titles[i]] = test_ratings[i]
        # we only want to see the evaluation on the good ones since recommender is trying to recommend the best ones
        eval_dict = {}
        for i in range(len(eval_movies)):
            if eval_ratings[i]>=4.0:
                eval_dict[eval_movies_titles[i]] = eval_ratings[i]
        # here we only calculate the intersection of recommended movies of the same length and the ones we know the user has rated positively (4.0+)
        recommendations = recommender.generate_recommendations(test_dict, len(eval_dict))
        recommendations_titles = recommendations['title'].values
        print(eval_dict.keys())
        print(recommendations_titles)
        overlaps[user_id] = len(list(set(eval_dict.keys()) & set(recommendations_titles)))/len(eval_dict)
        print(f'overlap - {overlaps[user_id]}')
    lst_overlaps.append(overlaps)
    pd.DataFrame(overlaps.items(), columns=['userId', 'overlap']).to_pickle(f'overlaps_{file_name}')

dict_keys(["Schindler's List (1993)", 'Philadelphia (1993)', 'Quiz Show (1994)', 'Birdcage, The (1996)', 'Happy Gilmore (1996)', 'Home Alone (1990)', 'Fatal Instinct (1993)'])
['Ace Ventura: Pet Detective (1994)' 'Babe (1995)' 'Home Alone (1990)'
 'Miracle on 34th Street (1994)' 'Under Siege 2: Dark Territory (1995)'
 'Robin Hood: Men in Tights (1993)'
 'Ace Ventura: When Nature Calls (1995)']
overlap - 0.14285714285714285
dict_keys(['Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964)', 'Whiplash (2014)', 'Big Short, The (2015)', 'Amadeus (1984)', 'There Will Be Blood (2007)', 'Short Term 12 (2013)', 'Au revoir les enfants (1987)', 'Dead Poets Society (1989)', 'Mildred Pierce (1945)', 'Do the Right Thing (1989)', 'Sound of Metal (2019)', 'Rebel Without a Cause (1955)', 'Inside Llewyn Davis (2013)', 'Rounders (1998)', 'Master, The (2012)', 'Logan Lucky (2017)'])
['Arrival (2016)' 'Whiplash (2014)' 'Big Short, The (2015)'
 'Blade Runner 2049 (2017)' 'Social Netwo