### A notebook used to run exhaustive testing (2^12) ANN trainings and testings - to see the average RMSE, MAE, and RMSE StDev of recommendations of each combination (the violin graphs are made of this info)

In [10]:
import pandas as pd
from annoy import AnnoyIndex
import time
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, median_absolute_error
import numpy as np

Building a basic ANN CF RecSys

In [11]:
class ANN:
    def __init__(self, user_item_matrix, movie_names_path, movies_info_path, num_neighbors=50):
        self.num_neighbors = num_neighbors
        self.user_item_matrix = user_item_matrix.replace(0.0,np.nan)
        self.movies_names = pd.read_csv(movie_names_path)
        self.movies_info = pd.read_csv(movies_info_path)
        self.factors = user_item_matrix.shape[1]
        self.annoy_index = AnnoyIndex(self.factors, 'angular')
        self.train()

    def train(self):
        for idx, user_vector in self.user_item_matrix.iterrows():
            vector = user_vector.fillna(0).tolist()
            self.annoy_index.add_item(idx, vector)  # Using DataFrame idx as itemID in Annoy
        self.annoy_index.build(10)

    def test(self, user_input):
        query_vector = [0] * self.factors
        for movie_id, rating in user_input.items():
            if movie_id in self.user_item_matrix.columns:
                movie_idx = self.user_item_matrix.columns.get_loc(movie_id)
                query_vector[movie_idx] = rating

        max_neighbors = min(self.num_neighbors, len(self.user_item_matrix) - 1)
        nearest_neighbors = self.annoy_index.get_nns_by_vector(query_vector, max_neighbors, include_distances=False)

        if not nearest_neighbors:
            return pd.Series()

        neighbor_ratings = self.user_item_matrix.loc[nearest_neighbors].mean(axis=0)
        return neighbor_ratings


In [12]:
def run_experiments(user_item_matrix, all_combinations_movie_ids, until, size, test_size=0.2):
    results = []
    for combination in all_combinations_movie_ids[until-size:until]:
        print(f'Processing combination: {combination}')
        start_time = time.time()
        
        user_id_stay = df_t.loc[:, df_t.columns.isin(combination)].replace(0.0,np.nan).dropna(how='any', axis=0).index.values
        df_tc = df_t.copy()
        df_tc['user_id_stay'] = 0
        df_tc.loc[user_id_stay, 'user_id_stay'] = 1

        train_df, test_df = train_test_split(df_tc, test_size=test_size, stratify=df_tc['user_id_stay'])
        
        
        print(f"training.... - time = {time.time() - start_time}")
        ann = ANN(train_df.drop(columns='user_id_stay'), 'ml-latest/movies.csv', 'suitable_movies.csv')
        #ann.train(train_df)
        
        print(f"predicting.... - time = {time.time() - start_time}")
        rmses, maes, med_aes = [], [], []
        test_df = test_df[test_df.user_id_stay != 0]
        test_df.drop(columns='user_id_stay')
        for user_id in test_df.index:
            # User input ratings for combination movies
            user_row = test_df.loc[user_id]
            #print(user_row)
            user_input = test_df.loc[user_id, combination].replace(0.0,np.nan).dropna().to_dict()
            #print(f'user_input - {user_input}')
            user_test_ = test_df.loc[user_id].replace(0.0,np.nan).dropna().drop('user_id_stay').to_dict()
            
            # Predict ratings for all movies based on the combination
            predicted_ratings = ann.test(user_input)

            # Actual ratings not in the combination for evaluation
            actual_ratings = {movie:float(rating) for movie, rating in user_test_.items() if movie not in user_input.keys()}
            predicted_ratings = predicted_ratings.loc[actual_ratings.keys()] # -> works

            actual_ratings_series = pd.Series(actual_ratings)
            #print(actual_ratings_series)
            predicted_ratings_series = predicted_ratings.loc[actual_ratings_series.index]
            #print(predicted_ratings_series)

            if not predicted_ratings_series.empty:
                valid = ~actual_ratings_series.isna() & ~predicted_ratings_series.isna()  # Filter out NaN values from both Series
                if valid.any():
                    rmse = np.sqrt(np.mean((predicted_ratings_series[valid] - actual_ratings_series[valid]) ** 2))
                    mae = mean_absolute_error(actual_ratings_series[valid], predicted_ratings_series[valid])
                    med_ae = median_absolute_error(actual_ratings_series[valid], predicted_ratings_series[valid])
                    rmses.append(rmse)
                    maes.append(mae)
                    med_aes.append(med_ae)
        print(f"appending results.... - time = {time.time() - start_time}")
        results.append({
            "Combination": combination,
            "Average RMSE": np.mean(rmses) if rmses else None,
            "RMSE StDev": np.std(rmses) if rmses else None,
            "Average MAE": np.mean(maes) if maes else None,
            "Median Absolute Error": np.median(med_aes) if med_aes else None,
            "Processing Time": time.time() - start_time
        })

    results_df = pd.DataFrame(results)
    return results_df

Now for testing...

Find the appropriate user tests that we filtered out in "creating_test_users"

In [13]:
df = pd.read_csv('suitable_movies.csv')
df_movies = pd.read_csv('ml-latest/movies.csv')
df = df.join(df_movies.set_index('movieId'), on='movieId')
top_6_genres_ordered = df.copy().groupby(['suitable_genre']).sum().sort_values(by='no_people_rated', ascending=False).head(6).index.values
amt_overall = df.copy().groupby(['suitable_genre']).sum().sort_values(by='no_people_rated', ascending=False)["no_people_rated"]
print(f'The Top 6 genres ({top_6_genres_ordered}) make up for {amt_overall.head(6).sum()*100/amt_overall.sum()} % of the ratings given')

The Top 6 genres (['Drama' 'Comedy' 'Thriller' 'Horror' 'Sci-Fi' 'Action']) make up for 96.12361676805486 % of the ratings given


Now for each of the genres let's pick movies that we assume our future user in the user study to know - it has to be popular and therefore influenced by the popularity bias but it is better then not knowing the movie and not giving a rating or an irrepresentative rating.

let's see the most rated moveis from each of the most popular categories

In [14]:
df

Unnamed: 0,movieId,userId,rating,timestamp,no_people_rated,avg_rating,suitable_genre,title,genres
0,1203,3738563090,96992.5,29894679700250,22730,4.267158,Drama,12 Angry Men (1957),Drama
1,527,13929219732,357340.5,103671567916204,84232,4.242337,Drama,Schindler's List (1993),Drama|War
2,1193,8118516317,207758.5,61568690477830,49316,4.212801,Drama,One Flew Over the Cuckoo's Nest (1975),Drama
3,750,5650978895,144160.5,42473159316615,34324,4.199991,Comedy,Dr. Strangelove or: How I Learned to Stop Worr...,Comedy|War
4,26082,213092748,5374.0,1897779020524,1282,4.191888,Drama,Harakiri (Seppuku) (1962),Drama
...,...,...,...,...,...,...,...,...,...
1073,1981,243854131,2772.0,1714465547406,1495,1.854181,Horror,Friday the 13th Part VIII: Jason Takes Manhatt...,Horror
1074,1760,552474506,6083.5,3830780656494,3317,1.834037,Comedy,Spice World (1997),Comedy
1075,6482,381293121,4065.5,2992591178695,2292,1.773778,Comedy,Dumb and Dumberer: When Harry Met Lloyd (2003),Comedy
1076,2555,285063817,2896.5,1898360097339,1679,1.725134,Comedy,Baby Geniuses (1999),Comedy


In [15]:
for genre in top_6_genres_ordered:
    print(df[df['suitable_genre'] == genre].sort_values(by="no_people_rated", ascending=False).head(10)[['suitable_genre', 'avg_rating','no_people_rated','title']])

    suitable_genre  avg_rating  no_people_rated  \
1            Drama    4.242337            84232   
2            Drama    4.212801            49316   
473          Drama    3.594554            37698   
147          Drama    3.912192            36557   
375          Drama    3.685078            33648   
50           Drama    4.049496            33265   
125          Drama    3.940875            32558   
83           Drama    3.989178            30402   
36           Drama    4.067927            26190   
227          Drama    3.822344            25738   

                                      title  
1                   Schindler's List (1993)  
2    One Flew Over the Cuckoo's Nest (1975)  
473                             Babe (1995)  
147                         Rain Man (1988)  
375                        Cast Away (2000)  
50                 Full Metal Jacket (1987)  
125               Dead Poets Society (1989)  
83               Requiem for a Dream (2000)  
36                      

Since this study does not have a target audience we really need to make these movies staples of the genre - well known, and not neccessarily well-rated. We will rather go by popularity than by ratings

In [16]:
import itertools

movie_lst = [ "Schindler's List (1993)", "Rain Man (1988)",                         # drama
              "Ace Ventura: Pet Detective (1994)", "Home Alone (1990)",             # comedy
              "Fugitive, The (1993)", "Spotlight (2015)",                           # thriller                
              "Shining, The (1980)", "Get Out (2017)",                              # horror            
              "Interstellar (2014)", "Arrival (2016)",                              # sci-fi
              "Butch Cassidy and the Sundance Kid (1969)", "Jason Bourne (2016)"    # action
]
movie_ids = [df[df['title'] == movie]['movieId'].values[0] for movie in movie_lst]
print(movie_ids)
# Define the length of the array
length = 12

# Generate all combinations of 0s and 1s
all_combinations = list(itertools.product([0, 1], repeat=length))
#print(all_combinations)
all_combinations_movie_ids = []

for combo in all_combinations:
    included = []
    for i in range(length):
        if combo[i] == 1:
            included.append(movie_ids[i])
    all_combinations_movie_ids.append(included)
all_combinations_movie_ids = all_combinations_movie_ids[1:]

[527, 1961, 344, 586, 457, 142488, 1258, 168250, 109487, 164179, 1304, 160438]


So for 6 categories and maximum 2 movies per category (the popular ones) - we have 2^12 train-test cases. This means we will have to train-test with a 80-20 split 2^12 = 4096 times

Granted though that the people who have rated those movies are stratified properly

In [17]:
for i in range(0,len(all_combinations_movie_ids),10):
    print(i)
    print(all_combinations_movie_ids[i])
    df_t = pd.read_pickle("sparse_ratings.pkl").T
    results_df = run_experiments(df_t, all_combinations_movie_ids, i, 10)
    results_df.to_pickle(f'results___{i}')

0
[160438]
10
[109487, 1304, 160438]
Processing combination: [160438]
training.... - time = 4.213114500045776
predicting.... - time = 77.06476426124573
appending results.... - time = 83.42117166519165
Processing combination: [1304]
training.... - time = 7.092653274536133
predicting.... - time = 77.25871896743774
appending results.... - time = 107.58003187179565
Processing combination: [1304, 160438]
training.... - time = 6.424811601638794
predicting.... - time = 86.67781782150269
appending results.... - time = 87.89513063430786
Processing combination: [164179]
training.... - time = 6.690570116043091
predicting.... - time = 85.04344916343689
appending results.... - time = 122.4670844078064
Processing combination: [164179, 160438]
training.... - time = 11.48620057106018
predicting.... - time = 88.28553605079651
appending results.... - time = 92.51370024681091
Processing combination: [164179, 1304]
training.... - time = 9.398784875869751
predicting.... - time = 91.76411986351013
appending