In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy

In [2]:
df = pd.read_pickle('sparse_ratings.pkl')

In [3]:
user_input_example = {"Ace Ventura: Pet Detective (1994)": 4.3,
                      "Interstellar (2014)":3,
                      "Schindler's List (1993)":3.5,
                      "Home Alone (1990)":3.5}

In [4]:
df

userId,1,2,3,4,5,6,7,8,9,10,...,330966,330967,330968,330969,330970,330971,330972,330973,330974,330975
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1203,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
527,0.0,5.0,0.0,0.0,4.0,0.0,4.0,0.0,4.0,0.0,...,0.0,4.0,0.0,0.0,0.0,4.5,4.0,0.0,4.5,3.5
1193,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,4.0,3.0
750,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,...,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26082,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1981,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1760,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6482,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2555,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Now to go through this more efficiently - we propose to only look for similarity between users that have rated some movie that also the previous person rated (essentially only taking the movieId that is relevant which might help make our solution space smaller 100-fold, or more...). As a backup - we will see their average rating per genre and then look for an approximate nearest neighbor within those genres

In [5]:
movies_info = pd.read_csv("suitable_movies.csv")

In [6]:
movies_info

Unnamed: 0,movieId,userId,rating,timestamp,no_people_rated,avg_rating,suitable_genre
0,1203,3738563090,96992.5,29894679700250,22730,4.267158,Drama
1,527,13929219732,357340.5,103671567916204,84232,4.242337,Drama
2,1193,8118516317,207758.5,61568690477830,49316,4.212801,Drama
3,750,5650978895,144160.5,42473159316615,34324,4.199991,Comedy
4,26082,213092748,5374.0,1897779020524,1282,4.191888,Drama
...,...,...,...,...,...,...,...
1073,1981,243854131,2772.0,1714465547406,1495,1.854181,Horror
1074,1760,552474506,6083.5,3830780656494,3317,1.834037,Comedy
1075,6482,381293121,4065.5,2992591178695,2292,1.773778,Comedy
1076,2555,285063817,2896.5,1898360097339,1679,1.725134,Comedy


In [7]:
movies_names = pd.read_csv("ml-latest/movies.csv")
movies_names

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
86532,288967,State of Siege: Temple Attack (2021),Action|Drama
86533,288971,Ouija Japan (2021),Action|Horror
86534,288975,The Men Who Made the Movies: Howard Hawks (1973),Documentary
86535,288977,Skinford: Death Sentence (2023),Crime|Thriller


In [8]:
movies_info_ = movies_info.copy().set_index("movieId")
joined = movies_names.copy().join(movies_info_, how="inner", on="movieId", lsuffix="_left", rsuffix="_right")
joined


Unnamed: 0,movieId,title,genres,userId,rating,timestamp,no_people_rated,avg_rating,suitable_genre
4,5,Father of the Bride Part II (1995),Comedy,2603834260,48619.0,15955867010862,15801,3.076957,Comedy
7,8,Tom and Huck (1995),Adventure|Children,260860097,4939.0,1572105851435,1584,3.118056,Adventure
8,9,Sudden Death (1995),Action,744359812,13703.5,4344284015054,4563,3.003178,Action
13,14,Nixon (1995),Drama,1125655409,23771.5,6563433835468,6933,3.428747,Drama
17,18,Four Rooms (1995),Comedy,1109373960,22874.0,7455723682994,6688,3.420156,Comedy
...,...,...,...,...,...,...,...,...,...
69608,223944,The Social Dilemma (2020),Documentary|Drama,174925447,3811.5,1699918539618,1048,3.636927,Drama
70004,225145,Nomadland (2020),Drama,198977021,4321.0,1954385174038,1195,3.615900,Drama
70197,225625,Borat Subsequent Moviefilm (2020),Comedy,233011136,4686.5,2314855294368,1425,3.288772,Comedy
70313,225984,Another Round (2020),Drama,232631047,5237.5,2252471175918,1376,3.806323,Drama


In [9]:
def user_genres(user_input):
    user_genres_ = []
    for key, value in user_input.items():
        movie_id = movies_names[movies_names['title']==key]["movieId"].values[0]
        genre = movies_info[movies_info["movieId"]==movie_id]['suitable_genre'].values[0]
        if genre not in user_genres_: user_genres_.append(genre)
    return user_genres_
print(user_genres(user_input_example))

['Comedy', 'Sci-Fi', 'Drama']


This methods finds all the users that have rated at least one of the movies from what the user provided - we don't care whether the rating is low or high - both give us good information - at the end we will insert the new user in the dataframe so to compute the distance metric reliably. This is however done through an external method so to adhere to the DRY principles.

In [10]:
def movies_only_valid_movies(user_input):
    df_lst = []
    for key,value in user_input.items():
        movie_id = movies_names[movies_names['title']==key]["movieId"].values[0]
        all_user_ratings_specific_movie = df[df.index==movie_id]
        df_lst.append(all_user_ratings_specific_movie)
    df_lst = pd.concat(df_lst)
    #return df_lst
    # can delete the following 2 rows to also have the 0 cols on the specified movies but it saves a lot of similarity computation
    df_lst = df_lst.replace(0.0,np.nan).dropna(axis=1,how="all")
    df_lst = df_lst.replace(np.nan, 0.0)
    return df_lst
print(movies_only_valid_movies(user_input_example))

userId   2       3       4       5       7       9       14      15      \
movieId                                                                   
344         3.0     0.0     0.0     0.0     0.0     3.0     1.0     0.0   
109487      0.0     5.0     4.5     0.0     0.0     0.0     0.0     0.0   
527         5.0     0.0     0.0     4.0     4.0     4.0     0.0     3.0   
586         4.0     0.0     0.0     0.0     0.0     3.0     0.5     0.0   

userId   16      17      ...  330951  330957  330961  330963  330965  330967  \
movieId                  ...                                                   
344         0.0     0.0  ...     1.0     0.0     1.0     4.0     3.0     0.0   
109487      4.5     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
527         0.0     5.0  ...     5.0     3.0     4.0     0.0     0.0     4.0   
586         0.0     0.0  ...     4.0     0.0     4.0     0.0     0.0     0.0   

userId   330971  330972  330974  330975  
movieId                   

This methods finds all the movies that have overlapping genres with what the user provided - we don't care whether the rating is low or high - both give us good information - at the end we will insert the new user in the dataframe so to compute the distance metric reliably. This is however done through an external method so to adhere to the DRY principles.

In [11]:
def movies_only_valid_genres(user_input):
    valid_genres = user_genres(user_input)
    movies_with_same_genres_id = movies_info[movies_info['suitable_genre'].isin(valid_genres)]
    movies_with_same_genres_id.set_index("movieId", inplace=True)
    print(movies_with_same_genres_id)
    movies_with_same_genres_title = movies_names.join(movies_with_same_genres_id, on='movieId', how='inner', lsuffix="l", rsuffix="r")["title"].values
    arbitrary_ratings = {movie_title:0 for movie_title in movies_with_same_genres_title}
    return movies_only_valid_movies(arbitrary_ratings)
test = movies_only_valid_genres(user_input_example)
#print(test)

              userId    rating        timestamp  no_people_rated  avg_rating  \
movieId                                                                        
1203      3738563090   96992.5   29894679700250            22730    4.267158   
527      13929219732  357340.5  103671567916204            84232    4.242337   
1193      8118516317  207758.5   61568690477830            49316    4.212801   
750       5650978895  144160.5   42473159316615            34324    4.199991   
26082      213092748    5374.0    1897779020524             1282    4.191888   
...              ...       ...              ...              ...         ...   
3564       320502095    3680.0    2354590713650             1969    1.868969   
1760       552474506    6083.5    3830780656494             3317    1.834037   
6482       381293121    4065.5    2992591178695             2292    1.773778   
2555       285063817    2896.5    1898360097339             1679    1.725134   
57532      194063825    1903.0    163676

Here we just define a helper method to compute the most similar users in the dataframes

In [12]:
import time
def most_similar_users_cos(df, user_ratings):
    movie_ids = []
    # laplace_constant = .001 - was meant for smoothing - might need it later
    user_ratings_sparse = [0 for _ in range(len(df))]
    df["user_ratings"] = user_ratings_sparse
    for movie_title,rating in user_ratings.items():
        movie_id = movies_names[movies_names['title']==movie_title]["movieId"].values[0]
        movie_ids.append(movie_id)
        # if the user has not provided movies that are relevant all the ratings will be 0 (which is not a valid value since the rating scale is 1-5 but that is ok because then it will say that they are most similar with the people that haven't rated the movies (haven't seen them) as well
        if movie_id in df.index.values:
            df.loc[movie_id, "user_ratings"] = rating
    # now compute the cosine distance - the problem here arises if the user has not provided any ratings - since dividing by a 0 is not allowed. for now - we will only work on the case where it is allowed
    our_user = df["user_ratings"].values
    cos_distances = []
    if np.any(our_user):
        print("Ratings of relevant movies found - computing similarity metrics")
        beginning = time.time()
        for i in range(df.shape[1] - 1):
            other_user = df.iloc[:,i].values
            cos_distance = np.dot(our_user, other_user)/(np.sqrt(np.sum([x*x for x in our_user]) * np.sqrt(np.sum([y*y for y in other_user]))))
            cos_distances.append(cos_distance)
        number_of_users = np.min([1000, len(cos_distances) - 1])
        top_indices = np.argsort(-np.array(cos_distances))[:number_of_users]
        top_indices = np.append(top_indices, -1)
        top_users = df[df.copy().T.index[top_indices]]
        print(f"Found the {number_of_users} most similar users in {time.time()-beginning} seconds")
        print("The following movies were the only ones with not all 0 ratings: ")
        non_zero_movies = []
        for i in range(len(top_users)):
            if np.sum(top_users.iloc[i]) > 0 : non_zero_movies.append(i)
        top_users = top_users.iloc[non_zero_movies]
        return top_users[top_users.index.isin(movie_ids)==False]
    else:
        print("No ratings were provided by the user - not able to compute similarity metrics")
        return None
most_similar_users = most_similar_users_cos(test, user_input_example)
print(most_similar_users)

  df.loc[movie_id, "user_ratings"] = rating


Ratings of relevant movies found - computing similarity metrics
Found the 1000 most similar users in 107.47016835212708 seconds
The following movies were the only ones with not all 0 ratings: 
userId   224578  268189  65580  313810  226716  148974  55685  142798  29723  \
movieId                                                                        
5           0.0     0.0    0.0     0.0     0.0     0.0    0.0     0.0    0.0   
14          0.0     0.0    0.0     0.0     0.0     0.0    0.0     0.0    0.0   
18          0.0     0.0    0.0     0.0     0.0     0.0    0.0     0.0    0.0   
19          0.0     0.0    0.0     0.0     0.0     0.0    0.0     0.0    0.0   
26          0.0     0.0    0.0     0.0     0.0     0.0    0.0     0.0    0.0   
...         ...     ...    ...     ...     ...     ...    ...     ...    ...   
212361      0.0     0.0    0.0     0.0     0.0     0.0    0.0     0.0    0.0   
217655      0.0     0.0    0.0     0.0     0.0     0.0    0.0     0.0    0.0   
223944 

Now that we have the n most similar users - we can see what movies to recommend from them...

In [13]:
def recommend_by_mean(df, N=5):
    mean_ratings = df.replace(0, np.nan).mean(axis=1).dropna()
    recommendations = mean_ratings.sort_values(ascending=False).head(N)
    return recommendations

top_movies_by_mean = recommend_by_mean(most_similar_users)
print("Top movies recommended by mean ratings:")
print(top_movies_by_mean)

Top movies recommended by mean ratings:
movieId
78836    5.0
2860     5.0
8827     5.0
501      5.0
36527    5.0
dtype: float64


For some movies we can see that this doesnt work very well since there is only one rating - maybe let's take the average rating into account as well as a tiebreaker

In [14]:
def recommend_with_tiebreaking(df, global_avg, movies_info, movies_names, N=5):
    # Calculate the mean rating for each movie within the similar users, ignoring zero ratings
    mean_ratings = df.replace(0, np.nan).mean(axis=1)
    # Count the number of non-zero ratings for each movie
    num_ratings = df[df > 0].count(axis=1)
    
    # Create a DataFrame for the combined data
    combined = pd.DataFrame({
        'movieId': mean_ratings.index,
        'weighted_mean': mean_ratings,
        'global_avg': global_avg.reindex(mean_ratings.index),
        'num_ratings': num_ratings
    }).dropna(subset=['weighted_mean'])
    
    combined_sorted = combined.sort_values(by=['weighted_mean', 'global_avg', 'num_ratings'], ascending=[False, False, False])

    recommendations = combined_sorted.head(N)
    recommendations = recommendations.join(movies_names.set_index('movieId'), on=recommendations.index)
    recommendations = recommendations.assign(rank=range(1, N + 1))
    return recommendations

global_avg_ratings = movies_info.set_index('movieId')['avg_rating']
movies_titles = movies_names[['movieId', 'title']]

top_movies_with_tiebreaking = recommend_with_tiebreaking(most_similar_users, global_avg_ratings, movies_info, movies_titles)
print("Top movies recommended with tiebreaking based on global average rating:")
print(top_movies_with_tiebreaking)


Top movies recommended with tiebreaking based on global average rating:
         key_0  movieId  weighted_mean  global_avg  num_ratings  \
movieId                                                           
3134      3134     3134            5.0    4.116348            1   
3089      3089     3089            5.0    4.108164            1   
1228      1228     1228            5.0    4.036668            1   
214        214      214            5.0    4.030982            1   
66371    66371    66371            5.0    3.997815            1   

                                                     title  rank  
movieId                                                           
3134            Grand Illusion (La grande illusion) (1937)     1  
3089     Bicycle Thieves (a.k.a. The Bicycle Thief) (a....     2  
1228                                    Raging Bull (1980)     3  
214                  Before the Rain (Pred dozhdot) (1994)     4  
66371                        Departures (Okuribito) (200