In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy

In [None]:
df = pd.read_pickle('sparse_ratings.pkl')

In [None]:
user_input_example = {"Ace Ventura: Pet Detective (1994)": 4.3,
                      "Interstellar (2014)":3,
                      "Schindler's List (1993)":3.5,
                      "Home Alone (1990)":3.5}

In [None]:
df

Now to go through this more efficiently - we propose to only look for similarity between users that have rated some movie that also the previous person rated (essentially only taking the movieId that is relevant which might help make our solution space smaller 100-fold, or more...). As a backup - we will see their average rating per genre and then look for an approximate nearest neighbor within those genres

In [None]:
movies_info = pd.read_csv("suitable_movies.csv")

In [None]:
movies_info

In [None]:
movies_names = pd.read_csv("ml-latest/movies.csv")
movies_names

In [None]:
movies_info_ = movies_info.copy().set_index("movieId")
joined = movies_names.copy().join(movies_info_, how="inner", on="movieId", lsuffix="_left", rsuffix="_right")
joined


In [None]:
def user_genres(user_input):
    user_genres_ = []
    for key, value in user_input.items():
        movie_id = movies_names[movies_names['title']==key]["movieId"].values[0]
        genre = movies_info[movies_info["movieId"]==movie_id]['suitable_genre'].values[0]
        if genre not in user_genres_: user_genres_.append(genre)
    return user_genres_
print(user_genres(user_input_example))

This methods finds all the users that have rated at least one of the movies from what the user provided - we don't care whether the rating is low or high - both give us good information - at the end we will insert the new user in the dataframe so to compute the distance metric reliably. This is however done through an external method so to adhere to the DRY principles.

In [None]:
def movies_only_valid_movies(user_input):
    df_lst = []
    for key,value in user_input.items():
        movie_id = movies_names[movies_names['title']==key]["movieId"].values[0]
        all_user_ratings_specific_movie = df[df.index==movie_id]
        df_lst.append(all_user_ratings_specific_movie)
    df_lst = pd.concat(df_lst)
    #return df_lst
    # can delete the following 2 rows to also have the 0 cols on the specified movies but it saves a lot of similarity computation
    df_lst = df_lst.replace(0.0,np.nan).dropna(axis=1,how="all")
    df_lst = df_lst.replace(np.nan, 0.0)
    return df_lst
print(movies_only_valid_movies(user_input_example))

This methods finds all the movies that have overlapping genres with what the user provided - we don't care whether the rating is low or high - both give us good information - at the end we will insert the new user in the dataframe so to compute the distance metric reliably. This is however done through an external method so to adhere to the DRY principles.

In [None]:
def movies_only_valid_genres(user_input):
    valid_genres = user_genres(user_input)
    movies_with_same_genres_id = movies_info[movies_info['suitable_genre'].isin(valid_genres)]
    movies_with_same_genres_id.set_index("movieId", inplace=True)
    print(movies_with_same_genres_id)
    movies_with_same_genres_title = movies_names.join(movies_with_same_genres_id, on='movieId', how='inner', lsuffix="l", rsuffix="r")["title"].values
    arbitrary_ratings = {movie_title:0 for movie_title in movies_with_same_genres_title}
    return movies_only_valid_movies(arbitrary_ratings)
test = movies_only_valid_genres(user_input_example)
#print(test)

Here we just define a helper method to compute the most similar users in the dataframes

In [None]:
import time
def most_similar_users_cos(df, user_ratings):
    movie_ids = []
    # laplace_constant = .001 - was meant for smoothing - might need it later
    user_ratings_sparse = [0 for _ in range(len(df))]
    df["user_ratings"] = user_ratings_sparse
    for movie_title,rating in user_ratings.items():
        movie_id = movies_names[movies_names['title']==movie_title]["movieId"].values[0]
        movie_ids.append(movie_id)
        # if the user has not provided movies that are relevant all the ratings will be 0 (which is not a valid value since the rating scale is 1-5 but that is ok because then it will say that they are most similar with the people that haven't rated the movies (haven't seen them) as well
        if movie_id in df.index.values:
            df.loc[movie_id, "user_ratings"] = rating
    # now compute the cosine distance - the problem here arises if the user has not provided any ratings - since dividing by a 0 is not allowed. for now - we will only work on the case where it is allowed
    our_user = df["user_ratings"].values
    cos_distances = []
    if np.any(our_user):
        print("Ratings of relevant movies found - computing similarity metrics")
        beginning = time.time()
        for i in range(df.shape[1] - 1):
            other_user = df.iloc[:,i].values
            cos_distance = np.dot(our_user, other_user)/(np.sqrt(np.sum([x*x for x in our_user]) * np.sqrt(np.sum([y*y for y in other_user]))))
            cos_distances.append(cos_distance)
        number_of_users = np.min([1000, len(cos_distances) - 1])
        top_indices = np.argsort(-np.array(cos_distances))[:number_of_users]
        top_indices = np.append(top_indices, -1)
        top_users = df[df.copy().T.index[top_indices]]
        print(f"Found the {number_of_users} most similar users in {time.time()-beginning} seconds")
        print("The following movies were the only ones with not all 0 ratings: ")
        non_zero_movies = []
        for i in range(len(top_users)):
            if np.sum(top_users.iloc[i]) > 0 : non_zero_movies.append(i)
        top_users = top_users.iloc[non_zero_movies]
        return top_users[top_users.index.isin(movie_ids)==False]
    else:
        print("No ratings were provided by the user - not able to compute similarity metrics")
        return None
most_similar_users = most_similar_users_cos(test, user_input_example)
print(most_similar_users)

Now that we have the n most similar users - we can see what movies to recommend from them...

In [None]:
def recommend_by_mean(df, N=5):
    mean_ratings = df.replace(0, np.nan).mean(axis=1).dropna()
    recommendations = mean_ratings.sort_values(ascending=False).head(N)
    return recommendations

top_movies_by_mean = recommend_by_mean(most_similar_users)
print("Top movies recommended by mean ratings:")
print(top_movies_by_mean)

For some movies we can see that this doesnt work very well since there is only one rating - maybe let's take the average rating into account as well as a tiebreaker

In [None]:
def recommend_with_tiebreaking(df, global_avg, movies_info, movies_names, N=5):
    # Calculate the mean rating for each movie within the similar users, ignoring zero ratings
    mean_ratings = df.replace(0, np.nan).mean(axis=1)
    # Count the number of non-zero ratings for each movie
    num_ratings = df[df > 0].count(axis=1)
    
    # Create a DataFrame for the combined data
    combined = pd.DataFrame({
        'movieId': mean_ratings.index,
        'weighted_mean': mean_ratings,
        'global_avg': global_avg.reindex(mean_ratings.index),
        'num_ratings': num_ratings
    }).dropna(subset=['weighted_mean'])
    
    combined_sorted = combined.sort_values(by=['weighted_mean', 'global_avg', 'num_ratings'], ascending=[False, False, False])

    recommendations = combined_sorted.head(N)
    recommendations = recommendations.join(movies_names.set_index('movieId'), on=recommendations.index)
    recommendations = recommendations.assign(rank=range(1, N + 1))
    return recommendations

global_avg_ratings = movies_info.set_index('movieId')['avg_rating']
movies_titles = movies_names[['movieId', 'title']]

top_movies_with_tiebreaking = recommend_with_tiebreaking(most_similar_users, global_avg_ratings, movies_info, movies_titles)
print("Top movies recommended with tiebreaking based on global average rating:")
print(top_movies_with_tiebreaking)
