In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from annoy import AnnoyIndex
from scipy.sparse import csr_matrix, vstack
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
import time
import matplotlib.pyplot as plt

In [None]:
user_input_example = {"Ace Ventura: Pet Detective (1994)": 4.3,
                      "Interstellar (2014)": 3,
                      "Schindler's List (1993)": 3.5,
                      "Home Alone (1990)": 3.5}

num_neighbors = 1000

In [None]:
df = pd.read_pickle('sparse_ratings.pkl')
user_item_matrix = df.transpose()
movies_info = pd.read_csv("suitable_movies.csv")
movies_names = pd.read_csv("ml-latest/movies.csv")

In [None]:
factors = user_item_matrix.shape[1]
annoy_index = AnnoyIndex(factors, 'angular')

In [None]:
# Adding all user vectors to the index
for user_id in range(user_item_matrix.shape[0]):
    user_vector = user_item_matrix.iloc[user_id].fillna(0).tolist()
    annoy_index.add_item(user_id, user_vector)

annoy_index.build(10)

In [None]:
# Generate a query vector based on user input
user_input_example = {"Ace Ventura: Pet Detective (1994)": 4.3,
                      "Interstellar (2014)": 3,
                      "Schindler's List (1993)": 3.5,
                      "Home Alone (1990)": 3.5}

In [None]:
all_user_movie_ids = []
query_vector = [0] * factors
for movie_title, rating in user_input_example.items():
    if movie_title in movies_names['title'].values:
        movie_id = movies_names[movies_names['title'] == movie_title]['movieId'].iloc[0]
        all_user_movie_ids.append(movie_id)
        movie_idx = df.index.get_loc(movie_id)
        query_vector[movie_idx] = rating

In [None]:
nearest_neighbors, distances = annoy_index.get_nns_by_vector(query_vector, 100, include_distances=True)

In [None]:
weights = np.exp(-np.array(distances))

neighbor_ratings = user_item_matrix.iloc[nearest_neighbors]

weighted_means = []
for column in neighbor_ratings.columns:
    valid_ratings = neighbor_ratings[column].dropna()
    if not valid_ratings.empty:
        weighted_avg = np.average(valid_ratings, weights=weights[:len(valid_ratings)])
        weighted_means.append((column, weighted_avg))
    else:
        weighted_means.append((column, np.nan))

weighted_means_df = pd.DataFrame(weighted_means, columns=['movieId', 'weighted_mean']).set_index('movieId')

In [None]:
global_avg_ratings = movies_info.set_index('movieId')['avg_rating']
num_ratings = neighbor_ratings.count()

combined = weighted_means_df.join(global_avg_ratings).join(num_ratings.rename('num_ratings'))
combined.dropna(subset=['weighted_mean'], inplace=True)

combined = combined[combined.index.isin(all_user_movie_ids)==False]

# Sort by weighted mean, global average rating, and number of ratings for tiebreaking
combined_sorted = combined.sort_values(by=['weighted_mean', 'avg_rating', 'num_ratings'], ascending=[False, False, False])

In [None]:
recommendations = combined_sorted.head(5)
recommendations = recommendations.join(movies_names.set_index('movieId'), on='movieId')
recommendations['rank'] = range(1, 6)

In [None]:
print("Top movies recommended based on nearest neighbors with weighted ratings:")
print(recommendations)