In [20]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity


def load_embeddings(npz_path):
    """
    Load IDs and embeddings from the specified .npz file.
    """
    data = np.load(npz_path, allow_pickle=True)
    ids = data['ids']
    embeddings = data['features']
    return ids, embeddings


def find_similar_movies(query_id, k, ids, embeddings):
    """
    Finds the top k most similar movies to the given movie ID based on cosine similarity.
    """
    # Find the index of the given movie ID
    query_index = np.where(ids == query_id)[0][0]

    # Compute cosine similarities between the query movie and all others
    similarities = cosine_similarity(
        embeddings[query_index].reshape(1, -1), embeddings
    ).flatten()

    # Get the top k most similar indices (excluding the query movie itself)
    top_k_indices = np.argsort(similarities)[::-1][1:k + 1]

    # Get the corresponding IDs and similarity scores
    top_k_ids = ids[top_k_indices]
    top_k_scores = similarities[top_k_indices]
    return top_k_ids, top_k_scores


def display_similar_movies(query_id, top_k_ids, top_k_scores, metadata):
    """
    Displays the query movie ID, title, genres, and the most similar movies with their similarity scores.
    """
    # Retrieve metadata for the query movie
    query_title, query_genres = metadata.get(query_id, ("Unknown", "Unknown"))

    print(f"Query Movie:\nID: {query_id}\nTitle: {query_title}\nGenres: {query_genres}\n")
    print("Most Similar Movies:")

    # Display each similar movie with its metadata
    for sim_id, score in zip(top_k_ids, top_k_scores):
        title, genres = metadata.get(sim_id, ("Unknown", "Unknown"))
        print(f"ID: {sim_id}\nTitle: {title}\nGenres: {genres}\nSimilarity Score: {score:.4f}\n")
        print("-" * 80)

def load_movie_metadata(csv_path):
    """
    Load movie metadata from a CSV file and return a dictionary mapping movie IDs to titles and genres.
    """
    movies = pd.read_csv(csv_path)
    metadata = {(row['movieId']): (row['title'], row['genres']) for _, row in movies.iterrows()}
    return metadata

In [22]:
# Path to the .npz file containing embeddings
embeddings_npz_path = "data/processed/subtitles_bert_large_chunking_features.npz"  # Replace with your .npz file path
movies_csv_path = "data/ml-20m-psm/movies.csv"  # Path to the movies metadata CSV file

# Load embeddings
ids, embeddings = load_embeddings(embeddings_npz_path)

# Load movie metadata
metadata = load_movie_metadata(movies_csv_path)

# Find the top 5 most similar movies to a given movie ID
query_movie_id = 1  # Replace with the ID of the query movie
top_k = 5
similar_movie_ids, similarity_scores = find_similar_movies(query_movie_id, top_k, ids, embeddings)

# Display the query movie and its most similar movies
display_similar_movies(query_movie_id, similar_movie_ids, similarity_scores, metadata)


Query Movie:
ID: 1
Title: Toy Story (1995)
Genres: Adventure|Animation|Children|Comedy|Fantasy

Most Similar Movies:
ID: 3114
Title: Toy Story 2 (1999)
Genres: Adventure|Animation|Children|Comedy|Fantasy
Similarity Score: 0.9940

--------------------------------------------------------------------------------
ID: 43926
Title: Doogal (2006)
Genres: Animation|Children
Similarity Score: 0.9903

--------------------------------------------------------------------------------
ID: 100611
Title: Escape from Planet Earth (2013)
Genres: Adventure|Animation|Comedy|Sci-Fi
Similarity Score: 0.9902

--------------------------------------------------------------------------------
ID: 78499
Title: Toy Story 3 (2010)
Genres: Adventure|Animation|Children|Comedy|Fantasy|IMAX
Similarity Score: 0.9899

--------------------------------------------------------------------------------
ID: 40339
Title: Chicken Little (2005)
Genres: Action|Adventure|Animation|Children|Comedy|Sci-Fi
Similarity Score: 0.9895

--