In [1]:
import pandas as pd
import os
from sentence_transformers import SentenceTransformer, util
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the dataset
df = pd.read_csv('database.csv')

# Check for missing values
print("Missing values in each column:")
print(df.isnull().sum())

Missing values in each column:
name            0
year            0
movie_rated     0
run_length      0
genres          0
release_date    0
rating          0
description     0
dtype: int64


In [51]:
df = pd.read_csv('database.csv')

model = SentenceTransformer('all-mpnet-base-v2')

# Encode all movie descriptions into embeddings
movie_descriptions = df['description'].tolist()
movie_embeddings = model.encode(movie_descriptions, convert_to_tensor=True)

# Save the embeddings for future use
torch.save(movie_embeddings, 'movie_embeddings.pt')
print("Movie embeddings saved as 'movie_embeddings.pt'.")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Movie embeddings saved as 'movie_embeddings.pt'.


In [52]:
def recommend_movies(user_prompt, filters=None, top_n=5):
    # Apply filters if provided
    filtered_df = df.copy()
    if filters:
        if 'genres' in filters:
            filtered_df = filtered_df[filtered_df['genres'].str.contains(filters['genres'], case=False)]
        if 'year' in filters:
            filtered_df = filtered_df[filtered_df['year'] == filters['year']]
        if 'rating' in filters:
            filtered_df = filtered_df[filtered_df['rating'] >= filters['rating']]

    # Get the indices of the filtered movies
    filtered_indices = filtered_df.index.tolist()

    # Filter the movie embeddings to match the filtered DataFrame
    filtered_movie_embeddings = movie_embeddings[filtered_indices]

    # Encode the user prompt
    prompt_embedding = model.encode(user_prompt, convert_to_tensor=True)

    # Compute cosine similarity between the prompt and the filtered movie descriptions
    similarities = util.cos_sim(prompt_embedding, filtered_movie_embeddings)[0]

    # Add similarity scores to the DataFrame
    filtered_df['similarity'] = similarities.tolist()

    # Sort by similarity and return top N results
    top_movies = filtered_df.sort_values(by='similarity', ascending=False).head(top_n)
    return top_movies

In [None]:
# Slightly modified description of "22 Jump Street"
user_prompt = "A comedy about two undercover cops who go to college to solve a case, but end up questioning their friendship and growing as individuals."

# Apply filters (e.g., genre, year, rating)
filters = {'genres': 'Comedy', 'year': 2014, 'rating': 7.0}

# Call the function
recommended_movies = recommend_movies(user_prompt, filters)

# Display results
print("Recommended Movies:")
print(recommended_movies[['name', 'year', 'rating', 'description', 'similarity']])

Recommended Movies:
                             name  year  rating  \
619             Pineapple Express  2008     6.9   
15                 22 Jump Street  2014     7.0   
14                 21 Jump Street  2012     7.2   
537                  Men in Black  1997     7.3   
469  Kingsman: The Secret Service  2014     7.7   

                                           description  similarity  
619  A stoner and his dealer are forced to go on th...    0.618382  
15   After making their way through high school (tw...    0.592649  
14   When cops Schmidt and Jenko join the secret Ju...    0.544719  
537  After a police chase with an otherworldly bein...    0.542073  
469  The story of a super-secret spy organization t...    0.514129  
