In [25]:
import pandas as pd
df = pd.read_parquet('movie_data.parquet')

In [26]:
df1 = df[['title','overview','genres','poster_path','vote_average','vote_count']].drop_duplicates().reset_index(drop=True)

df1['overview'] = df1['overview'].fillna('')
df1['poster_path'] = df1['poster_path'].fillna('')

In [27]:
import ast

genres = ast.literal_eval(df1['genres'][0])
genres_name = [genre['name'] for genre in genres]
print(genres_name)

['Animation', 'Comedy', 'Family']


In [28]:
def extract_genre_names(genre_str):
    try:
        genres = ast.literal_eval(genre_str)
        return [g['name'] for g in genres]
    except (ValueError, SyntaxError):
        return []
df1['genres'] = df1['genres'].apply(extract_genre_names)
df1[['title', 'genres']].head()

Unnamed: 0,title,genres
0,Toy Story,"[Animation, Comedy, Family]"
1,Jumanji,"[Adventure, Fantasy, Family]"
2,Grumpier Old Men,"[Romance, Comedy]"
3,Waiting to Exhale,"[Comedy, Drama, Romance]"
4,Father of the Bride Part II,[Comedy]


In [29]:
df1['genres_str'] = df1['genres'].apply(lambda x: ' '.join(x))
df1['genres_str'] = df1['genres_str'].fillna('').apply(lambda x: ' '.join(x) if isinstance(x, list) else x)

df1['text'] = df1['genres_str'] + ' ' + df1['overview']

In [None]:
import torch
from sentence_transformers import SentenceTransformer, util
import numpy as np

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
movie_overviews = df1['text'].tolist()

model = SentenceTransformer('all-mpnet-base-v2')

movie_embeddings = model.encode(movie_overviews, convert_to_tensor=True, show_progress=True)
movie_embeddings_np = movie_embeddings.cpu().numpy()
np.save('movie_embeddings.npy', movie_embeddings_np)

movie_embeddings = torch.tensor(np.load('movie_embeddings.npy')).float().to(device)

In [54]:
def recommend_books(user_prompt, top_k=5):
    query_embedding = model.encode(user_prompt, convert_to_tensor=True).to(device)

    # Compute similarity between query and description
    cos_scores = util.cos_sim(query_embedding, movie_embeddings)[0]

    # Get top k
    top_results = torch.topk(cos_scores, k=len(cos_scores))

    recommended = df1.iloc[top_results.indices.cpu().numpy()].copy()
    recommended['similarity'] = top_results.values.cpu().numpy()
    recommended = recommended.sort_values(by='similarity', ascending=False)
    recommend_books = recommended.drop_duplicates(subset=['title']).reset_index(drop=True)
    recommend_books = recommend_books[recommend_books['vote_count'] >  150]
    return recommend_books[['title','similarity','overview','vote_average','vote_count','poster_path']].head()


In [48]:
prompt = "It’s really bleak and dystopian, set in a totalitarian future where everything is monitored, and people aren't allowed to think freely"

In [55]:
recommend_books(prompt)

Unnamed: 0,title,similarity,overview,vote_average,vote_count,poster_path
0,The Lives of Others,0.577012,A tragic love story set in East Berlin with th...,7.9,977.0,/bzzDAg3fkztvfQB08VBprhs9tVE.jpg
1,Nineteen Eighty-Four,0.566651,George Orwell's novel of a totalitarian future...,6.8,311.0,/asqIqgy3lywRhrVv6WCdcofNWH1.jpg
3,Following,0.54367,"A struggling, unemployed young writer takes to...",7.2,363.0,/uoWnnSlUIWjqUXxfIej3ucAxg7J.jpg
4,May,0.5308,Psychological horror about a lonely young woma...,6.3,153.0,/kUaoxNyLFhZ8tOKFXu4xQnZJYxa.jpg
9,The Girl with the Dragon Tattoo,0.502649,This English-language adaptation of the Swedis...,7.2,2479.0,/voxRWFTtagLiqnJQs9tWQLB0MN.jpg
