# 🎯 Movie Recommendation using Sentence Transformers

This notebook uses semantic embeddings (Sentence Transformers) to recommend similar movies based on metadata such as title and description.

In [1]:
import pandas as pd
import numpy as np
import torch
from sentence_transformers import SentenceTransformer, util
import pickle
import os




#### Load Data

In [8]:
sent_df = pd.read_csv('../data/netflix_titles.csv')
print(f"Loaded {len(sent_df)} titles from Netflix dataset")

Loaded 8807 titles from Netflix dataset


#### Initialize Transformer

In [11]:
model = SentenceTransformer("paraphrase-distilroberta-base-v1")

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.55k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/328M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.32k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

#### Embedings

In [13]:
def get_embeddings():
    cache_file = "netflix_embeddings.pkl"
    if os.path.exists(cache_file):
        with open(cache_file, 'rb') as f:
            return pickle.load(f)
    descriptions = sent_df["description"].fillna("").tolist()
    print("Computing embeddings...")
    embeddings = model.encode(descriptions, batch_size=32, show_progress_bar=True)
    with open(cache_file, 'wb') as f:
        pickle.dump(embeddings, f)
    return embeddings
des_embeddings = get_embeddings()

des_embeddings_np = np.array(des_embeddings)

Computing embeddings...


Batches:   0%|          | 0/276 [00:00<?, ?it/s]

#### Recomendations

In [15]:
#description
def recommend(query, top_n=10):
    query_embedded = model.encode(query)
    query_tensor = torch.tensor(query_embedded)
    embeddings_tensor = torch.tensor(des_embeddings_np)
    cosine_scores = util.pytorch_cos_sim(query_tensor, embeddings_tensor)
    
    top_matches = torch.argsort(cosine_scores, dim=-1, descending=True).tolist()[0][:top_n]
    return top_matches

In [16]:
def get_recommendations_by_title(title, top_n=10):
    title_matches = sent_df[sent_df["title"] == title]
    
    if len(title_matches) == 0:
        print(f"Title '{title}' not found")
        return []
    query_show_des = title_matches["description"].iloc[0]
    
    if pd.isna(query_show_des):
        print(f"No description for '{title}'")
        return []
    matches = recommend(query_show_des, top_n)
    
    results = []
    for idx in matches:
        show = sent_df.iloc[idx]
        # Skip the original title
        if show["title"] == title:
            continue
        results.append(show["title"])
    
    return results[:top_n]

#### Test

In [18]:
title = "Twilight"
print(f"\nShows similar to '{title}':")
similar_shows = get_recommendations_by_title(title)
for i, show in enumerate(similar_shows, 1):
    print(f"{i}. {show}")


Shows similar to 'Twilight':
1. The Twilight Saga: New Moon
2. My Babysitter's a Vampire: The Movie
3. The Order
4. Greenhouse Academy
5. Miss in Kiss
6. The Twilight Saga: Breaking Dawn: Part 1
7. The Roommate
8. Kuch Kuch Hota Hai
9. The Twilight Saga: Eclipse


In [19]:
def get_recommendations_by_text(query_text, top_n=10):
    matches = recommend(query_text, top_n)
    
    return [sent_df.iloc[idx]["title"] for idx in matches]


custom_query = "A family comedy with heartwarming moments"
print(f"\nShows matching query: '{custom_query}':")

matches = get_recommendations_by_text(custom_query)

for i, show in enumerate(matches, 1):
    print(f"{i}. {show}")


Shows matching query: 'A family comedy with heartwarming moments':
1. Total Drama
2. The Upshaws
3. Dad Stop Embarrassing Me - The Afterparty
4. A 3 Minute Hug
5. Shameless (U.S.)
6. Jim Gaffigan: Cinco
7. Asperger's Are Us
8. Everything Will Be Fine
9. Almost Love
10. The Sound of Your Heart
