# Duke Nutrition Assistant CS 372 Project - Notebook 2: Create Embeddings
## Step 2: Embed Menu Items for Retrieval (MPS GPU)

### Setup

In [354]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
import json
import time

print(f"PyTorch version: {torch.__version__}")
print(f"MPS available: {torch.backends.mps.is_available()}")
print(f"MPS built: {torch.backends.mps.is_built()}")

PyTorch version: 2.9.0
MPS available: True
MPS built: True


In [356]:
if torch.backends.mps.is_available():
    device = "mps"
    print("Using MPS (Apple Silicon GPU)")
else:
    device = "cpu"
    print("Using CPU (no GPU detected)")

print(f"\nDevice set to: {device}")

Using MPS (Apple Silicon GPU)

Device set to: mps


### Load Processed Menu Data

In [359]:
# Load the processed data from Notebook 1
with open('data/menu_processed.json', 'r') as f:
    data = json.load(f)

documents = data['documents']
items = data['items']

print(f"Original: {len(documents):,} menu items")

# Some extra removals
items_to_drop = [
    'Powdered Sugar', 'White Sugar', 'Vanilla Whey Protein', 'Chocolate Whey Protein Boost', 'Almond Milk', 'Coconut Milk', 'Oat Milk', 'Soy Milk', 'Plain Yogurt', 'Green Apple', 'Avocado', 'Banana', 'Blueberries', 'Chia Seeds', 'Shredded Coconut', 'Granola', 'Mangoes', 'Nutella', 'Peanut Butter', 'Spinach', 'Strawberries', 'Vegan Protein', 'Chocolate Whey Protein', 'Vanilla Whey Protein', 'Brown Sugar', 'Raisins',
    'Egg Your Way',
    'Sugar Free Maple Syrup', 'Maple Flavored Syrup', 'Smart Balance', 'Butter', 'Maple Syrup',
    'Fettuccine',
    'Spaghetti', 'Ketchup'
]

# create filtered lists
filtered_documents = []
filtered_items = []

for doc, item in zip(documents, items):
    # skip if item name is in drop list
    if item['item_name'] not in items_to_drop:
        filtered_documents.append(doc)
        filtered_items.append(item)

# Replace original lists
documents = filtered_documents
items = filtered_items

print(f"After filtering: {len(documents):,} menu items")
print(f"Dropped: {len(data['documents']) - len(documents)} items")

Original: 1,644 menu items
After filtering: 1,637 menu items
Dropped: 7 items


### Load Embedding Model

In [362]:
# Load embedding model (same as from homework)
embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"
print(f"Loading model: {embedding_model_name}")

embedding_tokenizer = AutoTokenizer.from_pretrained(embedding_model_name)
embedding_model = AutoModel.from_pretrained(embedding_model_name)

# Move to MPS device
embedding_model = embedding_model.to(device)
embedding_model.eval()  # Set to evaluation mode

print(f"Model loaded successfully on {device}!")

Loading model: sentence-transformers/all-MiniLM-L6-v2
Model loaded successfully on mps!


### Compute Embeddings Function

**This function is COPIED EXACTLY from RAG homework with MPS compatibility!**

In [393]:
def compute_embeddings(texts, model, tokenizer, batch_size=32, device="mps"):
    """
    Compute dense embeddings for a list of texts.
        
    Args:
        texts: List of strings to embed
        model: Sentence transformer model
        tokenizer: Model tokenizer
        batch_size: Number of texts to process at once
        device: Device to use ('mps', 'cuda', or 'cpu')
    
    Returns:
        numpy array of shape (len(texts), embedding_dim)
    """
    embeddings = []
    
    with torch.no_grad():
        for i in tqdm(range(0, len(texts), batch_size), desc="Computing embeddings"):
            batch = texts[i:i+batch_size]
            
            # Tokenize batch
            inputs = tokenizer(batch, 
                             return_tensors='pt',
                             padding=True, 
                             truncation=True,
                             max_length=512)
            
            # Move to device (MPS)
            inputs = {k: v.to(device) for k, v in inputs.items()}
            outputs = model(**inputs)
            
            # Mean pooling (from RAG homework)
            pooled = outputs.last_hidden_state.mean(dim=1)
            
            # Move to CPU and convert to numpy
            pooled_np = pooled.cpu().numpy()
            embeddings.append(pooled_np)
    
    # Stack all batches
    return np.vstack(embeddings)

print("Embedding function ready!")
def retrieve_top_k(query, context_embeddings, contexts, model, tokenizer, device, k=5):
    """
    Retrieve top-k most similar menu items.
    FROM RAG HOMEWORK
    """
    query_embedding = compute_embeddings([query], model, tokenizer, 
                                        batch_size=1, device=device)[0]
    
    # Calculate cosine similarities
    dot_product = np.dot(context_embeddings, query_embedding)
    norms = (np.linalg.norm(context_embeddings, axis=1) * 
            np.linalg.norm(query_embedding))
    similarities = dot_product / norms
    
    # Get top-k indices
    top_indices = np.argsort(similarities)[-k:][::-1]
    
    return [
        {
            'index': int(idx),
            'document': contexts[idx],
            'item_name': items[idx]['item_name'],
            'restaurant': items[idx]['restaurant'],
            'score': float(similarities[idx]),
            'item': items[idx]  # Full item details
        }
        for idx in top_indices
    ]

print(" Retrieval functions ready!")

Embedding function ready!
 Retrieval functions ready!


### Test on a Few Items First

In [368]:
# Test on first 5 items
test_docs = documents[:5]
print(" Testing embedding on 5 items...\n")

test_embeddings = compute_embeddings(test_docs, embedding_model, 
                                    embedding_tokenizer, batch_size=5, device=device)

print(f"\n Test embeddings shape: {test_embeddings.shape}")
print(f"   Embedding dimension: {test_embeddings.shape[1]}")
print(f"\n First embedding (first 10 values):")
print(f"   {test_embeddings[0][:10]}")

 Testing embedding on 5 items...



Computing embeddings: 100%|███████████████████████| 1/1 [00:00<00:00,  8.08it/s]


 Test embeddings shape: (5, 384)
   Embedding dimension: 384

 First embedding (first 10 values):
   [-0.16062553 -0.10292386 -0.05305719  0.16567005 -0.11824609 -0.02903908
  0.06736976 -0.13347669 -0.21786804 -0.11153291]





### Compute Embeddings for ALL Menu Items

**On MPS (Apple Silicon), this should take 2-5 minutes for ~3,300 items.**

In [371]:
# Compute embeddings for all documents
print(f" Computing embeddings for {len(documents):,} menu items...")
print(f"   Device: {device}")
print(f"   Batch size: 32")
print(f"\n Estimated time on MPS: 2-5 minutes")
print(f"   (On CPU it would take 10-15 minutes)\n")

start_time = time.time()

menu_embeddings = compute_embeddings(
    documents, 
    embedding_model, 
    embedding_tokenizer,
    batch_size=32, 
    device=device
)

elapsed_time = time.time() - start_time

print(f"\n Embeddings computed in {elapsed_time:.1f} seconds ({elapsed_time/60:.1f} minutes)!")
print(f" Embeddings shape: {menu_embeddings.shape}")
print(f" Average time per item: {elapsed_time/len(documents)*1000:.2f}ms")

 Computing embeddings for 1,637 menu items...
   Device: mps
   Batch size: 32

 Estimated time on MPS: 2-5 minutes
   (On CPU it would take 10-15 minutes)



Computing embeddings: 100%|█████████████████████| 52/52 [00:05<00:00,  9.85it/s]


 Embeddings computed in 5.3 seconds (0.1 minutes)!
 Embeddings shape: (1637, 384)
 Average time per item: 3.23ms





### Save Embeddings

In [374]:
import os

np.save('models/menu_embeddings.npy', menu_embeddings)

metadata = {
    'model_name': embedding_model_name,
    'num_items': len(documents),
    'embedding_dim': menu_embeddings.shape[1],
    'device_used': device,
    'computation_time_seconds': elapsed_time,
    'batch_size': 32
}

with open('models/embedding_metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2)

file_size = os.path.getsize('models/menu_embeddings.npy') / 1024 / 1024

print(f" Embeddings saved to: models/menu_embeddings.npy")
print(f"   File size: {file_size:.2f} MB")
print(f" Metadata saved to: models/embedding_metadata.json")

 Embeddings saved to: models/menu_embeddings.npy
   File size: 2.40 MB
 Metadata saved to: models/embedding_metadata.json


### Test Loading Embeddings

In [377]:
#Check if loading matches og
loaded_embeddings = np.load('models/menu_embeddings.npy')

print(f" Loaded embeddings shape: {loaded_embeddings.shape}")
print(f" Matches original: {np.allclose(loaded_embeddings, menu_embeddings)}")

print("\n Embeddings can be loaded successfully")

 Loaded embeddings shape: (1637, 384)
 Matches original: True

 Embeddings can be loaded successfully


### Quick Retrieval Test

Test if our embeddings work for retrieval

In [380]:
def retrieve_top_k(query, context_embeddings, contexts, model, tokenizer, device, k=5):
    """
    Retrieve top-k most similar items.
    
    ADAPTED FROM RAG HOMEWORK retrieve_top_context()
    """
    query_embedding = compute_embeddings([query], model, tokenizer, 
                                        batch_size=1, device=device)[0]
    
    # Calculate cosine similarities
    dot_product = np.dot(context_embeddings, query_embedding)
    norms = (np.linalg.norm(context_embeddings, axis=1) * 
            np.linalg.norm(query_embedding))
    similarities = dot_product / norms
    
    # Get top-k indices
    top_indices = np.argsort(similarities)[-k:][::-1]
    
    return [
        {
            'index': int(idx),
            'document': contexts[idx],
            'item_name': items[idx]['item_name'],
            'restaurant': items[idx]['restaurant'],
            'score': float(similarities[idx])
        }
        for idx in top_indices
    ]

print(" Retrieval function ready!")

 Retrieval function ready!


In [382]:
# Test queries that match some potentnial use cases
test_queries = [
    "high protein low calorie",
    "I want to lean bulk",
    "I need more fiber",
    "keto friendly meal",
    "low sugar option",
    "heart healthy low sodium",
]

print(" TESTING RETRIEVAL ON QUERIES\n")
print("="*80)

for query in test_queries:
    print(f"\n Query: '{query}'")
    print("-"*80)
    
    results = retrieve_top_k(query, menu_embeddings, documents, 
                            embedding_model, embedding_tokenizer, device, k=3)
    
    for i, result in enumerate(results, 1):
        print(f"\n{i}. {result['item_name']} at {result['restaurant']}")
        print(f"   Score: {result['score']:.3f}")
        print(f"   {result['document'][:150]}...")

print("\n" + "="*80)

 TESTING RETRIEVAL ON QUERIES


 Query: 'high protein low calorie'
--------------------------------------------------------------------------------


Computing embeddings: 100%|███████████████████████| 1/1 [00:00<00:00, 12.39it/s]



1. Steak at The Pitchfork
   Score: 0.580
   Steak from The Pitchfork. Available during Lunch & Dinner. Nutrition: 120 calories, 20g protein, 4g fat, 30mg sodium. Macros: 67% protein, 30% fat. Ta...

2. Protein Shot at Saladalia @ The Perk
   Score: 0.574
   Protein Shot from Saladalia @ The Perk. Available during Specialty Drinks. Nutrition: 70 calories, 10g protein, 2g fat, 5g carbs, 3g fiber, 110mg sodi...

3. Chicken and Fresh Mozzarella Sandwich at Gothic Grill
   Score: 0.568
   Chicken and Fresh Mozzarella Sandwich from Gothic Grill. Available during Specialty Drinks. Nutrition: 750 calories, 72g protein, 24g fat, 55g carbs, ...

 Query: 'I want to lean bulk'
--------------------------------------------------------------------------------


Computing embeddings: 100%|███████████████████████| 1/1 [00:00<00:00, 41.45it/s]



1. Skinny Strawberry Smoothie at Red Mango
   Score: 0.385
   Skinny Strawberry Smoothie from Red Mango. Available during Smoothies. Nutrition: 350 calories, 26g protein, 4g fat, 55g carbs, 4g fiber, 38g sugars, ...

2. Mango Metabolizer Smoothie at Red Mango
   Score: 0.369
   Mango Metabolizer Smoothie from Red Mango. Available during Smoothies. Nutrition: 340 calories, 26g protein, 4g fat, 52g carbs, 3g fiber, 36g sugars, ...

3. Mango Smoothie at Cafe
   Score: 0.347
   Mango Smoothie from Cafe. Available during Specialty Drinks. Nutrition: 510 calories, 3g protein, 1g fat, 128g carbs, 6g fiber, 135g sugars, 20mg sodi...

 Query: 'I need more fiber'
--------------------------------------------------------------------------------


Computing embeddings: 100%|███████████████████████| 1/1 [00:00<00:00, 55.36it/s]



1. Bow Tie Pasta at Marketplace
   Score: 0.409
   Bow Tie Pasta from Marketplace. Available during Dinner. Nutrition: 270 calories, 10g protein, 2g fat, 53g carbs, 3g fiber. Dietary: Gluten Free; Vege...

2. Raisin Bran at Marketplace
   Score: 0.388
   Raisin Bran from Marketplace. Available during Breakfast. Nutrition: 90 calories, 2g protein, 0g fat, 23g carbs, 3g fiber, 9g sugars, 105mg sodium. Di...

3. Plain Bagel at Duke Marine Lab
   Score: 0.381
   Plain Bagel from Duke Marine Lab. Available during Breakfast. Nutrition: 1790 calories, 60g protein, 6g fat, 382g carbs, 12g fiber, 48g sugars, 2740mg...

 Query: 'keto friendly meal'
--------------------------------------------------------------------------------


Computing embeddings: 100%|███████████████████████| 1/1 [00:00<00:00, 56.46it/s]



1. Two Eggs to Order Any Style at The Skillet
   Score: 0.581
   Two Eggs to Order Any Style from The Skillet. Available during Breakfast. Nutrition: 150 calories, 9g protein, 12g fat, 1g carbs, 1g sugars, 135mg sod...

2. Carnitas at The Pitchfork
   Score: 0.565
   Carnitas from The Pitchfork. Available during Lunch & Dinner. Nutrition: 120 calories, 12g protein, 6g fat, 2g carbs, 2g sugars, 270mg sodium. Macros:...

3. Mushroom at The Pitchfork
   Score: 0.557
   Mushroom from The Pitchfork. Available during Lunch & Dinner. Nutrition: 20 calories, 3g protein, 3g carbs, 1g fiber, 15mg sodium. Dietary: Vegan; Veg...

 Query: 'low sugar option'
--------------------------------------------------------------------------------


Computing embeddings: 100%|███████████████████████| 1/1 [00:00<00:00, 46.67it/s]



1. Au Lait Skim Milk at Beyu Blue Coffee
   Score: 0.437
   Au Lait Skim Milk from Beyu Blue Coffee. Available during Specialty Drinks. Nutrition: 20 calories, 2g protein, 3g carbs, 3g sugars, 35mg sodium. Diet...

2. Iced Mint 2 B U Skim Milk at Beyu Blue Coffee
   Score: 0.420
   Iced Mint 2 B U Skim Milk from Beyu Blue Coffee. Available during Specialty Drinks. Nutrition: 190 calories, 5g protein, 2g fat, 39g carbs, 1g fiber, ...

3. Latte Skim Milk at Beyu Blue Coffee
   Score: 0.419
   Latte Skim Milk from Beyu Blue Coffee. Available during Specialty Drinks. Nutrition: 60 calories, 6g protein, 9g carbs, 9g sugars, 80mg sodium. Dietar...

 Query: 'heart healthy low sodium'
--------------------------------------------------------------------------------


Computing embeddings: 100%|███████████████████████| 1/1 [00:00<00:00, 42.94it/s]


1. Naan at Tandoor Indian Cuisine
   Score: 0.387
   Naan from Tandoor Indian Cuisine. Available during Lunch/Dinner. Nutrition: 220 calories, 6g protein, 1g fat, 45g carbs, 2g fiber, 80mg sodium. Dietar...

2. Grilled Yellow Fin Tuna at J.B.'s Roast & Chops
   Score: 0.384
   Grilled Yellow Fin Tuna from J.B.'s Roast & Chops. Nutrition: 140 calories, 34g protein, 0g fat, 270mg sodium. Macros: 97% protein, 3% fat. Tags: very...

3. Kidney Beans at Duke Marine Lab
   Score: 0.372
   Kidney Beans from Duke Marine Lab. Available during Lunch. Nutrition: 40 calories, 3g protein, 7g carbs, 2g fiber, 85mg sodium. Dietary: Vegan; Vegeta...






### Advanced Test: Check Tag Matching

In [391]:
# Test that retrieval finds items with the right tags
print(" Checking if retrieval matches tags correctly\n")

test_cases = [
    {
        'query': 'high protein low calorie',
        'expected_tags': ['high protein', 'low calorie']
    },
    {
        'query': 'keto friendly',
        'expected_tags': ['keto friendly', 'low carb']
    },
    {
        'query': 'high fiber',
        'expected_tags': ['fiber', 'digestive']
    },
]

for test in test_cases:
    query = test['query']
    expected = test['expected_tags']
    
    print(f"Query: '{query}'")
    print(f"Expected tags: {expected}")
    
    results = retrieve_top_k(query, menu_embeddings, documents, 
                            embedding_model, embedding_tokenizer, device, k=1)
    
    top_result = results[0]['document'].lower()
    
    matches = [tag for tag in expected if tag in top_result]
    
    if matches:
        print(f" Found tags: {matches}")
    else:
        print(f"Expected tags not found")
    
    print(f"Top result: {results[0]['item_name']}")
    print()

print("\n Tag matching test complete")

 Checking if retrieval matches tags correctly

Query: 'high protein low calorie'
Expected tags: ['high protein', 'low calorie']


Computing embeddings: 100%|███████████████████████| 1/1 [00:00<00:00,  3.70it/s]


 Found tags: ['high protein', 'low calorie']
Top result: Steak

Query: 'keto friendly'
Expected tags: ['keto friendly', 'low carb']


Computing embeddings: 100%|███████████████████████| 1/1 [00:00<00:00, 22.88it/s]


 Found tags: ['keto friendly', 'low carb']
Top result: Mushroom

Query: 'high fiber'
Expected tags: ['fiber', 'digestive']


Computing embeddings: 100%|███████████████████████| 1/1 [00:00<00:00, 34.41it/s]

 Found tags: ['fiber', 'digestive']
Top result: Pita Crisps


 Tag matching test complete





### Notebook 2 now has:
-  Working embedding model 
-  Embeddings for all 1,600+ menu items
-  Saved embeddings ready for retrieval
-  Tested retrieval works on some potential queries
-  Optimized for MPS

### Performance on MPS:
-  ~2-5 minutes for full dataset
-  Much faster than CPU (10-15 min)
-  Similar to CUDA performance

### Test Results:
Checked manually for results matching up
- "high protein low calorie" → chicken, turkey, etc.
- "lean bulk" → high protein items
- "high fiber" → fiber-rich items
- "keto" → low carb items

### Files Created:
- `models/menu_embeddings.npy` (embeddings)
- `models/embedding_metadata.json` (metadata)

### MPS Optimizations:
- Device detection (mps)
- Proper tensor movement to MPS
- Batch size optimized for Apple Silicon