## Initializations

In [1]:
from openai import OpenAI
from supabase import create_client
from dotenv import load_dotenv
import json
import os

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
SUPABASE_URL = os.getenv("SUPABASE_URL")
SUPABASE_KEY = os.getenv("SUPABASE_KEY")

## Functions

### Embed Benefits Chunk

In [None]:
def embed_benefits_chunks():
    """
    Embed the first 3 chunks from benefits_wellbeing_with_context.json 
    into Supabase test_chunks table (only if they don't already exist)
    """
    
    # Initialize clients
    print("Initializing clients...")
    openai_client = OpenAI(api_key=OPENAI_API_KEY)
    supabase = create_client(SUPABASE_URL, SUPABASE_KEY)
    
    # Check what chunks already exist
    print("Checking for existing chunks...")
    try:
        existing_chunks = supabase.table("test_chunks").select("source_file, chunk_index, chunk_heading").execute()
        existing_set = set()
        for chunk in existing_chunks.data:
            key = (chunk['source_file'], chunk['chunk_index'])
            existing_set.add(key)
            print(f"  📋 Found existing: {chunk['chunk_heading']} (index {chunk['chunk_index']})")
        
        print(f"📊 Found {len(existing_chunks.data)} existing chunks in database")
        
    except Exception as e:
        print(f"❌ Error checking existing chunks: {e}")
        existing_set = set()
    
    # Load the benefits data
    print("\nLoading benefits data...")
    with open("data/benefits_wellbeing_with_context.json", "r") as f:
        benefits_data = json.load(f)
    
    # Take first 3 chunks for testing
    test_chunks = benefits_data[:3]
    print(f"📊 Processing {len(test_chunks)} chunks...")
    
    # Track what we actually process
    processed_count = 0
    skipped_count = 0
    
    # Process each chunk
    for i, chunk in enumerate(test_chunks):
        source_file = "benefits_wellbeing_with_context.json"
        chunk_key = (source_file, i)
        
        print(f"\nProcessing chunk {i+1}: {chunk['chunk_heading']}")
        
        # Check if this chunk already exists
        if chunk_key in existing_set:
            print(f"Skipping - chunk already exists in database")
            skipped_count += 1
            continue
        
        # Prepare content for embedding (combine heading + text for better context)
        embedding_content = f"{chunk['chunk_heading']}\n\n{chunk['text']}"
        
        # Generate embedding
        print(f"🧠 Generating embedding for '{chunk['chunk_heading']}'...")
        try:
            response = openai_client.embeddings.create(
                model="text-embedding-3-small",
                input=embedding_content
            )
            embedding = response.data[0].embedding
            print(f"✅ Generated embedding with {len(embedding)} dimensions")
            
        except Exception as e:
            print(f"❌ Error generating embedding: {e}")
            continue
        
        # Prepare data for insertion
        chunk_data = {
            "source_file": source_file,
            "chunk_index": i,
            "chunk_heading": chunk["chunk_heading"],
            "content": chunk["text"],
            "situational_context": chunk["situational_context"],
            "embedding": embedding
        }
        
        # Insert into Supabase
        print(f"💾 Inserting chunk into Supabase...")
        try:
            result = supabase.table("test_chunks").insert(chunk_data).execute()
            print(f"✅ Successfully inserted chunk: {chunk['chunk_heading']}")
            processed_count += 1
            
        except Exception as e:
            print(f"❌ Error inserting into Supabase: {e}")
            continue
    
    # Summary
    print(f"\n🎉 Processing complete!")
    print(f"   ✅ Newly embedded: {processed_count} chunks")
    print(f"   ⏭️  Skipped existing: {skipped_count} chunks")
    print(f"   📊 Total chunks: {processed_count + skipped_count}")
    
    # Test a simple query
    print("\n🔍 Final database state...")
    try:
        test_query = supabase.table("test_chunks").select("*").execute()
        print(f"📊 Total chunks in database: {len(test_query.data)}")
        for chunk in test_query.data:
            print(f"  - {chunk['chunk_heading']} (ID: {chunk['id'][:8]}...)")
            
    except Exception as e:
        print(f"❌ Error testing retrieval: {e}")


In [6]:
def test_similarity_search(query_text="health insurance plans"):
    """
    Similarity search that handles embedding data types correctly
    """
    print(f"\n🔍 Testing similarity search for: '{query_text}'")
    
    # Initialize clients
    openai_client = OpenAI(api_key=OPENAI_API_KEY)
    supabase = create_client(SUPABASE_URL, SUPABASE_KEY)
    
    try:
        # Generate embedding for query
        print("🧠 Generating query embedding...")
        response = openai_client.embeddings.create(
            model="text-embedding-3-small",
            input=query_text
        )
        query_embedding = response.data[0].embedding
        
        # Get all chunks
        print("🔍 Retrieving chunks from database...")
        all_chunks = supabase.table("test_chunks").select("*").execute()
        print(f"📊 Retrieved {len(all_chunks.data)} chunks for similarity comparison")
        
        # Debug: Check what type the embedding is
        if all_chunks.data:
            sample_embedding = all_chunks.data[0]['embedding']
            print(f"🔍 Debug - Embedding type: {type(sample_embedding)}")
            print(f"🔍 Debug - Embedding preview: {str(sample_embedding)[:100]}...")
        
        # Calculate similarities with proper type handling
        import numpy as np
        similarities = []
        
        for chunk in all_chunks.data:
            if chunk['embedding']:
                # Handle different embedding formats from Supabase
                chunk_embedding = chunk['embedding']
                
                # Convert to numpy array if it's a list or string
                if isinstance(chunk_embedding, list):
                    chunk_embedding = np.array(chunk_embedding)
                elif isinstance(chunk_embedding, str):
                    # Try parsing as JSON array
                    import json
                    try:
                        chunk_embedding = np.array(json.loads(chunk_embedding))
                    except:
                        print(f"❌ Could not parse embedding for {chunk['chunk_heading']}")
                        continue
                else:
                    # Assume it's already a numpy array or compatible
                    chunk_embedding = np.array(chunk_embedding)
                
                # Ensure query embedding is also numpy array
                query_embedding_np = np.array(query_embedding)
                
                # Calculate cosine similarity (dot product of normalized vectors)
                # For now just use dot product for simplicity
                similarity = np.dot(query_embedding_np, chunk_embedding)
                
                similarities.append({
                    'chunk': chunk,
                    'similarity': float(similarity)  # Ensure it's a regular float
                })
        
        # Sort by similarity
        similarities.sort(key=lambda x: x['similarity'], reverse=True)
        
        print(f"\n🎯 Top matches for '{query_text}':")
        for i, match in enumerate(similarities[:3]):
            chunk = match['chunk']
            score = match['similarity']
            print(f"  {i+1}. {chunk['chunk_heading']} (similarity: {score:.3f})")
            
    except Exception as e:
        print(f"❌ Error in similarity search: {e}")
        import traceback
        print(f"Full traceback: {traceback.format_exc()}")


### Embedding & Testing

In [4]:
# Test the smart embedding function (will skip existing chunks)
embed_benefits_chunks()


🔧 Initializing clients...
🔍 Checking for existing chunks...
  📋 Found existing: Leaves (index 1)
  📋 Found existing: Health Benefits (index 0)
  📋 Found existing: Perks (index 2)
📊 Found 3 existing chunks in database

📂 Loading benefits data...
📊 Processing 3 chunks...

🔄 Processing chunk 1: Health Benefits
⏭️  Skipping - chunk already exists in database

🔄 Processing chunk 2: Leaves
⏭️  Skipping - chunk already exists in database

🔄 Processing chunk 3: Perks
⏭️  Skipping - chunk already exists in database

🎉 Processing complete!
   ✅ Newly embedded: 0 chunks
   ⏭️  Skipped existing: 3 chunks
   📊 Total chunks: 3

🔍 Final database state...
📊 Total chunks in database: 3
  - Leaves (ID: a64140ab...)
  - Health Benefits (ID: fd4e01a2...)
  - Perks (ID: d62a02fe...)


In [7]:
# Test similarity search
test_similarity_search("health insurance plans")
test_similarity_search("vacation time off")


🔍 Testing similarity search for: 'health insurance plans'
🧠 Generating query embedding...
🔍 Retrieving chunks from database...
📊 Retrieved 3 chunks for similarity comparison
🔍 Debug - Embedding type: <class 'str'>
🔍 Debug - Embedding preview: [-0.016314207,0.03778174,0.030891964,0.039434165,-0.038173843,-0.025262512,-0.018246705,-0.004253596...

🎯 Top matches for 'health insurance plans':
  1. Health Benefits (similarity: 0.489)
  2. Leaves (similarity: 0.173)
  3. Perks (similarity: 0.137)

🔍 Testing similarity search for: 'vacation time off'
🧠 Generating query embedding...
🔍 Retrieving chunks from database...
📊 Retrieved 3 chunks for similarity comparison
🔍 Debug - Embedding type: <class 'str'>
🔍 Debug - Embedding preview: [-0.016314207,0.03778174,0.030891964,0.039434165,-0.038173843,-0.025262512,-0.018246705,-0.004253596...

🎯 Top matches for 'vacation time off':
  1. Leaves (similarity: 0.407)
  2. Perks (similarity: 0.270)
  3. Health Benefits (similarity: 0.200)
