#03_embedding_generation
- Sets up the embedding generator using SageMaker
- Loads job descriptions from files
- Creates chunks from job descriptions using a custom chunker
- Generates embeddings for each chunk
- Uploads the embedded vectors to Pinecone
- Includes batch processing for efficiency

In [1]:
!pip install pinecone



In [2]:
!pip show pinecone

Name: pinecone
Version: 6.0.2
Summary: Pinecone client and SDK
Home-page: https://www.pinecone.io
Author: Pinecone Systems, Inc.
Author-email: support@pinecone.io
License: Apache-2.0
Location: /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages
Requires: certifi, pinecone-plugin-interface, python-dateutil, typing-extensions, urllib3
Required-by: 


In [3]:
import boto3
import json
import os
import pandas as pd
import uuid
import sys
import numpy as np
import time
from typing import List, Dict, Any, Union 
from tqdm import tqdm

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)


In [14]:
# SageMaker endpoint configuration

SAGEMAKER_ENDPOINT_NAME = "e5-embeddings-pooled-2" 
AWS_REGION = "us-east-1"

# Set Pinecone credentials directly
PINECONE_API_KEY = ""
PINECONE_ENVIRONMENT = "us-east-1" # matched with AWS region
PINECONE_INDEX_NAME = "sample-100-strings" ## important note - "sample-100-strings" if you don't need job_id to avoid dups, and "sample-100" if you may need strings by job_id

EMBEDDING_DIMENSION = 1024

print(f"Embedding dimension: {EMBEDDING_DIMENSION}")

Embedding dimension: 1024


In [15]:
# Initialize Pinecone with credentials
from pinecone import Pinecone

def initialize_pinecone():
    """Initialize Pinecone and return the index"""
    try:
        # Initialize Pinecone client
        pc = Pinecone(api_key=PINECONE_API_KEY)
        
        # Check if the index exists
        existing_indexes = pc.list_indexes().names()
        print(f"Available Pinecone indexes: {existing_indexes}")
        
        if PINECONE_INDEX_NAME not in existing_indexes:
            print(f"Creating new index '{PINECONE_INDEX_NAME}'...")
            
            # Create the index
            pc.create_index(
                name=PINECONE_INDEX_NAME,
                dimension=EMBEDDING_DIMENSION,
                metric="cosine",
                metadata_config={
                    "indexed": [
                        "source_type",
                        "requirement_level",
                        "job_id",
                        "resume_id"
                    ]
                }
            )
            print(f"Index '{PINECONE_INDEX_NAME}' created successfully")
        
        # Connect to the index
        index = pc.Index(PINECONE_INDEX_NAME)
        print(f"Connected to Pinecone index: {PINECONE_INDEX_NAME}")
        
        return index
        
    except Exception as e:
        print(f"Error initializing Pinecone: {str(e)}")
        print("Please check your API key and environment settings.")
        # Return None to indicate initialization failed
        return None

# Initialize Pinecone
pinecone_index = initialize_pinecone()

if pinecone_index:
    # Check index stats
    index_stats = pinecone_index.describe_index_stats()
    # Print directly to avoid serialization issues
    print(f"Index statistics:")
    print(index_stats)

Available Pinecone indexes: ['mirra-embeddings', 'sample-100-strings', 'sample-100', 'mirra-filtering', 'mirra']
Connected to Pinecone index: sample-100-strings
Index statistics:
{'dimension': 1024,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 4480}},
 'total_vector_count': 4480,
 'vector_type': 'dense'}


In [16]:
class ResilientSageMakerEmbedder:
    """
    A wrapper around SageMaker embedding endpoints with resilience features.
    """
    
    def __init__(self, endpoint_name, max_text_length=512, region="us-east-1"):
        """
        Initialize the embedder with a SageMaker endpoint.
        
        Args:
            endpoint_name: The name of the SageMaker endpoint
            max_text_length: Maximum text length to truncate to
            region: AWS region for the endpoint
        """
        import boto3
        import json
        import numpy as np
        
        self.sagemaker_runtime = boto3.client('sagemaker-runtime', region_name=region)
        self.endpoint_name = endpoint_name
        self.max_text_length = max_text_length
        print(f"Initialized ResilientSageMakerEmbedder for endpoint: {endpoint_name}")

    def _prepare_text(self, text):
            """Clean and prepare text for the embedding model."""
            if not isinstance(text, str):
                text = str(text)
            
            # Remove excessive whitespace
            text = ' '.join(text.split())
            
            # Add E5 prefix
            if not text.startswith("passage:"):
                text = f"passage: {text}"
            
            # Truncate if needed
            if len(text) > self.max_text_length:
                text = text[:self.max_text_length]
                
            return text
    def generate_embeddings(self, texts):
        """Generate embeddings using SageMaker E5 endpoint"""
        # Ensure texts is a list
        if not isinstance(texts, list):
            texts = [texts]
            
        try:
            # Prepare input for E5 model
            prepared_texts = [self._prepare_text(text) for text in texts]
            
            # Prepare payload with explicit pooling parameters
            payload = {
                "inputs": prepared_texts,
                "parameters": {
                    "normalize": True,
                    "pooling": "mean",
                    "return_sentence_embedding": True
                }
            }
            
            # Call SageMaker endpoint
            response = self.sagemaker_runtime.invoke_endpoint(
                EndpointName=self.endpoint_name,
                ContentType='application/json',
                Body=json.dumps(payload)
            )
            
            # Parse response
            response_body = json.loads(response['Body'].read().decode('utf-8'))
            
            # Process embeddings with proper pooling
            embeddings = []
            for emb in response_body:
                emb_array = np.array(emb)
                
                # Handle token-level embeddings by taking mean across tokens
                if len(emb_array.shape) > 1:
                    # Average across all but the last dimension
                    while len(emb_array.shape) > 1:
                        emb_array = np.mean(emb_array, axis=0)
                
                # Ensure we have the right dimension (1024)
                if emb_array.shape[0] != EMBEDDING_DIMENSION:
                    if emb_array.shape[0] > EMBEDDING_DIMENSION:
                        emb_array = emb_array[:EMBEDDING_DIMENSION]
                    else:
                        padded = np.zeros(EMBEDDING_DIMENSION)
                        padded[:emb_array.shape[0]] = emb_array
                        emb_array = padded
                
                embeddings.append(emb_array.tolist())
            
            return embeddings
                
        except Exception as e:
            print(f"Error generating embeddings: {str(e)}")
            return [self._create_random_unit_vector() for _ in range(len(texts))]
    
    def _create_random_unit_vector(self, dim=1024):
        """Create a random unit vector for fallback"""
        import numpy as np
        vec = np.random.normal(0, 1, size=dim)
        return (vec / np.linalg.norm(vec)).tolist()

In [17]:
def job_id_retrieval(pinecone_index):
    job_ids = set()
    stats = pinecone_index.describe_index_stats()
    total_vectors = stats.get('total_vector_count', 0)
    
    # Try different metadata key variations
    potential_keys = ['job_id', 'jobId', 'id', 'job_identifier']
    
    for namespace, ns_stats in stats.get('namespaces', {}).items():
        print(f"Checking namespace: {namespace}")
        try:
            # Fetch vectors in this namespace
            for key in potential_keys:
                namespace_vectors = pinecone_index.query(
                    vector=[0]*EMBEDDING_DIMENSION, 
                    top_k=total_vectors, 
                    namespace=namespace,
                    filter={key: {'$exists': True}},
                    include_metadata=True
                )
                
                for match in namespace_vectors.get('matches', []):
                    metadata = match.get('metadata', {})
                    for potential_key in potential_keys:
                        if potential_key in metadata:
                            job_ids.add(metadata[potential_key])
        except Exception as e:
            print(f"Error in namespace {namespace}: {e}")
    
    return sorted(list(job_ids))

In [18]:
# Execute and print results
if pinecone_index:
    print("Querying Pinecone for uploaded job IDs...")
    job_ids = job_id_retrieval(pinecone_index)
    
    if job_ids:
        print(f"\nFound {len(job_ids)} unique job IDs in Pinecone:")
        for i, job_id in enumerate(job_ids):
            print(f"{i+1}. {job_id}")
    else:
        print("No job IDs found in Pinecone")
else:
    print("Pinecone index not initialized")

Querying Pinecone for uploaded job IDs...
Checking namespace: 
No job IDs found in Pinecone


In [19]:
embedder = ResilientSageMakerEmbedder(
        endpoint_name=SAGEMAKER_ENDPOINT_NAME,
        region=AWS_REGION
    )

def semantic_search(query_title, pinecone_index, embedder=None, similarity_threshold=0.8):
    """
    Search for semantic similarity from vdb.
    
    Args:
        query_title: The text to search for
        pinecone_index: The Pinecone index to search
        embedder: Embedding generator instance (optional)
        similarity_threshold: Minimum similarity score (0-1) to include in results
        
    Returns:
        List of dictionaries with job_id, text, and similarity score
    """
    try:
        # Create embedder if not provided
        if embedder is None:
            from random_embedding import RandomEmbeddingGenerator
            print("No embedder provided. Creating RandomEmbeddingGenerator as fallback.")
            embedder = RandomEmbeddingGenerator()
            
        query_text = f"{query_title}"  
        query_embedding = embedder.generate_embeddings([query_text])[0]
        
        search_results = pinecone_index.query(
            vector=query_embedding,
            top_k=100,  # Get enough results to filter
            include_metadata=True,
            include_values=False  # only need the scores, not the vectors
        )
        
        similar_text = []
        
        # Pinecone v6+ returns objects instead of dictionaries
        matches = getattr(search_results, 'matches', [])
        
        for match in matches:
            # Extract match properties
            score = getattr(match, 'score', 0)
            metadata = getattr(match, 'metadata', {})
            
            # Skip if below threshold
            if score < similarity_threshold:
                continue
                
            # Extract job info from metadata
            job_id = metadata.get('job_id', 'unknown')
            text = metadata.get('text', metadata.get('text', '')).replace('Text: ', '')
            
            similar_text.append({
                'job_id': job_id,
                'text': text,
                'similarity': score
            })
        
        return similar_text
        
    except Exception as e:
        print(f"Error in semantic search: {str(e)}")
        return []

Initialized ResilientSageMakerEmbedder for endpoint: e5-embeddings-pooled-2


In [21]:
# Test the function with a sample job title
from pinecone import Pinecone
pc = Pinecone(api_key=PINECONE_API_KEY)
pinecone_index = pc.Index(PINECONE_INDEX_NAME)

test_query = "Python"
results = semantic_search(
    test_query, 
    pinecone_index=pinecone_index,
    embedder=embedder
)

print(f"Text similar to '{test_query}' (similarity > 0.8):")
for job in results:
    print(f"- {job['text']}, Score: {job['similarity']:.4f})")

Text similar to 'Python' (similarity > 0.8):
- Python, Score: 1.0000)
- Python Scripting, Score: 0.9769)
- Python development, Score: 0.9587)
- Use Python, Score: 0.9516)
- Numpy, Score: 0.9443)
- PySpark, Score: 0.9364)
- Scala, Score: 0.9331)
- Perl, Score: 0.9291)
- Typescript, Score: 0.9243)
- Leverage PySpark, Score: 0.9214)
- Bash, Score: 0.9189)
- Kotlin, Score: 0.9187)
- C#, Score: 0.9171)
- Various scripting languages, Score: 0.9163)
- TypeScript, Score: 0.9162)
- VB.NET, Score: 0.9157)
- C++, Score: 0.9139)
- Utilize programming languages, Score: 0.9137)
- Pandas, Score: 0.9126)
- PowerShell, Score: 0.9124)
- JavaScript, Score: 0.9105)
- Bash scripting, Score: 0.9099)
- Powershell, Score: 0.9090)
- Ruby, Score: 0.9087)
- Java, Score: 0.9078)
- Node.JS, Score: 0.9066)
- Javascript, Score: 0.9065)
- UNIX Shell Scripting, Score: 0.9052)
- CS, Score: 0.9049)
- .NET Core C#, Score: 0.9031)
- .NET, Score: 0.9020)
- .Net, Score: 0.9018)
- JAVA, Score: 0.9018)
- Postgres, Score: 0.90