#03_embedding_generation
- Sets up the embedding generator using SageMaker
- Loads job descriptions from files
- Creates chunks from job descriptions using a custom chunker
- Generates embeddings for each chunk
- Uploads the embedded vectors to Pinecone
- Includes batch processing for efficiency

In [1]:
!pip install pinecone



In [2]:
!pip show pinecone

Name: pinecone
Version: 6.0.2
Summary: Pinecone client and SDK
Home-page: https://www.pinecone.io
Author: Pinecone Systems, Inc.
Author-email: support@pinecone.io
License: Apache-2.0
Location: /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages
Requires: certifi, pinecone-plugin-interface, python-dateutil, typing-extensions, urllib3
Required-by: 


In [3]:
import boto3
import json
import os
import pandas as pd
import uuid
import sys
import numpy as np
import time
from typing import List, Dict, Any, Union 
from tqdm import tqdm

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)


In [4]:
# SageMaker endpoint configuration

SAGEMAKER_ENDPOINT_NAME = "e5-embeddings-pooled-2" 
AWS_REGION = "us-east-1"

# Set Pinecone credentials directly
PINECONE_API_KEY = "pcsk_7VkStS_ifR3SH9d1MSkkju9kP7DUt5M16CpNyzi9dwNBm7iUqyXmbKZWQbC55ZzfSEaAB"
PINECONE_ENVIRONMENT = "us-east-1" # matched with AWS region
PINECONE_INDEX_NAME = "mirra-filtering" ## important note - "sample-100-strings" if you don't need job_id to avoid dups, and "sample-100" if you may need strings by job_id

EMBEDDING_DIMENSION = 1024

print(f"Embedding dimension: {EMBEDDING_DIMENSION}")

Embedding dimension: 1024


In [5]:
# Initialize Pinecone with credentials
from pinecone import Pinecone

def initialize_pinecone():
    """Initialize Pinecone and return the index"""
    try:
        # Initialize Pinecone client
        pc = Pinecone(api_key=PINECONE_API_KEY)
        
        # Check if the index exists
        existing_indexes = pc.list_indexes().names()
        print(f"Available Pinecone indexes: {existing_indexes}")
        
        if PINECONE_INDEX_NAME not in existing_indexes:
            print(f"Creating new index '{PINECONE_INDEX_NAME}'...")
            
            # Create the index
            pc.create_index(
                name=PINECONE_INDEX_NAME,
                dimension=EMBEDDING_DIMENSION,
                metric="cosine",
                metadata_config={
                    "indexed": [
                        "source_type",
                        "requirement_level",
                        "job_id",
                        "resume_id"
                    ]
                }
            )
            print(f"Index '{PINECONE_INDEX_NAME}' created successfully")
        
        # Connect to the index
        index = pc.Index(PINECONE_INDEX_NAME)
        print(f"Connected to Pinecone index: {PINECONE_INDEX_NAME}")
        
        return index
        
    except Exception as e:
        print(f"Error initializing Pinecone: {str(e)}")
        print("Please check your API key and environment settings.")
        # Return None to indicate initialization failed
        return None

# Initialize Pinecone
pinecone_index = initialize_pinecone()

if pinecone_index:
    # Check index stats
    index_stats = pinecone_index.describe_index_stats()
    # Print directly to avoid serialization issues
    print(f"Index statistics:")
    print(index_stats)

Available Pinecone indexes: ['mirra-embeddings', 'sample-100-strings', 'sample-100', 'mirra-filtering', 'mirra']
Connected to Pinecone index: mirra-filtering
Index statistics:
{'dimension': 1024,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 112}},
 'total_vector_count': 112,
 'vector_type': 'dense'}


In [6]:
class ResilientSageMakerEmbedder:
    """
    A wrapper around SageMaker embedding endpoints with resilience features.
    """
    
    def __init__(self, endpoint_name, max_text_length=512, region="us-east-1"):
        """
        Initialize the embedder with a SageMaker endpoint.
        
        Args:
            endpoint_name: The name of the SageMaker endpoint
            max_text_length: Maximum text length to truncate to
            region: AWS region for the endpoint
        """
        import boto3
        import json
        import numpy as np
        
        self.sagemaker_runtime = boto3.client('sagemaker-runtime', region_name=region)
        self.endpoint_name = endpoint_name
        self.max_text_length = max_text_length
        print(f"Initialized ResilientSageMakerEmbedder for endpoint: {endpoint_name}")

    def _prepare_text(self, text):
            """Clean and prepare text for the embedding model."""
            if not isinstance(text, str):
                text = str(text)
            
            # Remove excessive whitespace
            text = ' '.join(text.split())
            
            # Add E5 prefix
            if not text.startswith("passage:"):
                text = f"passage: {text}"
            
            # Truncate if needed
            if len(text) > self.max_text_length:
                text = text[:self.max_text_length]
                
            return text
    def generate_embeddings(self, texts):
        """Generate embeddings using SageMaker E5 endpoint"""
        # Ensure texts is a list
        if not isinstance(texts, list):
            texts = [texts]
            
        try:
            # Prepare input for E5 model
            prepared_texts = [self._prepare_text(text) for text in texts]
            
            # Prepare payload with explicit pooling parameters
            payload = {
                "inputs": prepared_texts,
                "parameters": {
                    "normalize": True,
                    "pooling": "mean",
                    "return_sentence_embedding": True
                }
            }
            
            # Call SageMaker endpoint
            response = self.sagemaker_runtime.invoke_endpoint(
                EndpointName=self.endpoint_name,
                ContentType='application/json',
                Body=json.dumps(payload)
            )
            
            # Parse response
            response_body = json.loads(response['Body'].read().decode('utf-8'))
            
            # Process embeddings with proper pooling
            embeddings = []
            for emb in response_body:
                emb_array = np.array(emb)
                
                # Handle token-level embeddings by taking mean across tokens
                if len(emb_array.shape) > 1:
                    # Average across all but the last dimension
                    while len(emb_array.shape) > 1:
                        emb_array = np.mean(emb_array, axis=0)
                
                # Ensure we have the right dimension (1024)
                if emb_array.shape[0] != EMBEDDING_DIMENSION:
                    if emb_array.shape[0] > EMBEDDING_DIMENSION:
                        emb_array = emb_array[:EMBEDDING_DIMENSION]
                    else:
                        padded = np.zeros(EMBEDDING_DIMENSION)
                        padded[:emb_array.shape[0]] = emb_array
                        emb_array = padded
                
                embeddings.append(emb_array.tolist())
            
            return embeddings
                
        except Exception as e:
            print(f"Error generating embeddings: {str(e)}")
            return [self._create_random_unit_vector() for _ in range(len(texts))]
    
    def _create_random_unit_vector(self, dim=1024):
        """Create a random unit vector for fallback"""
        import numpy as np
        vec = np.random.normal(0, 1, size=dim)
        return (vec / np.linalg.norm(vec)).tolist()

In [7]:
def job_id_retrieval(pinecone_index):
    job_ids = set()
    stats = pinecone_index.describe_index_stats()
    total_vectors = stats.get('total_vector_count', 0)
    
    # Try different metadata key variations
    potential_keys = ['job_id', 'jobId', 'id', 'job_identifier']
    
    for namespace, ns_stats in stats.get('namespaces', {}).items():
        print(f"Checking namespace: {namespace}")
        try:
            # Fetch vectors in this namespace
            for key in potential_keys:
                namespace_vectors = pinecone_index.query(
                    vector=[0]*EMBEDDING_DIMENSION, 
                    top_k=total_vectors, 
                    namespace=namespace,
                    filter={key: {'$exists': True}},
                    include_metadata=True
                )
                
                for match in namespace_vectors.get('matches', []):
                    metadata = match.get('metadata', {})
                    for potential_key in potential_keys:
                        if potential_key in metadata:
                            job_ids.add(metadata[potential_key])
        except Exception as e:
            print(f"Error in namespace {namespace}: {e}")
    
    return sorted(list(job_ids))

In [8]:
# Execute and print results
if pinecone_index:
    print("Querying Pinecone for uploaded job IDs...")
    job_ids = job_id_retrieval(pinecone_index)
    
    if job_ids:
        print(f"\nFound {len(job_ids)} unique job IDs in Pinecone:")
        for i, job_id in enumerate(job_ids):
            print(f"{i+1}. {job_id}")
    else:
        print("No job IDs found in Pinecone")
else:
    print("Pinecone index not initialized")

Querying Pinecone for uploaded job IDs...
Checking namespace: 

Found 100 unique job IDs in Pinecone:
1. 06ba6c67-c4eb-4358-aa2a-64dc8d186198
2. 08d80a05-d86e-4fbf-9203-502dfad0f0f2
3. 0b3eccfd-15c1-4a4a-b9f5-87d0b26a6de7
4. 0c9591b9-68aa-4d93-81a4-c090753a43d5
5. 1154d66d-c922-48ea-bb60-59c719b3c77d
6. 128354d9-d13b-413b-8c3c-5dc85a608cd7
7. 137098ec-6b03-47cc-adcc-4f2d672a20f9
8. 137510b3-5847-44c2-a16a-61558634845a
9. 158df241-dc50-4aff-86de-114acd3bd2e5
10. 181b9a3d-71ca-420e-8ea8-08b348ad69f2
11. 18df64e2-17f0-45b0-8759-ba1b89fad7dd
12. 195f1df8-b444-4118-ad8f-b2d8a9959c34
13. 1a8b8afd-5bf1-43a5-abb8-b533e7ba6fdc
14. 1aa0834d-5e19-4a9d-a0bf-e58d417ea413
15. 23092c01-f933-4711-ac83-cbc310045763
16. 2510ddf5-282f-4f1c-bc72-44e34c8f67d8
17. 27b81e47-a357-4bc9-bf00-3a854b2c03f1
18. 28b3cec4-9cc9-4817-b133-181c8958cc3a
19. 29cd66f5-7ab3-44a3-8633-29dfc9557a08
20. 2b2477c7-fafb-411f-b890-11cbb24d02e0
21. 2bc7d30f-d8db-48e2-a76b-040ef73044a5
22. 2ce38b1b-2ff5-44f3-b8af-45dbd5e8c4fc
23. 2

In [9]:
embedder = ResilientSageMakerEmbedder(
        endpoint_name=SAGEMAKER_ENDPOINT_NAME,
        region=AWS_REGION
    )

def semantic_search(query_title, pinecone_index, embedder=None, similarity_threshold=0.8):
    """
    Search for semantic similarity from vdb.
    
    Args:
        query_title: The text to search for
        pinecone_index: The Pinecone index to search
        embedder: Embedding generator instance (optional)
        similarity_threshold: Minimum similarity score (0-1) to include in results
        
    Returns:
        List of dictionaries with job_id, text, and similarity score
    """
    try:
        if embedder is None:
            from random_embedding import RandomEmbeddingGenerator
            print("No embedder provided. Creating RandomEmbeddingGenerator as fallback.")
            embedder = RandomEmbeddingGenerator()
            
        query_text = f"{query_title}"  
        query_embedding = embedder.generate_embeddings([query_text])[0]
        
        search_results = pinecone_index.query(
            vector=query_embedding,
            top_k=100,  # Get enough results to filter
            include_metadata=True,
            include_values=False  # only need the scores, not the vectors
        )
        
        similar_text = []
        
        matches = getattr(search_results, 'matches', [])
        
        for match in matches:
            # Extract match properties
            score = getattr(match, 'score', 0)
            metadata = getattr(match, 'metadata', {})
            
            if score < similarity_threshold:
                continue
                
            # Extract job info from metadata
            job_id = metadata.get('job_id', 'unknown')
            text = metadata.get('text', metadata.get('text', '')).replace('Text: ', '')
            
            similar_text.append({
                'job_id': job_id,
                'text': text,
                'similarity': score
            })
        
        return similar_text
        
    except Exception as e:
        print(f"Error in semantic search: {str(e)}")
        return []

Initialized ResilientSageMakerEmbedder for endpoint: e5-embeddings-pooled-2


In [14]:
# Test the function with a sample job title
from pinecone import Pinecone
pc = Pinecone(api_key=PINECONE_API_KEY)
pinecone_index = pc.Index(PINECONE_INDEX_NAME)

test_query = "Data Scientist, visa = yes"
results = semantic_search(
    test_query, 
    pinecone_index=pinecone_index,
    embedder=embedder
)

print(f"Text similar to '{test_query}' (similarity > 0.8):")
for job in results:
    print(f"- {job['job_id']}, Score: {job['similarity']:.4f})")

Text similar to 'Data Scientist, visa = yes' (similarity > 0.8):
- 63b55b4d-1162-4151-8878-d55c1262156a, Score: 0.8870)
- 96051542-318e-4fd1-aea9-035cd9a74fe5, Score: 0.8628)
- f6ee0835-b07f-4ccf-8109-1d2c5b309061, Score: 0.8580)
- 2b2477c7-fafb-411f-b890-11cbb24d02e0, Score: 0.8565)
- d9a357a7-5e50-4bf8-a771-ef50414c2bac, Score: 0.8538)
- ac01e246-fa9b-4e5f-a7ed-fdd9cd1995b3, Score: 0.8514)
- 90822d5e-c54f-49ad-a345-17a00f8b0c56, Score: 0.8510)
- 3d6abfd2-971f-4279-bd5c-e2608afb27b8, Score: 0.8510)
- 639fb093-26a8-4745-9656-ee93f3b79f19, Score: 0.8503)
- 28b3cec4-9cc9-4817-b133-181c8958cc3a, Score: 0.8492)
- c50cd8cd-60d1-48a7-8db8-7e828210c020, Score: 0.8479)
- 2510ddf5-282f-4f1c-bc72-44e34c8f67d8, Score: 0.8457)
- 7837d31d-8065-4c42-8852-5a6cdaf6335e, Score: 0.8444)
- 2bc7d30f-d8db-48e2-a76b-040ef73044a5, Score: 0.8443)
- d6e54f22-efe1-4dc7-8e5a-cb8618634691, Score: 0.8441)
- bf9aa16e-181d-46fd-9b34-999b21221885, Score: 0.8440)
- abcf05ac-0424-4858-8f1b-d5e6678a2889, Score: 0.8435)


In [20]:
def print_job_metadata_pinecone_v6(job_id, pinecone_index):
    """
    Print metadata for a specific job ID using Pinecone v6 API
    
    Args:
        job_id: The job ID to look up
        pinecone_index: Pinecone index to fetch the data from
    """
    try:
        # Format the vector ID as expected in Pinecone
        vector_id = f"job_{job_id}"
        
        # Fetch the vector data (Pinecone v6 style)
        response = pinecone_index.fetch(ids=[vector_id])
        
        # For Pinecone v6, check for 'vectors' key
        if not hasattr(response, 'vectors') or vector_id not in response.vectors:
            print(f"No data found for job ID: {job_id}")
            return
        
        # Get the vector object which contains metadata
        vector = response.vectors[vector_id]
        
        # Access metadata from the vector object
        if not hasattr(vector, 'metadata'):
            print(f"No metadata found for job ID: {job_id}")
            return
        
        # Print the metadata
        print(f"\nMetadata for Job ID: {job_id}")
        print("-" * 50)
        
        # Convert metadata object to dictionary for easier printing
        metadata = {}
        for attr in dir(vector.metadata):
            # Filter out special attributes and methods
            if not attr.startswith('_') and not callable(getattr(vector.metadata, attr)):
                metadata[attr] = getattr(vector.metadata, attr)
        
        # Print each metadata field
        for key, value in metadata.items():
            print(f"{key}: {value}")
        
    except Exception as e:
        print(f"Error fetching job metadata: {str(e)}")
        import traceback
        traceback.print_exc()

In [32]:
print_job_metadata_by_id("63b55b4d-1162-4151-8878-d55c1262156a", pinecone_index)

Response type: <class 'pinecone.data.dataclasses.fetch_response.FetchResponse'>
Response structure:
FetchResponse(namespace='', vectors={'job_63b55b4d-1162-4151-8878-d55c1262156a': Vector(id='job_63b55b4d-1162-4151-8878-d55c1262156a', values=[-0.0832305402, 0.0841527, -1.17435408, -0.747047, 0.343526632, -0.0257578939, -0.610736668, 0.812522233, 0.969810426, -0.229124054, 0.878715694, 0.483227432, -0.195759773, -0.253812283, -0.0362168103, -0.298034787, -1.36627913, 0.0427451059, -0.230974421, -0.0540922657, 0.90534544, -0.227537885, -0.3869358, -0.73028183, -0.00923792, -0.345535874, -0.542204857, -0.864939034, -0.530479908, -0.296251804, 0.305368751, -0.214196518, 0.394803464, -0.864780486, -0.311869442, 0.680915833, 0.511995316, 0.52683264, -1.09753513, 0.810092568, 0.0165299345, 0.270888448, 0.325030744, -0.611388683, -0.363977462, 0.211122558, 0.295685768, -0.196364716, -0.397159487, 0.666268468, 0.349324942, 0.117929146, -0.274980247, -0.527582765, -0.344225436, 0.219872817, -0.5