#03_embedding_generation
- Sets up the embedding generator using either Jina API or SageMaker
- Loads job descriptions from files
- Creates chunks from job descriptions using a custom chunker
- Generates embeddings for each chunk
- Uploads the embedded vectors to Pinecone
- Includes batch processing for efficiency

In [1]:
!pip install pinecone



In [2]:
!pip show pinecone

Name: pinecone
Version: 6.0.2
Summary: Pinecone client and SDK
Home-page: https://www.pinecone.io
Author: Pinecone Systems, Inc.
Author-email: support@pinecone.io
License: Apache-2.0
Location: /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages
Requires: certifi, pinecone-plugin-interface, python-dateutil, typing-extensions, urllib3
Required-by: 


In [3]:
import boto3
import json
import os
import pandas as pd
import uuid
import sys
import numpy as np
import time
from typing import List, Dict, Any
from tqdm import tqdm

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [4]:
# SageMaker endpoint configuration
SAGEMAKER_ENDPOINT_NAME = "e5-large-instruct-endpoint-325" 
AWS_REGION = "us-east-1"

# Set Pinecone credentials directly
PINECONE_API_KEY = ""
PINECONE_ENVIRONMENT = "us-east-1" # matched with AWS region
PINECONE_INDEX_NAME = "mirra-filtering"

EMBEDDING_DIMENSION = 1024

print(f"Embedding dimension: {EMBEDDING_DIMENSION}")

Embedding dimension: 1024


In [5]:
class E5EmbeddingGenerator:
    """Class for embedding generation using SageMaker E5 endpoint"""

    def __init__(self, endpoint_name=SAGEMAKER_ENDPOINT_NAME, region=AWS_REGION):
        """
        Initialize with SageMaker endpoint
        
        Args:
            endpoint_name: Name of the SageMaker endpoint for E5 embeddings
            region: AWS region where the endpoint is deployed
        """
        self.endpoint_name = endpoint_name
        self.region = region
        self.sagemaker_runtime = boto3.client('sagemaker-runtime', region_name=region)
        print(f"Initialized SageMaker embedder for endpoint: {endpoint_name}")

    def generate_embeddings(self, texts):
        """
        Generate embeddings using SageMaker E5 endpoint
        
        Args:
            texts: String or list of texts to embed
            
        Returns:
            List of embedding vectors
        """
        # Ensure texts is a list
        if not isinstance(texts, list):
            texts = [texts]
            
        try:
            # Prepare input for E5 model
            # E5 requires "passage: " prefix for embedding text passages
            prefixed_texts = ["passage: " + text for text in texts]
        
            # Prepare payload for SageMaker endpoint
            payload = {
                "inputs": prefixed_texts,
                "normalize": True  # Normalize embeddings for cosine similarity
            }
            
            # Call SageMaker endpoint
            response = self.sagemaker_runtime.invoke_endpoint(
                EndpointName=self.endpoint_name,
                ContentType='application/json',
                Body=json.dumps(payload)
            )
            
            # Parse response from endpoint
            response_body = json.loads(response['Body'].read().decode('utf-8'))
            
            # Process the response
            embeddings = []
            
            for emb in response_body:
                # Convert to numpy for easier manipulation
                emb_array = np.array(emb)
                
                # Handle various possible shapes
                if len(emb_array.shape) == 3:  # Shape is (batch, tokens, dim)
                    # Remove batch dimension if it's 1
                    if emb_array.shape[0] == 1:
                        emb_array = emb_array[0]  # Now shape is (tokens, dim)
                    
                    # Average over token dimension to get (dim)
                    emb_vector = np.mean(emb_array, axis=0).tolist()
                    embeddings.append(emb_vector)
                elif len(emb_array.shape) == 2:  # Shape is (tokens, dim)
                    # Average over token dimension
                    emb_vector = np.mean(emb_array, axis=0).tolist()
                    embeddings.append(emb_vector)
                elif len(emb_array.shape) == 1:  # Already a 1D array
                    embeddings.append(emb_array.tolist())
                else:
                    raise ValueError(f"Unexpected embedding shape: {emb_array.shape}")
            
            return embeddings
        
        except Exception as e:
            print(f"Error generating embeddings with SageMaker: {str(e)}")
            # Return zero vectors as fallback
            return [[0.0] * EMBEDDING_DIMENSION] * len(texts)

In [6]:
# Initialize Pinecone
def initialize_pinecone():
    """Initialize Pinecone and return the index"""
    try:
        # Initialize Pinecone client
        from pinecone import Pinecone
        pc = Pinecone(api_key=PINECONE_API_KEY)
        
        # Check if the index exists
        existing_indexes = pc.list_indexes().names()
        print(f"Available Pinecone indexes: {existing_indexes}")
        
        if PINECONE_INDEX_NAME not in existing_indexes:
            print(f"Creating new index '{PINECONE_INDEX_NAME}'...")
            
            # Create the index
            pc.create_index(
                name=PINECONE_INDEX_NAME,
                dimension=EMBEDDING_DIMENSION,
                metric="cosine",
                metadata_config={
                    "indexed": [
                        "source_type",
                        "job_id",
                        "industry",
                        "location",
                        "experience_level",
                        "visa_sponsorship"
                    ]
                }
            )
            print(f"Index '{PINECONE_INDEX_NAME}' created successfully")
        
        # Connect to the index
        index = pc.Index(PINECONE_INDEX_NAME)
        print(f"Connected to Pinecone index: {PINECONE_INDEX_NAME}")
        
        return index
        
    except Exception as e:
        print(f"Error initializing Pinecone: {str(e)}")
        print("Please check your API key and environment settings.")
        return None

In [7]:
def load_job_descriptions(limit=1, bucket="mirra-matcher-325", prefix="data/processed/jobs/"):
    """
    Load job descriptions from S3.
    
    Args:
        limit: Maximum number of jobs to load
        bucket: S3 bucket name
        prefix: S3 prefix for job files
        
    Returns:
        List of job description dictionaries
    """
    jobs = []
    
    # Load from S3
    s3_client = boto3.client('s3')
    
    try:
        print(f"Loading job descriptions from S3: s3://{bucket}/{prefix}")
        response = s3_client.list_objects_v2(
            Bucket=bucket,
            Prefix=prefix,
            MaxKeys=limit
        )

        if 'Contents' in response:
            for obj in response['Contents']:
                key = obj['Key']
                if key.endswith(".json"):
                    response = s3_client.get_object(Bucket=bucket, Key=key)
                    job_data = json.loads(response['Body'].read().decode('utf-8'))
                    # Add job_id if not present
                    if "job_id" not in job_data:
                        job_data["job_id"] = key.split("/")[-1].replace(".json", "")
                    jobs.append(job_data)
                    print(f"Loaded job: {job_data.get('job_id')}")
        else:
            print(f"No job files found in S3 bucket {bucket}/{prefix}")
    except Exception as e:
        print(f"Error loading files from S3: {str(e)}")

    print(f"Loaded {len(jobs)} job descriptions from S3")
    return jobs

In [8]:
def format_job_for_embedding(job):
    """
    Format a job JSON into a comprehensive text representation for embedding.
    
    Args:
        job: Job description dictionary
    
    Returns:
        String representation of the job with all key attributes
    """
    # Extract job details
    details = job.get("details", {})
    
    # Extract each attribute, handling potential missing data gracefully
    # Job title - handle empty lists safely
    job_title_data = details.get("job_title", ["Unknown"]) 
    job_title = job_title_data[0] if isinstance(job_title_data, list) and job_title_data else job_title_data if not isinstance(job_title_data, list) else "Unknown"
    
    # Location handling
    location_data = details.get("location", {})
    location = "Remote"  # Default
    
    if isinstance(location_data, list) and location_data:
        location_item = location_data[0]
        if isinstance(location_item, dict):
            city = location_item.get("city", "")
            state = location_item.get("state", "")
            country = location_item.get("country", "")
            location = ", ".join(filter(None, [city, state, country]))
    elif isinstance(location_data, dict):
        city = location_data.get("city", "")
        state = location_data.get("state", "")
        country = location_data.get("country", "")
        location = ", ".join(filter(None, [city, state, country]))
    elif location_data:
        location = str(location_data)
    
    # Company name - handle empty lists safely
    company_data = details.get("company_name", ["Unknown"])
    company_name = company_data[0] if isinstance(company_data, list) and company_data else company_data if not isinstance(company_data, list) else "Unknown"
    
    # Employment type - handle empty lists safely
    employment_data = details.get("employment_type", ["Full-time"])
    employment_type = employment_data[0] if isinstance(employment_data, list) and employment_data else employment_data if not isinstance(employment_data, list) else "Full-time"
    
    # Experience level (inferred from required years in hard skills)
    experience_level = "Entry-level"  # Default
    if job.get("mandatory", {}).get("hard_skills"):
        max_years = 0
        for skill in job["mandatory"]["hard_skills"]:
            min_years_data = skill.get("minyears", [0])
            min_years = min_years_data[0] if isinstance(min_years_data, list) and min_years_data else min_years_data if not isinstance(min_years_data, list) else 0
            
            # Convert to numeric if needed
            if not isinstance(min_years, (int, float)):
                try:
                    min_years = float(min_years)
                except (ValueError, TypeError):
                    min_years = 0
                    
            max_years = max(max_years, min_years)
        
        if max_years >= 7:
            experience_level = "Senior"
        elif max_years >= 3:
            experience_level = "Mid-level"
        else:
            experience_level = "Entry-level"
    
    # Salary range - handle empty lists safely
    salary_data = details.get("salary_range", ["Not specified"])
    salary_range = salary_data[0] if isinstance(salary_data, list) and salary_data else salary_data if not isinstance(salary_data, list) else "Not specified"
    
    # Date posted (default to "Recent" if not available)
    date_posted = details.get("date_posted", "Recent")
    
    # Industry/domain (inferred from job title or company if not explicitly stated)
    industry_data = details.get("industry", ["Technology"])
    industry = industry_data[0] if isinstance(industry_data, list) and industry_data else industry_data if not isinstance(industry_data, list) else "Technology"
    
    # Visa sponsorship (default to "Not specified")
    visa_sponsorship = details.get("visa_sponsorship", "Not specified")
    
    # Skills required
    required_skills = []
    if job.get("mandatory", {}).get("hard_skills"):
        for skill in job["mandatory"]["hard_skills"]:
            skill_data = skill.get("skill", [""])
            skill_name = skill_data[0] if isinstance(skill_data, list) and skill_data else skill_data if not isinstance(skill_data, list) else ""
            
            min_years_data = skill.get("minyears", [0])
            min_years = min_years_data[0] if isinstance(min_years_data, list) and min_years_data else min_years_data if not isinstance(min_years_data, list) else 0
            if skill_name:
                required_skills.append(f"{skill_name} ({min_years} years)")
    
    required_skills_text = ", ".join(required_skills) if required_skills else "Not specified"
    
    # Education requirements
    education = []
    if job.get("mandatory", {}).get("education"):
        for edu in job["mandatory"]["education"]:
            level_data = edu.get("education_level", [""])
            level = level_data[0] if isinstance(level_data, list) and level_data else level_data if not isinstance(level_data, list) else ""
            
            field_data = edu.get("field_of_study", [""])
            field = field_data[0] if isinstance(field_data, list) and field_data else field_data if not isinstance(field_data, list) else ""
            if level or field:
                education.append(f"{level} in {field}" if field else level)
    
    education_text = ", ".join(education) if education else "Not specified"
    
    # Combine all attributes into a single comprehensive text
    job_text = f"""
Job Title: {job_title}
Company: {company_name}
Location: {location}
Employment Type: {employment_type}
Experience Level: {experience_level}
Salary Range: {salary_range}
Date Posted: {date_posted}
Industry/Domain: {industry}
Visa Sponsorship: {visa_sponsorship}
Required Skills: {required_skills_text}
Education Requirements: {education_text}
    """
    
    return job_text.strip()

In [9]:
def process_jobs_batch(jobs, embedder, pinecone_index, batch_size=1):
    """
    Process a batch of jobs: format them, generate embeddings, and upload to Pinecone.
    
    Args:
        jobs: List of job dictionaries
        embedder: Embedding generator instance
        pinecone_index: Pinecone index
        batch_size: Batch size for embedding generation
        
    Returns:
        Number of vectors uploaded
    """
    job_texts = []
    job_metadata = []
    
    # Process each job and prepare metadata
    print(f"Formatting {len(jobs)} jobs for embedding...")
    for job in jobs:
        try:
            job_id = job.get("job_id", str(uuid.uuid4()))
            
            # Format the job text
            job_text = format_job_for_embedding(job)
            job_texts.append(job_text)
            
            # Extract key metadata for filtering
            details = job.get("details", {})
            
            # Extract metadata for Pinecone filtering with safe handling
            metadata = {
                "job_id": job_id,
                "source_type": "job_description"
            }
            
            # Extract job title safely
            job_title_data = details.get("job_title", ["Unknown"])
            metadata["job_title"] = job_title_data[0] if isinstance(job_title_data, list) and job_title_data else job_title_data if not isinstance(job_title_data, list) else "Unknown"
            
            # Extract company safely
            company_data = details.get("company_name", ["Unknown"])
            metadata["company"] = company_data[0] if isinstance(company_data, list) and company_data else company_data if not isinstance(company_data, list) else "Unknown"
            
            # Handle location
            location_data = details.get("location", {})
            if isinstance(location_data, list) and location_data and isinstance(location_data[0], dict):
                metadata["location"] = location_data[0].get("city", "") + ", " + location_data[0].get("state", "")
            elif isinstance(location_data, dict):
                metadata["location"] = location_data.get("city", "") + ", " + location_data.get("state", "")
            else:
                metadata["location"] = "Unknown"
            
            # Experience level
            experience_level = "Entry-level"
            if job.get("mandatory", {}).get("hard_skills"):
                max_years = 0
                for skill in job["mandatory"]["hard_skills"]:
                    # Safe handling of min years
                    min_years_data = skill.get("minyears", [0])
                    min_years = min_years_data[0] if isinstance(min_years_data, list) and min_years_data else min_years_data if not isinstance(min_years_data, list) else 0
                    
                    # Convert to numeric if needed
                    if not isinstance(min_years, (int, float)):
                        try:
                            min_years = float(min_years)
                        except (ValueError, TypeError):
                            min_years = 0
                            
                    max_years = max(max_years, min_years)
                
                if max_years >= 7:
                    experience_level = "Senior"
                elif max_years >= 3:
                    experience_level = "Mid-level"
                else:
                    experience_level = "Entry-level"
            
            metadata["experience_level"] = experience_level
            
            # Industry/domain - safe handling
            industry_data = details.get("industry", ["Technology"])
            metadata["industry"] = industry_data[0] if isinstance(industry_data, list) and industry_data else industry_data if not isinstance(industry_data, list) else "Technology"
            
            # Visa sponsorship
            visa_data = details.get("visa_sponsorship", "No")
            if isinstance(visa_data, list):
                visa_value = visa_data[0] if visa_data else "No"
            else:
                visa_value = visa_data
                
            metadata["visa_sponsorship"] = "Yes" if str(visa_value).lower() in ["yes", "available", "offered"] else "No"
            
            job_metadata.append(metadata)
        except Exception as e:
            print(f"Error processing job {job.get('job_id', 'unknown')}: {str(e)}")
            # Continue with next job instead of failing the entire batch
            continue
    
    # Generate embeddings in batches
    vectors_to_upload = []
    for i in range(0, len(job_texts), batch_size):
        batch_texts = job_texts[i:i+batch_size]
        batch_metadata = job_metadata[i:i+batch_size]
        
        print(f"Processing batch {i//batch_size + 1}/{(len(job_texts)-1)//batch_size + 1 if len(job_texts) > 0 else 1}...")
        
        # Generate embeddings for the batch
        batch_embeddings = embedder.generate_embeddings(batch_texts)
        
        # Create vectors for Pinecone
        for j, embedding in enumerate(batch_embeddings):
            idx = i + j
            if idx < len(job_metadata):
                vector_id = f"job_{job_metadata[idx]['job_id']}"
                
                vectors_to_upload.append({
                    "id": vector_id,
                    "values": embedding,
                    "metadata": {
                        "text": job_texts[idx][:1000],  # Truncate text to avoid metadata size limits
                        **job_metadata[idx]
                    }
                })
    
    # Upload vectors to Pinecone
    total_uploaded = 0
    upload_batch_size = 1  # Pinecone recommendation
    
    if vectors_to_upload:
        for i in tqdm(range(0, len(vectors_to_upload), upload_batch_size), desc="Uploading to Pinecone"):
            batch = vectors_to_upload[i:i+upload_batch_size]
            try:
                pinecone_index.upsert(vectors=batch)
                total_uploaded += len(batch)
            except Exception as e:
                print(f"Error uploading batch to Pinecone: {str(e)}")
    else:
        print("No vectors to upload")
    
    return total_uploaded

In [None]:
def embed_jobs_main():
    """Main function to load jobs, generate embeddings, and upload to Pinecone"""
    # Initialize embedding generator
    embedder = E5EmbeddingGenerator()
    
    # Initialize Pinecone
    pinecone_index = initialize_pinecone()
    
    if not pinecone_index:
        print("Failed to initialize Pinecone. Exiting.")
        return
    
    # Test the embedder
    try:
        test_text = "Job Title: Software Engineer Location: San Francisco"
        embedding = embedder.generate_embeddings([test_text])
        
        print("Testing embedder")
        if isinstance(embedding, list) and len(embedding) > 0:
            embedding_array = np.array(embedding[0])
            print(f"Embedding generation successful!")
            print(f"Embedding dimension: {len(embedding[0])}")
            print(f"Embedding shape: {embedding_array.shape}")
            print(f"Embedding preview (first 5 values): {embedding[0][:5]}")
        else:
            print(f"Warning: Unexpected embedding format")
            return
    except Exception as e:
        print(f"Error testing embedding generation: {str(e)}")
        return
    
    # Load job descriptions
    jobs = load_job_descriptions(limit=100)
    
    if not jobs:
        print("No jobs loaded. Exiting.")
        return
    
    # Process and embed jobs
    print(f"Processing {len(jobs)} jobs...")
    
    # Process jobs in batches of 16
    batch_size = 1
    total_vectors = 0
    
    for i in range(0, len(jobs), batch_size):
        job_batch = jobs[i:i+batch_size]
        print(f"\nProcessing batch {i//batch_size + 1}/{(len(jobs)-1)//batch_size + 1} with {len(job_batch)} jobs")
        
        vectors_uploaded = process_jobs_batch(
            job_batch, 
            embedder, 
            pinecone_index, 
            batch_size=16
        )
        
        total_vectors += vectors_uploaded
        print(f"Uploaded {vectors_uploaded} vectors in this batch")
    
    print(f"\nCompleted job embedding process.")
    print(f"Total jobs processed: {len(jobs)}")
    print(f"Total vectors uploaded: {total_vectors}")
    
    # Verify index contents
    try:
        index_stats = pinecone_index.describe_index_stats()
        print(f"\nFinal Pinecone index statistics:")
        print(index_stats)
    except Exception as e:
        print(f"Error retrieving index stats: {str(e)}")

# Run the main function if executed directly
if __name__ == "__main__":
    embed_jobs_main()

In [12]:
embedder = E5EmbeddingGenerator()

def semantic_job_title_search(query_title, pinecone_index, embedder=None, similarity_threshold=0.8):
    """
    Search for semantically similar job titles in Pinecone.
    
    Args:
        query_title: The job title to search for
        pinecone_index: The Pinecone index to search
        embedder: Embedding generator instance (optional)
        similarity_threshold: Minimum similarity score (0-1) to include in results
        
    Returns:
        List of dictionaries with job_id, job_title, and similarity score
    """
    try:
        # Create embedder if not provided
        if embedder is None:
            from random_embedding import RandomEmbeddingGenerator
            print("No embedder provided. Creating RandomEmbeddingGenerator as fallback.")
            embedder = RandomEmbeddingGenerator()
            
        # 1. Create embedding for the query job title
        query_text = f"{query_title}"  # You may need to adjust the format
        query_embedding = embedder.generate_embeddings([query_text])[0]
        
        # 2. Query Pinecone
        search_results = pinecone_index.query(
            vector=query_embedding,
            top_k=100,  # Get enough results to filter
            include_metadata=True,
            include_values=False  # only need the scores, not the vectors
        )
        
        # 3. Filter and format results
        similar_jobs = []
        
        # Pinecone v6+ returns objects instead of dictionaries
        matches = getattr(search_results, 'matches', [])
        
        for match in matches:
            # Extract match properties
            score = getattr(match, 'score', 0)
            metadata = getattr(match, 'metadata', {})
            
            # Skip if below threshold
            if score < similarity_threshold:
                continue
                
            # Extract job info from metadata
            job_id = metadata.get('job_id', 'unknown')
            job_title = metadata.get('job_title', metadata.get('text', '')).replace('Job title: ', '')
            
            similar_jobs.append({
                'job_id': job_id,
                'job_title': job_title,
                'similarity': score
            })
        
        return similar_jobs
        
    except Exception as e:
        print(f"Error in semantic search: {str(e)}")
        return []

Initialized SageMaker embedder for endpoint: e5-large-instruct-endpoint-325


In [14]:
# Test the function with a sample job title
from pinecone import Pinecone
pc = Pinecone(api_key=PINECONE_API_KEY)
pinecone_index = pc.Index(PINECONE_INDEX_NAME)

test_query = "Senior Software Engineer"
results = semantic_job_title_search(
    test_query, 
    pinecone_index=pinecone_index,
    embedder=embedder
)

print(f"Jobs similar to '{test_query}' (similarity > 0.8):")
for job in results:
    print(f"- {job['job_title']} (ID: {job['job_id']}, Score: {job['similarity']:.4f})")

Jobs similar to 'Senior Software Engineer' (similarity > 0.8):
- Back-end Software Engineer (Hybrid) (ID: 901969dd-7034-47f0-b539-30dacd5e928d, Score: 0.8674)
- Senior Solutions Engineer (ID: 57cea150-f3f7-4238-95a5-1db5b6e940eb, Score: 0.8576)
- Software Engineer (ID: 1f58b149-ed8e-4fb2-96dd-d83d266f5310, Score: 0.8544)
- Software Engineer in Test (ID: 98ab83df-021a-4889-9af3-bd6664d725e2, Score: 0.8529)
- Software Engineer - ML Tools Support - 314915 (ID: a2998f35-d332-4fee-8170-c9c946fab8e0, Score: 0.8514)
- Staff Security Operations Engineer (ID: babf4bb0-7f3a-48e6-be4b-53e3934f9e0a, Score: 0.8420)
- Senior Mechanical Engineer (Rotating and Packaged Equipment) (ID: d84838c8-db39-4750-9021-64c5e2c14fe1, Score: 0.8391)
- Technology Delivery Lead (ID: 04a9e436-fbc3-4d2f-a543-a6474dc485f9, Score: 0.8354)
- Full Stack .NET Developer (ID: f6aeaba8-87f5-498c-919a-8d25d63d3d9e, Score: 0.8326)
- Senior QA/QC Technical Architect - Aviation & Federal (ID: f20e91c6-5042-44a7-a065-6a9454797a47,