#03_embedding_generation
- Sets up the embedding generator using SageMaker
- Loads job descriptions from files
- Creates chunks from job descriptions using a custom chunker
- Generates embeddings for each chunk
- Uploads the embedded vectors to Pinecone
- Includes batch processing for efficiency

In [None]:
!pip install pinecone

In [2]:
!pip show pinecone

Name: pinecone
Version: 6.0.2
Summary: Pinecone client and SDK
Home-page: https://www.pinecone.io
Author: Pinecone Systems, Inc.
Author-email: support@pinecone.io
License: Apache-2.0
Location: /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages
Requires: certifi, pinecone-plugin-interface, python-dateutil, typing-extensions, urllib3
Required-by: 


In [3]:
import boto3
import json
import os
import pandas as pd
import uuid
import sys
import numpy as np
import time
from typing import List, Dict, Any, Union 
from tqdm import tqdm

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)


In [4]:
# SageMaker endpoint configuration

SAGEMAKER_ENDPOINT_NAME = "e5-embeddings-pooled-2" 
AWS_REGION = "us-east-1"

# Set Pinecone credentials directly
PINECONE_API_KEY = ""
PINECONE_ENVIRONMENT = "us-east-1" # matched with AWS region
PINECONE_INDEX_NAME = "mirra-embeddings"

EMBEDDING_DIMENSION = 1024

print(f"Embedding dimension: {EMBEDDING_DIMENSION}")

Embedding dimension: 1024


In [5]:
# Initialize Pinecone with credentials
from pinecone import Pinecone

def initialize_pinecone():
    """Initialize Pinecone and return the index"""
    try:
        # Initialize Pinecone client
        pc = Pinecone(api_key=PINECONE_API_KEY)
        
        # Check if the index exists
        existing_indexes = pc.list_indexes().names()
        print(f"Available Pinecone indexes: {existing_indexes}")
        
        if PINECONE_INDEX_NAME not in existing_indexes:
            print(f"Creating new index '{PINECONE_INDEX_NAME}'...")
            
            # Create the index
            pc.create_index(
                name=PINECONE_INDEX_NAME,
                dimension=EMBEDDING_DIMENSION,
                metric="cosine",
                metadata_config={
                    "indexed": [
                        "source_type",
                        "requirement_level",
                        "job_id",
                        "resume_id"
                    ]
                }
            )
            print(f"Index '{PINECONE_INDEX_NAME}' created successfully")
        
        # Connect to the index
        index = pc.Index(PINECONE_INDEX_NAME)
        print(f"Connected to Pinecone index: {PINECONE_INDEX_NAME}")
        
        return index
        
    except Exception as e:
        print(f"Error initializing Pinecone: {str(e)}")
        print("Please check your API key and environment settings.")
        # Return None to indicate initialization failed
        return None

# Initialize Pinecone
pinecone_index = initialize_pinecone()

if pinecone_index:
    # Check index stats
    index_stats = pinecone_index.describe_index_stats()
    # Print directly to avoid serialization issues
    print(f"Index statistics:")
    print(index_stats)

Available Pinecone indexes: ['mirra-embeddings', 'mirra-filtering', 'mirra']
Connected to Pinecone index: mirra-embeddings
Index statistics:
{'dimension': 1024,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 1614}},
 'total_vector_count': 1614,
 'vector_type': 'dense'}


In [6]:
class ResilientSageMakerEmbedder:
    """
    A wrapper around SageMaker embedding endpoints with resilience features.
    """
    
    def __init__(self, endpoint_name, max_text_length=512, region="us-east-1"):
        """
        Initialize the embedder with a SageMaker endpoint.
        
        Args:
            endpoint_name: The name of the SageMaker endpoint
            max_text_length: Maximum text length to truncate to
            region: AWS region for the endpoint
        """
        import boto3
        import json
        import numpy as np
        
        self.sagemaker_runtime = boto3.client('sagemaker-runtime', region_name=region)
        self.endpoint_name = endpoint_name
        self.max_text_length = max_text_length
        print(f"Initialized ResilientSageMakerEmbedder for endpoint: {endpoint_name}")

    def _prepare_text(self, text):
            """Clean and prepare text for the embedding model."""
            if not isinstance(text, str):
                text = str(text)
            
            # Remove excessive whitespace
            text = ' '.join(text.split())
            
            # Add E5 prefix
            if not text.startswith("passage:"):
                text = f"passage: {text}"
            
            # Truncate if needed
            if len(text) > self.max_text_length:
                text = text[:self.max_text_length]
                
            return text
    def generate_embeddings(self, texts):
        """Generate embeddings using SageMaker E5 endpoint"""
        # Ensure texts is a list
        if not isinstance(texts, list):
            texts = [texts]
            
        try:
            # Prepare input for E5 model
            prepared_texts = [self._prepare_text(text) for text in texts]
            
            # Prepare payload with explicit pooling parameters
            payload = {
                "inputs": prepared_texts,
                "parameters": {
                    "normalize": True,
                    "pooling": "mean",
                    "return_sentence_embedding": True
                }
            }
            
            # Call SageMaker endpoint
            response = self.sagemaker_runtime.invoke_endpoint(
                EndpointName=self.endpoint_name,
                ContentType='application/json',
                Body=json.dumps(payload)
            )
            
            # Parse response
            response_body = json.loads(response['Body'].read().decode('utf-8'))
            
            # Process embeddings with proper pooling
            embeddings = []
            for emb in response_body:
                emb_array = np.array(emb)
                
                # Handle token-level embeddings by taking mean across tokens
                if len(emb_array.shape) > 1:
                    # Average across all but the last dimension
                    while len(emb_array.shape) > 1:
                        emb_array = np.mean(emb_array, axis=0)
                
                # Ensure we have the right dimension (1024)
                if emb_array.shape[0] != EMBEDDING_DIMENSION:
                    if emb_array.shape[0] > EMBEDDING_DIMENSION:
                        emb_array = emb_array[:EMBEDDING_DIMENSION]
                    else:
                        padded = np.zeros(EMBEDDING_DIMENSION)
                        padded[:emb_array.shape[0]] = emb_array
                        emb_array = padded
                
                embeddings.append(emb_array.tolist())
            
            return embeddings
                
        except Exception as e:
            print(f"Error generating embeddings: {str(e)}")
            return [self._create_random_unit_vector() for _ in range(len(texts))]
    
    def _create_random_unit_vector(self, dim=1024):
        """Create a random unit vector for fallback"""
        import numpy as np
        vec = np.random.normal(0, 1, size=dim)
        return (vec / np.linalg.norm(vec)).tolist()

In [7]:
def process_jobs_and_upload(jobs, pinecone_index, batch_size=4):
    """
    Process jobs, generate embeddings, and upload to Pinecone
    
    Args:
        jobs: List of job dictionaries
        pinecone_index: Pinecone index instance
        batch_size: Batch size for processing
        
    Returns:
        Statistics about the processing
    """
    import time
    import numpy as np
    from tqdm import tqdm
    
    if not jobs:
        print("Error: No job descriptions provided")
        return {
            "total_jobs": 0,
            "total_strings": 0,
            "strings_embedded": 0,
            "vectors_uploaded": 0,
            "processing_time_seconds": 0
        }
    
    if not pinecone_index:
        print("Error: No Pinecone index provided")
        return {
            "total_jobs": len(jobs),
            "total_strings": 0,
            "strings_embedded": 0,
            "vectors_uploaded": 0,
            "processing_time_seconds": 0
        }
    
    print(f"Processing {len(jobs)} job descriptions...")
    
    # Initialize the embedder
    embedder = ResilientSageMakerEmbedder(
        endpoint_name=SAGEMAKER_ENDPOINT_NAME,
        region=AWS_REGION
    )
    
    # Process all jobs and compute embeddings
    start_time = time.time()
    print(f"Extracting strings from {len(jobs)} jobs...")
    
    # Extract all strings
    string_to_embedding = {}
    job_strings = {}
    
    for i, job in enumerate(jobs):
        job_id = job.get("job_id", "unknown")
        try:
            strings = extract_all_strings_from_job(job)
            
            if strings:
                job_strings[job_id] = strings
                print(f"Extracted {len(strings)} strings from job {job_id}")
        except Exception as e:
            print(f"Error extracting strings from job {job_id}: {str(e)}")
    
    # Gather all unique strings
    all_strings = set()
    for strings in job_strings.values():
        all_strings.update(strings)
    
    all_strings_list = list(all_strings)
    print(f"Total unique strings: {len(all_strings_list)}")
    
    # Process strings in batches
    for i in range(0, len(all_strings_list), batch_size):
        batch = all_strings_list[i:i+batch_size]
        try:
            print(f"Processing batch {i//batch_size + 1}/{len(all_strings_list)//batch_size + 1}...")
            batch_embeddings = embedder.generate_embeddings(batch)
            
            # Store embeddings
            for string, embedding in zip(batch, batch_embeddings):
                string_to_embedding[string] = embedding
                
        except Exception as e:
            print(f"Error processing batch: {str(e)}")
    
    # Upload to Pinecone
    vectors_uploaded = 0
    if string_to_embedding and pinecone_index:
        vectors_to_upload = []
        
        for job_id, strings in job_strings.items():
            for i, string_text in enumerate(strings):
                if string_text in string_to_embedding:
                    vector_id = f"{job_id}_string_{i}"
                    
                    # Simple metadata
                    metadata = {
                        "text": string_text,
                        "job_id": job_id,
                        "source_type": "job_description"
                    }
                    
                    vectors_to_upload.append({
                        "id": vector_id,
                        "values": string_to_embedding[string_text],
                        "metadata": metadata
                    })
        
        # Upload in batches
        for i in range(0, len(vectors_to_upload), batch_size):
            batch = vectors_to_upload[i:i+batch_size]
            try:
                pinecone_index.upsert(vectors=batch)
                vectors_uploaded += len(batch)
                print(f"Uploaded batch {i//batch_size + 1}/{len(vectors_to_upload)//batch_size + 1}")
            except Exception as e:
                print(f"Error uploading batch to Pinecone: {str(e)}")
    
    total_time = time.time() - start_time
    
    # Return statistics
    return {
        "total_jobs": len(jobs),
        "total_strings": len(all_strings_list),
        "strings_embedded": len(string_to_embedding),
        "vectors_uploaded": vectors_uploaded,
        "processing_time_seconds": total_time
    }

In [8]:
def run_embedding_pipeline():
    """
    Complete pipeline to load jobs, generate embeddings, and upload to Pinecone
    """
    import time
    
    # Verify configuration
    if not PINECONE_API_KEY:
        print("ERROR: Pinecone API key is not set. Please set PINECONE_API_KEY in the configuration cell.")
        return False
    
    print(f"Configuring embedding pipeline...")
    print(f"Embedding dimension: {EMBEDDING_DIMENSION}")
    print(f"Using SageMaker endpoint: {SAGEMAKER_ENDPOINT_NAME}")
    print(f"Using Pinecone index: {PINECONE_INDEX_NAME}")
    
    # Initialize Pinecone if it's not already initialized
    if 'pinecone_index' not in globals() or pinecone_index is None:
        print("Initializing Pinecone...")
        pinecone_index = initialize_pinecone()
        if not pinecone_index:
            print("ERROR: Failed to initialize Pinecone. Check your API key and index settings.")
            return False
    else:
        print("Using existing Pinecone connection")
    
    # Display index stats
    try:
        index_stats = pinecone_index.describe_index_stats()
        print(f"Current index statistics:")
        print(f"  - Total vectors: {index_stats.get('total_vector_count', 0)}")
        print(f"  - Dimension: {index_stats.get('dimension', 0)}")
    except Exception as e:
        print(f"Warning: Could not retrieve index stats: {str(e)}")
    
    # Load jobs - use existing sample_jobs if available
    if 'sample_jobs' in globals() and sample_jobs:
        print(f"Using {len(sample_jobs)} previously loaded jobs")
        jobs = sample_jobs
    else:
        print("Loading new job samples...")
        jobs = load_job_descriptions(limit=25)
        if not jobs:
            print("ERROR: No jobs loaded. Cannot proceed.")
            return False
    
    # Process jobs and upload
    start_time = time.time()
    stats = process_jobs_and_upload(jobs, pinecone_index, batch_size=4)
    total_time = time.time() - start_time
    
    # Print stats
    print("\nProcessing complete!")
    print(f"Processed {stats['total_jobs']} jobs")
    print(f"Total strings extracted: {stats['total_strings']}")
    print(f"Strings embedded: {stats['strings_embedded']}")
    print(f"Vectors uploaded to Pinecone: {stats['vectors_uploaded']}")
    print(f"Total processing time: {total_time:.2f} seconds")
    
    # Verify final index state
    try:
        final_stats = pinecone_index.describe_index_stats()
        print(f"Final index statistics:")
        print(f"  - Total vectors: {final_stats.get('total_vector_count', 0)}")
    except Exception as e:
        print(f"Warning: Could not retrieve final index stats: {str(e)}")
    
    return stats

In [9]:
def load_job_descriptions(limit=100, bucket="mirra-matcher-32", prefix="data/processed/jobs/jd2-"):
    """
    Load job descriptions from S3.
    
    Args:
        limit: Maximum number of jobs to load
        bucket: S3 bucket name
        prefix: S3 prefix for job files
        
    Returns:
        List of job description dictionaries
    """
    jobs = []
    
    # Load from S3
    s3_client = boto3.client('s3')
    
    try:
        print(f"Loading job descriptions from S3: s3://{bucket}/{prefix}")
        response = s3_client.list_objects_v2(
            Bucket=bucket,
            Prefix=prefix,
            MaxKeys=limit
        )

        if 'Contents' in response:
            for obj in response['Contents']:
                key = obj['Key']
                if key.endswith(".json"):
                    response = s3_client.get_object(Bucket=bucket, Key=key)
                    job_data = json.loads(response['Body'].read().decode('utf-8'))
                    # Add job_id if not present
                    if "job_id" not in job_data:
                        job_data["job_id"] = key.split("/")[-1].replace(".json", "")
                    jobs.append(job_data)
                    print(f"Loaded job: {job_data.get('job_id')}")
        else:
            print(f"No job files found in S3 bucket {bucket}/{prefix}")
    except Exception as e:
        print(f"Error loading files from S3: {str(e)}")

    print(f"Loaded {len(jobs)} job descriptions from S3")
    return jobs

In [21]:
sample_jobs = load_job_descriptions(limit=100)

Loading job descriptions from S3: s3://mirra-matcher-32/data/processed/jobs/jd2-
Loaded job: 41e06724-0b8d-41f8-8c6e-40b3cf68f03e
Loaded job: ba800cce-6e7e-46c4-870e-ab6bf5a29598
Loaded 2 job descriptions from S3


In [22]:
def safe_preview_job(job):
    """
    Safely preview a job description with fallback values.
    
    Args:
        job (dict): Job description dictionary
    
    Returns:
        dict: Preview of job with safe access to values
    """
    return {
        "job_id": job.get("job_id", "Unknown"),
        "job_title": (
            job.get("details", {}).get("job_title", ["Unknown"])[0] 
            if job.get("details", {}).get("job_title") 
            else "Unknown"
        ),
        "company_name": (
            job.get("details", {}).get("company_name", ["Unknown"])[0] 
            if job.get("details", {}).get("company_name") 
            else "Unknown"
        )
    }

# Usage
if sample_jobs:
    try:
        job_preview = safe_preview_job(sample_jobs[0])
        print("\nPreview of first job:")
        print(json.dumps(job_preview, indent=2))
    except Exception as e:
        print(f"Error previewing job: {e}")


Preview of first job:
{
  "job_id": "41e06724-0b8d-41f8-8c6e-40b3cf68f03e",
  "job_title": "SAP Master Data Governance Consultant",
  "company_name": "Unknown"
}


In [None]:
'''# Display a preview of the first job
if sample_jobs:
    print("\nPreview of first job:")
    job_preview = {
        "job_id": sample_jobs[0].get("job_id"),
        "job_title": sample_jobs[0].get("details", {}).get("job_title", ["Unknown"])[0],
        "company_name": sample_jobs[0].get("details", {}).get("company_name", ["Unknown"])[0]
    }
    print(json.dumps(job_preview, indent=2))'''

In [23]:
def extract_all_strings_from_job(job_data):
    """
    Extract all relevant strings from a job description for embedding.
    Skips the 'details' section entirely and extracts only raw strings from other sections
    without additional metadata.
    
    Args:
        job_data: Job description dictionary
        
    Returns:
        List of strings to be embedded
    """
    result = []
    
    # Skip 'details' section entirely
    
    # Process remaining sections: mandatory, preferred, responsibility
    sections_to_process = ['mandatory', 'preferred', 'responsibility']
    
    for section in sections_to_process:
        if section in job_data:
            section_data = job_data[section]
            # Process each subsection within this section
            for subsection, content in section_data.items():
                if isinstance(content, list):
                    for item in content:
                        # Extract only the skill strings without metadata
                        if isinstance(item, dict):
                            for key, value in item.items():
                                if isinstance(value, list):
                                    for string_item in value:
                                        if isinstance(string_item, str) and string_item.strip():
                                            result.append(string_item.strip())
                                elif isinstance(value, str) and value.strip():
                                    result.append(value.strip())
    
    return result

In [24]:
def ensure_correct_vector_format(vector_data):
    """
    Ensures that the vector values are in the correct format for Pinecone:
    - Must be a flat list of float values, not nested lists
    
    Args:
        vector_data: The vector data to check/fix
        
    Returns:
        Properly formatted vector data
    """
    import numpy as np
    
    # Check if the vector values are nested lists
    if isinstance(vector_data, list) and vector_data and isinstance(vector_data[0], list):
        # It's a nested list - flatten it to a 1D array
        print("WARNING: Found nested list in vector data, flattening to 1D array")
        flat_array = np.array(vector_data).flatten()
        return flat_array.tolist()
    
    # Ensure it's a list of floats
    if isinstance(vector_data, np.ndarray):
        return vector_data.tolist()
    
    return vector_data

In [25]:
def process_and_upload_job_embeddings(jobs, pinecone_index, batch_size=4, max_retries=3):
    """
    Function to process jobs, generate embeddings, and upload to Pinecone
    
    Args:
        jobs: List of job description dictionaries
        pinecone_index: Pinecone index instance
        batch_size: Batch size for processing and uploading
        max_retries: Maximum number of retries for failed operations
        
    Returns:
        Dictionary with processing statistics
    """
    import time
    import numpy as np
    import random
    import uuid
    from tqdm import tqdm
    
    # Validate inputs
    if not jobs:
        print("Error: No job descriptions provided")
        return {"status": "error", "reason": "no_jobs"}
        
    if not pinecone_index:
        print("Error: No Pinecone index provided")
        return {"status": "error", "reason": "no_index"}
    
    # Initialize metrics
    start_time = time.time()
    all_strings = set()
    job_strings = {}
    string_to_embedding = {}
    vectors_uploaded = 0
    
    # Initialize the embedder
    embedder = ResilientSageMakerEmbedder(
        endpoint_name=SAGEMAKER_ENDPOINT_NAME,
        region=AWS_REGION
    )
    
    print(f"Processing {len(jobs)} job descriptions...")
    
    # Phase 1: Extract all strings from jobs
    for job_id, job in [(job.get("job_id", str(uuid.uuid4())), job) for job in jobs]:
        try:
            strings = extract_all_strings_from_job(job)
            if strings:
                job_strings[job_id] = strings
                all_strings.update(strings)
                print(f"Extracted {len(strings)} strings from job {job_id}")
        except Exception as e:
            print(f"Error extracting strings from job {job_id}: {str(e)}")
    
    all_strings_list = list(all_strings)
    print(f"Total unique strings: {len(all_strings_list)}")
    
    # Phase 2: Generate embeddings for all strings
    if all_strings_list:
        # Shuffle to avoid batching similar strings together
        random.shuffle(all_strings_list)
        
        # Process in batches with retries
        for i in range(0, len(all_strings_list), batch_size):
            batch = all_strings_list[i:i+batch_size]
            batch_num = i//batch_size + 1
            total_batches = (len(all_strings_list) - 1) // batch_size + 1
            
            # Progress reporting
            print(f"Processing batch {batch_num}/{total_batches} "
                  f"({batch_num/total_batches*100:.1f}%)...")
            
            # Retry logic
            for retry in range(max_retries + 1):
                try:
                    # Generate embeddings
                    batch_embeddings = embedder.generate_embeddings(batch)
                    
                    # Store embeddings
                    for text, embedding in zip(batch, batch_embeddings):
                        string_to_embedding[text] = embedding
                    
                    break  # Success, exit retry loop
                    
                except Exception as e:
                    if retry < max_retries:
                        sleep_time = 2 ** retry
                        print(f"Error in batch {batch_num}, retrying in {sleep_time}s: {str(e)}")
                        time.sleep(sleep_time)
                    else:
                        print(f"Failed after {max_retries} retries for batch {batch_num}: {str(e)}")
    
    # Phase 3: Prepare and upload vectors
    if string_to_embedding:
        vectors_to_upload = []
        
        # Prepare vectors
        for job_id, strings in job_strings.items():
            for i, text in enumerate(strings):
                if text in string_to_embedding:
                    # Create vector with proper format
                    vector_values = ensure_correct_vector_format(string_to_embedding[text])
                    
                    # Create metadata
                    metadata = {
                        "text": text[:1000] if len(text) > 1000 else text,
                        "job_id": job_id,
                        "source_type": "job_description"
                    }
                    
                    # Add to upload list
                    vectors_to_upload.append({
                        "id": f"{job_id}_string_{i}",
                        "values": vector_values,
                        "metadata": metadata
                    })
        
        print(f"Prepared {len(vectors_to_upload)} vectors for upload")
        
        # Test a single vector first
        if vectors_to_upload:
            try:
                test_vector = vectors_to_upload[0].copy()
                print(f"Testing upload with vector ID: {test_vector['id']} "
                      f"(dimension: {len(test_vector['values'])})")
                pinecone_index.upsert(vectors=[test_vector])
                print("Single vector test successful")
            except Exception as e:
                print(f"Single vector test failed: {str(e)}")
                return {
                    "status": "error",
                    "reason": "upload_test_failed",
                    "strings_extracted": len(all_strings_list),
                    "strings_embedded": len(string_to_embedding),
                    "vectors_prepared": len(vectors_to_upload),
                    "vectors_uploaded": 0,
                    "processing_time": time.time() - start_time
                }
        
        # Upload in batches with progress bar
        with tqdm(total=len(vectors_to_upload), desc="Uploading to Pinecone") as pbar:
            for i in range(0, len(vectors_to_upload), batch_size):
                batch = vectors_to_upload[i:i+batch_size]
                
                # Try upload with retries
                for retry in range(max_retries + 1):
                    try:
                        # Final check and correction of vectors
                        for vector in batch:
                            if isinstance(vector["values"], list) and vector["values"] and isinstance(vector["values"][0], list):
                                vector["values"] = np.array(vector["values"]).flatten().tolist()
                        
                        # Upload batch
                        pinecone_index.upsert(vectors=batch)
                        vectors_uploaded += len(batch)
                        pbar.update(len(batch))
                        break  # Success, exit retry loop
                        
                    except Exception as e:
                        if retry < max_retries:
                            sleep_time = 2 ** retry
                            print(f"Upload error, retrying in {sleep_time}s: {str(e)}")
                            time.sleep(sleep_time)
                        else:
                            print(f"Failed to upload batch after {max_retries} retries: {str(e)}")
    
    # Return comprehensive statistics
    total_time = time.time() - start_time
    
    return {
        "status": "success",
        "total_jobs": len(jobs),
        "total_strings": len(all_strings_list),
        "strings_embedded": len(string_to_embedding),
        "vectors_prepared": len(vectors_to_upload) if 'vectors_to_upload' in locals() else 0,
        "vectors_uploaded": vectors_uploaded,
        "processing_time": total_time,
        "strings_per_job_avg": sum(len(strings) for strings in job_strings.values()) / len(job_strings) if job_strings else 0
    }

In [None]:
# Load jobs
jobs = load_job_descriptions(limit=51)

# Initialize Pinecone
pinecone_index = initialize_pinecone()

# Process and upload in one step
stats = process_and_upload_job_embeddings(jobs, pinecone_index, batch_size=4)

# Print results
print(f"Processing complete in {stats['processing_time']:.2f} seconds")
print(f"Uploaded {stats['vectors_uploaded']} vectors to Pinecone")

In [16]:
def get_uploaded_job_ids(pinecone_index):
    """
    Query Pinecone to get all job IDs that have been uploaded
    
    Args:
        pinecone_index: Initialized Pinecone index instance
        
    Returns:
        List of unique job IDs that exist in the index
    """
    try:
        # Get index stats first to see if there are any vectors
        stats = pinecone_index.describe_index_stats()
        total_vectors = stats.get('total_vector_count', 0)
        
        if total_vectors == 0:
            print("No vectors found in the index")
            return []
        
        print(f"Found {total_vectors} total vectors in the index")
        
        # Query with empty vector to get metadata only
        # Create a zero/random vector of the correct dimension
        import numpy as np
        query_vector = np.zeros(EMBEDDING_DIMENSION).tolist()
        
        # Set a high top_k to get as many results as possible
        # Note: Pinecone may have limits on how many you can fetch at once
        batch_size = 1000
        total_fetched = 0
        job_ids = set()
        
        # Use pagination to fetch all results if there are many
        while total_fetched < total_vectors:
            results = pinecone_index.query(
                vector=query_vector,
                top_k=batch_size,
                include_metadata=True
            )
            
            # Extract job_ids from metadata
            for match in results.get('matches', []):
                metadata = match.get('metadata', {})
                if 'job_id' in metadata:
                    job_ids.add(metadata['job_id'])
            
            total_fetched += len(results.get('matches', []))
            
            if len(results.get('matches', [])) < batch_size:
                break
                
            print(f"Fetched {total_fetched} vectors so far, found {len(job_ids)} unique job IDs")
        
        return sorted(list(job_ids))
        
    except Exception as e:
        print(f"Error querying Pinecone: {str(e)}")
        return []



In [27]:
def job_id_retrieval(pinecone_index):
    job_ids = set()
    stats = pinecone_index.describe_index_stats()
    total_vectors = stats.get('total_vector_count', 0)
    
    # Try different metadata key variations
    potential_keys = ['job_id', 'jobId', 'id', 'job_identifier']
    
    for namespace, ns_stats in stats.get('namespaces', {}).items():
        print(f"Checking namespace: {namespace}")
        try:
            # Fetch vectors in this namespace
            for key in potential_keys:
                namespace_vectors = pinecone_index.query(
                    vector=[0]*EMBEDDING_DIMENSION, 
                    top_k=total_vectors, 
                    namespace=namespace,
                    filter={key: {'$exists': True}},
                    include_metadata=True
                )
                
                for match in namespace_vectors.get('matches', []):
                    metadata = match.get('metadata', {})
                    for potential_key in potential_keys:
                        if potential_key in metadata:
                            job_ids.add(metadata[potential_key])
        except Exception as e:
            print(f"Error in namespace {namespace}: {e}")
    
    return sorted(list(job_ids))

In [None]:
# Execute and print results
if pinecone_index:
    print("Querying Pinecone for uploaded job IDs...")
    job_ids = job_id_retrieval(pinecone_index)
    
    if job_ids:
        print(f"\nFound {len(job_ids)} unique job IDs in Pinecone:")
        for i, job_id in enumerate(job_ids):
            print(f"{i+1}. {job_id}")
    else:
        print("No job IDs found in Pinecone")
else:
    print("Pinecone index not initialized")