#03_embedding_generation
- Sets up the embedding generator using SageMaker
- Extracts and embeds strings from job description jsons
- Uploads the embedded vectors to Pinecone
- Includes batch processing for efficiency

In [None]:
!pip install pinecone

In [None]:
!pip show pinecone

In [3]:
import boto3
import json
import os
import pandas as pd
import uuid
import sys
import numpy as np
import time
from typing import List, Dict, Any, Union 
from tqdm import tqdm

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)


In [None]:
# SageMaker endpoint configuration

SAGEMAKER_ENDPOINT_NAME = "e5-embeddings-huggingface" 
AWS_REGION = "us-east-1"

# Set Pinecone credentials directly
PINECONE_API_KEY = "pcsk_7VkStS_ifR3SH9d1MSkkju9kP7DUt5M16CpNyzi9dwNBm7iUqyXmbKZWQbC55ZzfSEaAB"
PINECONE_ENVIRONMENT = "us-east-1" # matched with AWS region
PINECONE_INDEX_NAME = "sample-100"

EMBEDDING_DIMENSION = 1024

print(f"Embedding dimension: {EMBEDDING_DIMENSION}")

In [None]:
# Initialize Pinecone with credentials
from pinecone import Pinecone

def initialize_pinecone():
    """Initialize Pinecone and return the index"""
    try:
        # Initialize Pinecone client
        pc = Pinecone(api_key=PINECONE_API_KEY)
        
        # Check if the index exists
        existing_indexes = pc.list_indexes().names()
        print(f"Available Pinecone indexes: {existing_indexes}")
        
        if PINECONE_INDEX_NAME not in existing_indexes:
            print(f"Creating new index '{PINECONE_INDEX_NAME}'...")
            
            # Create the index
            pc.create_index(
                name=PINECONE_INDEX_NAME,
                dimension=EMBEDDING_DIMENSION,
                metric="cosine",
                metadata_config={
                    "indexed": [
                        "source_type",
                        "requirement_level",
                        "job_id",
                        "resume_id"
                    ]
                }
            )
            print(f"Index '{PINECONE_INDEX_NAME}' created successfully")
        
        # Connect to the index
        index = pc.Index(PINECONE_INDEX_NAME)
        print(f"Connected to Pinecone index: {PINECONE_INDEX_NAME}")
        
        return index
        
    except Exception as e:
        print(f"Error initializing Pinecone: {str(e)}")
        print("Please check your API key and environment settings.")
        # Return None to indicate initialization failed
        return None

# Initialize Pinecone
pinecone_index = initialize_pinecone()

if pinecone_index:
    # Check index stats
    index_stats = pinecone_index.describe_index_stats()
    # Print directly to avoid serialization issues
    print(f"Index statistics:")
    print(index_stats)

In [6]:
class ResilientSageMakerEmbedder:
    """
    A wrapper around SageMaker embedding endpoints with resilience features.
    """
    
    def __init__(self, endpoint_name, max_text_length=512, region="us-east-1"):
        """
        Initialize the embedder with a SageMaker endpoint.
        
        Args:
            endpoint_name: The name of the SageMaker endpoint
            max_text_length: Maximum text length to truncate to
            region: AWS region for the endpoint
        """
        import boto3
        import json
        import numpy as np
        
        self.sagemaker_runtime = boto3.client('sagemaker-runtime', region_name=region)
        self.endpoint_name = endpoint_name
        self.max_text_length = max_text_length
        print(f"Initialized ResilientSageMakerEmbedder for endpoint: {endpoint_name}")

    def _prepare_text(self, text):
            """Clean and prepare text for the embedding model."""
            if not isinstance(text, str):
                text = str(text)
            
            # Remove excessive whitespace
            text = ' '.join(text.split())
            
            # Add E5 prefix
            if not text.startswith("passage:"):
                text = f"passage: {text}"
            
            # Truncate if needed
            if len(text) > self.max_text_length:
                text = text[:self.max_text_length]
                
            return text
    def generate_embeddings(self, texts):
        """Generate embeddings using SageMaker E5 endpoint"""
        # Ensure texts is a list
        if not isinstance(texts, list):
            texts = [texts]
            
        try:
            # Prepare input for E5 model
            prepared_texts = [self._prepare_text(text) for text in texts]
            
            # Prepare payload with explicit pooling parameters
            payload = {
                "inputs": prepared_texts,
                "parameters": {
                    "normalize": True,
                    "pooling": "mean",
                    "return_sentence_embedding": True
                }
            }
            
            response = self.sagemaker_runtime.invoke_endpoint(
                EndpointName=self.endpoint_name,
                ContentType='application/json',
                Body=json.dumps(payload)
            )
            
            response_body = json.loads(response['Body'].read().decode('utf-8'))
            
            embeddings = []
            for emb in response_body:
                emb_array = np.array(emb)
                
                if len(emb_array.shape) > 1:
                    while len(emb_array.shape) > 1:
                        emb_array = np.mean(emb_array, axis=0)
                
                if emb_array.shape[0] != EMBEDDING_DIMENSION:
                    if emb_array.shape[0] > EMBEDDING_DIMENSION:
                        emb_array = emb_array[:EMBEDDING_DIMENSION]
                    else:
                        padded = np.zeros(EMBEDDING_DIMENSION)
                        padded[:emb_array.shape[0]] = emb_array
                        emb_array = padded
                
                embeddings.append(emb_array.tolist())
            
            return embeddings
                
        except Exception as e:
            print(f"Error generating embeddings: {str(e)}")
            return [self._create_random_unit_vector() for _ in range(len(texts))]
    
    def _create_random_unit_vector(self, dim=1024):
        """Create a random unit vector for fallback"""
        import numpy as np
        vec = np.random.normal(0, 1, size=dim)
        return (vec / np.linalg.norm(vec)).tolist()

In [7]:
def run_embedding_pipeline():
    """
    Complete pipeline to load jobs, generate embeddings, and upload to Pinecone
    """
    import time
    
    # Verify configuration
    if not PINECONE_API_KEY:
        print("ERROR: Pinecone API key is not set. Please set PINECONE_API_KEY in the configuration cell.")
        return False
    
    print(f"Configuring embedding pipeline...")
    print(f"Embedding dimension: {EMBEDDING_DIMENSION}")
    print(f"Using SageMaker endpoint: {SAGEMAKER_ENDPOINT_NAME}")
    print(f"Using Pinecone index: {PINECONE_INDEX_NAME}")
    
    # Initialize Pinecone if it's not already initialized
    if 'pinecone_index' not in globals() or pinecone_index is None:
        print("Initializing Pinecone...")
        pinecone_index = initialize_pinecone()
        if not pinecone_index:
            print("ERROR: Failed to initialize Pinecone. Check your API key and index settings.")
            return False
    else:
        print("Using existing Pinecone connection")
    
    # Display index stats
    try:
        index_stats = pinecone_index.describe_index_stats()
        print(f"Current index statistics:")
        print(f"  - Total vectors: {index_stats.get('total_vector_count', 0)}")
        print(f"  - Dimension: {index_stats.get('dimension', 0)}")
    except Exception as e:
        print(f"Warning: Could not retrieve index stats: {str(e)}")
    
    # Load jobs - use existing sample_jobs if available
    if 'sample_jobs' in globals() and sample_jobs:
        print(f"Using {len(sample_jobs)} previously loaded jobs")
        jobs = sample_jobs
    else:
        print("Loading new job samples...")
        jobs = load_job_descriptions(limit=101)
        if not jobs:
            print("ERROR: No jobs loaded. Cannot proceed.")
            return False
    
    # Process jobs and upload
    start_time = time.time()
    stats = process_jobs_and_upload(jobs, pinecone_index, batch_size=64)
    total_time = time.time() - start_time
    
    # Print stats
    print("\nProcessing complete!")
    print(f"Processed {stats['total_jobs']} jobs")
    print(f"Total strings extracted: {stats['total_strings']}")
    print(f"Strings embedded: {stats['strings_embedded']}")
    print(f"Vectors uploaded to Pinecone: {stats['vectors_uploaded']}")
    print(f"Total processing time: {total_time:.2f} seconds")
    
    # Verify final index state
    try:
        final_stats = pinecone_index.describe_index_stats()
        print(f"Final index statistics:")
        print(f"  - Total vectors: {final_stats.get('total_vector_count', 0)}")
    except Exception as e:
        print(f"Warning: Could not retrieve final index stats: {str(e)}")
    
    return stats

In [8]:
def load_job_descriptions(limit=37310, skip=0, bucket="mirra-matcher-325", prefix="data/processed/jobs/"):
    """
    Load job descriptions from S3 with pagination support and skip capability.
    
    Args:
        limit: Maximum number of jobs to load
        skip: Number of jobs to skip before starting to load
        bucket: S3 bucket name
        prefix: S3 prefix path
        
    Returns:
        List of job description dictionaries
    """
    jobs = []
    s3_client = boto3.client('s3')
    
    try:
        print(f"Loading job descriptions from S3: s3://{bucket}/{prefix}")
        print(f"Skipping first {skip} jobs, loading up to {limit} jobs")
        
        # Initialize pagination parameters
        continuation_token = None
        has_more = True
        jobs_seen = 0
        
        while has_more and len(jobs) < limit:
            # Prepare request parameters
            list_params = {
                'Bucket': bucket,
                'Prefix': prefix,
                'MaxKeys': 1000  # Request 1000 keys per request for efficiency
            }
            
            # Add continuation token if we have one
            if continuation_token:
                list_params['ContinuationToken'] = continuation_token
                
            # Make the request
            response = s3_client.list_objects_v2(**list_params)
            
            # Process the contents
            if 'Contents' in response:
                for obj in response['Contents']:
                    jobs_seen += 1
                    
                    # Skip jobs until we reach our starting point
                    if jobs_seen <= skip:
                        continue
                        
                    key = obj['Key']
                    if key.endswith(".json"):
                        try:
                            response = s3_client.get_object(Bucket=bucket, Key=key)
                            job_data = json.loads(response['Body'].read().decode('utf-8'))
                            # Add job_id if not present
                            if "job_id" not in job_data:
                                job_data["job_id"] = key.split("/")[-1].replace(".json", "")
                            jobs.append(job_data)
                            print(f"Loaded job: {job_data.get('job_id')}")
                            
                            # Stop if we've reached the limit
                            if len(jobs) >= limit:
                                break
                        except Exception as e:
                            print(f"Error loading job from {key}: {str(e)}")
            
            # Check if there are more items to fetch
            has_more = response.get('IsTruncated', False)
            continuation_token = response.get('NextContinuationToken')
                
    except Exception as e:
        print(f"Error loading files from S3: {str(e)}")

    print(f"Loaded {len(jobs)} job descriptions from S3")
    return jobs

In [None]:
sample_jobs = load_job_descriptions(limit=37310)

In [10]:
def safe_preview_job(job):
    """
    Safely preview a job description with fallback values.
    
    Args:
        job (dict): Job description dictionary
    
    Returns:
        dict: Preview of job with safe access to values
    """
    return {
        "job_id": job.get("job_id", "Unknown"),
        "job_title": (
            job.get("details", {}).get("job_title", ["Unknown"])[0] 
            if job.get("details", {}).get("job_title") 
            else "Unknown"
        ),
        "company_name": (
            job.get("details", {}).get("company_name", ["Unknown"])[0] 
            if job.get("details", {}).get("company_name") 
            else "Unknown"
        )
    }

# Usage
if sample_jobs:
    try:
        job_preview = safe_preview_job(sample_jobs[0])
        print("\nPreview of first job:")
        print(json.dumps(job_preview, indent=2))
    except Exception as e:
        print(f"Error previewing job: {e}")


Preview of first job:
{
  "job_id": "00009cf2-7c46-4365-aa06-f3d142ea4819",
  "job_title": "Information Technology Manager (MSO)",
  "company_name": "AC Wellness Network LLC"
}


In [11]:
def extract_job_strings(job_data):
    """
    Extract all relevant strings from a job description for embedding.
    Does not add prefix information to strings.
    
    Args:
        job_data: Job description dictionary
        
    Returns:
        List of unique strings to be embedded
    """
    result = []

    def extract_strings(data):
        if isinstance(data, dict):
            for key, value in data.items():
                # Add the key itself if it's a string
                if isinstance(key, str):
                    result.append(key)
                
                # Process the value
                extract_strings(value)
                
        elif isinstance(data, list):
            for item in data:
                extract_strings(item)
                
        elif isinstance(data, str) and data.strip():
            # For string values, add only the raw string
            result.append(data.strip())
    
    extract_strings(job_data)
    
    # Remove duplicates while preserving order
    seen = set()
    unique_result = []
    for item in result:
        if item not in seen:
            seen.add(item)
            unique_result.append(item)
    
    return unique_result

In [12]:
def ensure_correct_vector_format(vector_data):
    """
    Ensures that the vector values are in the correct format for Pinecone:
    - Must be a flat list of float values, not nested lists
    
    Args:
        vector_data: The vector data to check/fix
        
    Returns:
        Properly formatted vector data
    """
    import numpy as np
    
    # Check if the vector values are nested lists
    if isinstance(vector_data, list) and vector_data and isinstance(vector_data[0], list):
        # It's a nested list - flatten it to a 1D array
        print("WARNING: Found nested list in vector data, flattening to 1D array")
        flat_array = np.array(vector_data).flatten()
        return flat_array.tolist()
    
    # Ensure it's a list of floats
    if isinstance(vector_data, np.ndarray):
        return vector_data.tolist()
    
    return vector_data

In [13]:
def process_and_vectorize_strings(jobs, pinecone_index, embedder, batch_size=64, max_retries=3):
    """
    Process job descriptions, extract unique strings, vectorize them, and upload to Pinecone.
    - Excludes 'details' section
    - Removes duplicate strings
    - Stores only the string text in the metadata (no job_id)
    
    Args:
        jobs: List of job description dictionaries
        pinecone_index: Pinecone index instance
        embedder: Embedding generator instance
        batch_size: Batch size for processing
        max_retries: Maximum number of retries for failed operations
        
    Returns:
        Dictionary with processing statistics
    """
    import time
    import numpy as np
    import uuid
    from tqdm import tqdm
    
    # Validate inputs
    if not jobs:
        print("Error: No job descriptions provided")
        return {"status": "error", "reason": "no_jobs"}
        
    if not pinecone_index:
        print("Error: No Pinecone index provided")
        return {"status": "error", "reason": "no_index"}
    
    # Initialize metrics
    start_time = time.time()
    
    # Extract all unique strings across all jobs
    print(f"Extracting strings from {len(jobs)} job descriptions...")
    all_unique_strings = set()
    
    for job in jobs:
        try:
            job_strings = extract_job_strings(job)
            all_unique_strings.update(job_strings)
        except Exception as e:
            job_id = job.get("job_id", "unknown")
            print(f"Error extracting strings from job {job_id}: {str(e)}")
    
    all_strings_list = list(all_unique_strings)
    print(f"Total unique strings: {len(all_strings_list)}")
    
    # Generate embeddings for all unique strings
    string_to_embedding = {}
    if all_strings_list:
        # Process in batches
        for i in range(0, len(all_strings_list), batch_size):
            batch = all_strings_list[i:i+batch_size]
            batch_num = i//batch_size + 1
            total_batches = (len(all_strings_list) - 1) // batch_size + 1
            
            print(f"Processing batch {batch_num}/{total_batches} ({batch_num/total_batches*100:.1f}%)...")
            
            # Retry logic
            for retry in range(max_retries + 1):
                try:
                    # Generate embeddings
                    batch_embeddings = embedder.generate_embeddings(batch)
                    
                    # Store embeddings
                    for text, embedding in zip(batch, batch_embeddings):
                        string_to_embedding[text] = embedding
                    
                    break  # Success, exit retry loop
                    
                except Exception as e:
                    if retry < max_retries:
                        sleep_time = 2 ** retry
                        print(f"Error in batch {batch_num}, retrying in {sleep_time}s: {str(e)}")
                        time.sleep(sleep_time)
                    else:
                        print(f"Failed after {max_retries} retries for batch {batch_num}: {str(e)}")
    
    # Upload vector embeddings to Pinecone
    vectors_uploaded = 0
    if string_to_embedding:
        vectors_to_upload = []
        
        # Prepare vectors - one for each unique string, no job association
        for i, (text, embedding) in enumerate(string_to_embedding.items()):
            # Ensure proper vector format
            vector_values = ensure_correct_vector_format(embedding)
            
            # Create a unique ID for each string
            vector_id = f"string_{i}_{uuid.uuid4()}"
            
            # Create metadata with only the text
            metadata = {
                "text": text[:1000] if len(text) > 1000 else text
            }
            
            # Add to upload list
            vectors_to_upload.append({
                "id": vector_id,
                "values": vector_values,
                "metadata": metadata
            })
        
        print(f"Prepared {len(vectors_to_upload)} vectors for upload")
        
        # Upload in batches
        with tqdm(total=len(vectors_to_upload), desc="Uploading to Pinecone") as pbar:
            for i in range(0, len(vectors_to_upload), batch_size):
                batch = vectors_to_upload[i:i+batch_size]
                
                # Try upload with retries
                for retry in range(max_retries + 1):
                    try:
                        pinecone_index.upsert(vectors=batch)
                        vectors_uploaded += len(batch)
                        pbar.update(len(batch))
                        break  # Success, exit retry loop
                        
                    except Exception as e:
                        if retry < max_retries:
                            sleep_time = 2 ** retry
                            print(f"Upload error, retrying in {sleep_time}s: {str(e)}")
                            time.sleep(sleep_time)
                        else:
                            print(f"Failed to upload batch after {max_retries} retries: {str(e)}")
    
    # Return comprehensive statistics
    total_time = time.time() - start_time
    
    return {
        "status": "success",
        "total_jobs": len(jobs),
        "total_unique_strings": len(all_strings_list),
        "strings_embedded": len(string_to_embedding),
        "vectors_uploaded": vectors_uploaded,
        "processing_time": total_time
    }


# Modified search function to work with string-only vectors
def semantic_string_search(query, pinecone_index, embedder, similarity_threshold=0.8, top_k=100):
    """
    Search for semantically similar strings in Pinecone.
    
    Args:
        query: The text to search for
        pinecone_index: The Pinecone index to search
        embedder: Embedding generator instance
        similarity_threshold: Minimum similarity score (0-1) to include in results
        top_k: Maximum number of results to return
        
    Returns:
        List of dictionaries with text content and similarity score
    """
    try:
        query_embedding = embedder.generate_embeddings([query])[0]
        
        search_results = pinecone_index.query(
            vector=query_embedding,
            top_k=top_k,
            include_metadata=True,
            include_values=False
        )
        
        similar_strings = []
        
        matches = getattr(search_results, 'matches', [])
        
        for match in matches:

            score = getattr(match, 'score', 0)
            metadata = getattr(match, 'metadata', {})
            
            if score < similarity_threshold:
                continue
                
            text = metadata.get('text', '')
            
            similar_strings.append({
                'text': text,
                'similarity': score
            })
        
        return similar_strings
        
    except Exception as e:
        print(f"Error in semantic search: {str(e)}")
        return []

In [14]:
def run_vectorization_pipeline():
    # Load jobs
    jobs = load_job_descriptions(limit=37310)
    
    # Initialize Pinecone and embedder
    pinecone_index = initialize_pinecone()
    embedder = ResilientSageMakerEmbedder(
        endpoint_name=SAGEMAKER_ENDPOINT_NAME, 
        region=AWS_REGION
    )
    
    # Process and upload
    stats = process_and_vectorize_strings(
        jobs=jobs,
        pinecone_index=pinecone_index,
        embedder=embedder,
        batch_size=4
    )
    
    print(f"Processing complete!")
    print(f"Total unique strings: {stats.get('total_unique_strings', 0)}")
    print(f"Strings embedded: {stats.get('strings_embedded', 0)}")
    print(f"Vectors uploaded: {stats.get('vectors_uploaded', 0)}")
    
    # Test a search
    query = "Python"
    results = semantic_string_search(
        query=query,
        pinecone_index=pinecone_index,
        embedder=embedder,
        similarity_threshold=0.8
    )
    
    print(f"\nStrings similar to '{query}':")
    for i, result in enumerate(results[:10]):  # Show top 10
        print(f"{i+1}. {result['text']} (Score: {result['similarity']:.4f})")

In [None]:
## run_vectorization_pipeline()

In [15]:
def process_all_jobs_in_batches(batch_size=1000, start_index=0, max_jobs=37312):
    """
    Process all jobs in batches of specified size, continuing until all jobs are processed.
    
    Args:
        batch_size: Number of jobs to load and process in each batch
        start_index: Index to start from (for resuming)
        max_jobs: Maximum number of jobs to process in total
        
    Returns:
        Dictionary with total statistics
    """
    import time
    import os
    import json
    
    # Path to store progress tracking
    progress_file = "embedding_progress.json"
    
    # Load progress if exists
    if os.path.exists(progress_file):
        try:
            with open(progress_file, 'r') as f:
                progress = json.load(f)
                processed_ids = set(progress.get("processed_job_ids", []))
                last_processed_index = progress.get("last_index", start_index)
                # Use the greater of the provided start_index or the last_index from the progress file
                start_index = max(start_index, last_processed_index)
        except Exception as e:
            print(f"Error loading progress file: {e}")
            processed_ids = set()
    else:
        processed_ids = set()
    
    # Initialize Pinecone and embedder
    pinecone_index = initialize_pinecone()
    embedder = ResilientSageMakerEmbedder(
        endpoint_name=SAGEMAKER_ENDPOINT_NAME, 
        region=AWS_REGION
    )
    
    # Initialize statistics
    total_stats = {
        "total_processed": 0,
        "total_strings_embedded": 0,
        "total_vectors_uploaded": 0,
        "batches_completed": 0,
        "errors": 0
    }
    
    current_index = start_index
    remaining_jobs = max_jobs - len(processed_ids)
    
    print(f"Starting batch processing from index {current_index}")
    print(f"Already processed {len(processed_ids)} jobs")
    print(f"Approximately {remaining_jobs} jobs remaining")
    
    # Process batches until reaching max_jobs or all jobs are processed
    while current_index < max_jobs:
        batch_start_time = time.time()
        print(f"\nProcessing batch starting at index {current_index}")
        
        # Load a batch of jobs
        loaded_jobs = load_job_descriptions(limit=batch_size, skip=current_index)
        
        if not loaded_jobs:
            print(f"No more jobs to load after index {current_index}. Processing complete.")
            break
            
        print(f"Loaded {len(loaded_jobs)} jobs for this batch")
        
        # Filter out already processed jobs
        batch_jobs = []
        for job in loaded_jobs:
            job_id = job.get("job_id")
            if job_id and job_id not in processed_ids:
                batch_jobs.append(job)
        
        if not batch_jobs:
            print(f"All jobs in this batch have already been processed. Moving to next batch.")
            current_index += len(loaded_jobs)
            continue
            
        print(f"After filtering duplicates: {len(batch_jobs)} jobs to process")
        
        try:
            # Process this batch of jobs
            batch_stats = process_and_vectorize_strings(
                jobs=batch_jobs,
                pinecone_index=pinecone_index,
                embedder=embedder,
                batch_size=10  # This is the embedder batch size, not the job batch size
            )
            
            # Update statistics
            total_stats["total_processed"] += len(batch_jobs)
            total_stats["total_strings_embedded"] += batch_stats.get("strings_embedded", 0)
            total_stats["total_vectors_uploaded"] += batch_stats.get("vectors_uploaded", 0)
            total_stats["batches_completed"] += 1
            
            # Update processed IDs
            for job in batch_jobs:
                job_id = job.get("job_id")
                if job_id:
                    processed_ids.add(job_id)
            
            # Save progress
            progress = {
                "last_index": current_index + len(loaded_jobs),
                "total_processed": len(processed_ids),  # Total number of unique jobs processed
                "processed_job_ids": list(processed_ids)
            }
            
            with open(progress_file, 'w') as f:
                json.dump(progress, f)
                            batch_time = time.time() - batch_start_time
            print(f"Batch completed in {batch_time:.2f} seconds")
            print(f"Processed {len(batch_jobs)} jobs in this batch")
            print(f"Total unique strings in batch: {batch_stats.get('total_unique_strings', 0)}")
            print(f"Strings embedded in batch: {batch_stats.get('strings_embedded', 0)}")
            print(f"Vectors uploaded in batch: {batch_stats.get('vectors_uploaded', 0)}")
            print(f"Total processed so far: {len(processed_ids)} jobs")
            
            if total_stats["batches_completed"] > 0:
                avg_time_per_batch = batch_time
                remaining_batches = (max_jobs - current_index - len(loaded_jobs)) // batch_size + 1
                est_remaining_time = remaining_batches * avg_time_per_batch
                hours, remainder = divmod(est_remaining_time, 3600)
                minutes, seconds = divmod(remainder, 60)
                print(f"Estimated remaining time: {int(hours)}h {int(minutes)}m {int(seconds)}s")
            
        except Exception as e:
            print(f"Error processing batch starting at index {current_index}: {str(e)}")
            total_stats["errors"] += 1
            
            progress = {
                "last_index": current_index + len(loaded_jobs),
                "total_processed": len(processed_ids),
                "processed_job_ids": list(processed_ids),
                "last_error": str(e)
            }
            
            with open(progress_file, 'w') as f:
                json.dump(progress, f)
        
        current_index += len(loaded_jobs)
        
        if len(processed_ids) >= max_jobs:
            print(f"Reached maximum number of jobs to process ({max_jobs}). Stopping.")
            break
    
    # Final report
    print("\n===== Processing Complete =====")
    print(f"Total jobs processed: {total_stats['total_processed']}")
    print(f"Total unique strings embedded: {total_stats['total_strings_embedded']}")
    print(f"Total vectors uploaded to Pinecone: {total_stats['total_vectors_uploaded']}")
    print(f"Total batches processed: {total_stats['batches_completed']}")
    print(f"Errors encountered: {total_stats['errors']}")
    
    return total_stats

# stats = process_all_jobs_in_batches(batch_size=1000, start_index=0, max_jobs=37310)

In [16]:
# Run the full batch processing
def run_full_process():
    """
    Run the full batch processing for all 37,000+ jobs
    """
    # Define the parameters
    batch_size = 1000  
    start_index = 0  
    max_jobs = 37310
    
    print(f"Starting full batch processing with batch size of {batch_size}")
    print(f"Will process up to {max_jobs} jobs starting from index {start_index}")
    
    try:
        stats = process_all_jobs_in_batches(
            batch_size=batch_size, 
            start_index=start_index, 
            max_jobs=max_jobs
        )
        
        print("\n===== FINAL SUMMARY =====")
        print(f"Total jobs processed: {stats['total_processed']}")
        print(f"Total strings embedded: {stats['total_strings_embedded']}")
        print(f"Total vectors uploaded: {stats['total_vectors_uploaded']}")
        print(f"Total batches completed: {stats['batches_completed']}")
        print(f"Errors encountered: {stats['errors']}")
        
        try:
            pinecone_index = initialize_pinecone()
            
            final_stats = pinecone_index.describe_index_stats()
            print("\n===== PINECONE INDEX STATISTICS =====")
            print(f"Total vectors in index: {final_stats.get('total_vector_count', 0)}")
            print(f"Dimension: {final_stats.get('dimension', 0)}")
            print(f"Metric: {final_stats.get('metric', 'unknown')}")
            
            namespaces = final_stats.get('namespaces', {})
            for ns_name, ns_info in namespaces.items():
                ns_name_display = f"'{ns_name}'" if ns_name else "default"
                print(f"Namespace {ns_name_display}: {ns_info.get('vector_count', 0)} vectors")
                
        except Exception as e:
            print(f"Error getting final Pinecone statistics: {str(e)}")
        
        return stats
        
    except Exception as e:
        print(f"Error during batch processing: {str(e)}")
        raise

In [None]:
# result = process_jobs_in_batches(batch_size=1000, start_index=0)
# print(f"Processing complete! {result['total_processed']} new jobs processed.")
# print(f"Next starting index: {result['last_index']}")

In [None]:
run_full_process()

In [None]:
# For a specific job, list all its strings
if job_ids:
    job_id = job_ids[0]  # Take the first job
    strings = list_all_strings_for_job(pinecone_index, job_id)

In [None]:
embedder = ResilientSageMakerEmbedder(
        endpoint_name=SAGEMAKER_ENDPOINT_NAME,
        region=AWS_REGION
    )

def semantic_search(query_title, pinecone_index, embedder=None, similarity_threshold=0.8):
    """
    Search for semantically similar job titles in Pinecone.
    
    Args:
        query_title: The job title to search for
        pinecone_index: The Pinecone index to search
        embedder: Embedding generator instance (optional)
        similarity_threshold: Minimum similarity score (0-1) to include in results
        
    Returns:
        List of dictionaries with job_id, job_title, and similarity score
    """
    try:
        # Create embedder if not provided
        if embedder is None:
            from random_embedding import RandomEmbeddingGenerator
            print("No embedder provided. Creating RandomEmbeddingGenerator as fallback.")
            embedder = RandomEmbeddingGenerator()
            
        
        query_text = f"{query_title}"  # You may need to adjust the format
        query_embedding = embedder.generate_embeddings([query_text])[0]
        
        search_results = pinecone_index.query(
            vector=query_embedding,
            top_k=100,  # Get enough results to filter
            include_metadata=True,
            include_values=False  # only need the scores, not the vectors
        )
        
        similar_text = []
        
        # Pinecone v6+ returns objects instead of dictionaries
        matches = getattr(search_results, 'matches', [])
        
        for match in matches:
            # Extract match properties
            score = getattr(match, 'score', 0)
            metadata = getattr(match, 'metadata', {})
            
            # Skip if below threshold
            if score < similarity_threshold:
                continue
                
            # Extract text
            text = metadata.get('text', metadata.get('text', '')).replace('Text: ', '')
            
            similar_text.append({
                'text': text,
                'similarity': score
            })
        
        return similar_text
        
    except Exception as e:
        print(f"Error in semantic search: {str(e)}")
        return []

In [None]:
# Test the function with a sample string
from pinecone import Pinecone
pc = Pinecone(api_key=PINECONE_API_KEY)
pinecone_index = pc.Index(PINECONE_INDEX_NAME)

test_query = ""
results = semantic_search(
    test_query, 
    pinecone_index=pinecone_index,
    embedder=embedder
)

print(f"Jobs similar to '{test_query}' (similarity > 0.8):")
for job in results:
    print(f"- {job['text']}, Score: {job['similarity']:.4f})")