In [1]:
import boto3
import json
import os
import pandas as pd
import uuid
import sys
import numpy as np
import time
from typing import List, Dict, Any, Union
from tqdm import tqdm

In [2]:
SAGEMAKER_ENDPOINT_NAME = "e5-embeddings-huggingface"
AWS_REGION = "us-east-1"

PINECONE_API_KEY = ""  
PINECONE_ENVIRONMENT = "us-east-1"  # matched with AWS region
PINECONE_INDEX_NAME = "mirra-filtering"

EMBEDDING_DIMENSION = 1024

In [10]:
class ResilientSageMakerEmbedder:
    """
    A wrapper around SageMaker embedding endpoints with resilience features.
    Includes text length limits, proper error handling, and fallbacks.
    """
    
    def __init__(self, endpoint_name, max_text_length=512, region="us-east-1"):
        """
        Initialize the embedder with a SageMaker endpoint.
        
        Args:
            endpoint_name: The name of the SageMaker endpoint
            max_text_length: Maximum text length to truncate to
            region: AWS region for the endpoint
        """
        self.sagemaker_runtime = boto3.client('sagemaker-runtime', region_name=region)
        self.endpoint_name = endpoint_name
        self.max_text_length = max_text_length
        print(f"Initialized ResilientSageMakerEmbedder for endpoint: {endpoint_name}")

    def _prepare_text(self, text):
        """Clean and prepare text for the embedding model."""
        if not isinstance(text, str):
            text = str(text)
        
        text = ' '.join(text.split())
        
        # Add E5 prefix
        if not text.startswith("passage:"):
            text = f"passage: {text}"
        
        # Truncate if needed
        if len(text) > self.max_text_length:
            text = text[:self.max_text_length]
            
        return text

    def generate_embeddings(self, texts, max_retries=3):
        """
        Generate embeddings using SageMaker E5 endpoint with retries and fallbacks
        
        Args:
            texts: String or list of texts to embed
            max_retries: Maximum retry attempts for API failures
            
        Returns:
            List of embedding vectors
        """
        # Ensure texts is a list
        if not isinstance(texts, list):
            texts = [texts]
            
        all_embeddings = []
        
        for i, text in enumerate(texts):
            # Process with retries
            for retry in range(max_retries):
                try:
                    # Prepare the single text
                    prepared_text = self._prepare_text(text)
                    
                    # Prepare payload with explicit pooling parameters
                    payload = {
                        "inputs": [prepared_text],
                        "parameters": {
                            "normalize": True,
                            "pooling": "mean",
                            "return_sentence_embedding": True
                        }
                    }
                    
                    # Call SageMaker endpoint
                    response = self.sagemaker_runtime.invoke_endpoint(
                        EndpointName=self.endpoint_name,
                        ContentType='application/json',
                        Body=json.dumps(payload)
                    )
                    
                    response_body = json.loads(response['Body'].read().decode('utf-8'))
                    
                    emb_array = np.array(response_body[0])
                    
                    if len(emb_array.shape) > 1:
                        # Average across all but the last dimension
                        while len(emb_array.shape) > 1:
                            emb_array = np.mean(emb_array, axis=0)
                    
                    if emb_array.shape[0] != EMBEDDING_DIMENSION:
                        if emb_array.shape[0] > EMBEDDING_DIMENSION:
                            emb_array = emb_array[:EMBEDDING_DIMENSION]
                        else:
                            padded = np.zeros(EMBEDDING_DIMENSION)
                            padded[:emb_array.shape[0]] = emb_array
                            emb_array = padded
                    
                    all_embeddings.append(emb_array.tolist())
                    
                    # Small delay to prevent overwhelming the endpoint
                    if i < len(texts) - 1:
                        time.sleep(0.1)
                        
                    break
                    
                except Exception as e:
                    if retry < max_retries - 1:
                        wait_time = (2 ** retry) * 0.5  # Exponential backoff
                        print(f"Retry {retry+1} for text {i+1}: {str(e)}")
                        time.sleep(wait_time)
                    else:
                        print(f"Error generating embedding for text {i+1}: {str(e)}")
                        
                        all_embeddings.append(self._create_random_unit_vector())
    
        return all_embeddings
    
    def _create_random_unit_vector(self, dim=1024):
        """Create a random unit vector for fallback"""
        vec = np.random.normal(0, 1, size=dim)
        return (vec / np.linalg.norm(vec)).tolist()

In [11]:
def initialize_pinecone():
    """Initialize Pinecone and return the index"""
    try:
        # Initialize Pinecone client
        from pinecone import Pinecone
        pc = Pinecone(api_key=PINECONE_API_KEY)
        
        # Check if the index exists
        existing_indexes = pc.list_indexes().names()
        print(f"Available Pinecone indexes: {existing_indexes}")
        
        if PINECONE_INDEX_NAME not in existing_indexes:
            print(f"Creating new index '{PINECONE_INDEX_NAME}'...")
            
            # Create the index with the metadata fields we want to be searchable
            pc.create_index(
                name=PINECONE_INDEX_NAME,
                dimension=EMBEDDING_DIMENSION,
                metric="cosine",
                metadata_config={
                    "indexed": [
                        "emp_type",
                        "job_title",
                        "exp_level",
                        "domain",
                        "location",
                        "visa_sponsor",
                        "salary_range_from",
                        "salary_range_to"
                    ]
                }
            )
            print(f"Index '{PINECONE_INDEX_NAME}' created successfully")
        
        # Connect to the index
        index = pc.Index(PINECONE_INDEX_NAME)
        print(f"Connected to Pinecone index: {PINECONE_INDEX_NAME}")
        
        return index
        
    except Exception as e:
        print(f"Error initializing Pinecone: {str(e)}")
        print("Please check your API key and environment settings.")
        return None

In [12]:
def load_job_descriptions(start=0, limit=32300, bucket="mirra-matcher-325", prefix="data/processed/jobs/"):
    """
    Load job descriptions from S3 with pagination.

    Args:
        start: Index to start loading from
        limit: Maximum number of jobs to load
        bucket: S3 bucket name
        prefix: S3 prefix for job files

    Returns:
        List of job description dicts
    """
    import boto3
    import json

    jobs = []
    s3_client = boto3.client('s3')
    paginator = s3_client.get_paginator('list_objects_v2')
    page_iterator = paginator.paginate(Bucket=bucket, Prefix=prefix)

    seen = 0
    for page in page_iterator:
        if 'Contents' not in page:
            continue

        for obj in page['Contents']:
            if seen < start:
                seen += 1
                continue
            if len(jobs) >= limit:
                return jobs

            key = obj['Key']
            if key.endswith(".json"):
                response = s3_client.get_object(Bucket=bucket, Key=key)
                job_data = json.loads(response['Body'].read().decode('utf-8'))
                if "job_id" not in job_data:
                    job_data["job_id"] = key.split("/")[-1].replace(".json", "")
                jobs.append(job_data)
                print(f"Loaded job: {job_data.get('job_id')}")
            seen += 1

    print(f"Loaded {len(jobs)} job descriptions from S3")
    return jobs


In [13]:
def format_job_for_embedding(job):
    """
    Format a job JSON into a comprehensive text representation for embedding.
    
    Args:
        job: Job description dictionary
    
    Returns:
        String representation of the job with all key attributes and
        Dictionary of metadata for storage in Pinecone
    """
    # Extract job details
    details = job.get("details", {})
    
    # Extract each attribute, handling potential missing data
    # Job title
    job_title_data = details.get("job_title", ["Unknown"]) 
    job_title = job_title_data[0] if isinstance(job_title_data, list) and job_title_data else "Unknown"
    
    # Location handling
    location_data = details.get("location", [])
    location = "Remote"  # Default
    
    if isinstance(location_data, list) and location_data:
        location_item = location_data[0]
        if isinstance(location_item, dict):
            city = location_item.get("city", "")
            state = location_item.get("state", "")
            country = location_item.get("country", "")
            location = ", ".join(filter(None, [city, state, country]))
    
    # Company name
    company_data = details.get("company_name", [])
    company_name = company_data[0] if isinstance(company_data, list) and company_data else "Unknown"
    
    # Employment type - Default to "Full-time" if empty
    employment_data = details.get("employment_type", [])
    if not employment_data and details.get("tax_terms"):
        # Use tax_terms if employment_type is empty
        employment_data = details.get("tax_terms", ["Full-time"])
    employment_type = employment_data[0] if isinstance(employment_data, list) and employment_data else "Full-time"
    
    # Experience level (from required years in hard skills or from details.experience_level)
    experience_level = "Entry-level"  # Default
    
    # First check if experience_level is directly specified
    exp_level_data = details.get("experience_level", [])
    if isinstance(exp_level_data, list) and exp_level_data:
        experience_level = exp_level_data[0]
    # If not directly specified, try to infer from the required years in hard skills
    elif job.get("mandatory", {}).get("hard_skills"):
        max_years = 0
        for skill in job["mandatory"]["hard_skills"]:
            min_years_data = skill.get("minyears", [0])
            min_years = min_years_data[0] if isinstance(min_years_data, list) and min_years_data else 0
            
            # Convert to numeric if needed
            if not isinstance(min_years, (int, float)):
                try:
                    min_years = float(min_years)
                except (ValueError, TypeError):
                    min_years = 0
            
            max_years = max(max_years, min_years)
        
        if max_years >= 7:
            experience_level = "Senior"
        elif max_years >= 3:
            experience_level = "Mid-level"
        else:
            experience_level = "Entry-level"
    
    # Process and standardize salary information
    # Initialize default values
    salary_from = 0
    salary_to = 0
    
    # Helper function to convert hourly rate to annual salary
    def hourly_to_annual(hourly_rate):
        try:
            hourly = float(hourly_rate)
            # 40 hours per week, 52 weeks per year
            return int(hourly * 40 * 52)
        except (ValueError, TypeError):
            return 0
    
    # Helper function to determine if a value is likely an hourly rate
    def is_likely_hourly(value):
        try:
            num_value = float(value)
            # Most hourly rates are under 200, most annual salaries are over 20000
            return num_value > 0 and num_value < 200
        except (ValueError, TypeError):
            return False
    
    # Check for direct salary values at the top level of the job object
    # Convert to annual if they look like hourly rates
    if "salary_range_from" in job:
        try:
            from_val = float(job["salary_range_from"])
            if is_likely_hourly(from_val):
                salary_from = hourly_to_annual(from_val)
            else:
                salary_from = int(from_val)
        except (ValueError, TypeError):
            salary_from = 0
    
    if "salary_range_to" in job:
        try:
            to_val = float(job["salary_range_to"])
            if is_likely_hourly(to_val):
                salary_to = hourly_to_annual(to_val)
            else:
                salary_to = int(to_val)
        except (ValueError, TypeError):
            salary_to = 0
    
    # Try to get salary information from wage field in details
    wage_data = details.get("wage", [])
    
    if isinstance(wage_data, list) and wage_data:
        for wage_item in wage_data:
            # Check if the wage item is a dictionary
            if isinstance(wage_item, dict):
                min_wage = wage_item.get("min", 0)
                max_wage = wage_item.get("max", 0)
                pay_type = wage_item.get("pay_type", "")
                
                try:
                    # Standardize the salary values based on pay type
                    if pay_type.lower() == "hourly":
                        # Convert hourly to annual (40 hours/week, 52 weeks/year)
                        annual_min = hourly_to_annual(min_wage)
                        annual_max = hourly_to_annual(max_wage)
                        
                        if annual_min > 0 and (annual_min > salary_from or salary_from == 0):
                            salary_from = annual_min
                        if annual_max > 0 and (annual_max > salary_to or salary_to == 0):
                            salary_to = annual_max
                            
                    elif pay_type.lower() == "salary":
                        # For salary, use the values directly but ensure they're annualized
                        if min_wage and float(min_wage) > 0:
                            min_value = int(float(min_wage))
                            if is_likely_hourly(min_value):
                                min_value = hourly_to_annual(min_value)
                            if min_value > salary_from or salary_from == 0:
                                salary_from = min_value
                                
                        if max_wage and float(max_wage) > 0:
                            max_value = int(float(max_wage))
                            if is_likely_hourly(max_value):
                                max_value = hourly_to_annual(max_value)
                            if max_value > salary_to or salary_to == 0:
                                salary_to = max_value
                except (ValueError, TypeError):
                    # If there's an error converting, continue to the next item
                    continue
    
    if salary_from > 0 and salary_from < 200:
        salary_from = hourly_to_annual(salary_from)
        
    if salary_to > 0 and salary_to < 200:
        salary_to = hourly_to_annual(salary_to)
    
    if salary_to > 0 and salary_from > 0 and salary_to < salary_from:
        salary_to = salary_from
    
    salary_range = "Not specified"
    if salary_from > 0 or salary_to > 0:
        if salary_from > 0 and salary_to > 0:
            if salary_from == salary_to:
                salary_range = f"${salary_from:,}"
            else:
                salary_range = f"${salary_from:,} - ${salary_to:,}"
        elif salary_from > 0:
            salary_range = f"From ${salary_from:,}"
        elif salary_to > 0:
            salary_range = f"Up to ${salary_to:,}"
    
    company_industry = details.get("company_industry", [])
    domain = company_industry[0] if isinstance(company_industry, list) and company_industry else "Technology"
    
    if "domain" in job and job["domain"]:
        domain = job["domain"].strip('"\'')
    
    work_authorization = details.get("work_authorization", [])
    visa_sponsorship = "No"  
    
    if "visa_sponsor" in job:
        visa_sponsorship = job["visa_sponsor"].strip('"\'')
    else:
        if isinstance(work_authorization, list) and work_authorization:
            for auth in work_authorization:
                auth_lower = str(auth).lower()
                if "sponsor" in auth_lower and "not" not in auth_lower and "no" not in auth_lower:
                    visa_sponsorship = "Yes"
                    break
    
    # Construct job text that includes responsibilities and required skills
    skills_text = ""
    if job.get("mandatory", {}).get("hard_skills"):
        skills_list = []
        for skill_item in job["mandatory"]["hard_skills"]:
            if skill_item.get("skill"):
                for skill_group in skill_item["skill"]:
                    if isinstance(skill_group, list):
                        skills_list.append(" ".join(skill_group))
                    else:
                        skills_list.append(skill_group)
        if skills_list:
            skills_text = "Required skills: " + ", ".join(skills_list)
    
    # Construct responsibilities text
    responsibilities_text = ""
    if job.get("responsibility", {}).get("hard_skills"):
        resp_list = []
        for resp_item in job["responsibility"]["hard_skills"]:
            if resp_item.get("skill"):
                for resp_group in resp_item["skill"]:
                    if isinstance(resp_group, list):
                        resp_list.append(" ".join(resp_group))
                    else:
                        resp_list.append(resp_group)
        if resp_list:
            responsibilities_text = "Responsibilities: " + ", ".join(resp_list)
    
    # Combine all attributes into a comprehensive text
    job_text = f"""
Job Title: {job_title}
Company: {company_name}
Location: {location}
Employment Type: {employment_type}
Experience Level: {experience_level}
Industry/Domain: {domain}
Salary Range: {salary_range}
Visa Sponsorship: {visa_sponsorship}

{skills_text}

{responsibilities_text}

Job Description Summary:
This is a {employment_type} position for a {job_title} located in {location}. 
The role requires {experience_level} experience in the {domain} industry.
{f"Salary range: {salary_range}. " if salary_range != "Not specified" else ""}
Visa sponsorship is {visa_sponsorship}.
"""
    
    return job_text.strip(), {
        "job_id": job.get("job_id", str(uuid.uuid4())),
        "job_title": job_title,
        "emp_type": employment_type,
        "exp_level": experience_level,
        "domain": domain,
        "location": location,
        "salary_range_from": salary_from,
        "salary_range_to": salary_to,
        "visa_sponsor": visa_sponsorship
    }

In [14]:
def process_and_upload_jobs(jobs, pinecone_index, batch_size=100, max_retries=3):
    """
    Process jobs, generate embeddings, and upload to Pinecone
    
    Args:
        jobs: List of job dictionaries
        pinecone_index: Pinecone index instance
        batch_size: Batch size for processing (keep low for stability)
        max_retries: Maximum retry attempts
    
    Returns:
        Dictionary with processing statistics
    """
    if not jobs:
        print("No jobs provided for processing")
        return {"status": "error", "reason": "no_jobs"}
    
    if not pinecone_index:
        print("No Pinecone index provided")
        return {"status": "error", "reason": "no_index"}
    
    print(f"Processing {len(jobs)} jobs...")
    
    embedder = ResilientSageMakerEmbedder(
        endpoint_name=SAGEMAKER_ENDPOINT_NAME,
        max_text_length=512, 
        region=AWS_REGION
    )
    
    # Process all jobs
    job_texts = []
    job_metadata = []
    
    for job in jobs:
        try:
            # Format the job and extract metadata
            job_text, metadata = format_job_for_embedding(job)
            job_texts.append(job_text)
            job_metadata.append(metadata)
            print(f"Processed job {metadata['job_id']} - {metadata['job_title']}")
        except Exception as e:
            print(f"Error processing job {job.get('job_id', 'unknown')}: {str(e)}")
    
    vectors_uploaded = 0
    total_vectors = len(job_texts)  
    
    with tqdm(total=total_vectors, desc="Processing and uploading") as progress_bar:
        for i in range(0, total_vectors, batch_size):
            batch_texts = job_texts[i:i+batch_size]
            batch_metadata = job_metadata[i:i+batch_size]
            
            print(f"Processing batch {i//batch_size + 1}/{(total_vectors-1)//batch_size + 1}...")
            
            try:
                # Process strictly one text at a time with explicit error handling
                batch_embeddings = []
                for idx, text in enumerate(batch_texts):
                    try:
                        print(f"Generating embedding for text {i+idx+1}/{total_vectors}")
                        embedding = embedder.generate_embeddings([text])[0]
                        batch_embeddings.append(embedding)
                        print(f"Successfully generated embedding: length={len(embedding)}")
                    except Exception as e:
                        print(f"ERROR generating embedding for text {i+idx+1}: {str(e)}")
                        # Use fallback vector and continue
                        fallback_vector = embedder._create_random_unit_vector()
                        batch_embeddings.append(fallback_vector)
                        print(f"Using fallback vector instead")
                    time.sleep(0.5)  # Brief pause between embeddings
                
                # Create vectors - with explicit length checks
                vectors_to_upload = []
                for j, embedding in enumerate(batch_embeddings):
                    if not embedding or len(embedding) != EMBEDDING_DIMENSION:
                        print(f"WARNING: Invalid embedding at index {j}, using fallback")
                        embedding = embedder._create_random_unit_vector()
                    
                    if j < len(batch_metadata):
                        job_id = batch_metadata[j]['job_id']
                        vector_id = f"job_{job_id}"
                        
                        # Ensure all metadata values are properly formatted
                        metadata_copy = {}
                        for key, value in batch_metadata[j].items():
                            if key in ["salary_range_from", "salary_range_to"]:
                                metadata_copy[key] = int(value) if isinstance(value, (int, float)) else 0
                            else:
                                metadata_copy[key] = str(value) if not isinstance(value, str) else value
                        
                        vectors_to_upload.append({
                            "id": vector_id,
                            "values": embedding,
                            "metadata": metadata_copy
                        })
                
                # Print vector counts for clarity
                print(f"Prepared {len(vectors_to_upload)} vectors for upload")
                
                # Explicit upload with better error logging
                if vectors_to_upload:
                    upload_success = False
                    for retry in range(max_retries):
                        try:
                            pinecone_index.upsert(vectors=vectors_to_upload)
                            vectors_uploaded += len(vectors_to_upload)
                            progress_bar.update(len(vectors_to_upload))
                            upload_success = True
                            print(f"Successfully uploaded {len(vectors_to_upload)} vectors")
                            break
                        except Exception as e:
                            print(f"Upload attempt {retry+1} failed: {str(e)}")
                            import traceback
                            traceback.print_exc()
                            if retry < max_retries - 1:
                                wait_time = (2 ** retry) * 2.0  # Longer wait time
                                print(f"Waiting {wait_time}s before retry...")
                                time.sleep(wait_time)
                    
                    if not upload_success:
                        print("WARNING: Failed to upload batch after all retries")
                else:
                    print("No vectors to upload for this batch")
            
            except Exception as e:
                print(f"Critical error in batch {i//batch_size + 1}: {str(e)}")
                import traceback
                traceback.print_exc()
    
    return {
        "total_jobs": len(jobs),
        "vectors_uploaded": vectors_uploaded
    }

In [15]:
def job_embedding_pipeline(limit=2001):
    """
    Main pipeline to load, process, and upload job embeddings
    
    Args:
        limit: Maximum number of jobs to process
        
    Returns:
        Dictionary with processing statistics
    """
    import time
    start_time = time.time()
    
    # Initialize Pinecone
    pinecone_index = initialize_pinecone()
    if not pinecone_index:
        return {"status": "error", "reason": "Failed to initialize Pinecone"}
    
    # Load jobs
    jobs = load_job_descriptions(limit=limit)
    if not jobs:
        return {"status": "error", "reason": "No jobs loaded"}
    
    # Process and upload jobs
    result = process_and_upload_jobs(jobs, pinecone_index, batch_size=1)
    
    # Get final index stats
    try:
        stats = pinecone_index.describe_index_stats()
        total_vectors = stats.get('total_vector_count', 0)
        print(f"\nFinal index statistics:")
        print(f"  - Total vectors: {total_vectors}")
    except Exception as e:
        print(f"Error getting index stats: {str(e)}")
        total_vectors = result.get("vectors_uploaded", 0)
    
    end_time = time.time()
    processing_time = end_time - start_time
    
    return {
        "status": "success",
        "jobs_processed": len(jobs),
        "vectors_uploaded": result.get("vectors_uploaded", 0),
        "total_vectors_in_index": total_vectors,
        "processing_time_seconds": processing_time
    }


In [16]:
def process_jobs_in_batches(batch_size=2000, start_index=0, max_jobs=37312):
    """
    Process jobs in batches of specified size, keeping track of progress.
    
    Args:
        batch_size: Number of jobs to process in each batch
        start_index: Index to start from (for resuming)
        max_jobs: Maximum number of jobs to process in total
        
    Returns:
        Dictionary with statistics and the last processed index
    """
    import time
    import os
    import json
    from datetime import datetime
    
    progress_file = "embedding_progress.json"
    log_file = f"embedding_log_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt"
    
    # Setup logging
    def log_message(message):
        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        formatted_msg = f"[{timestamp}] {message}"
        print(formatted_msg)
        with open(log_file, 'a') as f:
            f.write(formatted_msg + '\n')
    
    log_message(f"Starting job processing with batch size {batch_size}")
    
    # Load progress if exists
    if os.path.exists(progress_file):
        with open(progress_file, 'r') as f:
            progress = json.load(f)
            processed_ids = set(progress.get("processed_job_ids", []))
            start_index = progress.get("last_index", start_index)
            log_message(f"Resuming from index {start_index} with {len(processed_ids)} jobs already processed")
    else:
        processed_ids = set()
        log_message("Starting new processing job")
    
    # Initialize Pinecone and embedder
    pinecone_index = initialize_pinecone()
    if not pinecone_index:
        log_message("Failed to initialize Pinecone. Check your API key and settings.")
        return {"error": "Pinecone initialization failed"}
        
    embedder = ResilientSageMakerEmbedder(
        endpoint_name=SAGEMAKER_ENDPOINT_NAME, 
        region=AWS_REGION
    )
    
    total_processed = len(processed_ids)
    current_index = start_index
    overall_start_time = time.time()
    
    try:
        while current_index < max_jobs:
            # Load a chunk of jobs
            chunk_size = min(batch_size * 2, max_jobs - current_index)  # Load more than we'll process
            log_message(f"Loading {chunk_size} jobs from index {current_index}")
            
            chunk_jobs = load_job_descriptions(start=current_index, limit=chunk_size)

            if not chunk_jobs:
                log_message("No more jobs available to process")
                break
                
            # Process this chunk in batches
            for batch_start in range(0, len(chunk_jobs), batch_size):
                batch_end = min(batch_start + batch_size, len(chunk_jobs))
                batch_jobs = chunk_jobs[batch_start:batch_end]
                
                # Filter out already processed jobs
                new_batch_jobs = []
                for job in batch_jobs:
                    job_id = job.get("job_id")
                    if job_id and job_id not in processed_ids:
                        new_batch_jobs.append(job)
                
                if new_batch_jobs:
                    log_message(f"Processing batch of {len(new_batch_jobs)} jobs")
                    batch_start_time = time.time()
                    
                    # Process the batch
                    result = process_and_upload_jobs(
                        jobs=new_batch_jobs,
                        pinecone_index=pinecone_index,
                        batch_size=10  # Smaller batch for embeddings
                    )
                    
                    # Update processed IDs and stats
                    for job in new_batch_jobs:
                        processed_ids.add(job.get("job_id"))
                    
                    total_processed += len(new_batch_jobs)
                    
                    # Save progress after each batch
                    progress = {
                        "last_index": current_index + batch_end,
                        "total_processed": total_processed,
                        "processed_job_ids": list(processed_ids),
                        "last_updated": datetime.now().isoformat()
                    }
                    
                    with open(progress_file, 'w') as f:
                        json.dump(progress, f)
                    
                    # Report batch statistics
                    batch_time = time.time() - batch_start_time
                    log_message(f"Batch completed in {batch_time:.2f} seconds")
                    log_message(f"Vectors uploaded: {result.get('vectors_uploaded', 0)}")
                    
                    # Calculate and report progress
                    elapsed = time.time() - overall_start_time
                    jobs_per_second = total_processed / elapsed if elapsed > 0 else 0
                    estimated_remaining = (max_jobs - total_processed) / jobs_per_second if jobs_per_second > 0 else "unknown"
                    
                    if isinstance(estimated_remaining, float):
                        hours, remainder = divmod(estimated_remaining, 3600)
                        minutes, seconds = divmod(remainder, 60)
                        time_remaining = f"{int(hours)}h {int(minutes)}m {int(seconds)}s"
                    else:
                        time_remaining = "unknown"
                    
                    log_message(f"Progress: {total_processed}/{max_jobs} jobs ({total_processed/max_jobs*100:.2f}%)")
                    log_message(f"Speed: {jobs_per_second:.2f} jobs/second, Estimated time remaining: {time_remaining}")
                else:
                    log_message("No new jobs to process in this batch - all already processed")
            
            # Move to next chunk
            current_index += chunk_size
    
    except Exception as e:
        log_message(f"Error during processing: {str(e)}")
        import traceback
        log_message(traceback.format_exc())
    
    # Final report
    total_time = time.time() - overall_start_time
    hours, remainder = divmod(total_time, 3600)
    minutes, seconds = divmod(remainder, 60)
    
    log_message(f"Job processing complete!")
    log_message(f"Total jobs processed: {total_processed}")
    log_message(f"Total time: {int(hours)}h {int(minutes)}m {int(seconds)}s")
    log_message(f"Average processing speed: {total_processed/total_time:.2f} jobs/second")
    
    return {
        "total_processed": total_processed,
        "last_index": current_index,
        "processed_ids": len(processed_ids),
        "total_time_seconds": total_time
    }

In [None]:
result = process_jobs_in_batches(batch_size=1000)
print(f"Processing complete! {result['total_processed']} jobs processed in {result['total_time_seconds']/3600:.2f} hours.")

In [None]:
import json
import os
import time
from datetime import datetime
from pinecone import Pinecone

PINECONE_ENVIRONMENT = "us-east-1"
PINECONE_INDEX_NAME = "mirra-filtering"
PROGRESS_FILE = "embedding_progress.json"
LOG_FILE = f"reconciliation_log_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt"

# Number of vectors to fetch in each batch when paginating
FETCH_BATCH_SIZE = 1000

def log_message(message):
    """Log a message to both console and file"""
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    formatted_msg = f"[{timestamp}] {message}"
    print(formatted_msg)
    with open(LOG_FILE, 'a') as f:
        f.write(formatted_msg + '\n')

def initialize_pinecone():
    """Initialize Pinecone and return the index"""
    try:
        # Initialize Pinecone client
        pc = Pinecone(api_key=PINECONE_API_KEY)
        
        # Check if the index exists
        existing_indexes = pc.list_indexes().names()
        log_message(f"Available Pinecone indexes: {existing_indexes}")
        
        if PINECONE_INDEX_NAME not in existing_indexes:
            log_message(f"Error: Index '{PINECONE_INDEX_NAME}' does not exist")
            return None
        
        # Connect to the index
        index = pc.Index(PINECONE_INDEX_NAME)
        log_message(f"Connected to Pinecone index: {PINECONE_INDEX_NAME}")
        
        return index
        
    except Exception as e:
        log_message(f"Error initializing Pinecone: {str(e)}")
        log_message("Please check your API key and environment settings.")
        return None

def get_all_job_ids_from_pinecone(pinecone_index):
    """
    Retrieve all job IDs from Pinecone using pagination to handle large indexes.
    Returns a set of job IDs (without 'job_' prefix).
    """
    all_job_ids = set()
    total_fetched = 0
    
    try:
        # Get total vector count
        stats = pinecone_index.describe_index_stats()
        total_vectors = stats.get('total_vector_count', 0)
        log_message(f"Pinecone index contains {total_vectors} total vectors")
        
        log_message("Starting to fetch all vector IDs from Pinecone using pagination...")
        
        # Use list_vectors if available (newer Pinecone SDK versions)
        # Otherwise, fall back to fetching with dummy queries
        try:
            # First approach: Try using list_vectors (if available in your Pinecone version)
            # This is the preferred method when available
            page_token = None
            batch_count = 0
            
            while True:
                batch_count += 1
                log_message(f"Fetching batch {batch_count} of vector IDs...")
                
                # Get the next batch of vectors
                response = pinecone_index.list_vectors(
                    limit=FETCH_BATCH_SIZE,
                    page_token=page_token
                )
                
                # Extract vector IDs
                vector_ids = response.get('vectors', [])
                for vector_id in vector_ids:
                    if vector_id.startswith('job_'):
                        job_id = vector_id[4:]  # Remove 'job_' prefix
                    else:
                        job_id = vector_id
                    all_job_ids.add(job_id)
                
                total_fetched += len(vector_ids)
                log_message(f"Fetched {len(vector_ids)} vector IDs, total so far: {total_fetched}/{total_vectors}")
                
                # Check if there are more vectors to fetch
                page_token = response.get('page_token')
                if not page_token:
                    break
                
                # Add a small delay to avoid overwhelming the API
                time.sleep(0.5)
            
        except (AttributeError, TypeError) as e:
           
            log_message(f"list_vectors not available, using alternative method: {str(e)}")
            log_message("Falling back to dummy query approach for fetching vector IDs...")
            
            # Generate a dummy vector for querying (all zeros)
            dimension = 1024 
            dummy_vector = [0.0] * dimension
            
            offset = 0
            batch_count = 0
            
            while offset < total_vectors:
                batch_count += 1
                batch_size = min(FETCH_BATCH_SIZE, total_vectors - offset)
                
                log_message(f"Fetching batch {batch_count} of vector IDs (offset: {offset}, size: {batch_size})...")
                
                # Query Pinecone with our dummy vector
                response = pinecone_index.query(
                    vector=dummy_vector,
                    top_k=batch_size,
                    include_metadata=True,
                    include_values=False
                )
                
                # Extract IDs from matches
                matches = getattr(response, 'matches', response.get('matches', []))
                
                if not matches:
                    log_message(f"Warning: No matches returned in batch {batch_count}")
                    break
                
                batch_ids = []
                for match in matches:
                    vector_id = getattr(match, 'id', match.get('id', ''))
                    if vector_id.startswith('job_'):
                        job_id = vector_id[4:]  # Remove 'job_' prefix
                    else:
                        job_id = vector_id
                    
                    all_job_ids.add(job_id)
                    batch_ids.append(job_id)
                
                total_fetched += len(batch_ids)
                log_message(f"Fetched {len(batch_ids)} vector IDs, total so far: {total_fetched}/{total_vectors}")
                
                if len(batch_ids) < batch_size:
                    break
                
                offset += batch_size
                
                # Adding a small delay to avoid overwhelming the API
                time.sleep(1.0)
        
        log_message(f"Completed fetching all vector IDs from Pinecone")
        log_message(f"Total unique job IDs collected: {len(all_job_ids)}")
        
        return all_job_ids
    
    except Exception as e:
        log_message(f"Error retrieving job IDs from Pinecone: {str(e)}")
        import traceback
        log_message(traceback.format_exc())
        return all_job_ids  

In [None]:
def reconcile_tracking_with_pinecone():
    """
    Reconcile the local tracking file with the actual state in Pinecone.
    Updates the local tracking to match Pinecone's count and sets IDs to match.
    """
    log_message("Starting full reconciliation between local tracking and Pinecone")
    
    # Initialize Pinecone
    pinecone_index = initialize_pinecone()
    if not pinecone_index:
        log_message("Reconciliation failed: Could not connect to Pinecone")
        return False
    
    # Get Pinecone stats
    try:
        stats = pinecone_index.describe_index_stats()
        pinecone_count = stats.get('total_vector_count', 0)
        log_message(f"Pinecone reports {pinecone_count} total vectors")
    except Exception as e:
        log_message(f"Error getting Pinecone stats: {str(e)}")
        return False
    
    # Load local tracking data
    if os.path.exists(PROGRESS_FILE):
        try:
            with open(PROGRESS_FILE, 'r') as f:
                progress_data = json.load(f)
                local_count = progress_data.get("total_processed", 0)
                local_ids = set(progress_data.get("processed_job_ids", []))
                last_index = progress_data.get("last_index", 0)
                last_updated = progress_data.get("last_updated", "")
                
                log_message(f"Local tracking reports {local_count} processed jobs")
                log_message(f"Local tracking has {len(local_ids)} job IDs")
                log_message(f"Last index: {last_index}")
                log_message(f"Last updated: {last_updated}")
        except Exception as e:
            log_message(f"Error reading progress file: {str(e)}")
            log_message("Creating new progress file")
            progress_data = {
                "last_index": 0,
                "total_processed": 0,
                "processed_job_ids": [],
                "last_updated": datetime.now().isoformat()
            }
            local_count = 0
            local_ids = set()
            last_index = 0
    else:
        log_message("No local tracking file found. Creating new one.")
        progress_data = {
            "last_index": 0,
            "total_processed": 0,
            "processed_job_ids": [],
            "last_updated": datetime.now().isoformat()
        }
        local_count = 0
        local_ids = set()
        last_index = 0
    
    # Get all job IDs from Pinecone
    pinecone_job_ids = get_all_job_ids_from_pinecone(pinecone_index)
    log_message(f"Retrieved {len(pinecone_job_ids)} job IDs from Pinecone")
    
    # Find IDs in Pinecone but not in local tracking
    missing_locally = pinecone_job_ids - local_ids
    
    # Find IDs in local tracking but not in Pinecone
    missing_in_pinecone = local_ids - pinecone_job_ids
    
    log_message(f"Found {len(missing_locally)} job IDs in Pinecone that are missing from local tracking")
    log_message(f"Found {len(missing_in_pinecone)} job IDs in local tracking that are missing from Pinecone")
    
    # Update local tracking with the complete set of job IDs from Pinecone
    if missing_locally:
        log_message(f"Adding {len(missing_locally)} missing job IDs to local tracking")
        for job_id in missing_locally:
            progress_data["processed_job_ids"].append(job_id)
    
    # Log some info about IDs that are in local tracking but not in Pinecone
    if missing_in_pinecone:
        log_message(f"Warning: {len(missing_in_pinecone)} job IDs in local tracking are not found in Pinecone")
        log_message("This could indicate jobs that failed to upload or were removed from Pinecone")
        
        # Optionally, remove these IDs from local tracking
        # Uncomment the following code to remove them:
        """
        log_message(f"Removing {len(missing_in_pinecone)} job IDs from local tracking that are not in Pinecone")
        progress_data["processed_job_ids"] = list(pinecone_job_ids)
        """
    
    # Update the total count to match Pinecone
    log_message(f"Updating total_processed from {progress_data['total_processed']} to {pinecone_count}")
    progress_data["total_processed"] = pinecone_count
    
    # Update last_updated timestamp
    progress_data["last_updated"] = datetime.now().isoformat()
    
    # Save updated progress data
    try:
        with open(PROGRESS_FILE, 'w') as f:
            json.dump(progress_data, f, indent=2)
        log_message(f"Successfully updated local tracking file: {PROGRESS_FILE}")
        log_message(f"New total_processed: {progress_data['total_processed']}")
        log_message(f"Number of job IDs in tracking: {len(progress_data['processed_job_ids'])}")
        return True
    except Exception as e:
        log_message(f"Error saving progress file: {str(e)}")
        return False

if __name__ == "__main__":
    success = reconcile_tracking_with_pinecone()
    if success:
        log_message("Reconciliation completed successfully")
    else:
        log_message("Reconciliation encountered errors")

In [None]:
if __name__ == "__main__":
    # Run the pipeline
    result = job_embedding_pipeline(limit=101)  
    print(f"\nPipeline results: {result}")
    
    # Example search
    pinecone_index = initialize_pinecone()
    if pinecone_index:
        embedder = ResilientSageMakerEmbedder(
            endpoint_name=SAGEMAKER_ENDPOINT_NAME,
            region=AWS_REGION
        )
        
        # Search for data science jobs that sponsor visas
        search_results = search_jobs(
            query_text="Data Science",
            pinecone_index=pinecone_index,
            embedder=embedder,
            filters={"visa_sponsor": "Yes"}
        )
        
        print("\nSearch results:")
        for i, result in enumerate(search_results):
            print(f"{i+1}. {result['job_title']} ({result['location']}) - Score: {result['similarity_score']:.4f}")

In [None]:
def search_jobs(query_text, pinecone_index, embedder=None, top_k=10, filters=None):
    """
    Search for jobs using semantic embedding similarity
    
    Args:
        query_text: Query text to search for
        pinecone_index: Pinecone index to search
        embedder: Embedding generator (optional)
        top_k: Number of results to return
        filters: Dictionary of metadata filters
        
    Returns:
        List of job matches with scores
    """
    if not embedder:
        embedder = ResilientSageMakerEmbedder(
            endpoint_name=SAGEMAKER_ENDPOINT_NAME,
            region=AWS_REGION
        )
    
    # Generate embedding for query
    query_embedding = embedder.generate_embeddings([query_text])[0]
    
    # Prepare filters if any
    filter_dict = {}
    if filters:
        for key, value in filters.items():
            if value:  # Only add non-empty filters
                # Handle numeric values for salary filters
                if key in ["salary_range_from", "salary_range_to"] and isinstance(value, (int, float)):
                    filter_dict[key] = {"$gte": value} if key == "salary_range_from" else {"$lte": value}
                else:
                    filter_dict[key] = value
    
    print(f"Searching with filters: {filter_dict}")
    
    # Perform search
    try:
        search_results = pinecone_index.query(
            vector=query_embedding,
            top_k=top_k,
            include_metadata=True,
            filter=filter_dict if filter_dict else None
        )
        
        # Format results
        results = []
        
        # For Pinecone v6+ compatibility
        matches = search_results.get('matches', [])
        if hasattr(search_results, 'matches'):
            matches = search_results.matches
        
        for match in matches:
            # Handle different response formats
            if hasattr(match, 'metadata'):
                metadata = match.metadata
                score = match.score
            else:
                metadata = match.get('metadata', {})
                score = match.get('score', 0)
            
            results.append({
                'job_id': metadata.get('job_id', 'unknown'),
                'job_title': metadata.get('job_title', 'Unknown'),
                'location': metadata.get('location', 'Unknown'),
                'emp_type': metadata.get('emp_type', 'Unknown'),
                'exp_level': metadata.get('exp_level', 'Unknown'),
                'domain': metadata.get('domain', 'Unknown'),
                'visa_sponsor': metadata.get('visa_sponsor', 'No'),
                'similarity_score': score
            })
        
        return results
    
    except Exception as e:
        print(f"Error searching jobs: {str(e)}")
        import traceback
        traceback.print_exc()
        return []



In [None]:
# Example search
pinecone_index = initialize_pinecone()
if pinecone_index:
        embedder = ResilientSageMakerEmbedder(
            endpoint_name=SAGEMAKER_ENDPOINT_NAME,
            region=AWS_REGION
        )
        
        # Search for data science jobs that sponsor visas
        search_results = search_jobs(
            query_text="Data Science",
            pinecone_index=pinecone_index,
            embedder=embedder,
            filters={"visa_sponsor": "Yes"}
        )
        
        print("\nSearch results:")
        for i, result in enumerate(search_results):
            print(f"{i+1}. {result['job_title']} ({result['location']}) - Score: {result['similarity_score']:.4f}")