#03_embedding_generation
- Sets up the embedding generator using either Jina API or SageMaker
- Loads job descriptions from files
- Creates chunks from job descriptions using a custom chunker
- Generates embeddings for each chunk
- Uploads the embedded vectors to Pinecone
- Includes batch processing for efficiency

In [None]:
!pip install jinaai

In [2]:
from jinaai import JinaAI
client = JinaAI()


In [3]:
import boto3
import json
import os
import pandas as pd
import uuid
import sys
import numpy as np
import time
from typing import List, Dict, Any
from tqdm import tqdm

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

# Import modules
from src.chunking.job_chunker import chunk_job_description
from src.embedding.jina_embedder import JinaEmbedder

In [4]:
# 1: SageMaker endpoint configuration
SAGEMAKER_ENDPOINT_NAME = "jina-embeddings-v3-endpoint-325"
AWS_REGION = "us-east-1"

# alt option 2: Direct Jina API configuration
USE_JINA_API = False  # Set to True to use direct Jina API instead of SageMaker, need to add API key
JINA_API_HOST = "grpc://api.jina.ai:443"  # Jina API endpoint

# Set Pinecone credentials directly
PINECONE_API_KEY = "redacted"
PINECONE_ENVIRONMENT = "us-east-1" # matched with AWS region
PINECONE_INDEX_NAME = "mirra"

EMBEDDING_DIMENSION = 1024

print(f"Embedding method: {'Direct Jina API' if USE_JINA_API else 'SageMaker Endpoint'}")
print(f"Embedding dimension: {EMBEDDING_DIMENSION}")

Embedding method: SageMaker Endpoint
Embedding dimension: 1024


In [None]:
# Set variables - skipping because endpoint is already created
"""
region = "us-east-1" 
model_name = "jina-embeddings-v3-model"
endpoint_name = "jina-embeddings-v3-endpoint-325"
instance_type = "ml.g5.xlarge" # limited to compatible options
model_package_arn = "redacted"

# Initialize boto3 clients
sm_client = boto3.client('sagemaker', region_name=region)
sm_runtime = boto3.client('sagemaker-runtime', region_name=region)

# Create model
try:
    model_response = sm_client.create_model(
        ModelName=model_name,
        PrimaryContainer={
            'ModelPackageName': model_package_arn
        },
        ExecutionRoleArn="redacted"
    )
    print(f"Model created: {model_name}")
except Exception as e:
    print(f"Error creating model: {str(e)}")
    # If model already exists, continue
    print("Continuing with existing model if available.")

# Create endpoint configuration
try:
    endpoint_config_response = sm_client.create_endpoint_config(
        EndpointConfigName=endpoint_name,
        ProductionVariants=[{
            'VariantName': 'default',
            'ModelName': model_name,
            'InstanceType': instance_type,
            'InitialInstanceCount': 1
        }]
    )
    print(f"Endpoint configuration created: {endpoint_name}")
except Exception as e:
    print(f"Error creating endpoint configuration: {str(e)}")
    # If config already exists, continue
    print("Continuing with existing endpoint configuration if available.")

# Check if endpoint exists
try:
    endpoint_response = sm_client.describe_endpoint(EndpointName=endpoint_name)
    print(f"Endpoint already exists: {endpoint_name}")
    print(f"Current status: {endpoint_response['EndpointStatus']}")
except sm_client.exceptions.ClientError:
    # Create endpoint if it doesn't exist
    try:
        create_endpoint_response = sm_client.create_endpoint(
            EndpointName=endpoint_name,
            EndpointConfigName=endpoint_name
        )
        print(f"Creating endpoint: {endpoint_name}")
        print("Endpoint creation initiated. This will take several minutes...")
    except Exception as e:
        print(f"Error creating endpoint: {str(e)}")
        raise

# wait for endpoint to be in service
wait_for_endpoint = True
if wait_for_endpoint:
    print("Waiting for endpoint to be ready... (this may take 5-10 minutes)")
    waiter = sm_client.get_waiter('endpoint_in_service')
    waiter.wait(EndpointName=endpoint_name)
    print(f"Endpoint {endpoint_name} is now ready for use!")
"""

In [6]:
SAGEMAKER_ENDPOINT_NAME = "jina-embeddings-v3-endpoint-325" 
AWS_REGION = "us-east-1"

class EmbeddingGenerator:
    """Unified class for embedding generation using either SageMaker or Jina API"""
    
    def __init__(self, use_jina_api=True, jina_api_key=None):
        """
        Initialize with embedding method.
        
        Args:
            use_jina_api (bool): If True, use direct Jina API; otherwise use SageMaker.
            jina_api_key: API key for Jina. Must be provided if using the Jina API.
        """
        self.use_jina_api = use_jina_api
        
        if use_jina_api:
            try:
                if not jina_api_key:
                    print("Warning: Jina API key not provided. Some functionality may be limited.")
                
                # Initialize the Jina API client and configure it with the API key if provided
                self.jina_client = JinaAI()
                if jina_api_key:
                    self.jina_client.configure(api_key=jina_api_key)
                print("Initialized Jina API client")
            except Exception as e:
                print(f"Error initializing Jina API client: {str(e)}")
                raise
        else:
            # Initialize the SageMaker embedder
            self.embedder = SageMakerEmbedder(
                endpoint_name=SAGEMAKER_ENDPOINT_NAME, 
                region=AWS_REGION
            )
            print(f"Initialized SageMaker embedder for endpoint: {SAGEMAKER_ENDPOINT_NAME}")
            
    def generate_embeddings(self, texts, instructions=None):
        """Generate embeddings using the configured method"""
        # Ensure texts is a list
        if not isinstance(texts, list):
            texts = [texts]
        # Process using the appropriate method
        if self.use_jina_api:
            return self._generate_with_jina_api(texts)
        else:
            return self._generate_with_sagemaker(texts)
            
    def _generate_with_jina_api(self, texts):
        """Generate embeddings using Jina API"""
        try:
            # Call the embeddings API using the JinaAI client
            embeddings_response = self.jina_client.embeddings(
                texts=texts,
                model="jina-embeddings-v3-base-en"
            )
            
            if isinstance(embeddings_response, dict) and 'embeddings' in embeddings_response:
                # If response is {embeddings: [[values], [values]]}
                embeddings = embeddings_response['embeddings']
            elif isinstance(embeddings_response, list):
                # If response is a list of objects with 'embedding' field
                embeddings = [item['embedding'] for item in embeddings_response]
            else:
                # Fallback case
                print(f"Unexpected response format: {type(embeddings_response)}")
                print(f"Response sample: {str(embeddings_response)[:100]}...")
                raise ValueError("Could not parse embedding response")
                
            return embeddings
        except Exception as e:
            print(f"Error generating embeddings with Jina API: {str(e)}")
            raise 
            
    def _generate_with_sagemaker(self, texts, instructions=None):
        """Generate embeddings using SageMaker endpoint"""
        try:
            # Format the payload directly here rather than in SageMakerEmbedder
            payload = {
                "data": [{"text": text} for text in texts],
                "parameters": {
                    "task": "text-matching",
                    "late_chunking": False,
                    "dimensions": 1024
                }
            }
            
            # Pass the properly formatted payload to the embedder
            embeddings = self.embedder.generate_embeddings(payload)
            return embeddings
        except Exception as e:
            print(f"Error with SageMaker embedding: {str(e)}")
            # Return zero vectors as fallback
            return [[0.0] * EMBEDDING_DIMENSION] * len(texts)

In [7]:
# Initialize Pinecone with credentials
from pinecone import Pinecone

def initialize_pinecone():
    """Initialize Pinecone and return the index"""
    try:
        # Initialize Pinecone 
        pc = Pinecone(api_key=PINECONE_API_KEY)
        
        # Check if the index exists
        existing_indexes = pc.list_indexes().names()
        print(f"Available Pinecone indexes: {existing_indexes}")
        
        if PINECONE_INDEX_NAME not in existing_indexes:
            print(f"Creating new index '{PINECONE_INDEX_NAME}'...")
            # Create the index
            pc.create_index(
                name=PINECONE_INDEX_NAME,
                dimension=EMBEDDING_DIMENSION,
                metric="cosine",
                metadata_config={
                    "indexed": [
                        "source_type",
                        "chunk_type",
                        "requirement_level",
                        "job_id",
                        "resume_id"
                    ]
                }
            )
            print(f"Index '{PINECONE_INDEX_NAME}' created successfully")
        
        # Connect to the index
        index = pc.Index(PINECONE_INDEX_NAME)
        print(f"Connected to Pinecone index: {PINECONE_INDEX_NAME}")
        return index
    except Exception as e:
        print(f"Error initializing Pinecone: {str(e)}")
        print("Please check your API key and environment settings.")
        # Return None to indicate initialization failed
        return None

# Initialize Pinecone
pinecone_index = initialize_pinecone()
if pinecone_index:
    # Check index stats
    index_stats = pinecone_index.describe_index_stats()
    # Convert to dictionary for JSON serialization
    index_stats_dict = index_stats.to_dict()
    print(f"Index statistics: {json.dumps(index_stats_dict, indent=2)}")

Available Pinecone indexes: ['mirra']
Connected to Pinecone index: mirra
Index statistics: {
  "namespaces": {},
  "index_fullness": 0.0,
  "total_vector_count": 0,
  "dimension": 1024,
  "metric": "cosine",
  "vector_type": "dense"
}


In [9]:
# Check how JinaEmbedder is defined
print(JinaEmbedder.__init__.__code__.co_varnames)

('self', 'api_key', 'Embeddings')


In [None]:

print("Payload:", json.dumps(payload, indent=2))


In [12]:
# update
class SageMakerEmbedder:
    def __init__(self, endpoint_name, region="us-east-1"):
        self.endpoint_name = endpoint_name
        self.region = region
        self.client = boto3.client('sagemaker-runtime', region_name=region)
        
    def generate_embeddings(self, texts_or_payload):
        """
        Generate embeddings using SageMaker endpoint.
        
        Args:
            texts_or_payload: Either a string, list of strings, or pre-formatted payload
            
        Returns:
            List of embedding vectors
        """
        # Check if texts_or_payload is already formatted as a payload
        if isinstance(texts_or_payload, dict) and "data" in texts_or_payload:
            payload = texts_or_payload
        else:
            # Convert input to proper format
            if isinstance(texts_or_payload, str):
                texts = [texts_or_payload]
            else:
                texts = texts_or_payload
                
            # Format as Jina Embeddings v3 payload
            payload = {
                "data": [{"text": text} for text in texts],
                "parameters": {
                    "task": "text-matching",
                    "late_chunking": False,
                    "dimensions": 1024
                }
            }
        
        # Call the SageMaker endpoint
        response = self.client.invoke_endpoint(
            EndpointName=self.endpoint_name,
            ContentType='application/json',
            Body=json.dumps(payload)
        )
        
        # Parse the response
        result = json.loads(response['Body'].read().decode())
        
        # Handle the specific response format we've observed
        if isinstance(result, list) and all("embedding" in item for item in result):
            # Response contains a list of objects with 'embedding' key
            embeddings = [item["embedding"] for item in result]
            return embeddings
        
        # Fallback to previous handling methods if format changes
        if "embeddings" in result:
            return result["embeddings"]
        elif "data" in result:
            return result["data"]
        else:
            # Try to find embeddings in the result structure
            for key in result:
                if isinstance(result[key], list) and len(result[key]) > 0:
                    return result[key]
        
        raise ValueError("Could not find embeddings in response")

In [13]:
# Use the SageMaker embedder
embedder = SageMakerEmbedder(endpoint_name=SAGEMAKER_ENDPOINT_NAME, region=AWS_REGION)

try:
    test_text = "Required skill: Python programming with 3 years experience"
    # Correct payload matching the required JSON format:
    payload = {
        "data": [{"text": test_text}],
        "parameters": {
            "task": "text-matching",
            "late_chunking": False,
            "dimensions": 1024
        }
    }
    print("Payload:", json.dumps(payload, indent=2))  # Debug output

    result = embedder.generate_embeddings(payload)
    
    print("Full embedding result:", result)
    
    # Extract the embedding vector from the first document in the returned list
    embedding_vector = result[0].get("embedding")
    if not embedding_vector:
        raise ValueError("No embedding vector found in the result.")
    
    # calculate its dimension; print a preview
    embedding_dim = len(embedding_vector)
    print("Embedding generation successful!")
    print(f"Embedding dimension: {embedding_dim}")
    print(f"Embedding preview (first 5 values): {embedding_vector[:5]}")
    
    if 'EMBEDDING_DIMENSION' in globals() and embedding_dim != EMBEDDING_DIMENSION:
        print(f"Warning: Expected dimension {EMBEDDING_DIMENSION}, but got {embedding_dim}")
except Exception as e:
    print(f"Error testing embedding generation: {str(e)}")
    print("Please check your configuration and try again")


Payload: {
  "data": [
    {
      "text": "Required skill: Python programming with 3 years experience"
    }
  ],
  "parameters": {
    "task": "text-matching",
    "late_chunking": false,
    "dimensions": 1024
  }
}
Full embedding result: [{'id': '285a6bd41a2db6ba675bebc53577806b', 'embedding': [0.042495023, -0.100973584, 0.13122457, 0.09683984, 0.010602621, 0.029769432, -0.11279018, -0.03456933, 0.005282447, -0.07970909, 0.01447604, 0.049952693, 0.035989024, 0.020272836, 0.070766844, 0.0012989584, -0.01198133, 0.08423315, -0.05487654, 0.031116141, -0.07024854, -0.008453575, 0.09742766, 0.11486971, 0.002564517, -0.044930875, 0.0024241956, 0.048844982, -0.05904031, -0.016179081, -0.015846059, 0.0410697, -0.06578453, -0.08232428, 0.06688908, 0.021171479, 0.05903241, -0.007505971, -0.025006961, -0.078260064, 0.0702359, 0.01094404, 0.01820606, 0.042301454, -0.0039463546, 0.01971612, 0.023533441, 0.006535729, 0.074573494, -0.0012242949, 0.014283259, -0.05642985, 0.007336387, -0.033324443

In [14]:
def load_job_descriptions(source_type="local", limit=10):
    """
    Load job descriptions from local storage or S3.

    Args:
        source_type: 'local' or 's3'
        limit: Maximum number of jobs to load

    Returns:
        List of job description dictionaries
    """
    jobs = []

    if source_type == "local":
        # Load from local directory
        job_dir = "../data/processed/jobs"

        # Create directory if it doesn't exist
        os.makedirs(job_dir, exist_ok=True)

        # Check if directory contains files
        if not os.path.exists(job_dir) or not os.listdir(job_dir):
            print(f"No job files found in {job_dir}")

            # Create a sample job for testing if no files found
            print("Creating a sample job for testing...")
            sample_job = {
                "job_id": "sample_job_001",
                "details": {
                    "job_title": ["Software Engineer"],
                    "job_title_base": ["Software Engineer"],
                    "company_name": ["Sample Company"],
                    "employment_type": ["Full-time"],
                    "location": [{"city": "New York", "state": "NY", "country": "US"}]
                },
                "mandatory": {
                    "hard_skills": [
                        {"skill": ["Python"], "minyears": [3]},
                        {"skill": ["Machine Learning"], "minyears": [1]}
                    ],
                    "education": [
                        {"education_level": ["Bachelor's"], "field_of_study": ["Computer Science"]}
                    ],
                    "credentials": [
                        {"credential": ["AWS Certified Developer"]}
                    ],
                    "professional_background": [
                        {"background": ["Software Development"], "minyears": [2], "industry": ["Technology"]}
                    ]
                },
                "preferred": {
                    "hard_skills": [
                        {"skill": ["AWS"], "minyears": [1]},
                        {"skill": ["Docker"], "minyears": [1]}
                    ],
                    "education": [],
                    "credentials": [],
                    "professional_background": []
                },
                "responsibility": {
                    "hard_skills": [
                        {"skill": ["Develop and maintain software applications"]}
                    ],
                    "professional_background": [
                        {"background": ["Work in a collaborative team environment"]}
                    ]
                }
            }

            # Save the sample job
            os.makedirs(job_dir, exist_ok=True)
            with open(os.path.join(job_dir, "sample_job_001.json"), 'w') as f:
                json.dump(sample_job, f, indent=2)

            # Add to jobs list
            jobs.append(sample_job)
            return jobs

        # Load job files
        job_files = os.listdir(job_dir)[:limit]

        for filename in job_files:
            if filename.endswith(".json"):
                with open(os.path.join(job_dir, filename), 'r') as f:
                    job_data = json.load(f)
                    # Add job_id if not present
                    if "job_id" not in job_data:
                        job_data["job_id"] = filename.replace(".json", "")
                    jobs.append(job_data)

    elif source_type == "s3":
        # Load from S3
        s3_client = boto3.client('s3')
        bucket = "mirra-matcher-325"
        prefix = "processed/jobs/"

        try:
            response = s3_client.list_objects_v2(
                Bucket=bucket,
                Prefix=prefix,
                MaxKeys=limit
            )

            if 'Contents' in response:
                for obj in response['Contents']:
                    key = obj['Key']
                    if key.endswith(".json"):
                        response = s3_client.get_object(Bucket=bucket, Key=key)
                        job_data = json.loads(response['Body'].read().decode('utf-8'))
                        # Add job_id if not present
                        if "job_id" not in job_data:
                            job_data["job_id"] = key.split("/")[-1].replace(".json", "")
                        jobs.append(job_data)
            else:
                print(f"No job files found in S3 bucket {bucket}/{prefix}")
        except Exception as e:
            print(f"Error loading files from S3: {str(e)}")

    print(f"Loaded {len(jobs)} job descriptions")
    return jobs

# Load job descriptions
sample_jobs = load_job_descriptions(source_type="local", limit=3)

# Display a preview of the first job
if sample_jobs:
    print("\nPreview of first job:")
    company_names = sample_jobs[0].get("details", {}).get("company_name", ["Unknown"])
    job_title = sample_jobs[0].get("details", {}).get("job_title", ["Unknown"])
    
    job_preview = {
        "job_id": sample_jobs[0].get("job_id"),
        "job_title": job_title[0] if job_title else "Unknown",
        "company_name": company_names[0] if company_names else "Unknown"
    }
    print(json.dumps(job_preview, indent=2))

Loaded 3 job descriptions

Preview of first job:
{
  "job_id": "f6aeaba8-87f5-498c-919a-8d25d63d3d9e",
  "job_title": "Full Stack .NET Developer",
  "company_name": "Unknown"
}


In [15]:
# Generate and display chunks for a sample job
if sample_jobs:
    sample_job = sample_jobs[0]

    print(f"Generating chunks for job: {sample_job.get('job_id')}")
    chunks = chunk_job_description(sample_job)

    print(f"Generated {len(chunks)} chunks")

    # Display a few sample chunks
    print("\nSample chunks:")
    for i, chunk in enumerate(chunks[:3]):  # Show first 3 chunks
        print(f"\nChunk {i+1}:")
        print(f"Text: {chunk['text']}")
        print(f"Metadata: {json.dumps({k: v for k, v in chunk['metadata'].items() if k != 'source_type'}, indent=2)}")

Generating chunks for job: f6aeaba8-87f5-498c-919a-8d25d63d3d9e
Generated 45 chunks

Sample chunks:

Chunk 1:
Text: Job title: Full Stack .NET Developer
Metadata: {
  "chunk_type": "job_title",
  "job_id": "f6aeaba8-87f5-498c-919a-8d25d63d3d9e"
}

Chunk 2:
Text: Employment type: Contract
Metadata: {
  "chunk_type": "employment_type",
  "job_id": "f6aeaba8-87f5-498c-919a-8d25d63d3d9e"
}

Chunk 3:
Text: Work arrangement: On-site
Metadata: {
  "chunk_type": "wfh_policy",
  "job_id": "f6aeaba8-87f5-498c-919a-8d25d63d3d9e"
}


In [19]:
def process_and_embed_job(job_data, embedder):
    """
    Process a job into chunks and generate embeddings.
    
    Args:
        job_data: Job description dictionary
        embedder: EmbeddingGenerator instance or SageMakerEmbedder instance
        
    Returns:
        List of vectors ready for Pinecone
    """
    import uuid

    # Validate job data
    if not isinstance(job_data, dict) or "details" not in job_data:
        print("Invalid job data format")
        return []
    
    # Ensure job has an ID
    job_id = job_data.get("job_id")
    if not job_id:
        job_id = str(uuid.uuid4())
        job_data["job_id"] = job_id
    
    # Generate chunks
    chunks = chunk_job_description(job_data)
    print(f"Generated {len(chunks)} chunks for job {job_id}")
    
    if not chunks:
        print("No chunks generated. Check the job data structure.")
        return []
    
    # Create batch for embedding
    texts = [chunk["text"] for chunk in chunks]
    
    # Generate embeddings in batches
    vectors = []
    batch_size = 32  # Adjust as needed
    
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        
        try:
            # Check which type of embedder we're using
            if isinstance(embedder, SageMakerEmbedder):
                # If using SageMakerEmbedder directly, only pass texts
                batch_embeddings = embedder.generate_embeddings(batch_texts)
            else:
                # If using EmbeddingGenerator, prepare instructions
                batch_instructions = []
                for j in range(i, min(i+batch_size, len(chunks))):
                    chunk_type = chunks[j]["metadata"]["chunk_type"]
                    if chunk_type == "skill":
                        instruction = "Represent this skill qualification for job-resume matching"
                    elif chunk_type == "education":
                        instruction = "Represent this educational qualification for job-resume matching"
                    elif chunk_type == "experience":
                        instruction = "Represent this professional experience for job-resume matching"
                    elif chunk_type == "credential":
                        instruction = "Represent this professional credential for job-resume matching"
                    else:
                        instruction = "Represent this qualification for job-resume matching"
                    batch_instructions.append(instruction)
                
                # Pass instructions only if using EmbeddingGenerator
                batch_embeddings = embedder.generate_embeddings(
                    batch_texts,
                    batch_instructions if not USE_JINA_API else None
                )
            
            # Create vectors for Pinecone
            for j, emb in enumerate(batch_embeddings):
                chunk_index = i + j
                if chunk_index < len(chunks):  # Safety check
                    vector_id = f"{job_id}_chunk_{chunk_index}"
                    
                    # Extract the actual embedding vector if emb is a dict
                    if isinstance(emb, dict) and "embedding" in emb:
                        embedding_vector = emb["embedding"]
                    else:
                        embedding_vector = emb
                    
                    # Convert each element in the embedding vector to a float
                    try:
                        embedding_floats = [float(v) for v in embedding_vector]
                    except Exception as conv_e:
                        print(f"Error converting embedding to floats for chunk {chunk_index}: {conv_e}")
                        continue
                    
                    vector = {
                        "id": vector_id,
                        "values": embedding_floats,
                        "metadata": {
                            "chunk_text": chunks[chunk_index]["text"],
                            **chunks[chunk_index]["metadata"]
                        }
                    }
                    
                    vectors.append(vector)
            
            print(f"Processed batch of {len(batch_texts)} chunks")
        
        except Exception as e:
            print(f"Error processing batch: {str(e)}")
        
    return vectors



In [20]:
def upload_vectors_to_pinecone(vectors, index):
    """
    Upload vectors to Pinecone in batches.

    Args:
        vectors: List of vectors
        index: Pinecone index

    Returns:
        Number of vectors uploaded
    """
    if not vectors:
        print("No vectors to upload")
        return 0

    if not index:
        print("No valid Pinecone index provided")
        return 0

    batch_size = 100  # Pinecone recommends max 100 per batch

    total_uploaded = 0
    for i in tqdm(range(0, len(vectors), batch_size), desc="Uploading to Pinecone"):
        batch = vectors[i:i+batch_size]
        try:
            index.upsert(vectors=batch)
            total_uploaded += len(batch)
        except Exception as e:
            print(f"Error uploading batch to Pinecone: {str(e)}")

    return total_uploaded

In [26]:
# Process and embed a sample job
if sample_jobs and pinecone_index:
    sample_job = sample_jobs[0]
    job_id = sample_job.get("job_id")

    print(f"Processing job: {job_id}")

    try:
        # Generate vectors
        start_time = time.time()
        vectors = process_and_embed_job(sample_job, embedder)
        embedding_time = time.time() - start_time

        if vectors:
            print(f"Generated {len(vectors)} vectors in {embedding_time:.2f} seconds")

            # Upload to Pinecone
            upload_start = time.time()
            uploaded = upload_vectors_to_pinecone(vectors, pinecone_index)
            upload_time = time.time() - upload_start

            print(f"Successfully processed and indexed job {job_id} with {uploaded} vectors")
            print(f"Upload took {upload_time:.2f} seconds")
        else:
            print("No vectors were generated. Check for errors above.")
    except Exception as e:
        print(f"Error processing job {job_id}: {str(e)}")

Processing job: f6aeaba8-87f5-498c-919a-8d25d63d3d9e
Generated 45 chunks for job f6aeaba8-87f5-498c-919a-8d25d63d3d9e
Processed batch of 32 chunks
Processed batch of 13 chunks
Generated 45 vectors in 0.24 seconds


Uploading to Pinecone: 100%|██████████| 1/1 [00:00<00:00,  3.16it/s]

Successfully processed and indexed job f6aeaba8-87f5-498c-919a-8d25d63d3d9e with 45 vectors
Upload took 0.32 seconds





In [27]:
# Test retrieving vectors for sample job
if pinecone_index and 'job_id' in locals() and job_id:
    try:
        # Query for vectors related to this job
        results = pinecone_index.query(
            vector=[0.0] * EMBEDDING_DIMENSION,  #Using a zero vector just to retrieve by metadata
            filter={"job_id": job_id},
            top_k=5,
            include_metadata=True
        )

        print(f"\nRetrieved {len(results['matches'])} vectors for job {job_id}")

        # Show a sample of retrieved vectors
        if results["matches"]:
            print("\nSample retrieved vector:")
            sample_match = results["matches"][0]
            # Only show relevant metadata for clarity
            relevant_metadata = {k: v for k, v in sample_match["metadata"].items()
                               if k in ["chunk_text", "chunk_type", "requirement_level", "skill_name"]}
            print(json.dumps(relevant_metadata, indent=2))
        else:
            print("No vectors retrieved. Check if the upload was successful.")
    except Exception as e:
        print(f"Error querying Pinecone: {str(e)}")


Retrieved 5 vectors for job f6aeaba8-87f5-498c-919a-8d25d63d3d9e

Sample retrieved vector:
{
  "chunk_text": "Preferred skill: NServiceBus",
  "chunk_type": "skill",
  "requirement_level": "preferred",
  "skill_name": "NServiceBus"
}


In [28]:
def process_all_jobs(jobs, embedder, index, batch_size=5):
    """
    Process multiple jobs and upload to Pinecone.

    Args:
        jobs: List of job dictionaries
        embedder: EmbeddingGenerator instance
        index: Pinecone index
        batch_size: Number of jobs to process at once

    Returns:
        Dictionary with processing statistics
    """
    if not jobs:
        print("No jobs to process")
        return {"total_jobs": 0}

    if not index:
        print("No valid Pinecone index provided")
        return {"total_jobs": len(jobs), "successful_jobs": 0, "failed_jobs": [job.get("job_id", "unknown") for job in jobs]}

    total_jobs = len(jobs)
    total_chunks = 0
    total_vectors = 0
    failed_jobs = []

    print(f"Starting processing of {total_jobs} jobs")

    start_time = time.time()

    for i, job in enumerate(jobs):
        job_id = job.get("job_id", str(uuid.uuid4()))
        print(f"\nProcessing job {i+1}/{total_jobs}: {job_id}")

        try:
            # Generate vectors
            vectors = process_and_embed_job(job, embedder)
            total_chunks += len(vectors)

            # Upload to Pinecone
            uploaded = upload_vectors_to_pinecone(vectors, index)
            total_vectors += uploaded

            print(f"Successfully processed job {job_id} with {uploaded} vectors")
        except Exception as e:
            print(f"Error processing job {job_id}: {str(e)}")
            failed_jobs.append(job_id)

        # Simple progress update
        elapsed = time.time() - start_time
        jobs_per_second = (i + 1) / elapsed if elapsed > 0 else 0
        estimated_remaining = (total_jobs - (i + 1)) / jobs_per_second if jobs_per_second > 0 else 0

        print(f"Progress: {i+1}/{total_jobs} jobs processed")
        print(f"Speed: {jobs_per_second:.2f} jobs/second")
        print(f"Estimated time remaining: {estimated_remaining/60:.1f} minutes")

    # Final stats
    total_time = time.time() - start_time

    stats = {
        "total_jobs": total_jobs,
        "successful_jobs": total_jobs - len(failed_jobs),
        "failed_jobs": failed_jobs,
        "total_chunks": total_chunks,
        "total_vectors": total_vectors,
        "processing_time_seconds": total_time,
        "jobs_per_second": total_jobs / total_time if total_time > 0 else 0
    }

    return stats

# Process all sample jobs
if sample_jobs and pinecone_index:
    print(f"Processing all {len(sample_jobs)} sample jobs")

    stats = process_all_jobs(sample_jobs, embedder, pinecone_index)

    print("\nProcessing complete!")
    print(f"Successfully processed {stats['successful_jobs']}/{stats['total_jobs']} jobs")
    print(f"Total vectors created: {stats.get('total_vectors', 0)}")
    print(f"Processing time: {stats.get('processing_time_seconds', 0):.2f} seconds")

    if stats.get('processing_time_seconds', 0) > 0:
        print(f"Average processing speed: {stats.get('jobs_per_second', 0):.2f} jobs/second")

    if stats.get('failed_jobs', []):
        print(f"Failed jobs: {stats['failed_jobs']}")

Processing all 3 sample jobs
Starting processing of 3 jobs

Processing job 1/3: f6aeaba8-87f5-498c-919a-8d25d63d3d9e
Generated 45 chunks for job f6aeaba8-87f5-498c-919a-8d25d63d3d9e
Processed batch of 32 chunks
Processed batch of 13 chunks


Uploading to Pinecone: 100%|██████████| 1/1 [00:00<00:00,  3.02it/s]


Successfully processed job f6aeaba8-87f5-498c-919a-8d25d63d3d9e with 45 vectors
Progress: 1/3 jobs processed
Speed: 1.91 jobs/second
Estimated time remaining: 0.0 minutes

Processing job 2/3: 3acaf400-50b0-4906-8c86-ed7184df8712
Generated 20 chunks for job 3acaf400-50b0-4906-8c86-ed7184df8712
Processed batch of 20 chunks


Uploading to Pinecone: 100%|██████████| 1/1 [00:00<00:00,  7.83it/s]


Successfully processed job 3acaf400-50b0-4906-8c86-ed7184df8712 with 20 vectors
Progress: 2/3 jobs processed
Speed: 2.65 jobs/second
Estimated time remaining: 0.0 minutes

Processing job 3/3: 0ca4cd66-bba5-4efa-8b26-ecc57bb3939e
Generated 37 chunks for job 0ca4cd66-bba5-4efa-8b26-ecc57bb3939e
Processed batch of 32 chunks
Processed batch of 5 chunks


Uploading to Pinecone: 100%|██████████| 1/1 [00:00<00:00,  4.06it/s]

Successfully processed job 0ca4cd66-bba5-4efa-8b26-ecc57bb3939e with 37 vectors
Progress: 3/3 jobs processed
Speed: 2.53 jobs/second
Estimated time remaining: 0.0 minutes

Processing complete!
Successfully processed 3/3 jobs
Total vectors created: 102
Processing time: 1.19 seconds
Average processing speed: 2.53 jobs/second





In [29]:
def print_job_chunks(job_id, num_chunks=None):
    """
    Print chunks for a specific job
    
    Args:
        job_id: ID of the job to display chunks for
        num_chunks: Number of chunks to display (None for all)
    """
    # Find the job in your loaded jobs
    job = None
    for j in sample_jobs:
        if j.get("job_id") == job_id:
            job = j
            break
    
    if not job:
        print(f"Job with ID {job_id} not found")
        return
    
    # Generate chunks for the job
    chunks = chunk_job_description(job)
    
    # Determine how many chunks to display
    display_count = num_chunks if num_chunks else len(chunks)
    display_count = min(display_count, len(chunks))
    
    print(f"Job ID: {job_id}")
    print(f"Total chunks: {len(chunks)}")
    print(f"Displaying {display_count} chunks:\n")
    
    # Display the chunks
    for i, chunk in enumerate(chunks[:display_count]):
        print(f"Chunk {i+1}:")
        print(f"Text: {chunk['text']}")
        print(f"Metadata: {json.dumps(chunk['metadata'], indent=2)}")
        print("-" * 50)


In [30]:
# Print all chunks for a job
print_job_chunks("0ca4cd66-bba5-4efa-8b26-ecc57bb3939e", 20)

# Print only the first 10 chunks
print_job_chunks("f6aeaba8-87f5-498c-919a-8d25d63d3d9e", 10)

Job ID: 0ca4cd66-bba5-4efa-8b26-ecc57bb3939e
Total chunks: 37
Displaying 20 chunks:

Chunk 1:
Text: Job title: Program Manager
Metadata: {
  "source_type": "job_description",
  "chunk_type": "job_title",
  "job_id": "0ca4cd66-bba5-4efa-8b26-ecc57bb3939e"
}
--------------------------------------------------
Chunk 2:
Text: Required skill: SAP
Metadata: {
  "source_type": "job_description",
  "chunk_type": "skill",
  "requirement_level": "mandatory",
  "job_id": "0ca4cd66-bba5-4efa-8b26-ecc57bb3939e",
  "skill_name": "SAP",
  "minyears": 0
}
--------------------------------------------------
Chunk 3:
Text: Required skill: project delivery
Metadata: {
  "source_type": "job_description",
  "chunk_type": "skill",
  "requirement_level": "mandatory",
  "job_id": "0ca4cd66-bba5-4efa-8b26-ecc57bb3939e",
  "skill_name": "project delivery",
  "minyears": 0
}
--------------------------------------------------
Chunk 4:
Text: Required skill: business operations
Metadata: {
  "source_type": "job_descr