In [1]:
import os
from dotenv import load_dotenv
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import PromptTemplate
from langchain_core.documents import Document

load_dotenv()

# === 1. Load documents ===
resume_loader = TextLoader("resume_candidate_a.txt", encoding="utf-8")
job_loader = TextLoader("jd_genai_engineer.txt", encoding="utf-8")

resume_docs = resume_loader.load()
job_docs = job_loader.load()

# Add metadata to distinguish sources
for doc in resume_docs:
    doc.metadata["source"] = "resume"
for doc in job_docs:
    doc.metadata["source"] = "job_description"

all_docs = resume_docs + job_docs

# === 2. Semantic Chunking (Agentic-style) ===
# Use smaller chunk_size to preserve job/resume sections
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,
    chunk_overlap=50,
    separators=["\n\n", "\n", ". ", " ", ""],
    keep_separator=True
)

chunks = text_splitter.split_documents(all_docs)

# Optional: Add chunk index for debugging
for i, chunk in enumerate(chunks):
    chunk.metadata["chunk_id"] = i

print(f"‚úÖ Created {len(chunks)} chunks")

# === 3. Embed & Store in FAISS ===
embeddings = HuggingFaceEmbeddings(model="hkunlp/instructor-large")
vectorstore = FAISS.from_documents(chunks, embeddings)

# Save for later (optional)
# vectorstore.save_local("faiss_resume_job_index")

# === 4. Test Retrieval ===
chunks
# query = "What Python and AWS experience does the candidate have, and does it match the job requirements?"

# # Get top 5 most relevant chunks with scores
# retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
# docs_and_scores = vectorstore.similarity_search_with_relevance_scores(query, k=5)

# print("\nüîç Retrieval Results:\n")
# for i, (doc, score) in enumerate(docs_and_scores):
#     print(f"--- Chunk {i+1} (Score: {score:.4f}) ---")
#     print(f"Source: {doc.metadata['source']}")
#     print(f"Content: {doc.page_content[:300]}...\n")

  from .autonotebook import tqdm as notebook_tqdm


‚úÖ Created 36 chunks


[Document(metadata={'source': 'resume', 'chunk_id': 0}, page_content='John Doe\nSan Francisco, CA | johndoe@email.com | (123) 456-7890 | linkedin.com/in/johndoe | github.com/johndoe'),
 Document(metadata={'source': 'resume', 'chunk_id': 1}, page_content='Generative AI Engineer'),
 Document(metadata={'source': 'resume', 'chunk_id': 2}, page_content='Innovative and detail-oriented Generative AI Engineer with 1.5 years of experience designing, training, and deploying large language models (LLMs) and multimodal AI systems'),
 Document(metadata={'source': 'resume', 'chunk_id': 3}, page_content='. Skilled in prompt engineering, fine-tuning transformer architectures, and building scalable generative pipelines using modern ML frameworks.'),
 Document(metadata={'source': 'resume', 'chunk_id': 4}, page_content='PROFESSIONAL EXPERIENCE\n\nAI Engineer\nNeuraLabs Inc., San Francisco, CA\nJune 2023 ‚Äì Present'),
 Document(metadata={'source': 'resume', 'chunk_id': 5}, page_content='Developed and dep

In [2]:
query = "What Python and AWS experience does the candidate have, and does it match the job requirements?"

# Get top 5 most relevant chunks with scores
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
docs_and_scores = vectorstore.similarity_search_with_relevance_scores(query, k=5)

print("\nüîç Retrieval Results:\n")
for i, (doc, score) in enumerate(docs_and_scores):
    print(f"--- Chunk {i+1} (Score: {score:.4f}) ---")
    print(f"Source: {doc.metadata['source']}")
    print(f"Content: {doc.page_content[:300]}...\n")


üîç Retrieval Results:

--- Chunk 1 (Score: 0.8516) ---
Source: job_description
Content: - Proficiency in Python, TensorFlow, and PyTorch for developing AI models.

- Experience in generative AI techniques such as GANs and VAEs.

- Ability to design and implement scalable and efficient AI systems.

- Advanced knowledge of natural language processing for text generation tasks....

--- Chunk 2 (Score: 0.8308) ---
Source: job_description
Content: - Familiarity with computer vision and image generation using AI.

- Skills in data preprocessing and feature engineering for AI model training.

- Strong understanding of neural network architectures and optimization techniques.

- Experience in deploying AI models into production environments....

--- Chunk 3 (Score: 0.8298) ---
Source: resume
Content: CERTIFICATIONS

AWS Certified Machine Learning ‚Äì Specialty (2024)
DeepLearning.AI Generative AI with LLMs Specialization (2023)
Google Cloud Professional Machine Learning Engineer (2023)

TEC

In [3]:
#Agentic Chunking

In [5]:
if "GOOGLE_API_KEY" not in os.environ:
    os.environ["GOOGLE_API_KEY"] = "AIzaSyC96fwdtFTkgtQaYA0wtbsktG7PV_VOa8M"

In [7]:
resume_loader = TextLoader("resume_candidate_a.txt", encoding="utf-8")
job_loader = TextLoader("jd_genai_engineer.txt", encoding="utf-8")

resume_text = resume_loader.load()
job_text = job_loader.load()

In [9]:
import os
from dotenv import load_dotenv
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_huggingface import ChatHuggingFace, HuggingFaceEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document

load_dotenv()

# === 1. Load JD (primary) and Resume (secondary) ===
jd_text = TextLoader("txts/job_description.txt", encoding="utf-8").load()[0].page_content
resume_text = TextLoader("txts/resume.txt", encoding="utf-8").load()[0].page_content

# === 2. STEP 1: LLM reads JD ‚Üí generates verification questions ===
llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0.3)

question_prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a senior hiring agent. Based on the job description, generate a list of 3-5 specific, factual questions to verify a candidate's qualifications. "
                "Each question must be answerable by looking at a resume. Focus on skills, experience duration, tools, or domains mentioned in the JD."),
    ("human", "Job Description:\n{jd}\n\nGenerate verification questions (one per line):")
])

questions_chain = question_prompt | llm
response = questions_chain.invoke({"jd": jd_text})
questions = [q.strip("-‚Ä¢ 1234567890. ") for q in response.content.strip().split("\n") if q.strip()]

print("üìã JD-Driven Verification Questions:")
for i, q in enumerate(questions, 1):
    print(f"{i}. {q}")

# === 3. STEP 2: Prepare Resume for Retrieval (Chunk or Keep Whole) ===
# For resumes, often better to keep as 1-2 chunks (they're short)
# But we'll split by section if possible
from langchain_text_splitters import RecursiveCharacterTextSplitter

resume_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=50,
    separators=["\n\n", "\n", ". ", " "]
)
resume_chunks = resume_splitter.create_documents([resume_text])
# Mark all as resume source
for chunk in resume_chunks:
    chunk.metadata["source"] = "resume"

# Create vector store from RESUME ONLY
embeddings = HuggingFaceEmbeddings(model="text-embedding-3-small")
resume_vectorstore = FAISS.from_documents(resume_chunks, embeddings)
resume_retriever = resume_vectorstore.as_retriever(search_kwargs={"k": 2})

# === 4. STEP 3: For each JD-driven question, retrieve FROM RESUME ONLY ===
print("\nüîç Retrieving Evidence from Resume:\n")

all_results = []
for q in questions:
    print(f"‚ùì Question: {q}")
    # Retrieve ONLY from resume
    retrieved = resume_vectorstore.similarity_search_with_relevance_scores(q, k=2)
    
    evidence = []
    for doc, score in retrieved:
        if score > 0.4:  # Only show relevant matches
            print(f"  ‚Üí (Score: {score:.3f}) {doc.page_content[:200]}...")
            evidence.append(doc.page_content)
        else:
            print("  ‚Üí No strong evidence found.")
    
    all_results.append({
        "question": q,
        "evidence": evidence,
        "retrieved_chunks": retrieved
    })
    print()

# === OPTIONAL: Save for deeper analysis ===
# You can now feed `all_results` to an LLM for final scoring

üîç Agent identified these requirements to verify:
1. Here's a list of specific, verifiable requirements from the job description:
2. *   Background in machine learning algorithms
3. *   Background in deep learning algorithms
4. *   Proficiency in Python
5. *   Proficiency in TensorFlow
6. *   Proficiency in PyTorch
7. *   Experience with GANs (Generative Adversarial Networks)
8. *   Experience with VAEs (Variational Autoencoders)
9. *   Ability to design scalable AI systems
10. *   Ability to implement scalable AI systems
11. *   Ability to design efficient AI systems
12. *   Ability to implement efficient AI systems
13. *   Advanced knowledge of NLP for text generation tasks
14. *   Familiarity with computer vision
15. *   Familiarity with image generation using AI
16. *   Skills in data preprocessing
17. *   Skills in feature engineering
18. *   Understanding of neural network architectures
19. *   Understanding of neural network optimization techniques
20. *   Experience deploying

Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/usage?tab=rate-limit. 
* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 10
Please retry in 24.322744475s. [links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.5-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 10
}
, retry_delay {
  sec


‚úÖ Created 36 AGENTIC chunks (evidence + requirements)

üéØ Retrieved Agentic Chunks:

--- Match 1 (Score: 0.861) ---
Source: job_description
Requirement: N/A
Content: Here's a list of specific, verifiable requirements from the job description:

--- Match 2 (Score: 0.842) ---
Source: job_description
Requirement: N/A
Content: *   Ability to implement efficient AI systems

--- Match 3 (Score: 0.841) ---
Source: job_description
Requirement: N/A
Content: *   Ability to implement scalable AI systems

--- Match 4 (Score: 0.840) ---
Source: job_description
Requirement: N/A
Content: *   Familiarity with computer vision

--- Match 5 (Score: 0.835) ---
Source: job_description
Requirement: N/A
Content: *   Proficiency in PyTorch


In [10]:
agentic_chunks

[Document(metadata={'source': 'resume', 'requirement': '*   Background in machine learning algorithms', 'chunk_type': 'agentic_evidence'}, page_content='Skilled in prompt engineering, fine-tuning transformer architectures, and building scalable generative pipelines using modern ML frameworks. AWS Certified Machine Learning ‚Äì Specialty (2024). Google Cloud Professional Machine Learning Engineer (2023).'),
 Document(metadata={'source': 'resume', 'requirement': '*   Background in deep learning algorithms', 'chunk_type': 'agentic_evidence'}, page_content='Generative AI Engineer with 1.5 years of experience designing, training, and deploying large language models (LLMs) and multimodal AI systems. Skilled in fine-tuning transformer architectures. DeepLearning.AI Generative AI with LLMs Specialization (2023).'),
 Document(metadata={'source': 'resume', 'requirement': '*   Proficiency in Python', 'chunk_type': 'agentic_evidence'}, page_content='Languages: Python, JavaScript, SQL\nImplemented 

In [11]:
query = "Does the candidate have Familiarity with CV?"
retrieved = vectorstore.similarity_search_with_relevance_scores(query, k=5)

print("\nüéØ Retrieved Agentic Chunks:")
for i, (doc, score) in enumerate(retrieved):
    print(f"\n--- Match {i+1} (Score: {score:.3f}) ---")
    print(f"Source: {doc.metadata['source']}")
    print(f"Requirement: {doc.metadata.get('requirement', 'N/A')}")
    print(f"Content: {doc.page_content}")


üéØ Retrieved Agentic Chunks:

--- Match 1 (Score: 0.859) ---
Source: job_description
Requirement: N/A
Content: *   Familiarity with computer vision

--- Match 2 (Score: 0.825) ---
Source: job_description
Requirement: N/A
Content: *   Skills in data preprocessing

--- Match 3 (Score: 0.822) ---
Source: job_description
Requirement: N/A
Content: *   Proficiency in Python

--- Match 4 (Score: 0.818) ---
Source: job_description
Requirement: N/A
Content: Here's a list of specific, verifiable requirements from the job description:

--- Match 5 (Score: 0.814) ---
Source: job_description
Requirement: N/A
Content: *   Familiarity with image generation using AI


In [26]:
import os
import json
from typing import List, Dict
from dotenv import load_dotenv

# LangChain components
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.documents import Document
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings

# --- 1. JSON SCHEMA DEFINITION ---
# Define the desired structured output as a JSON Schema dictionary.

JSON_RESUME_SCHEMA = {
    "title": "Resume",
    "description": "The structured representation of a resume.",
    "type": "object",
    "properties": {
        "name": {"type": "string", "description": "The full name of the candidate."},
        "summary": {"type": "string", "description": "A brief summary of the candidate's profile."},
        "work_experience": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "role": {"type": "string", "description": "The job title or role."},
                    "company": {"type": "string", "description": "The name of the company."},
                    "start_date": {"type": "string", "description": "The start date of the employment."},
                    "end_date": {"type": "string", "description": "The end date of the employment (or 'Present')."},
                    "responsibilities": {"type": "array", "items": {"type": "string"}},
                },
                "required": ["role", "company", "responsibilities"],
            },
        },
        "education": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "degree": {"type": "string", "description": "The degree obtained."},
                    "institution": {"type": "string", "description": "The name of the institution."},
                    "graduation_date": {"type": "string", "description": "The graduation date."},
                },
                "required": ["degree", "institution"],
            },
        },
        "skills": {"type": "array", "items": {"type": "string"}},
    },
    "required": ["name", "summary", "work_experience", "education", "skills"],
}

# --- 2. AGENTIC EXTRACTION (LANGCHAIN + GEMINI) ---

def extract_resume_data(resume_text: str) -> Dict:
    """
    Uses LangChain and Gemini to extract structured data from resume text
    based on a JSON schema dictionary.
    """
    llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0.3)
    prompt = ChatPromptTemplate.from_messages(
        [
            ("system", "You are an expert HR assistant specializing in parsing resumes. Your task is to extract relevant information"
            "from the following resume text and format it as a valid JSON object. Adhere strictly to the provided schema."
            "If a piece of information is not found, use null or an empty list."),
            ("human", "{resume_text}"),
        ]
    )


    # Chain the prompt and model with the structured output schema
    structured_runnable = prompt | llm.with_structured_output(schema=JSON_RESUME_SCHEMA)

    print("Invoking Gemini for agentic extraction...")
    result = structured_runnable.invoke({"resume_text": resume_text})
    print("Extraction complete.")
    return result

# --- 3. SEMANTIC CHUNKING ---

def create_semantic_chunks(resume_data: Dict) -> List[Document]:
    """
    Converts the structured dictionary into a list of semantic LangChain Documents.
    """
    chunks = []

    # Summary chunk
    chunks.append(Document(
        page_content=f"Summary: {resume_data.get('summary', '')}",
        metadata={"category": "summary", "name": resume_data.get('name', '')}
    ))

    # Work Experience chunks
    for job in resume_data.get('work_experience', []):
        content = (
            f"Role: {job.get('role')} at {job.get('company')} ({job.get('start_date')} - {job.get('end_date')}). "
            f"Responsibilities: {' '.join(job.get('responsibilities', []))}"
        )
        chunks.append(Document(
            page_content=content,
            metadata={"category": "work_experience", "company": job.get('company'), "role": job.get('role')}
        ))

    # Education chunks
    for edu in resume_data.get('education', []):
        content = f"Degree: {edu.get('degree')} from {edu.get('institution')} (Graduated: {edu.get('graduation_date')})."
        chunks.append(Document(
            page_content=content,
            metadata={"category": "education", "institution": edu.get('institution')}
        ))

    # Skills chunk
    skills = resume_data.get('skills', [])
    if skills:
        chunks.append(Document(
            page_content=f"Skills: {', '.join(skills)}",
            metadata={"category": "skills"}
        ))

    print(f"Created {len(chunks)} semantic chunks.")
    return chunks

# --- MAIN EXECUTION ---

if __name__ == "__main__":
    load_dotenv()

    if not os.getenv("GOOGLE_API_KEY"):
        raise ValueError("GOOGLE_API_KEY not found in environment variables.")

    # --- Build Phase ---
    print("--- Starting Resume Processing Pipeline ---")

    with open("txts/resume.txt", "r", encoding='utf-8') as f:
        resume_text = f.read()

    structured_resume = extract_resume_data(resume_text)
    documents = create_semantic_chunks(structured_resume)

    print("\n--- Sample Chunk ---")
    print(documents[1])
    print("--------------------\n")

    # --- 4. VECTORIZATION & STORAGE ---
    print("Initializing embedding model and FAISS vector store...")
    embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    db = FAISS.from_documents(documents, embeddings)
    db.save_local("faiss_resume_index")
    print("FAISS index saved locally to 'faiss_resume_index'.")
    print("--- Pipeline Completed Successfully ---\n")

    # --- 5. VERIFICATION ---
    print("--- Verification: Loading index and performing search ---")
    loaded_db = FAISS.load_local("faiss_resume_index", embeddings, allow_dangerous_deserialization=True)
    retriever = loaded_db.as_retriever(search_kwargs={"k": 2})

    query = "What is his experience with CI/CD pipelines?"
    results = retriever.invoke(query)

    print(f"Query: '{query}'")
    print("--- Search Results ---")
    for doc in results:
        print(f"Content: {doc.page_content}")
        print(f"Metadata: {doc.metadata}\n")
    print("----------------------")

Key 'parameters' is not supported in schema, ignoring


--- Starting Resume Processing Pipeline ---
Invoking Gemini for agentic extraction...
Extraction complete.
Created 7 semantic chunks.

--- Sample Chunk ---
page_content='Role: Senior Generative AI Engineer at NeuraLabs Inc. (June 2023 - Present). Responsibilities: Spearheaded end-to-end development of an enterprise LLM platform supporting 10+ internal products; fine-tuned Llama-3-70B and Mistral-7B using QLoRA and DPO, achieving 92% human preference alignment on domain-specific QA tasks. Designed and deployed a multimodal RAG architecture combining CLIP, BLIP-2, and FAISS to ground generative responses in proprietary documentation, reducing hallucination rates by 58%. Led a team of 6 engineers to build a model evaluation framework for generative quality, safety, and latency‚Äîadopted org-wide and integrated into CI/CD pipelines. Reduced cloud inference costs by 65% via dynamic batching, model quantization (GGUF), and speculative decoding; saved $1.2M/year in AWS spend. Authored interna

In [27]:
loaded_db = FAISS.load_local("faiss_resume_index", embeddings, allow_dangerous_deserialization=True)
retriever = loaded_db.as_retriever(search_kwargs={"k": 53})


query = ""
results = retriever.invoke(query)

print(f"Query: '{query}'")
print("--- Search Results ---")
for doc in results:
    print(f"Content: {doc.page_content}")
    print(f"Metadata: {doc.metadata}\n")
print("----------------------")


Query: ''
--- Search Results ---
Content: Summary: Visionary AI scientist and engineer with 4+ years of experience at the intersection of large language models (LLMs), multimodal systems, and production-scale generative infrastructure. Combines deep theoretical expertise from PhD research in neural representation learning with hands-on experience building and deploying state-of-the-art generative AI products used by millions. Proven ability to lead cross-functional teams, publish high-impact research, and translate academic innovation into business value.
Metadata: {'category': 'summary', 'name': 'John Doe'}

Content: Degree: B.S. in Computer Science (Honors) from University of California, Berkeley (Graduated: 2017).
Metadata: {'category': 'education', 'institution': 'University of California, Berkeley'}

Content: Skills: Python, C++, JavaScript, SQL, Rust, Fine-tuning (LoRA, QLoRA, DPO, RLHF), RAG, Prompt Engineering, LLM Agents, Guardrails, Multimodal Models (LLaVA, Stable Diffusion 

In [24]:
resume_text

'John Doe\nSan Francisco, CA | johndoe@email.com | (123) 456-7890 | linkedin.com/in/johndoe | github.com/johndoe\n\nGenerative AI Engineer\nInnovative and detail-oriented Generative AI Engineer with 1.5 years of experience designing, training, and deploying large language models (LLMs) and multimodal AI systems. Skilled in prompt engineering, fine-tuning transformer architectures, and building scalable generative pipelines using modern ML frameworks.\n\nPROFESSIONAL EXPERIENCE\n\nAI Engineer\nNeuraLabs Inc., San Francisco, CA\nJune 2023 √¢‚Ç¨‚Äú Present\n\nDeveloped and deployed a fine-tuned Llama-2-based customer support chatbot, reducing human agent workload by 35% and improving response accuracy by 22%.\nBuilt a multimodal content generation pipeline using CLIP and Stable Diffusion to auto-generate marketing visuals from text prompts, adopted by 3 internal product teams.\nOptimized inference latency by 40% through quantization and ONNX runtime integration for generative models in pr

In [1]:
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

  embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm


In [35]:
import os
import json
from typing import List, Dict
from dotenv import load_dotenv

# LangChain components
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.documents import Document
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings

# --- 1. JSON SCHEMA DEFINITION FOR JOB DESCRIPTION ---
# Define the desired structured output for a job description.

JSON_JD_SCHEMA = {
    "title": "JobDescription",
    "description": "The structured representation of a job description.",
    "type": "object",
    "properties": {
        "job_title": {"type": "string", "description": "The title of the job position."},
        "company": {"type": "string", "description": "The name of the company hiring."},
        "location": {"type": "string", "description": "The location of the job (e.g., city, state, remote)."},
        "company_summary": {"type": "string", "description": "A brief summary of the company."},
        "responsibilities": {
            "type": "array",
            "description": "A list of key responsibilities for the role.",
            "items": {"type": "string"},
        },
        "required_qualifications": {
            "type": "array",
            "description": "A list of essential qualifications and skills.",
            "items": {"type": "string"},
        },
        "preferred_qualifications": {
            "type": "array",
            "description": "A list of desired but not essential qualifications.",
            "items": {"type": "string"},
        },
    },
    "required": ["job_title", "company", "responsibilities", "required_qualifications"],
}

# --- 2. AGENTIC EXTRACTION (LANGCHAIN + GEMINI) ---

def extract_jd_data(jd_text: str) -> Dict:
    """
    Uses LangChain and Gemini to extract structured data from job description text
    based on a JSON schema dictionary.
    """
    llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0.2)
    prompt = ChatPromptTemplate.from_messages(
        [
            ("system", "You are an expert recruiting assistant specializing in parsing job descriptions. "
             "Your task is to extract relevant information from the following job description text and format it as a valid JSON object. "
             "Adhere strictly to the provided schema. If a piece of information is not found, use null or an empty list."),
            ("human", "{jd_text}"),
        ]
    )

    # Chain the prompt and model with the structured output schema
    structured_runnable = prompt | llm.with_structured_output(schema=JSON_JD_SCHEMA)

    print("Invoking Gemini for agentic extraction of Job Description...")
    result = structured_runnable.invoke({"jd_text": jd_text})
    print("Extraction complete.")
    return result

# --- 3. SEMANTIC CHUNKING ---

def create_jd_semantic_chunks(jd_data: Dict) -> List[Document]:
    """
    Converts the structured JD dictionary into a list of semantic LangChain Documents.
    """
    chunks = []
    company = jd_data.get('company', 'N/A')
    job_title = jd_data.get('job_title', 'N/A')

    # Job Overview chunk
    overview_content = (
        f"Job Title: {job_title} at {company}. "
        f"Location: {jd_data.get('location', 'N/A')}. "
        f"Company Summary: {jd_data.get('company_summary', '')}"
    )
    chunks.append(Document(
        page_content=overview_content.strip(),
        metadata={"category": "overview", "company": company, "job_title": job_title}
    ))

    # Responsibilities chunk
    responsibilities = jd_data.get('responsibilities', [])
    if responsibilities:
        chunks.append(Document(
            page_content=f"Responsibilities: {' '.join(responsibilities)}",
            metadata={"category": "responsibilities", "company": company, "job_title": job_title}
        ))

    # Required Qualifications chunk
    required_qualifications = jd_data.get('required_qualifications', [])
    if required_qualifications:
        chunks.append(Document(
            page_content=f"Required Qualifications: {' '.join(required_qualifications)}",
            metadata={"category": "required_qualifications", "company": company, "job_title": job_title}
        ))

    # Preferred Qualifications chunk
    preferred_qualifications = jd_data.get('preferred_qualifications', [])
    if preferred_qualifications:
        chunks.append(Document(
            page_content=f"Preferred Qualifications: {' '.join(preferred_qualifications)}",
            metadata={"category": "preferred_qualifications", "company": company, "job_title": job_title}
        ))

    print(f"Created {len(chunks)} semantic chunks for the job description.")
    return chunks

# --- MAIN EXECUTION ---

if __name__ == "__main__":
    load_dotenv()

    if not os.getenv("GOOGLE_API_KEY"):
        raise ValueError("GOOGLE_API_KEY not found in environment variables.")

    # --- Build Phase ---
    print("--- Starting Job Description Processing Pipeline ---")

    with open("txts/job_description.txt", "r", encoding='utf-8') as f:
        jd_text = f.read()

    structured_jd = extract_jd_data(jd_text)
    documents = create_jd_semantic_chunks(structured_jd)

    print("\n--- Sample Chunk ---")
    if documents:
        print(documents[1]) # Print the responsibilities chunk
    print("--------------------\n")

    # --- 4. VECTORIZATION & STORAGE ---
    print("Initializing embedding model and FAISS vector store...")
    embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    db = FAISS.from_documents(documents, embeddings)
    db.save_local("faiss_jd_index")
    print("FAISS index saved locally to 'faiss_jd_index'.")
    print("--- Pipeline Completed Successfully ---\n")

Key 'parameters' is not supported in schema, ignoring


--- Starting Job Description Processing Pipeline ---
Invoking Gemini for agentic extraction of Job Description...
Extraction complete.
Created 3 semantic chunks for the job description.

--- Sample Chunk ---
page_content='Responsibilities: Design and develop algorithms for generative models using deep learning techniques. Collaborate with cross-functional teams to integrate generative AI solutions into existing workflow systems. Research and stay up-to-date on the latest advancements in generative AI technologies and methodologies. Optimize and fine-tune generative models for performance and efficiency. Troubleshoot and resolve issues related to generative AI models and implementations. Create and maintain documentation for generative AI models and their applications. Communicate complex technical concepts and findings to non-technical stakeholders.' metadata={'category': 'responsibilities', 'company': 'null', 'job_title': 'Generative AI Engineer'}
--------------------

Initializing em

In [4]:
loaded_db = FAISS.load_local("faiss/faiss_jd_index", embeddings, allow_dangerous_deserialization=True)
retriever = loaded_db.as_retriever(search_kwargs={"k": 1})


query = "key responsibilities and essential qualifications for this role"
results = retriever.invoke(query)

print(f"Query: '{query}'")
print("--- Search Results ---")
for doc in results:
    print(f"Content: {doc.page_content}")
    print(f"Metadata: {doc.metadata}\n")
print("----------------------")


Query: 'key responsibilities and essential qualifications for this role'
--- Search Results ---
Content: Required Qualifications: Strong background in machine learning and deep learning algorithms. Proficiency in Python, TensorFlow, and PyTorch for developing AI models. Experience in generative AI techniques such as GANs and VAEs. Ability to design and implement scalable and efficient AI systems. Advanced knowledge of natural language processing for text generation tasks. Familiarity with computer vision and image generation using AI. Skills in data preprocessing and feature engineering for AI model training. Strong understanding of neural network architectures and optimization techniques. Experience in deploying AI models into production environments. Ability to stay updated with the latest advancements in generative AI research and incorporate them into work.
Metadata: {'category': 'required_qualifications', 'company': 'null', 'job_title': 'Generative AI Engineer'}

-----------------