In [1]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate
import os
from langchain_text_splitters import RecursiveCharacterTextSplitter

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
if "GOOGLE_API_KEY" not in os.environ:
    os.environ["GOOGLE_API_KEY"] = "AIzaSyC96fwdtFTkgtQaYA0wtbsktG7PV_VOa8M"

In [3]:
gemini_2_5_flash = ChatGoogleGenerativeAI(
    model = "gemini-2.5-flash",
    temperature = 0,
)


In [4]:
def load_text(file_path: str) -> str:
    with open(file_path, "r", encoding="utf-8") as f:
        return f.read()


In [27]:
def recursive_split_text(text: str, chunk_size=250, chunk_overlap=10) -> list:
    splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size,
        chunk_overlap = chunk_overlap,
        separators=['\n\n','\n', '.', ' ', '' ],
    )
    return splitter.create_documents([text])


In [30]:
text = load_text("jd_genai_engineer.txt")
print(text)

Generative AI Engineer job brief

We are seeking a skilled Generative AI Engineer to join our team and lead the development of innovative AI solutions. Your expertise in generative models, deep learning, and data analysis will be critical in creating intelligent and transformative AI applications. You will work closely with cross-functional teams to conceptualize, design, test, and deploy AI projects that drive innovation and provide value in the rapidly evolving field of artificial intelligence. Join us and be part of a dynamic team that is shaping the future of AI.


Generative AI Engineer job responsibilities:
- Design and develop algorithms for generative models using deep learning techniques.

- Collaborate with cross-functional teams to integrate generative AI solutions into existing workflow systems.

- Research and stay up-to-date on the latest advancements in generative AI technologies and methodologies.

- Optimize and fine-tune generative models for performance and efficienc

In [32]:
recursive_chunks=recursive_split_text(text)
recursive_chunks

[Document(metadata={}, page_content='Generative AI Engineer job brief'),
 Document(metadata={}, page_content='We are seeking a skilled Generative AI Engineer to join our team and lead the development of innovative AI solutions'),
 Document(metadata={}, page_content='. Your expertise in generative models, deep learning, and data analysis will be critical in creating intelligent and transformative AI applications'),
 Document(metadata={}, page_content='. You will work closely with cross-functional teams to conceptualize, design, test, and deploy AI projects that drive innovation and provide value in the rapidly evolving field of artificial intelligence'),
 Document(metadata={}, page_content='. Join us and be part of a dynamic team that is shaping the future of AI.'),
 Document(metadata={}, page_content='Generative AI Engineer job responsibilities:\n- Design and develop algorithms for generative models using deep learning techniques.\n\n- Collaborate with cross-functional teams to integra

In [34]:
import os
import argparse
import numpy as np
from dotenv import load_dotenv
from typing import List

import nltk
from sklearn.cluster import KMeans

from langchain_core.documents import Document


try:
    nltk.download('punkt')
except nltk.downloader.DownloadError:
    print("Downloading NLTK 'punkt' model...")
    

# --- 2. Semantic Chunking Helper Function ---
def semantic_cluster_chunker(text: str, embeddings_model, sentences_per_chunk=8) -> List[Document]:
    """
    Splits text into semantically coherent chunks using sentence embeddings and clustering.
    """
    # Step 1: Split text into sentences
    sentences = nltk.sent_tokenize(text)
    if len(sentences) < sentences_per_chunk:
        return [Document(page_content=text)]

    # Step 2: Embed each sentence
    print(f"Embedding {len(sentences)} sentences for semantic clustering...")
    embeddings = embeddings_model.embed_documents(sentences)

    # Step 3: Cluster the embeddings
    num_clusters = max(1, len(sentences) // sentences_per_chunk)
    print(f"Clustering sentences into {num_clusters} chunks...")
    kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init='auto').fit(embeddings)

    # Step 4: Group sentences by cluster
    clusters = [[] for _ in range(num_clusters)]
    for i, sentence in enumerate(sentences):
        cluster_id = kmeans.labels_[i]
        clusters[cluster_id].append(sentence)

    # Step 5: Create Document objects from clustered sentences
    semantic_chunks = []
    for cluster in clusters:
        chunk_text = " ".join(cluster).strip()
        if chunk_text:
            semantic_chunks.append(Document(page_content=chunk_text))

    print("Semantic chunking complete.")
    return semantic_chunks

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\joshua.david\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


In [None]:
import nltk
nltk.download('punkt_tab')

In [46]:
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings import HuggingFaceInstructEmbeddings

def _load_and_split_docs(jd, resume):
        
        """
        Loads documents and splits them using the semantic clustering strategy.
        """
        
        embeddings_model = HuggingFaceInstructEmbeddings(
            model_name="hkunlp/instructor-large",
            model_kwargs={"device": "cpu"}
)
        jd_loader = TextLoader(jd, encoding='utf-8')
        resume_loader = TextLoader(resume, encoding='utf-8')

        jd_text = jd_loader.load()[0].page_content
        resume_text = resume_loader.load()[0].page_content

        print("\n--- Starting Semantic Chunking for Job Description ---")
        jd_chunks = semantic_cluster_chunker(jd_text, embeddings_model)
        for doc in jd_chunks: doc.metadata = {"source": "job_description"}

        print("\n--- Starting Semantic Chunking for Resume ---")
        resume_chunks = semantic_cluster_chunker(resume_text, embeddings_model)
        for doc in resume_chunks: doc.metadata = {"source": "resume"}

        return jd_chunks + resume_chunks

In [47]:
job_description_path="jd_genai_engineer.txt"
resume_path="resume_candidate_a.txt"



In [51]:
docs = _load_and_split_docs(job_description_path, resume_path)

No sentence-transformers model found with name hkunlp/instructor-large. Creating a new one with mean pooling.
`SentenceTransformer._target_device` has been deprecated, please use `SentenceTransformer.device` instead.



--- Starting Semantic Chunking for Job Description ---
Embedding 29 sentences for semantic clustering...


`SentenceTransformer._target_device` has been deprecated, please use `SentenceTransformer.device` instead.


Clustering sentences into 3 chunks...
Semantic chunking complete.

--- Starting Semantic Chunking for Resume ---
Embedding 20 sentences for semantic clustering...
Clustering sentences into 2 chunks...
Semantic chunking complete.


In [52]:
print(docs)

[Document(metadata={'source': 'job_description'}, page_content='- Communicate complex technical concepts and findings to non-technical stakeholders. - Strong understanding of neural network architectures and optimization techniques.'), Document(metadata={'source': 'job_description'}, page_content="Generative AI Engineer job brief\n\nWe are seeking a skilled Generative AI Engineer to join our team and lead the development of innovative AI solutions. Your expertise in generative models, deep learning, and data analysis will be critical in creating intelligent and transformative AI applications. You will work closely with cross-functional teams to conceptualize, design, test, and deploy AI projects that drive innovation and provide value in the rapidly evolving field of artificial intelligence. Join us and be part of a dynamic team that is shaping the future of AI. Generative AI Engineer job responsibilities:\n- Design and develop algorithms for generative models using deep learning techn

In [53]:
docs

[Document(metadata={'source': 'job_description'}, page_content='- Communicate complex technical concepts and findings to non-technical stakeholders. - Strong understanding of neural network architectures and optimization techniques.'),
 Document(metadata={'source': 'job_description'}, page_content="Generative AI Engineer job brief\n\nWe are seeking a skilled Generative AI Engineer to join our team and lead the development of innovative AI solutions. Your expertise in generative models, deep learning, and data analysis will be critical in creating intelligent and transformative AI applications. You will work closely with cross-functional teams to conceptualize, design, test, and deploy AI projects that drive innovation and provide value in the rapidly evolving field of artificial intelligence. Join us and be part of a dynamic team that is shaping the future of AI. Generative AI Engineer job responsibilities:\n- Design and develop algorithms for generative models using deep learning tech