In [0]:
%pip install -U langchain langchain-community langchain-databricks faiss-cpu tiktoken

In [0]:

dbutils.library.restartPython()

In [0]:
import re
from typing import List, Callable
from langchain_text_splitters import TokenTextSplitter
from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import TokenTextSplitter
from langchain_databricks import ChatDatabricks, DatabricksEmbeddings


In [0]:
# -----------------------
# 1) Databricks LLM + Embeddings
# -----------------------
# Make sure your Databricks auth is configured (e.g., DATABRICKS_HOST + DATABRICKS_TOKEN)
LLM_ENDPOINT_NAME = "databricks-meta-llama-3-1-8b-instruct"
EMBEDDING_ENDPOINT_NAME = "databricks-bge-large-en"  # <-- change to your embedding endpoint name

llm = ChatDatabricks(endpoint=LLM_ENDPOINT_NAME, temperature=0.1)
embeddings = DatabricksEmbeddings(endpoint=EMBEDDING_ENDPOINT_NAME)

In [0]:
# -----------------------
# 2) Sentence split + sentence-window splitter (LlamaIndex-like)
# -----------------------
_SENT_SPLIT_RE = re.compile(r"(?<=[。！？!?])\s+|\n+")

def split_sentences(text: str) -> List[str]:
    parts = [p.strip() for p in _SENT_SPLIT_RE.split(text) if p.strip()]
    return parts

def sentence_window_splitter(
    documents: List[Document],
    window_size: int = 2,
) -> List[Document]:
    """
    Mimics LlamaIndex SentenceWindowNodeParser:
    - Each chunk is one core sentence
    - metadata includes:
        - original_text: core sentence
        - window: context window (core +/- window_size sentences)
    """
    out: List[Document] = []
    for doc in documents:
        sents = split_sentences(doc.page_content)
        for i, core in enumerate(sents):
            lo = max(0, i - window_size)
            hi = min(len(sents), i + window_size + 1)
            window_text = " ".join(sents[lo:hi])

            out.append(
                Document(
                    page_content=core,
                    metadata={
                        **(doc.metadata or {}),
                        "original_text": core,
                        "window": window_text,
                        "sent_index": i,
                        "window_size": window_size,
                    },
                )
            )
    return out

In [0]:
# -----------------------
# 3) Prompt + QA
# -----------------------
QA_PROMPT = ChatPromptTemplate.from_messages([
    ("system",
     "You are a helpful technical assistant. Answer using ONLY the provided context. "
     "If the context is insufficient, say what is missing."),
    ("human",
     "Question:\n{question}\n\nContext:\n{context}\n\nAnswer in English:")
])

def answer_with_retrieval(
    docs: List[Document],
    question: str,
    top_k: int = 5,
    use_window_metadata: bool = False,
) -> None:
    """
    Build FAISS index, retrieve top_k chunks, print retrieved chunks, then ask the Databricks LLM.
    If use_window_metadata=True, feed metadata['window'] as context (Sentence Window style).
    """
    vs = FAISS.from_documents(docs, embeddings)
    retrieved = vs.similarity_search_with_score(question, k=top_k)

    print("\n--- Top retrieved chunks ---")
    context_blocks = []
    for rank, (d, score) in enumerate(retrieved, 1):
        if use_window_metadata and "window" in (d.metadata or {}):
            window = d.metadata.get("window", d.page_content)
            core = d.metadata.get("original_text", d.page_content)
            print(f"\n[{rank}] score={score:.4f}")
            print(f"Core: {core}")
            print(f"Window:\n{window}")
            context_blocks.append(window)
        else:
            print(f"\n[{rank}] score={score:.4f}")
            print(d.page_content)
            context_blocks.append(d.page_content)

    context = "\n\n".join(context_blocks)
    msg = QA_PROMPT.format_messages(question=question, context=context)
    resp = llm.invoke(msg)

    print("\n--- LLM Answer ---")
    print(resp.content)


In [0]:
# -----------------------
# 4) Runner (prints raw chunks + retrieval behavior)
# -----------------------
def evaluate_splitter(
    splitter_name: str,
    split_fn: Callable[[], List[Document]],
    question: str,
    use_window_metadata: bool = False,
    top_k: int = 5,
    max_print_chunks: int = 50,
) -> None:
    print(f"\n{'='*60}")
    print(f"Testing splitter: {splitter_name}")
    print(f"{'='*60}")

    chunks = split_fn()

    print(f"\n[Raw chunks generated] total={len(chunks)}")
    for i, d in enumerate(chunks[:max_print_chunks], 1):
        print(f"\n--- Chunk {i} ---")
        if use_window_metadata and "window" in (d.metadata or {}):
            print(f"Core: {d.metadata.get('original_text')}")
            print(f"Window: {d.metadata.get('window')}")
        else:
            print(d.page_content)

    if len(chunks) > max_print_chunks:
        print(f"\n... (only printed first {max_print_chunks} chunks)")

    print(f"\nQuestion: {question}")
    answer_with_retrieval(
        docs=chunks,
        question=question,
        top_k=top_k,
        use_window_metadata=use_window_metadata,
    )

    print(f"\n{splitter_name} done.")
    print(f"{'='*60}\n")


In [0]:
# -----------------------
# 5) Example document + question (English version)
# -----------------------
documents = [
    Document(page_content="""
Retrieval-Augmented Generation (RAG) is a common architecture for building
LLM-powered applications that combine external knowledge retrieval with
text generation.
Instead of relying solely on a model’s internal parameters, RAG systems
retrieve relevant information from external data sources and use it as
context during answer generation.
This approach is widely used for applications such as question answering,
knowledge assistants, and domain-specific search.

A typical RAG pipeline consists of several core components, including
document ingestion, text chunking, embedding generation, vector indexing,
retrieval, and response synthesis.
Documents from various sources—such as PDFs, databases, APIs, or web pages—
are first ingested and preprocessed.
They are then split into smaller units, often referred to as chunks,
to make retrieval more precise and efficient.

Each chunk is converted into a vector representation using an embedding model
and stored in a vector index.
During inference, a user query is embedded and compared against the indexed
vectors to retrieve the most relevant chunks.
These retrieved chunks are provided to a Large Language Model (LLM) as context,
allowing the model to generate answers that are grounded in external knowledge.

--- The following content is less directly related to the RAG topic ---

In addition, Python, as a general-purpose programming language, is widely used
in the AI field due to its simplicity and rich ecosystem.
For example, NumPy and Pandas are foundational tools for data processing,
providing powerful capabilities for numerical computation and structured data.
Scikit-learn offers a comprehensive suite of machine learning algorithms
for tasks such as classification, regression, and clustering.
Together, these tools form a powerful toolbox for data scientists and AI practitioners,
enabling efficient development and deployment of complex AI models.

--- The following is another related but conceptually independent section ---

Sentence window chunking is an advanced chunking strategy in which each chunk
contains a target sentence along with a configurable number of surrounding
“window” sentences as context.
This approach aims to provide rich local context to the LLM during retrieval,
thereby improving the coherence and factual consistency of generated answers.
Semantic chunking, on the other hand, attempts to split text based on semantic
content rather than relying solely on fixed character counts or sentence boundaries.
It leverages embedding models to compute semantic similarity between sentences
or phrases and identify natural breakpoints where topics or meanings shift.
Both advanced methods can significantly improve retrieval quality and
downstream generation performance in RAG applications.
Choosing the right chunking strategy typically depends on the characteristics
of the data and the expected query types.
""")
]

question = (
    "What are the main components of a Retrieval-Augmented Generation (RAG) system, "
    "and how do sentence window chunking and semantic chunking differ?"
)

# -----------------------
# 6) Run splitters
# -----------------------
# Token-based split (chunk_size=30, overlap=0)
splitter_a = TokenTextSplitter(chunk_size=30, chunk_overlap=0)
evaluate_splitter(
    splitter_name="Token Split (chunk_size=30, overlap=0)",
    split_fn=lambda: splitter_a.split_documents(documents),
    question=question,
    use_window_metadata=False,
    top_k=5
)

# Token-based split (chunk_size=30, overlap=10)
splitter_b = TokenTextSplitter(chunk_size=30, chunk_overlap=10)
evaluate_splitter(
    splitter_name="Token Split (chunk_size=30, overlap=10)",
    split_fn=lambda: splitter_b.split_documents(documents),
    question=question,
    use_window_metadata=False,
    top_k=5
)

# Sentence-window split (window_size=2)
evaluate_splitter(
    splitter_name="Sentence Window Split (window_size=2)",
    split_fn=lambda: sentence_window_splitter(documents, window_size=2),
    question=question,
    use_window_metadata=True,   # feed metadata['window'] into LLM context
    top_k=5
)


### Evaluate Token Splitter

In [0]:

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA

sentence_splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=50,
)


def run_rag_pipeline(
    splitter,
    documents: List[Document],
    question: str,
    splitter_name: str,
):
    print(f"\n{'=' * 70}")
    print(f"Running RAG pipeline with splitter: {splitter_name}")
    print(f"{'=' * 70}\n")

    # -----------------------
    # Step 1: Split documents
    # -----------------------
    chunks = splitter.split_documents(documents)

    print(f"[{splitter_name}] Generated document chunks:")
    for i, chunk in enumerate(chunks, 1):
        print(f"\n--- Chunk {i} ---")
        print(chunk.page_content.strip())
        print("-" * 40)

    # -----------------------
    # Step 2: Build vector store
    # -----------------------
    print("\nBuilding vector store...")
    vectorstore = FAISS.from_documents(chunks, embeddings)

    retriever = vectorstore.as_retriever(
        search_kwargs={"k": 5}
    )

    # -----------------------
    # Step 3: Build RAG chain
    # -----------------------
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        retriever=retriever,
        chain_type="stuff",
        return_source_documents=True,
    )

    # -----------------------
    # Step 4: Ask question
    # -----------------------
    print(f"\nQuestion:\n{question}\n")
    print("Model Answer:\n")

    result = qa_chain(question)
    print(result["result"])

    # -----------------------
    # Step 5: Show retrieved context
    # -----------------------
    print(f"\n[{splitter_name}] Retrieved source documents:")
    for i, doc in enumerate(result["source_documents"], 1):
        print(f"\n--- Source Document {i} ---")
        print(doc.page_content.strip())
        print("-" * 60)

    print(f"\nFinished RAG pipeline with {splitter_name}")
    print(f"{'=' * 70}\n")


run_rag_pipeline(
    splitter=sentence_splitter,
    documents=documents,
    question=question,
    splitter_name="RecursiveCharacterTextSplitter",
)