Install required packages for the RAG system.

1. Langgraph
2. Langchain
3. Qdrant
4. Gradio

In [2]:
import os

#Configuration
DOCS_DIR = "docs"  # Directory containing your PDF's files
MARKDOWN_DIR = "markdown" # Directory containing the pdfs converted to markdown
PARENT_STORE_PATH = "parent_store" # Directory for parent chunk JSON files
CHILD_COLLECTION = "document_child_chunks"

# Create directories if they don't exist
os.makedirs(DOCS_DIR, exist_ok=True)
os.makedirs(MARKDOWN_DIR, exist_ok=True)
os.makedirs(PARENT_STORE_PATH, exist_ok=True)

In [3]:
# Initialize LLM Setup

from langchain_ollama import ChatOllama
llm = ChatOllama(model="gpt-oss:20b-cloud",temperature=0)


In [None]:
# Embeddings: Set up a Hybrid Search approach:

from langchain_huggingface import HuggingFaceEmbeddings
from langchain_qdrant.fastembed_sparse import FastEmbedSparse

# Dense embeddings for semantic understanding
dense_embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2"
)



# Sparse embeddings for keyword matching
sparse_embeddings = FastEmbedSparse(
    model_name="Qdrant/bm25"
)

Fetching 18 files: 100%|██████████| 18/18 [00:01<00:00, 13.42it/s]


In [None]:
# Vector Database Setup 
from qdrant_client import QdrantClient
from qdrant_client.http import models as qdrant_models
from langchain_qdrant import QdrantVectorStore
from langchain_qdrant.qdrant import RetrievalMode


# Initialize Client
if os.path.exists("qdrant_db"):
    print("✓ Qdrant client already exists")
else:
    client = QdrantClient(path="qdrant_db")

#Get Embedding Dimension
embedding_dimension = len(dense_embeddings.embed_query("test"))

def ensure_collections(collection_name):
    if not client.collection_exists(collection_name):
        client.create_collection(
            collection_name=collection_name,
            vectors_config=qdrant_models.VectorParams(
                size = embedding_dimension,
                distance = qdrant_models.Distance.COSINE
            ),
            sparse_vectors_config={
                "sparse": qdrant_models.SparseVectorParams()
            },
        )
        print(f"✓ Created collection: {collection_name}")
    else:
        print(f"✓ Collection {collection_name} already exists")

In [2]:
# PDF TO MARKDOWN
import os
import pymupdf.layout
import pymupdf4llm
from pathlib import Path
import glob

os.environ["TOKENIZERS_PARALLELISM"] = "false"

def pdf_to_markdown(pdf_path, output_dir):
    doc = pymupdf.open(pdf_path)
    md = pymupdf4llm.to_markdown(doc, header=False, footer=False, page_separators=True, ignore_images=True, write_images=False, image_path=None)
    md_cleaned = md.encode('utf-8', errors='surrogatepass').decode('utf-8', errors='ignore')
    output_path = Path(output_dir) / Path(doc.name).stem
    Path(output_path).with_suffix(".md").write_bytes(md_cleaned.encode('utf-8'))


def pdfs_to_markdowns(path_pattern, overwrite: bool = False):
    output_dir = Path(MARKDOWN_DIR)
    output_dir.mkdir(parents=True, exist_ok=True)

    for pdf_path in map(Path, glob.glob(path_pattern)):
        md_path = (output_dir / pdf_path.stem).with_suffix(".md")
        if overwrite or not md_path.exists():
            pdf_to_markdown(pdf_path, output_dir)


pdfs_to_markdowns(f"{DOCS_DIR}/*.pdf")               

NameError: name 'DOCS_DIR' is not defined

In [None]:
# Document Indexing
# Implementing Parent child chunking
# Large context Window (Parent chunks) stored as JSON
# Child Chunks should be stored in Qdrant
# Merges small chunks and splits large ones for consistency
# Creates bidirectional links between parent and child chunks


# Chunking Strategy:

# Split by Markdown headers (#, ##, ###)
# Merge chunks smaller than 2000 characters
# Split chunks larger than 10000 characters
# Create child chunks (500 chars) from each parent
# Store parent chunks in JSON files
# Index child chunks in vector database

The Core Problem: "Search" vs. "Synthesis"
In a standard RAG system, you have to choose a chunk size (e.g., 1000 characters). You face a dilemma:

Small Chunks are great for searching. They are specific, so the vector embedding represents a single clear idea. (e.g., "The capital of France is Paris").
Large Chunks are great for generation. The LLM needs surrounding sentences to understand the nuance, tone, and full details to write a good answer.
If you only use small chunks, the LLM lacks context. If you only use large chunks, the vector search becomes "diluted" (the embedding tries to represent too many topics at once) and you miss relevant logic.

Why Your Strategy Solves This
Your implementation acts as the perfect middle ground:

1. High Precision Search (The Child Chunks)
Why 500 chars? Vectors (like all-mpnet-base) work best on focused text. By keeping the search chunks small, you ensure that when a user asks a specific question, the system finds the exact paragraph containing the answer.
Result: You get highly accurate search hits.
2. High Quality Answers (The Parent Chunks)
Why JSON/Parent Store? Once the search finds the "Child," we don't just feed that tiny 500-char snippet to the LLM. Instead, we look up its "Parent"—the full 2,000–10,000 character section.
Result: The LLM gets the full chapter or section. It sees not just the specific fact, but the introduction, the caveats, and the conclusion surrounding it. This prevents it from hallucinating or missing key details.
3. Semantic Boundaries (Markdown Splitting)
Why Headers (#)? arbitrary splitting (e.g., every 1000 chars) often cuts sentences within the middle or breaks a logical argument in half.
Result: By splitting at Markdown headers, you respect the document's original structure. The "Parent" chunk becomes a logical unit (like "Chapter 1: Installation" or "Section 2.3: Pricing"), which makes much more sense to an LLM.