Add a unique `chunk_id` as metadata during the chunking and embedding phase of RAG ingestion pipeline.<br>
“Enterprise Knowledge Lake” for RAG<br>

One central vector database serving the entire organization →
data is partitioned (department, time) →
retrieval is filtered by context + access control →
older data is archived or moved to cold storage.

In [None]:
import hashlib
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

def generate_chunk_id(doc_id: str, chunk_content: str, chunk_index: int) -> str:
    """Generates a deterministic ID for a chunk."""
    # Combine relevant information: document ID, a content hash (for uniqueness), 
    # and the index (for order)
    content_hash = hashlib.sha256(chunk_content.encode('utf-8')).hexdigest()[:8]
    return f"{doc_id}-{chunk_index}-{content_hash}"

# 1. Simulate Document Loading
raw_text = "Chapter 1. Introduction to RAG. RAG is a great technique. Chapter 2. Advanced Techniques. Metadata is key."
source_file_id = "report-2025-Q1"

# Create a single initial Document object
original_doc = Document(
    page_content=raw_text,
    metadata={"source": "annual_report.pdf", "doc_id": source_file_id}
)

# 2. Chunking and ID Assignment (The Enrichment Step)
splitter = RecursiveCharacterTextSplitter(chunk_size=50, chunk_overlap=10)
chunks = splitter.split_documents([original_doc])

# Assign a unique ID to each chunk
enriched_chunks = []
for i, chunk in enumerate(chunks):
    # 📝 1. Get the base document ID from existing metadata
    doc_id = chunk.metadata["doc_id"]
    
    # 📝 2. Generate the unique chunk ID
    new_chunk_id = generate_chunk_id(doc_id, chunk.page_content, i)
    
    # 📝 3. Add the new chunk ID to the metadata
    chunk.metadata["chunk_id"] = new_chunk_id
    chunk.metadata["chunk_index"] = i # Optionally store the index too
    
    enriched_chunks.append(chunk)

# 3. Verification
for chunk in enriched_chunks:
    print(f"Content: '{chunk.page_content[:30]}...'")
    print(f"  Metadata: {chunk.metadata}")

# Output:
# Content: 'Chapter 1. Introduction to RAG...'
#   Metadata: {'source': 'annual_report.pdf', 'doc_id': 'report-2025-Q1', 'chunk_id': 'report-2025-Q1-0-282e35b7', 'chunk_index': 0}
# Content: 'RAG is a great technique. Chapte...'
#   Metadata: {'source': 'annual_report.pdf', 'doc_id': 'report-2025-Q1', 'chunk_id': 'report-2025-Q1-1-678a9c3d', 'chunk_index': 1}
# Content: 'r 2. Advanced Techniques. Metadata is key.'
#   Metadata: {'source': 'annual_report.pdf', 'doc_id': 'report-2025-Q1', 'chunk_id': 'report-2025-Q1-2-5d4f1a2e', 'chunk_index': 2}