In [63]:
import os
from dotenv import load_dotenv
from pprint import pprint

import pandas as pd

import chromadb
from chromadb import Documents, EmbeddingFunction, Embeddings

import google.generativeai as genai

from IPython.display import Markdown

In [64]:
load_dotenv()
api_key = os.getenv("GOOGLE_API_KEY")
genai.configure(api_key=api_key)

In [65]:
for m in genai.list_models():
    if 'embedContent' in m.supported_generation_methods:
        print(m.name)

models/embedding-001
models/text-embedding-004
models/gemini-embedding-exp-03-07
models/gemini-embedding-exp
models/gemini-embedding-001


In [66]:
filepath = "./document/british_gp_2020.docx"

In [67]:
from docx import Document
from docx.table import Table
from docx.text.paragraph import Paragraph

doc = Document(filepath)

def iter_block_items(parent):
    """Yield paragraphs and tables in document order."""
    body = parent._element.body
    for child in body.iterchildren():
        if child.tag.endswith('p'):
            yield Paragraph(child, parent)
        elif child.tag.endswith('tbl'):
            yield Table(child, parent)

def get_heading_level(paragraph):
    """Get heading level from paragraph style (1-9 for headings, None for body text)."""
    style_name = paragraph.style.name if paragraph.style else ""
    if style_name.startswith("Heading"):
        try:
            return int(style_name.split()[-1])
        except ValueError:
            return None
    return None

# Extract structured chunks based on document structure
structured_chunks = []
current_section = {"heading": None, "content": []}

for block in iter_block_items(doc):
    if isinstance(block, Paragraph):
        text = block.text.strip()
        if not text:
            continue
        
        heading_level = get_heading_level(block)
        
        if heading_level is not None:
            # Save previous section if it has content
            if current_section["content"]:
                structured_chunks.append({
                    "heading": current_section["heading"],
                    "content": "\n".join(current_section["content"]),
                    "type": "section"
                })
            # Start new section
            current_section = {"heading": text, "content": []}
        else:
            current_section["content"].append(text)
            
    elif isinstance(block, Table):
        # Tables are kept as separate chunks
        table_rows = []
        for row in block.rows:
            cells = [cell.text.strip().replace("\n", " ") for cell in row.cells]
            if any(cells):
                table_rows.append(" | ".join(cells))
        
        if table_rows:
            table_text = "\n".join(table_rows)
            structured_chunks.append({
                "heading": current_section["heading"],
                "content": table_text,
                "type": "table"
            })

# Don't forget the last section
if current_section["content"]:
    structured_chunks.append({
        "heading": current_section["heading"],
        "content": "\n".join(current_section["content"]),
        "type": "section"
    })

print(f"Total structured chunks: {len(structured_chunks)}")
for i, chunk in enumerate(structured_chunks[:3]):
    print(f"\n--- Chunk {i+1} ({chunk['type']}) ---")
    print(f"Heading: {chunk['heading']}")
    print(f"Content preview: {chunk['content'][:200]}...")

Total structured chunks: 12

--- Chunk 1 (section) ---
Heading: None
Content preview: British Grand Prix 2020 — Silverstone Circuit Track Reference
Purpose: Canonical track facts + commentary cues for an LLM-powered race commentary bot.
Last updated: 02 January 2026...

--- Chunk 2 (table) ---
Heading: 1) Quick facts (Silverstone Grand Prix configuration used in 2020)
Content preview: Field | Value
Circuit name | Silverstone Circuit (Grand Prix / 'Arena' layout used since 2011)
Location | Silverstone, Northamptonshire, England, United Kingdom
Direction | Clockwise
FIA licence grade...

--- Chunk 3 (section) ---
Heading: 2) Layout overview
Content preview: Silverstone is one of the fastest circuits on the F1 calendar, built on a former RAF airfield. It combines long full-throttle sections with sustained high-speed direction changes that generate very hi...


In [68]:
# Document Structure-based Chunking
# Optionally split large sections while preserving structure

MAX_CHUNK_SIZE = 1000  # Maximum characters per chunk

def split_large_chunk(chunk, max_size=MAX_CHUNK_SIZE):
    """Split large chunks while trying to preserve paragraph boundaries."""
    content = chunk["content"]
    if len(content) <= max_size:
        return [chunk]
    
    # Split by paragraphs first
    paragraphs = content.split("\n")
    sub_chunks = []
    current_content = []
    current_length = 0
    
    for para in paragraphs:
        para_len = len(para) + 1  # +1 for newline
        if current_length + para_len > max_size and current_content:
            sub_chunks.append({
                "heading": chunk["heading"],
                "content": "\n".join(current_content),
                "type": chunk["type"]
            })
            current_content = [para]
            current_length = para_len
        else:
            current_content.append(para)
            current_length += para_len
    
    if current_content:
        sub_chunks.append({
            "heading": chunk["heading"],
            "content": "\n".join(current_content),
            "type": chunk["type"]
        })
    
    return sub_chunks

# Process all chunks
final_chunks = []
for chunk in structured_chunks:
    final_chunks.extend(split_large_chunk(chunk))

print(f"Final chunks after splitting large sections: {len(final_chunks)}")

Final chunks after splitting large sections: 15


In [69]:
# Convert to LangChain Document format for compatibility with downstream processing
from langchain_core.documents import Document as LangchainDoc

texts = []
for i, chunk in enumerate(final_chunks):
    # Create metadata with structural information
    metadata = {
        "heading": chunk["heading"],
        "chunk_type": chunk["type"],
        "chunk_index": i
    }
    doc = LangchainDoc(page_content=chunk["content"], metadata=metadata)
    texts.append(doc)

print(f"Created {len(texts)} LangChain documents\n")
print("--- First Document ---")
pprint(texts[0].page_content)
print(f"\nMetadata: {texts[0].metadata}")

Created 15 LangChain documents

--- First Document ---
('British Grand Prix 2020 — Silverstone Circuit Track Reference\n'
 'Purpose: Canonical track facts + commentary cues for an LLM-powered race '
 'commentary bot.\n'
 'Last updated: 02 January 2026')

Metadata: {'heading': None, 'chunk_type': 'section', 'chunk_index': 0}
