In [2]:
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv


In [3]:
import re, hashlib
from dataclasses import dataclass
from typing import List, Dict, Any
from langchain_core.documents import Document

HEADING_RE = re.compile(
    r"""(?mx)
    ^\s*(?:Section|Sec\.)?\s*
    (?P<num>\d+(?:\.\d+)*)        # 1 or 1.2 or 1.2.3
    (?:[.)])?                     # optional trailing "." or ")"
    \s+(?P<title>\S.+?)\s*$       # non-empty title
    """
)

@dataclass
class Section:
    number: str
    title: str
    level: int
    start_char: int
    end_char: int
    content: str  # text between this heading and the next

def split_by_numbered_headings(text: str) -> List[Section]:
    matches = list(HEADING_RE.finditer(text))
    sections: List[Section] = []
    for i, m in enumerate(matches):
        start = m.end()
        end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
        body = text[start:end].strip("\n")
        num = m.group("num")
        title = m.group("title").strip()
        sections.append(
            Section(
                number=num,
                title=title,
                level=num.count(".") + 1,
                start_char=m.start(),
                end_char=end,
                content=body,
            )
        )
    return sections

def _ancestors(num: str) -> List[str]:
    parts = num.split(".")
    return [".".join(parts[:i]) for i in range(1, len(parts))]  # e.g., 1.2.3 -> ["1","1.2"]

def _section_id(source_path: str, number: str) -> str:
    # Stable id for dedup/versioning
    return hashlib.sha1(f"{source_path}::{number}".encode()).hexdigest()[:16]

def sections_to_documents(sections: List[Section], *, source_path: str) -> List[Document]:
    docs: List[Document] = []
    for s in sections:
        heading_line = f"{s.number} {s.title}"
        # keep heading in content to improve retrieval quality
        page_content = f"{heading_line}\n\n{s.content}".strip()
        meta: Dict[str, Any] = {
            "source": source_path,
            "section_number": s.number,                 # "1.2.3"
            "section_title": s.title,                   # "From airport"
            "level": s.level,                           # 1,2,3...
            "ancestors": _ancestors(s.number),          # ["1","1.2"]
            "top_level": int(s.number.split(".")[0]),   # 1,2,3...
            "heading": heading_line,
            "start_char": s.start_char,
            "end_char": s.end_char,
            "section_id": _section_id(source_path, s.number),
        }
        docs.append(Document(page_content=page_content, metadata=meta))
    return docs