In [4]:
import json
import re
from pathlib import Path


In [7]:
input_path="../data_structured/chapter2_structured.json"
output_path="../data/pec_chunks_chapter2.json"
chunk_size  = 800
overlap     = 80

In [8]:

raw = json.loads(Path(input_path).read_text(encoding="utf-8"))
chunks = []

for article in raw:
    for sec in article.get("sections", []):
        items = sec.get("subsections") or [sec]

        for sub in items:
            sec_id = sub.get("section", "").strip()
            title  = sub.get("title", "").strip()
            text   = sub.get("text", "") or ""

            # clean text
            full = re.sub(r'^[0-9]+(?:\.[0-9]+)+\s*', '',
                            " ".join(text.splitlines())).strip()

            # sliding window chunking
            step = chunk_size - overlap
            total_len = len(full)
            if total_len == 0:
                continue

            num_chunks = (total_len - overlap + step - 1) // step
            for i in range(num_chunks):
                start = i * step
                end   = min(start + chunk_size, total_len)
                piece = full[start:end].strip()
                if not piece:
                    continue

                # id suffix only if multiple
                chunk_id = sec_id if num_chunks == 1 else f"{sec_id}_{i+1}"

                parts = sec_id.split(".")
                metadata = {
                    "chapter": parts[0] if parts else "",
                    "article": ".".join(parts[:2]) if len(parts) >= 2 else "",
                    "section_title": title
                }

                chunks.append({
                    "id":       chunk_id,
                    "text":     piece,
                    "metadata": metadata
                })

Path(output_path).write_text(
    json.dumps(chunks, ensure_ascii=False, indent=2),
    encoding="utf-8"
)



457326