In [1]:
import re
import xml.etree.ElementTree as ET
import pandas as pd

###############################################################################
# 1. Helpers
###############################################################################

REF_RE = re.compile(
    r"""                 # jci1.3:c:BWBR0011353&artikel=10.2&lid=7
        (?P<law>BWBR\d+) # BWBR-number
        .*?&artikel=(?P<artikel>[^&]+)  # artikel=10.2                         │
        (?:&lid=(?P<lid>[^&]+))?        # optional lid                        │
    """,
    re.VERBOSE,
)

def parse_reference(jci_doc: str):
    """
    Extract (law_id, artikel_id, lid_id or None) from the extref/@doc string.
    """
    m = REF_RE.search(jci_doc)
    return (m.group("law"), m.group("artikel"), m.group("lid") or None) if m else None


def normalise_text(elem):
    """
    Collect full, whitespace-normalised text inside <al>, preserving <extref> text.
    """
    parts = []
    for node in elem.iter():
        if node.tag == "extref":
            parts.append(node.text or "")
        else:
            if node is elem:
                # skip root wrapper; its .text is first chunk
                if node.text:
                    parts.append(node.text)
            else:
                if node.text:
                    parts.append(node.text)
        if node.tail:
            parts.append(node.tail)
    return " ".join("".join(parts).split())


###############################################################################
# 2. Main extractor
###############################################################################

def extract_lids_from_article(xml_path) -> pd.DataFrame:
    tree = ET.parse(xml_path)
    root = tree.getroot()

    rows = []
    for artikel in root.iter("artikel"):
        artikel_id = artikel.attrib.get("label") or artikel.attrib.get("id")
        law_text_id = artikel.attrib.get("bwb-ng-variabel-deel", "").split("/")[0]

        for lid in artikel.iter("lid"):
            lid_id = lid.attrib.get("label-id") or lid.attrib.get("id")  # fallback
            al = lid.find("al")
            text = normalise_text(al)

            # collect references inside this lid
            ref_tuples = []
            for ext in lid.iter("extref"):
                ref = parse_reference(ext.attrib.get("doc", ""))
                if ref:
                    ref_tuples.append(ref)

            rows.append(
                dict(
                    law_text_id=law_text_id,
                    artikel_id=artikel_id,
                    lid_id=lid_id,
                    text=text,
                    reference_ids=tuple(ref_tuples),
                )
            )

    return pd.DataFrame(rows)


###############################################################################
# 3. Demo with the snippet you supplied
###############################################################################

# If your XML lives in a string (as in your example) you can parse it via ET.fromstring
from io import StringIO

snippet = """<artikel bwb-ng-variabel-deel="/HoofdstukI/Artikel2" stam-id="2689373" versie-id="30518442" id="C43751131" label-id="2634774" inwerking="2025-01-01" label="Artikel 2" bron="Stb.2024-435" effect="wijziging" ondertekening_bron="2024-12-18" publicatie_bron="2024-12-23" publicatie_iwt="2024-12-23" status="goed">
<lid bwb-ng-variabel-deel="/HoofdstukI/Artikel2/Lid7" label-id="2634774L7">
<lidnr status="officieel">7</lidnr>
<al>
Bij het begin van het kalenderjaar ...
<extref doc="jci1.3:c:BWBR0011353&artikel=10.2" label-id="2786694" verwijzing-id="2974014" bwb-id="BWBR0011353">artikel 10.2 van de Wet inkomstenbelasting 2001</extref>
, en vervolgens ...
</al>
<meta-data> ... </meta-data>
</lid>
</artikel>
"""

demo_df = extract_lids_from_article(StringIO(f"<root>{snippet}</root>"))
print(demo_df)


ParseError: not well-formed (invalid token): line 6, column 41 (<string>)