jsvine · dhdaines · Jul 19, 2023 · Jul 19, 2023 · Jul 19, 2023 · Jul 19, 2023
diff --git a/README.md b/README.md
@@ -108,6 +108,7 @@ The `pdfplumber.Page` class is at the core of `pdfplumber`. Most things you'll d
 |`.width`| The page's width.|
 |`.height`| The page's height.|
 |`.objects` / `.chars` / `.lines` / `.rects` / `.curves` / `.images`| Each of these properties is a list, and each list contains one dictionary for each such object embedded on the page. For more detail, see "[Objects](#objects)" below.|
+|`.structure_tree` | A list of `dict` representing structural elements and associated marked content IDs.|
 
 ... and these main methods:
 
@@ -158,6 +159,7 @@ Each object is represented as a simple Python `dict`, with the following propert
 |`bottom`| Distance of bottom of the character from top of page.|
 |`doctop`| Distance of top of character from top of document.|
 |`matrix`| The "current transformation matrix" for this character. (See below for details.)|
+|`mcid`| The marked content section ID for this character if any (otherwise None)|
 |`ncs`|TKTK|
 |`stroking_pattern`|TKTK|
 |`non_stroking_pattern`|TKTK|
@@ -504,6 +506,7 @@ Many thanks to the following users who've contributed ideas, features, and fixes
 - [Shannon Shen](https://github.com/lolipopshock)
 - [Matsumoto Toshi](https://github.com/toshi1127)
 - [John West](https://github.com/jwestwsj)
+- [David Huggins-Daines](https://github.com/dhdaines)
 
 ## Contributing
 

diff --git a/docs/structure.md b/docs/structure.md
@@ -0,0 +1,63 @@
+# Structure Tree
+
+Since PDF 1.3 it is possible for a PDF to contain logical structure,
+contained in a *structure tree*.  In conjunction with PDF 1.2 [marked
+content sections](#marked-content-sections) this forms the basis of
+Tagged PDF and other accessibility features.
+
+Unfortunately, since all of these standards are optional and variably
+implemented in PDF authoring tools, and are frequently not enabled by
+default, it is not possible to rely on them to extract the structure
+of a PDF and associated content.  Nonetheless they can be useful as
+features for a heuristic or machine-learning based system, or for
+extracting particular structures such as tables.
+
+Since `pdfplumber`'s API is page-based, the structure is available for
+a particular page, using the `structure_tree` attribute:
+
+    with pdfplumber.open(pdffile) as pdf:
+        for element in pdf.pages[0].structure_tree:
+             print(element["type"], element["mcids"])
+             for child in element.children:
+                 print(child["type"], child["mcids"])
+
+The `type` field contains the type of the structure element - the
+standard structure types can be seen in section 10.7.3 of [the PDF 1.7
+reference
+document](https://ghostscript.com/~robin/pdf_reference17.pdf#page=898),
+but usually they are rather HTML-like, if created by a recent PDF
+authoring tool (notably, older tools may simply produce `P` for
+everything).
+
+The `mcids` field contains the list of marked content section IDs
+corresponding to this element.  You can use this to match the element
+to words or characters using the API described below.
+
+The `lang` field is often present as well, and contains a language
+code for the text content, e.g. `"EN-US"` or `"FR-CA"`.
+
+The `alt_text` field will be present if the author has helpfully added
+alternate text to an image.  In theory, `title` and `actual_text` may
+also be present, but not all tools seem to support these.
+
+The `id` field is of unknown origin and use.  Please find a PDF that
+contains it so we can test it.
+
+Likewise, attributes for structure elements (which, confusingly, come
+as a *list* of dictionaries) are not supported because I haven't found
+a PDF using them to test with yet.
+
+# Marked Content Sections
+
+The structure of a PDF obviously isn't all that useful unless you can,
+minimally, attach some text to the elements.  This is where marked
+content sections come in.
+
+`pdfplumber` adds an optional field called `mcid` to the items in the
+`objects` and `chars` properties of a page, which tells you which
+marked content section a given character or other object belongs to.
+
+You can propagate `mcid` to the words returned by `extract_words` by
+adding it to the `extra_attrs` argument, e.g.:
+
+    words = pdf.pages[0].extract_words(extra_attrs=["mcid"])
diff --git a/pdfplumber/page.py b/pdfplumber/page.py
@@ -22,13 +22,14 @@
     LTPage,
     LTTextContainer,
 )
-from pdfminer.pdfinterp import PDFPageInterpreter
+from pdfminer.pdfinterp import PDFPageInterpreter, PDFStackT
 from pdfminer.pdfpage import PDFPage
 from pdfminer.psparser import PSLiteral
 
 from . import utils
 from ._typing import T_bbox, T_num, T_obj, T_obj_list
 from .container import Container
+from .structure import get_page_structure
 from .table import T_table_settings, Table, TableFinder, TableSettings
 from .utils import decode_text, resolve_all, resolve_and_decode
 from .utils.text import TextMap
@@ -62,6 +63,7 @@
         "stream",
         "stroke",
         "stroking_color",
+        "mcid",
     ]
 )
 
@@ -115,6 +117,52 @@ def normalize_color(
     return separate_pattern(tuplefied)
 
 
+class PDFPageAggregatorWithMarkedContent(PDFPageAggregator):
+    """Extract layout from a specific page, adding marked-content IDs to
+    objects where found."""
+
+    cur_mcid: Optional[int] = None
+
+    def begin_tag(self, tag: PSLiteral, props: Optional[PDFStackT] = None) -> None:
+        """Handle beginning of tag, setting current MCID if any."""
+        if isinstance(props, dict) and "MCID" in props:
+            self.cur_mcid = props["MCID"]
+        else:
+            self.cur_mcid = None
+
+    def end_tag(self) -> None:
+        """Handle beginning of tag, clearing current MCID."""
+        self.cur_mcid = None
+
+    def tag_cur_item(self) -> None:
+        """Add current MCID to what we hope to be the most recent object created
+        by pdfminer.six."""
+        # This is somewhat hacky and would not be necessary if
+        # pdfminer.six supported MCIDs.  In reading the code it's
+        # clear that the `render_*` methods methods will only ever
+        # create one object, but that is far from being guaranteed.
+        # Even if pdfminer.six's API would just return the objects it
+        # creates, we wouldn't have to do this.
+        cur_obj = self.cur_item._objs[-1]
+        cur_obj.mcid = self.cur_mcid  # type: ignore
+
+    def render_char(self, *args, **kwargs) -> float:  # type: ignore
+        """Hook for rendering characters, adding the `mcid` attribute."""
+        adv = super().render_char(*args, **kwargs)
+        self.tag_cur_item()
+        return adv
+
+    def render_image(self, *args, **kwargs) -> None:  # type: ignore
+        """Hook for rendering images, adding the `mcid` attribute."""
+        super().render_image(*args, **kwargs)
+        self.tag_cur_item()
+
+    def paint_path(self, *args, **kwargs) -> None:  # type: ignore
+        """Hook for rendering lines and curves, adding the `mcid` attribute."""
+        super().paint_path(*args, **kwargs)
+        self.tag_cur_item()
+
+
 class Page(Container):
     cached_properties: List[str] = Container.cached_properties + ["_layout"]
     is_original: bool = True
@@ -170,11 +218,31 @@ def width(self) -> T_num:
     def height(self) -> T_num:
         return self.bbox[3] - self.bbox[1]
 
+    @property
+    def structure_tree(self) -> T_obj_list:
+        """Return the structure tree for a page.
+
+        This consists of a list of dictionaries each of which
+        minimally contains the keys `type` and `mcids`, which give the
+        type of the structure element (see
+        https://ghostscript.com/~robin/pdf_reference17.pdf#page=898)
+        and the marked content IDs for its contents, if any.
+
+        It may also contain the key `children` with child elements,
+        and possibly some other keys like `id`, `lang`, `title`,
+        `alt_text`, and `actual_text` if the PDF is very recent.
+
+        """
+        tree = get_page_structure(
+            self.pdf.stream, self.page_number - 1, self.pdf.password
+        )
+        return [child.to_dict() for child in tree.children]
+
     @property
     def layout(self) -> LTPage:
         if hasattr(self, "_layout"):
             return self._layout
-        device = PDFPageAggregator(
+        device = PDFPageAggregatorWithMarkedContent(
             self.pdf.rsrcmgr,
             pageno=self.page_number,
             laparams=self.pdf.laparams,

diff --git a/pdfplumber/structure.py b/pdfplumber/structure.py
@@ -0,0 +1,157 @@
+import ctypes
+from io import BufferedReader, BytesIO
+from typing import TYPE_CHECKING, Callable, Iterator, Optional, Union
+
+import pypdfium2  # type: ignore
+import pypdfium2.raw as pdfium_c  # type: ignore
+
+from ._typing import T_obj
+
+if TYPE_CHECKING:  # pragma: nocover
+    fpdf_structelement_t = ctypes._Pointer[pdfium_c.fpdf_structelement_t__]
+    fpdf_structtree_t = ctypes._Pointer[pdfium_c.fpdf_structtree_t__]
+    c_char_array = ctypes.Array[ctypes.c_char]
+else:
+    fpdf_structelement_t = ctypes._Pointer
+    fpdf_structtree_t = ctypes._Pointer
+    c_char_array = ctypes.Array
+
+
+class PdfStructElement:
+    def __init__(self, raw: fpdf_structelement_t):
+        self.raw = raw
+
+    @property
+    def children(self) -> Iterator["PdfStructElement"]:
+        n_children = pdfium_c.FPDF_StructElement_CountChildren(self.raw)
+        for idx in range(n_children):
+            child = PdfStructElement(
+                pdfium_c.FPDF_StructElement_GetChildAtIndex(self.raw, idx)
+            )
+            if child.type:
+                yield child
+
+    def string_accessor(
+        self,
+        pdffunc: Callable[
+            [
+                fpdf_structelement_t,
+                Optional[c_char_array],
+                int,
+            ],
+            int,
+        ],
+    ) -> str:
+        n_bytes = pdffunc(self.raw, None, 0)
+        buffer = ctypes.create_string_buffer(n_bytes)
+        pdffunc(self.raw, buffer, n_bytes)
+        return buffer.raw[: n_bytes - 2].decode("utf-16-le")
+
+    @property
+    def id(self) -> str:
+        return self.string_accessor(pdfium_c.FPDF_StructElement_GetID)
+
+    @property
+    def lang(self) -> str:
+        return self.string_accessor(pdfium_c.FPDF_StructElement_GetLang)
+
+    @property
+    def title(self) -> str:
+        return self.string_accessor(pdfium_c.FPDF_StructElement_GetTitle)
+
+    @property
+    def type(self) -> str:
+        return self.string_accessor(pdfium_c.FPDF_StructElement_GetType)
+
+    @property
+    def alt_text(self) -> str:
+        return self.string_accessor(pdfium_c.FPDF_StructElement_GetAltText)
+
+    @property
+    def actual_text(self) -> str:
+        return self.string_accessor(pdfium_c.FPDF_StructElement_GetActualText)
+
+    @property
+    def mcid(self) -> Optional[int]:
+        mcid: int = pdfium_c.FPDF_StructElement_GetMarkedContentID(self.raw)
+        if mcid == -1:
+            return None
+        else:
+            return mcid
+
+    @property
+    def mcids(self) -> Iterator[int]:
+        mcid_count = pdfium_c.FPDF_StructElement_GetMarkedContentIdCount(self.raw)
+        if mcid_count == -1:
+            return
+        else:
+            for idx in range(mcid_count):
+                mcid = pdfium_c.FPDF_StructElement_GetMarkedContentIdAtIndex(
+                    self.raw, idx
+                )
+                if mcid != -1:
+                    yield mcid
+
+    def to_dict(self) -> T_obj:
+        eldict: T_obj = {}
+        if self.id:
+            eldict["id"] = self.id  # pragma: nocover
+        if self.lang:
+            eldict["lang"] = self.lang
+        if self.title:
+            eldict["title"] = self.title  # pragma: nocover
+        if self.type:
+            eldict["type"] = self.type
+        if self.alt_text:
+            eldict["alt_text"] = self.alt_text
+        if self.actual_text:
+            eldict["actual_text"] = self.actual_text
+        if self.mcid:
+            eldict["mcids"] = [self.mcid]
+        else:
+            mcids = list(self.mcids)
+            if mcids:
+                eldict["mcids"] = mcids
+        children = []
+        for child in self.children:
+            if child.type:
+                children.append(child.to_dict())
+        if children:
+            eldict["children"] = children
+        return eldict
+
+
+class PdfStructTree:
+    def __init__(self, raw: fpdf_structtree_t):
+        self.raw = raw
+
+    @classmethod
+    def from_page(self, page: pypdfium2.PdfPage) -> "PdfStructTree":
+        raw = pdfium_c.FPDF_StructTree_GetForPage(page)
+        return PdfStructTree(raw)
+
+    @property
+    def children(self) -> Iterator[PdfStructElement]:
+        n_children = pdfium_c.FPDF_StructTree_CountChildren(self.raw)
+        for idx in range(n_children):
+            child = PdfStructElement(
+                pdfium_c.FPDF_StructTree_GetChildAtIndex(self.raw, idx)
+            )
+            if child.type:
+                yield child
+
+
+def get_page_structure(
+    stream: Union[BufferedReader, BytesIO],
+    page_ix: int,
+    password: Optional[str] = None,
+) -> PdfStructTree:
+    # If we are working with a file object saved to disk
+    if hasattr(stream, "name"):
+        src = stream.name
+    # If we instead are working with a BytesIO stream
+    else:
+        stream.seek(0)
+        src = stream
+    pdf = pypdfium2.PdfDocument(src)
+    return PdfStructTree.from_page(pdf[page_ix])
diff --git a/tests/pdfs/2023-06-20-PV.pdf b/tests/pdfs/2023-06-20-PV.pdf
diff --git a/tests/pdfs/figure_structure.pdf b/tests/pdfs/figure_structure.pdf
diff --git a/tests/pdfs/image_structure.pdf b/tests/pdfs/image_structure.pdf
diff --git a/tests/pdfs/pdf_structure.pdf b/tests/pdfs/pdf_structure.pdf
diff --git a/tests/pdfs/word365_structure.pdf b/tests/pdfs/word365_structure.pdf
diff --git a/tests/pdfs/zonage_crlf.pdf b/tests/pdfs/zonage_crlf.pdf
diff --git a/tests/test_convert.py b/tests/test_convert.py
@@ -70,7 +70,7 @@ def test_csv(self):
         assert c.split("\r\n")[9] == (
             "char,1,45.83,58.826,656.82,674.82,117.18,117.18,135.18,12.996,"
             '18.0,12.996,,,,,,TimesNewRomanPSMT,,,"(1, 0, 0, 1, 45.83, 660.69)"'
-            ',DeviceRGB,"(0, 0, 0)",,,18.0,,,,,,Y,,1,'
+            ',,DeviceRGB,"(0, 0, 0)",,,18.0,,,,,,Y,,1,'
         )
 
         io = StringIO()
@@ -125,7 +125,7 @@ def test_cli_csv(self):
         assert res.decode("utf-8").split("\r\n")[9] == (
             "char,1,45.83,58.826,656.82,674.82,117.18,117.18,135.18,12.996,"
             '18.0,12.996,,,,,,TimesNewRomanPSMT,,,"(1, 0, 0, 1, 45.83, 660.69)"'
-            ',DeviceRGB,"(0, 0, 0)",,,18.0,,,,,,Y,,1,'
+            ',,DeviceRGB,"(0, 0, 0)",,,18.0,,,,,,Y,,1,'
         )
 
     def test_cli_csv_exclude(self):
@@ -141,6 +141,7 @@ def test_cli_csv_exclude(self):
                 "3",
                 "--exclude-attrs",
                 "matrix",
+                "mcid",
                 "ncs",
                 "non_stroking_pattern",
                 "stroking_pattern",