jsvine · afriedman412 · Nov 4, 2023
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,7 @@ All notable changes to this project will be documented in this file. The format
 
 - Add support for marked-content sequences, represented by `mcid` and `tag` attributes on `char`/`rect`/`line`/`curve`/`image` objects (h/t @dhdaines). ([#961](https://github.com/jsvine/pdfplumber/pulls/961))
 - Add `gs_path` argument to `pdfplumber.open(...)` and `pdfplumber.repair(...)`, to allow passing a custom Ghostscript path to be used for repairing. ([#953](https://github.com/jsvine/pdfplumber/issues/953))
+- Add `extract_text_dir_sensitive` for text extraction while specifying direction
 
 ### Fixed
 

diff --git a/pdfplumber/utils/text.py b/pdfplumber/utils/text.py
@@ -3,7 +3,18 @@
 import re
 import string
 from operator import itemgetter
-from typing import Any, Dict, Generator, List, Match, Optional, Pattern, Tuple, Union
+from typing import (
+    Any,
+    Dict,
+    Generator,
+    List,
+    Literal,
+    Match,
+    Optional,
+    Pattern,
+    Tuple,
+    Union,
+)
 
 from .._typing import T_num, T_obj, T_obj_iter, T_obj_list
 from .clustering import cluster_objects
@@ -72,7 +83,6 @@ def search(
         return_chars: bool = True,
         main_group: int = 0,
     ) -> List[Dict[str, Any]]:
-
         if isinstance(pattern, Pattern):
             if regex is False:
                 raise ValueError(
@@ -562,6 +572,118 @@ def extract_text_simple(
     return "\n".join(collate_line(c, x_tolerance) for c in clustered)
 
 
+def extract_text_dir_sensitive(
+    chars: T_obj_list,
+    x_tolerance: T_num = 1,
+    y_tolerance: T_num = 1,
+    x_tolerance_ratio: Union[int, float, None] = None,
+    y_tolerance_ratio: Union[int, float, None] = None,
+    char_dir: Literal["ltr", "rtl", "ttb", "btt"] = "ltr",
+    line_dir: Literal["ltr", "rtl", "ttb", "btt"] = "ttb",
+) -> str:
+
+    dir_key: Dict[Tuple[str, str], Dict[str, Any]] = {
+        ("ltr", "ttb"): {
+            "line_cluster_key": "doctop",
+            "char_cluster_keys": ("x0", "x1"),
+            "reverse_lines": False,
+            "reverse_words": False,
+            "reverse_chars": False,
+        },
+        ("ttb", "rtl"): {
+            "line_cluster_key": "x0",
+            "char_cluster_keys": ("x0", "x1"),
+            "reverse_lines": True,
+            "reverse_words": False,
+            "reverse_chars": False,
+        },
+        ("rtl", "btt"): {
+            "line_cluster_key": "doctop",
+            "char_cluster_keys": ("doctop", "bottom"),
+            "reverse_lines": False,
+            "reverse_words": True,
+            "reverse_chars": False,
+        },
+        ("btt", "ltr"): {
+            "line_cluster_key": "x0",
+            "char_cluster_keys": ("x0", "x1"),
+            "reverse_lines": True,
+            "reverse_words": True,
+            "reverse_chars": False,
+        },
+        ("rtl", "ttb"): {
+            "line_cluster_key": "doctop",
+            "char_cluster_keys": ("bottom", "bottom"),
+            "reverse_lines": True,
+            "reverse_words": True,
+            "reverse_chars": False,
+        },
+        ("btt", "rtl"): {
+            "line_cluster_key": "x0",
+            "char_cluster_keys": ("x0", "x1"),
+            "reverse_lines": False,
+            "reverse_words": True,
+            "reverse_chars": False,
+        },
+        ("ltr", "btt"): {
+            "line_cluster_key": "doctop",
+            "char_cluster_keys": ("x0", "x1"),
+            "reverse_lines": False,
+            "reverse_words": True,
+            "reverse_chars": False,
+        },
+        ("ttb", "ltr"): {
+            "line_cluster_key": "x0",
+            "char_cluster_keys": ("x0", "x1"),
+            "reverse_lines": False,
+            "reverse_words": False,
+            "reverse_chars": False,
+        },
+    }
+
+    params = dir_key[(char_dir, line_dir)]
+
+    for axis in "xy":
+        if locals()[f"{axis}_tolerance_ratio"] is not None:
+            locals()[f"{axis}_tolerance"] = set_tolerance(
+                chars[0], locals()[f"{axis}_tolerance_ratio"]
+            )
+
+    line_cluster_tolerance, char_cluster_tolerance = (
+        (x_tolerance, y_tolerance)
+        if params["line_cluster_key"] == "doctop"
+        else (y_tolerance, x_tolerance)
+    )
+    line_clusters = cluster_objects(
+        chars, itemgetter(params["line_cluster_key"]), line_cluster_tolerance
+    )
+
+    if params["reverse_lines"]:
+        line_clusters = line_clusters[::-1]
+
+    k0, k1 = params["char_cluster_keys"]
+    lines = []
+    for c in line_clusters:
+        coll = ""
+        last_k1 = None
+        chars = sorted(c, key=itemgetter(k0))
+        if params["reverse_chars"]:
+            chars = chars[::-1]
+        for char in chars:
+            if (last_k1 is not None) and (
+                char[k0] > (last_k1 + char_cluster_tolerance)
+            ):
+                coll += " "
+            last_k1 = char[k1]
+            coll += char["text"]
+        lines.append(coll)
+
+    if params["reverse_words"]:
+        lines = lines[::-1]
+
+    return "\n".join(lines)
+
+
 def dedupe_chars(chars: T_obj_list, tolerance: T_num = 1) -> T_obj_list:
     """
     Removes duplicate chars — those sharing the same text, fontname, size,
@@ -583,3 +705,7 @@ def yield_unique_chars(chars: T_obj_list) -> Generator[T_obj, None, None]:
 
     deduped = yield_unique_chars(chars)
     return sorted(deduped, key=chars.index)
+
+
+def set_tolerance(t: T_obj, tolerance_ratio: float) -> Any:
+    return tolerance_ratio * (t["bottom"] - t["top"])
diff --git a/tests/pdfs/issue-848/robocop_-0.pdf b/tests/pdfs/issue-848/robocop_-0.pdf
diff --git a/tests/pdfs/issue-848/robocop_-180.pdf b/tests/pdfs/issue-848/robocop_-180.pdf
diff --git a/tests/pdfs/issue-848/robocop_-270.pdf b/tests/pdfs/issue-848/robocop_-270.pdf
diff --git a/tests/pdfs/issue-848/robocop_-90.pdf b/tests/pdfs/issue-848/robocop_-90.pdf
diff --git a/tests/pdfs/issue-848/robocop_0.pdf b/tests/pdfs/issue-848/robocop_0.pdf
diff --git a/tests/pdfs/issue-848/robocop_180.pdf b/tests/pdfs/issue-848/robocop_180.pdf
diff --git a/tests/pdfs/issue-848/robocop_270.pdf b/tests/pdfs/issue-848/robocop_270.pdf
diff --git a/tests/pdfs/issue-848/robocop_90.pdf b/tests/pdfs/issue-848/robocop_90.pdf
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -88,6 +88,31 @@ def test_extract_words(self):
         assert words_rtl[1]["text"] == "baaabaaA/AAA"
         assert words_rtl[1]["direction"] == -1
 
+    def test_extract_words_dir_sensitve(self):
+        pdf = pdfplumber.open(
+            "/Users/user/Documents/code/pdfplumber/tests/pdfs/issue-848/robocop_0.pdf"
+        )
+        p = pdf.pages[0]
+        expected = utils.text.extract_text_simple(p.chars)
+        rotation_key = {
+            "0": ("ltr", "ttb"),
+            "90": ("ttb", "rtl"),
+            "180": ("rtl", "btt"),
+            "270": ("btt", "ltr"),
+            "-0": ("rtl", "ttb"),
+            "-90": ("btt", "rtl"),
+            "-180": ("ltr", "btt"),
+            "-270": ("ttb", "ltr"),
+        }
+        for n in rotation_key:
+            pdf = pdfplumber.open(os.path.join(HERE, f"pdfs/issue-848/robocop_{n}.pdf"))
+            p = pdf.pages[0]
+            char_dir, line_dir = rotation_key[n]
+            output = utils.text.extract_text_dir_sensitive(
+                chars=p.chars, char_dir=char_dir, line_dir=line_dir
+            )
+            assert output == expected
+
     def test_extract_words_punctuation(self):
         path = os.path.join(HERE, "pdfs/test-punkt.pdf")
         with pdfplumber.open(path) as pdf: