Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

adding extract_text_dir_sensitive #1040

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ All notable changes to this project will be documented in this file. The format

- Add support for marked-content sequences, represented by `mcid` and `tag` attributes on `char`/`rect`/`line`/`curve`/`image` objects (h/t @dhdaines). ([#961](https://github.com/jsvine/pdfplumber/pulls/961))
- Add `gs_path` argument to `pdfplumber.open(...)` and `pdfplumber.repair(...)`, to allow passing a custom Ghostscript path to be used for repairing. ([#953](https://github.com/jsvine/pdfplumber/issues/953))
- Add `extract_text_dir_sensitive` for text extraction while specifying direction

### Fixed

Expand Down
130 changes: 128 additions & 2 deletions pdfplumber/utils/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,18 @@
import re
import string
from operator import itemgetter
from typing import Any, Dict, Generator, List, Match, Optional, Pattern, Tuple, Union
from typing import (
Any,
Dict,
Generator,
List,
Literal,
Match,
Optional,
Pattern,
Tuple,
Union,
)

from .._typing import T_num, T_obj, T_obj_iter, T_obj_list
from .clustering import cluster_objects
Expand Down Expand Up @@ -72,7 +83,6 @@ def search(
return_chars: bool = True,
main_group: int = 0,
) -> List[Dict[str, Any]]:

if isinstance(pattern, Pattern):
if regex is False:
raise ValueError(
Expand Down Expand Up @@ -562,6 +572,118 @@ def extract_text_simple(
return "\n".join(collate_line(c, x_tolerance) for c in clustered)


def extract_text_dir_sensitive(
chars: T_obj_list,
x_tolerance: T_num = 1,
y_tolerance: T_num = 1,
x_tolerance_ratio: Union[int, float, None] = None,
y_tolerance_ratio: Union[int, float, None] = None,
char_dir: Literal["ltr", "rtl", "ttb", "btt"] = "ltr",
line_dir: Literal["ltr", "rtl", "ttb", "btt"] = "ttb",
) -> str:

dir_key: Dict[Tuple[str, str], Dict[str, Any]] = {
("ltr", "ttb"): {
"line_cluster_key": "doctop",
"char_cluster_keys": ("x0", "x1"),
"reverse_lines": False,
"reverse_words": False,
"reverse_chars": False,
},
("ttb", "rtl"): {
"line_cluster_key": "x0",
"char_cluster_keys": ("x0", "x1"),
"reverse_lines": True,
"reverse_words": False,
"reverse_chars": False,
},
("rtl", "btt"): {
"line_cluster_key": "doctop",
"char_cluster_keys": ("doctop", "bottom"),
"reverse_lines": False,
"reverse_words": True,
"reverse_chars": False,
},
("btt", "ltr"): {
"line_cluster_key": "x0",
"char_cluster_keys": ("x0", "x1"),
"reverse_lines": True,
"reverse_words": True,
"reverse_chars": False,
},
("rtl", "ttb"): {
"line_cluster_key": "doctop",
"char_cluster_keys": ("bottom", "bottom"),
"reverse_lines": True,
"reverse_words": True,
"reverse_chars": False,
},
("btt", "rtl"): {
"line_cluster_key": "x0",
"char_cluster_keys": ("x0", "x1"),
"reverse_lines": False,
"reverse_words": True,
"reverse_chars": False,
},
("ltr", "btt"): {
"line_cluster_key": "doctop",
"char_cluster_keys": ("x0", "x1"),
"reverse_lines": False,
"reverse_words": True,
"reverse_chars": False,
},
("ttb", "ltr"): {
"line_cluster_key": "x0",
"char_cluster_keys": ("x0", "x1"),
"reverse_lines": False,
"reverse_words": False,
"reverse_chars": False,
},
}

params = dir_key[(char_dir, line_dir)]

for axis in "xy":
if locals()[f"{axis}_tolerance_ratio"] is not None:
locals()[f"{axis}_tolerance"] = set_tolerance(
chars[0], locals()[f"{axis}_tolerance_ratio"]
)

line_cluster_tolerance, char_cluster_tolerance = (
(x_tolerance, y_tolerance)
if params["line_cluster_key"] == "doctop"
else (y_tolerance, x_tolerance)
)
line_clusters = cluster_objects(
chars, itemgetter(params["line_cluster_key"]), line_cluster_tolerance
)

if params["reverse_lines"]:
line_clusters = line_clusters[::-1]

k0, k1 = params["char_cluster_keys"]
lines = []
for c in line_clusters:
coll = ""
last_k1 = None
chars = sorted(c, key=itemgetter(k0))
if params["reverse_chars"]:
chars = chars[::-1]
for char in chars:
if (last_k1 is not None) and (
char[k0] > (last_k1 + char_cluster_tolerance)
):
coll += " "
last_k1 = char[k1]
coll += char["text"]
lines.append(coll)

if params["reverse_words"]:
lines = lines[::-1]

return "\n".join(lines)


def dedupe_chars(chars: T_obj_list, tolerance: T_num = 1) -> T_obj_list:
"""
Removes duplicate chars — those sharing the same text, fontname, size,
Expand All @@ -583,3 +705,7 @@ def yield_unique_chars(chars: T_obj_list) -> Generator[T_obj, None, None]:

deduped = yield_unique_chars(chars)
return sorted(deduped, key=chars.index)


def set_tolerance(t: T_obj, tolerance_ratio: float) -> Any:
return tolerance_ratio * (t["bottom"] - t["top"])
Binary file added tests/pdfs/issue-848/robocop_-0.pdf
Binary file not shown.
Binary file added tests/pdfs/issue-848/robocop_-180.pdf
Binary file not shown.
Binary file added tests/pdfs/issue-848/robocop_-270.pdf
Binary file not shown.
Binary file added tests/pdfs/issue-848/robocop_-90.pdf
Binary file not shown.
Binary file added tests/pdfs/issue-848/robocop_0.pdf
Binary file not shown.
Binary file added tests/pdfs/issue-848/robocop_180.pdf
Binary file not shown.
Binary file added tests/pdfs/issue-848/robocop_270.pdf
Binary file not shown.
Binary file added tests/pdfs/issue-848/robocop_90.pdf
Binary file not shown.
25 changes: 25 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,31 @@ def test_extract_words(self):
assert words_rtl[1]["text"] == "baaabaaA/AAA"
assert words_rtl[1]["direction"] == -1

def test_extract_words_dir_sensitve(self):
pdf = pdfplumber.open(
"/Users/user/Documents/code/pdfplumber/tests/pdfs/issue-848/robocop_0.pdf"
)
p = pdf.pages[0]
expected = utils.text.extract_text_simple(p.chars)
rotation_key = {
"0": ("ltr", "ttb"),
"90": ("ttb", "rtl"),
"180": ("rtl", "btt"),
"270": ("btt", "ltr"),
"-0": ("rtl", "ttb"),
"-90": ("btt", "rtl"),
"-180": ("ltr", "btt"),
"-270": ("ttb", "ltr"),
}
for n in rotation_key:
pdf = pdfplumber.open(os.path.join(HERE, f"pdfs/issue-848/robocop_{n}.pdf"))
p = pdf.pages[0]
char_dir, line_dir = rotation_key[n]
output = utils.text.extract_text_dir_sensitive(
chars=p.chars, char_dir=char_dir, line_dir=line_dir
)
assert output == expected

def test_extract_words_punctuation(self):
path = os.path.join(HERE, "pdfs/test-punkt.pdf")
with pdfplumber.open(path) as pdf:
Expand Down
Loading