Skip to content

Commit

Permalink
Handle use_text_flow more consistently
Browse files Browse the repository at this point in the history
As noted in #912, `use_text_flow` was not being handled consistently, as
characters and words were being re-sorted without checking first if this
parameter was set to `True`.
  • Loading branch information
jsvine committed Jul 1, 2023
1 parent ae676ae commit b1db5b8
Showing 1 changed file with 16 additions and 6 deletions.
22 changes: 16 additions & 6 deletions pdfplumber/utils/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ def to_textmap(
x_shift: T_num = 0,
y_shift: T_num = 0,
y_tolerance: T_num = DEFAULT_Y_TOLERANCE,
use_text_flow: bool = False,
presorted: bool = False,
expand_ligatures: bool = True,
) -> TextMap:
Expand Down Expand Up @@ -213,17 +214,19 @@ def to_textmap(

num_newlines = 0

words_sorted = (
words_sorted_doctop = (
self.tuples
if presorted
else sorted(self.tuples, key=lambda x: (x[0]["doctop"], x[0]["x0"]))
if presorted or use_text_flow
else sorted(self.tuples, key=lambda x: float(x[0]["doctop"]))
)

first_word = words_sorted[0][0]
first_word = words_sorted_doctop[0][0]
doctop_start = first_word["doctop"] - first_word["top"]

for i, ws in enumerate(
cluster_objects(words_sorted, lambda x: float(x[0]["doctop"]), y_tolerance)
cluster_objects(
words_sorted_doctop, lambda x: float(x[0]["doctop"]), y_tolerance
)
):
y_dist = (
(ws[0][0]["doctop"] - (doctop_start + y_shift)) / y_density
Expand All @@ -245,7 +248,14 @@ def to_textmap(
num_newlines += num_newlines_prepend

line_len = 0
for word, chars in sorted(ws, key=lambda x: float(x[0]["x0"])):

line_words_sorted_x0 = (
ws
if presorted or use_text_flow
else sorted(ws, key=lambda x: float(x[0]["x0"]))
)

for word, chars in line_words_sorted_x0:
x_dist = (word["x0"] - x_shift) / x_density if layout else 0
num_spaces_prepend = max(min(1, line_len), round(x_dist) - line_len)
_textmap += [(" ", None)] * num_spaces_prepend
Expand Down

0 comments on commit b1db5b8

Please sign in to comment.