Skip to content

Commit

Permalink
Fix utils.extract_words & .resize_object, + tests
Browse files Browse the repository at this point in the history
- Fixes `.extract_words`, which had been returning incorrect results when `horizontal_ltr = False`
- Fixes `.resize_object`, which had been failing in various permutations
- Brings utils.py test coverage to 100%
  • Loading branch information
jsvine committed Aug 1, 2020
1 parent 8e2a166 commit d16aa13
Show file tree
Hide file tree
Showing 4 changed files with 204 additions and 78 deletions.
26 changes: 4 additions & 22 deletions pdfplumber/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,29 +216,11 @@ def sorter(x):
largest = list(sorted(tables, key=sorter))[0]
return largest.extract()

def extract_text(
self,
x_tolerance=utils.DEFAULT_X_TOLERANCE,
y_tolerance=utils.DEFAULT_Y_TOLERANCE,
):

return utils.extract_text(
self.chars, x_tolerance=x_tolerance, y_tolerance=y_tolerance
)
def extract_text(self, **kwargs):
return utils.extract_text(self.chars, **kwargs)

def extract_words(
self,
x_tolerance=utils.DEFAULT_X_TOLERANCE,
y_tolerance=utils.DEFAULT_Y_TOLERANCE,
keep_blank_chars=False,
):

return utils.extract_words(
self.chars,
x_tolerance=x_tolerance,
y_tolerance=y_tolerance,
keep_blank_chars=keep_blank_chars,
)
def extract_words(self, **kwargs):
return utils.extract_words(self.chars, **kwargs)

def crop(self, bbox):
return CroppedPage(self, self.decimalize(bbox))
Expand Down
97 changes: 41 additions & 56 deletions pdfplumber/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from pdfminer.pdftypes import PDFObjRef
from decimal import Decimal, ROUND_HALF_UP
import numbers
from operator import itemgetter
from operator import itemgetter, gt, lt, add, sub
import itertools
from functools import lru_cache as cache

Expand All @@ -13,7 +13,7 @@

def cluster_list(xs, tolerance=0):
tolerance = decimalize(tolerance)
if tolerance == 0:
if tolerance == Decimal(0):
return [[x] for x in sorted(xs)]
if len(xs) < 2:
return [[x] for x in sorted(xs)]
Expand Down Expand Up @@ -112,16 +112,8 @@ def resolve_all(x):

@cache(maxsize=int(10e4))
def _decimalize(v, q=None):
# If already a decimal, just return itself
if type(v) == Decimal:
return v

# If tuple/list passed, bulk-convert
elif isinstance(v, (tuple, list)):
return type(v)(decimalize(x, q) for x in v)

# Convert int-like
elif isinstance(v, numbers.Integral):
if isinstance(v, numbers.Integral):
return Decimal(int(v))

# Convert float-like
Expand Down Expand Up @@ -211,37 +203,39 @@ def extract_words(
def process_word_chars(chars, upright):
x0, top, x1, bottom = objects_to_bbox(chars)

if upright:
if horizontal_ltr:
sorted_chars = chars
else:
sorted_chars = sorted(chars, key=lambda x: -x["x1"])
else:
if vertical_ttb:
sorted_chars = sorted(chars, key=itemgetter("doctop"))
else:
sorted_chars = sorted(chars, key=lambda x: -x["bottom"])

return {
"x0": x0,
"x1": x1,
"top": top,
"bottom": bottom,
"upright": upright,
"text": "".join(map(itemgetter("text"), sorted_chars)),
"text": "".join(map(itemgetter("text"), chars)),
}

def get_line_words(chars, upright, tolerance=DEFAULT_X_TOLERANCE):
def get_line_words(chars, upright, tolerance):
get_text = itemgetter("text")
min_key = "x0" if upright else "top"
max_key = "x1" if upright else "bottom"

chars_sorted = sorted(chars, key=itemgetter(min_key))
if upright:
min_key, max_key = ("x0", "x1") if horizontal_ltr else ("x1", "x0")
else:
min_key, max_key = ("top", "bottom") if vertical_ttb else ("bottom", "top")

words = []
current_word = []

for char in chars_sorted:
asc_order = (
(upright and horizontal_ltr)
or (not upright and vertical_ttb)
)

comp_fn = gt if asc_order else lt
tol_fn = add if asc_order else sub

def sort_key(x):
return tol_fn(0, x[min_key])

sorted_chars = sorted(chars, key=sort_key)

for char in sorted_chars:
if not keep_blank_chars and get_text(char).isspace():
if len(current_word) > 0:
words.append(current_word)
Expand All @@ -252,7 +246,8 @@ def get_line_words(chars, upright, tolerance=DEFAULT_X_TOLERANCE):
current_word.append(char)
else:
last_char = current_word[-1]
if char[min_key] > (last_char[max_key] + tolerance):
prev_pos = tol_fn(last_char[max_key], tolerance)
if comp_fn(char[min_key], prev_pos):
words.append(current_word)
current_word = []
current_word.append(char)
Expand Down Expand Up @@ -417,36 +412,26 @@ def resize_object(obj, key, value):
assert key in ("x0", "x1", "top", "bottom")
old_value = obj[key]
diff = value - old_value
if key in ("x0", "x1"):
if key == "x0":
assert value <= obj["x1"]
else:
assert value >= obj["x0"]
new_items = (
(key, value),
("width", obj["width"] + diff),
)
if key == "top":
new_items = [
(key, value),
]
if key == "x0":
assert value <= obj["x1"]
new_items.append(("width", obj["x1"] - value))
elif key == "x1":
assert value >= obj["x0"]
new_items.append(("width", value - obj["x0"]))
elif key == "top":
assert value <= obj["bottom"]
new_items = [
(key, value),
("doctop", obj["doctop"] + diff),
("height", obj["height"] - diff),
]
new_items.append(("doctop", obj["doctop"] + diff))
new_items.append(("height", obj["height"] - diff))
if "y1" in obj:
new_items += [
("y1", obj["y1"] - diff),
]
if key == "bottom":
new_items.append(("y1", obj["y1"] - diff))
elif key == "bottom":
assert value >= obj["top"]
new_items = [
(key, value),
("height", obj["height"] + diff),
]
new_items.append(("height", obj["height"] + diff))
if "y0" in obj:
new_items += [
("y0", obj["y0"] - diff),
]
new_items.append(("y0", obj["y0"] - diff))
return obj.__class__(tuple(obj.items()) + tuple(new_items))


Expand Down
Binary file added tests/pdfs/issue-192-example.pdf
Binary file not shown.
159 changes: 159 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
#!/usr/bin/env python
import unittest
import pytest
import pandas as pd
import pdfplumber
from pdfplumber import utils
from pdfminer.pdfparser import PDFObjRef
from pdfminer.psparser import PSLiteral
from decimal import Decimal
import sys, os

import logging
logging.disable(logging.ERROR)

HERE = os.path.abspath(os.path.dirname(__file__))

class Test(unittest.TestCase):

@classmethod
def setup_class(self):
path = os.path.join(HERE, "pdfs/pdffill-demo.pdf")
self.pdf = pdfplumber.open(path)

@classmethod
def teardown_class(self):
self.pdf.close()

def test_cluster_list(self):
a = [1, 2, 3, 4]
assert utils.cluster_list(a) == [[x] for x in a]
assert utils.cluster_list(a, tolerance=1) == [a]

a = [1, 2, 5, 6]
assert utils.cluster_list(a, tolerance=1) == [[1, 2], [5, 6]]

def test_cluster_objects(self):
a = ["a", "ab", "abc", "b"]
assert utils.cluster_objects(a, len, 0) == [["a", "b"], ["ab"], ["abc"]]

def test_resolve(self):
annot = self.pdf.annots[0]
annot_ad0 = utils.resolve(annot["data"]["A"]["D"][0])
assert annot_ad0["MediaBox"] == [0, 0, 612, 792]

def test_resolve_all(self):
info = self.pdf.doc.xrefs[0].trailer["Info"]
assert type(info) == PDFObjRef
a = [ { "info": info } ]
a_res = utils.resolve_all(a)
assert a_res[0]["info"]["Producer"] == self.pdf.doc.info[0]["Producer"]

def test_decimalize(self):
d = Decimal("1.011")
assert utils.decimalize(1.011) == d
assert [ utils.decimalize(1.011) ] == [ d ]
assert utils.decimalize(d) == d
assert id(utils.decimalize(d)) == id(d)
assert utils.decimalize(1) == Decimal("1")
with pytest.raises(ValueError):
utils.decimalize("1")

def test_decode_psl_list(self):
a = [ PSLiteral("test"), "test_2" ]
assert utils.decode_psl_list(a) == ["test", "test_2"]

def test_extract_words(self):
path = os.path.join(HERE, "pdfs/issue-192-example.pdf")
with pdfplumber.open(path) as pdf:
p = pdf.pages[0]
words = p.extract_words(vertical_ttb=False)
words_rtl = p.extract_words(horizontal_ltr=False)

assert words[0]["text"] == "Agaaaaa:"
vertical = [w for w in words if w["upright"] == 0]
assert vertical[0]["text"] == "Aaaaaabag8"
assert words_rtl[1]["text"] == "baaabaaA/AAA"

def test_extract_text(self):
text = self.pdf.pages[0].extract_text()
goal = "\n".join([
"First Page Previous Page Next Page Last Page",
"Print",
"PDFill: PDF Drawing",
"You can open a PDF or create a blank PDF by PDFill.",
"Online Help",
"Here are the PDF drawings created by PDFill",
"Please save into a new PDF to see the effect!",
"Goto Page 2: Line Tool",
"Goto Page 3: Arrow Tool",
"Goto Page 4: Tool for Rectangle, Square and Rounded Corner",
"Goto Page 5: Tool for Circle, Ellipse, Arc, Pie",
"Goto Page 6: Tool for Basic Shapes",
"Goto Page 7: Tool for Curves",
"Here are the tools to change line width, style, arrow style and colors",
])

assert text == goal
assert self.pdf.pages[0].crop((0, 0, 0, 0)).extract_text() == None

def test_resize_object(self):
obj = {
"x0": 5,
"x1": 10,
"top": 20,
"bottom": 30,
"width": 5,
"height": 10,
"doctop": 120,
"y0": 40,
"y1": 50,
}
assert utils.resize_object(obj, "x0", 0) == {
"x0": 0,
"x1": 10,
"top": 20,
"doctop": 120,
"bottom": 30,
"width": 10,
"height": 10,
"y0": 40,
"y1": 50,
}
assert utils.resize_object(obj, "x1", 50) == {
"x0": 5,
"x1": 50,
"top": 20,
"doctop": 120,
"bottom": 30,
"width": 45,
"height": 10,
"y0": 40,
"y1": 50,
}
assert utils.resize_object(obj, "top", 0) == {
"x0": 5,
"x1": 10,
"top": 0,
"doctop": 100,
"bottom": 30,
"height": 30,
"width": 5,
"y0": 40,
"y1": 70,
}
assert utils.resize_object(obj, "bottom", 40) == {
"x0": 5,
"x1": 10,
"top": 20,
"doctop": 120,
"bottom": 40,
"height": 20,
"width": 5,
"y0": 30,
"y1": 50,
}

def test_filter_edges(self):
with pytest.raises(ValueError):
utils.filter_edges([], "x")

0 comments on commit d16aa13

Please sign in to comment.