Fix utils.extract_words & .resize_object, + tests

- Fixes `.extract_words`, which had been returning incorrect results when `horizontal_ltr = False` - Fixes `.resize_object`, which had been failing in various permutations - Brings utils.py test coverage to 100%
jsvine · Aug 1, 2020 · d16aa13 · d16aa13
1 parent 8e2a166
commit d16aa13
Show file tree

Hide file tree

Showing 4 changed files with 204 additions and 78 deletions.
diff --git a/pdfplumber/page.py b/pdfplumber/page.py
@@ -216,29 +216,11 @@ def sorter(x):
         largest = list(sorted(tables, key=sorter))[0]
         return largest.extract()
 
-    def extract_text(
-        self,
-        x_tolerance=utils.DEFAULT_X_TOLERANCE,
-        y_tolerance=utils.DEFAULT_Y_TOLERANCE,
-    ):
-
-        return utils.extract_text(
-            self.chars, x_tolerance=x_tolerance, y_tolerance=y_tolerance
-        )
+    def extract_text(self, **kwargs):
+        return utils.extract_text(self.chars, **kwargs)
 
-    def extract_words(
-        self,
-        x_tolerance=utils.DEFAULT_X_TOLERANCE,
-        y_tolerance=utils.DEFAULT_Y_TOLERANCE,
-        keep_blank_chars=False,
-    ):
-
-        return utils.extract_words(
-            self.chars,
-            x_tolerance=x_tolerance,
-            y_tolerance=y_tolerance,
-            keep_blank_chars=keep_blank_chars,
-        )
+    def extract_words(self, **kwargs):
+        return utils.extract_words(self.chars, **kwargs)
 
     def crop(self, bbox):
         return CroppedPage(self, self.decimalize(bbox))

diff --git a/pdfplumber/utils.py b/pdfplumber/utils.py
@@ -3,7 +3,7 @@
 from pdfminer.pdftypes import PDFObjRef
 from decimal import Decimal, ROUND_HALF_UP
 import numbers
-from operator import itemgetter
+from operator import itemgetter, gt, lt, add, sub
 import itertools
 from functools import lru_cache as cache
 
@@ -13,7 +13,7 @@
 
 def cluster_list(xs, tolerance=0):
     tolerance = decimalize(tolerance)
-    if tolerance == 0:
+    if tolerance == Decimal(0):
         return [[x] for x in sorted(xs)]
     if len(xs) < 2:
         return [[x] for x in sorted(xs)]
@@ -112,16 +112,8 @@ def resolve_all(x):
 
 @cache(maxsize=int(10e4))
 def _decimalize(v, q=None):
-    # If already a decimal, just return itself
-    if type(v) == Decimal:
-        return v
-
-    # If tuple/list passed, bulk-convert
-    elif isinstance(v, (tuple, list)):
-        return type(v)(decimalize(x, q) for x in v)
-
     # Convert int-like
-    elif isinstance(v, numbers.Integral):
+    if isinstance(v, numbers.Integral):
         return Decimal(int(v))
 
     # Convert float-like
@@ -211,37 +203,39 @@ def extract_words(
     def process_word_chars(chars, upright):
         x0, top, x1, bottom = objects_to_bbox(chars)
 
-        if upright:
-            if horizontal_ltr:
-                sorted_chars = chars
-            else:
-                sorted_chars = sorted(chars, key=lambda x: -x["x1"])
-        else:
-            if vertical_ttb:
-                sorted_chars = sorted(chars, key=itemgetter("doctop"))
-            else:
-                sorted_chars = sorted(chars, key=lambda x: -x["bottom"])
-
         return {
             "x0": x0,
             "x1": x1,
             "top": top,
             "bottom": bottom,
             "upright": upright,
-            "text": "".join(map(itemgetter("text"), sorted_chars)),
+            "text": "".join(map(itemgetter("text"), chars)),
         }
 
-    def get_line_words(chars, upright, tolerance=DEFAULT_X_TOLERANCE):
+    def get_line_words(chars, upright, tolerance):
         get_text = itemgetter("text")
-        min_key = "x0" if upright else "top"
-        max_key = "x1" if upright else "bottom"
-
-        chars_sorted = sorted(chars, key=itemgetter(min_key))
+        if upright:
+            min_key, max_key = ("x0", "x1") if horizontal_ltr else ("x1", "x0")
+        else:
+            min_key, max_key = ("top", "bottom") if vertical_ttb else ("bottom", "top")
 
         words = []
         current_word = []
 
-        for char in chars_sorted:
+        asc_order = (
+            (upright and horizontal_ltr)
+            or (not upright and vertical_ttb)
+        )
+
+        comp_fn = gt if asc_order else lt
+        tol_fn = add if asc_order else sub
+
+        def sort_key(x):
+            return tol_fn(0, x[min_key])
+
+        sorted_chars = sorted(chars, key=sort_key)
+
+        for char in sorted_chars:
             if not keep_blank_chars and get_text(char).isspace():
                 if len(current_word) > 0:
                     words.append(current_word)
@@ -252,7 +246,8 @@ def get_line_words(chars, upright, tolerance=DEFAULT_X_TOLERANCE):
                 current_word.append(char)
             else:
                 last_char = current_word[-1]
-                if char[min_key] > (last_char[max_key] + tolerance):
+                prev_pos = tol_fn(last_char[max_key], tolerance)
+                if comp_fn(char[min_key], prev_pos):
                     words.append(current_word)
                     current_word = []
                 current_word.append(char)
@@ -417,36 +412,26 @@ def resize_object(obj, key, value):
     assert key in ("x0", "x1", "top", "bottom")
     old_value = obj[key]
     diff = value - old_value
-    if key in ("x0", "x1"):
-        if key == "x0":
-            assert value <= obj["x1"]
-        else:
-            assert value >= obj["x0"]
-        new_items = (
-            (key, value),
-            ("width", obj["width"] + diff),
-        )
-    if key == "top":
+    new_items = [
+        (key, value),
+    ]
+    if key == "x0":
+        assert value <= obj["x1"]
+        new_items.append(("width", obj["x1"] - value))
+    elif key == "x1":
+        assert value >= obj["x0"]
+        new_items.append(("width", value - obj["x0"]))
+    elif key == "top":
         assert value <= obj["bottom"]
-        new_items = [
-            (key, value),
-            ("doctop", obj["doctop"] + diff),
-            ("height", obj["height"] - diff),
-        ]
+        new_items.append(("doctop", obj["doctop"] + diff))
+        new_items.append(("height", obj["height"] - diff))
         if "y1" in obj:
-            new_items += [
-                ("y1", obj["y1"] - diff),
-            ]
-    if key == "bottom":
+            new_items.append(("y1", obj["y1"] - diff))
+    elif key == "bottom":
         assert value >= obj["top"]
-        new_items = [
-            (key, value),
-            ("height", obj["height"] + diff),
-        ]
+        new_items.append(("height", obj["height"] + diff))
         if "y0" in obj:
-            new_items += [
-                ("y0", obj["y0"] - diff),
-            ]
+            new_items.append(("y0", obj["y0"] - diff))
     return obj.__class__(tuple(obj.items()) + tuple(new_items))
 
 

diff --git a/tests/pdfs/issue-192-example.pdf b/tests/pdfs/issue-192-example.pdf
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -0,0 +1,159 @@
+#!/usr/bin/env python
+import unittest
+import pytest
+import pandas as pd
+import pdfplumber
+from pdfplumber import utils
+from pdfminer.pdfparser import PDFObjRef
+from pdfminer.psparser import PSLiteral
+from decimal import Decimal
+import sys, os
+
+import logging
+logging.disable(logging.ERROR)
+
+HERE = os.path.abspath(os.path.dirname(__file__))
+
+class Test(unittest.TestCase):
+
+    @classmethod
+    def setup_class(self):
+        path = os.path.join(HERE, "pdfs/pdffill-demo.pdf")
+        self.pdf = pdfplumber.open(path)
+
+    @classmethod
+    def teardown_class(self):
+        self.pdf.close()
+
+    def test_cluster_list(self):
+        a = [1, 2, 3, 4]
+        assert utils.cluster_list(a) == [[x] for x in a]
+        assert utils.cluster_list(a, tolerance=1) == [a]
+
+        a = [1, 2, 5, 6]
+        assert utils.cluster_list(a, tolerance=1) == [[1, 2], [5, 6]]
+
+    def test_cluster_objects(self):
+        a = ["a", "ab", "abc", "b"]
+        assert utils.cluster_objects(a, len, 0) == [["a", "b"], ["ab"], ["abc"]]
+
+    def test_resolve(self):
+        annot = self.pdf.annots[0]
+        annot_ad0 = utils.resolve(annot["data"]["A"]["D"][0])
+        assert annot_ad0["MediaBox"] == [0, 0, 612, 792]
+
+    def test_resolve_all(self):
+        info = self.pdf.doc.xrefs[0].trailer["Info"]
+        assert type(info) == PDFObjRef
+        a = [ { "info": info } ]
+        a_res = utils.resolve_all(a)
+        assert a_res[0]["info"]["Producer"] == self.pdf.doc.info[0]["Producer"]
+
+    def test_decimalize(self):
+        d = Decimal("1.011")
+        assert utils.decimalize(1.011) == d
+        assert [ utils.decimalize(1.011) ] == [ d ]
+        assert utils.decimalize(d) == d
+        assert id(utils.decimalize(d)) == id(d)
+        assert utils.decimalize(1) == Decimal("1")
+        with pytest.raises(ValueError):
+            utils.decimalize("1")
+
+    def test_decode_psl_list(self):
+        a = [ PSLiteral("test"), "test_2" ]
+        assert utils.decode_psl_list(a) == ["test", "test_2"]
+
+    def test_extract_words(self):
+        path = os.path.join(HERE, "pdfs/issue-192-example.pdf")
+        with pdfplumber.open(path) as pdf:
+            p = pdf.pages[0]
+            words = p.extract_words(vertical_ttb=False)
+            words_rtl = p.extract_words(horizontal_ltr=False)
+
+        assert words[0]["text"] == "Agaaaaa:"
+        vertical = [w for w in words if w["upright"] == 0]
+        assert vertical[0]["text"] == "Aaaaaabag8"
+        assert words_rtl[1]["text"] == "baaabaaA/AAA"
+
+    def test_extract_text(self):
+        text = self.pdf.pages[0].extract_text()
+        goal = "\n".join([
+            "First Page Previous Page Next Page Last Page",
+            "Print",
+            "PDFill: PDF Drawing",
+            "You can open a PDF or create a blank PDF by PDFill.",
+            "Online Help",
+            "Here are the PDF drawings created by PDFill",
+            "Please save into a new PDF to see the effect!",
+            "Goto Page 2: Line Tool",
+            "Goto Page 3: Arrow Tool",
+            "Goto Page 4: Tool for Rectangle, Square and Rounded Corner",
+            "Goto Page 5: Tool for Circle, Ellipse, Arc, Pie",
+            "Goto Page 6: Tool for Basic Shapes",
+            "Goto Page 7: Tool for Curves",
+            "Here are the tools to change line width, style, arrow style and colors",
+        ])
+
+        assert text == goal
+        assert self.pdf.pages[0].crop((0, 0, 0, 0)).extract_text() == None
+
+    def test_resize_object(self):
+        obj = {
+            "x0": 5,
+            "x1": 10,
+            "top": 20,
+            "bottom": 30,
+            "width": 5,
+            "height": 10,
+            "doctop": 120,
+            "y0": 40,
+            "y1": 50,
+        }
+        assert utils.resize_object(obj, "x0", 0) == {
+            "x0": 0,
+            "x1": 10,
+            "top": 20,
+            "doctop": 120,
+            "bottom": 30,
+            "width": 10,
+            "height": 10,
+            "y0": 40,
+            "y1": 50,
+        }
+        assert utils.resize_object(obj, "x1", 50) == {
+            "x0": 5,
+            "x1": 50,
+            "top": 20,
+            "doctop": 120,
+            "bottom": 30,
+            "width": 45,
+            "height": 10,
+            "y0": 40,
+            "y1": 50,
+        }
+        assert utils.resize_object(obj, "top", 0) == {
+            "x0": 5,
+            "x1": 10,
+            "top": 0,
+            "doctop": 100,
+            "bottom": 30,
+            "height": 30,
+            "width": 5,
+            "y0": 40,
+            "y1": 70,
+        }
+        assert utils.resize_object(obj, "bottom", 40) == {
+            "x0": 5,
+            "x1": 10,
+            "top": 20,
+            "doctop": 120,
+            "bottom": 40,
+            "height": 20,
+            "width": 5,
+            "y0": 30,
+            "y1": 50,
+        }
+
+    def test_filter_edges(self):
+        with pytest.raises(ValueError):
+            utils.filter_edges([], "x")