Add convert.py/.to_json/.to_csv & improve testcov

Moves most of the logic previously in cli.py to convert.py, for usage by other submodules. Adds Container.to_json and Container.to_csv. Makes adjustments/fixes to other parts of the library, based on edge-cases encountered (such as infinite recursion in anntations).
jsvine · Aug 13, 2020 · cbc91c6 · cbc91c6
1 parent 3f4b4b2
commit cbc91c6
Show file tree

Hide file tree

Showing 11 changed files with 296 additions and 108 deletions.
diff --git a/README.md b/README.md
@@ -41,9 +41,9 @@ The output will be a CSV containing info about every character, line, and rectan
 
 | Argument | Description |
 |----------|-------------|
-|`--format [format]`| `csv` or `json`. The `json` format returns slightly more information; it includes PDF-level metadata and height/width information about each page.|
+|`--format [format]`| `csv` or `json`. The `json` format returns more information; it includes PDF-level and page-level metadata, plus dictionary-nested attributes.|
 |`--pages [list of pages]`| A space-delimited, `1`-indexed list of pages or hyphenated page ranges. E.g., `1, 11-15`, which would return data for pages 1, 11, 12, 13, 14, and 15.|
-|`--types [list of object types to extract]`| Choices are `char`, `line`, `curve`, `rect`, `rect_edge`. Defaults to `char`, `line`, `curve`, `rect`.|
+|`--types [list of object types to extract]`| Choices are `char`, `rect`, `line`, `curve`, `image`, `annot`. Defaults to all.|
 
 ## Python library
 

diff --git a/pdfplumber/cli.py b/pdfplumber/cli.py
@@ -1,116 +1,50 @@
 #!/usr/bin/env python
-import pdfplumber
+from . import convert
+from .pdf import PDF
 import argparse
 from itertools import chain
-
-try:
-    from cdecimal import Decimal, ROUND_HALF_UP
-except ImportError:
-    from decimal import Decimal, ROUND_HALF_UP
-import unicodecsv
-import codecs
-import json
 import sys
 
 
-class DecimalEncoder(json.JSONEncoder):
-    def default(self, o):
-        if isinstance(o, Decimal):
-            return float(o.quantize(Decimal(".0001"), rounding=ROUND_HALF_UP))
-        return super(DecimalEncoder, self).default(o)
-
-
 def parse_page_spec(p_str):
     if "-" in p_str:
-        return list(range(*map(int, p_str.split("-"))))
+        start, end = map(int, p_str.split("-"))
+        return range(start, end + 1)
     else:
         return [int(p_str)]
 
 
-def parse_args():
+def parse_args(args_raw):
     parser = argparse.ArgumentParser("pdfplumber")
 
-    stdin = sys.stdin.buffer if sys.version_info[0] >= 3 else sys.stdin
     parser.add_argument(
-        "infile", nargs="?", type=argparse.FileType("rb"), default=stdin
+        "infile", nargs="?", type=argparse.FileType("rb"), default=sys.stdin.buffer
     )
 
     parser.add_argument("--format", choices=["csv", "json"], default="csv")
 
-    parser.add_argument("--encoding", default="utf-8")
-
-    TYPE_DEFAULTS = ["char", "anno", "line", "curve", "rect"]
     parser.add_argument(
         "--types",
         nargs="+",
-        choices=TYPE_DEFAULTS + ["rect_edge"],
-        default=TYPE_DEFAULTS,
+        default=convert.DEFAULT_TYPES,
+        choices=convert.DEFAULT_TYPES,
     )
 
     parser.add_argument("--pages", nargs="+", type=parse_page_spec)
 
-    args = parser.parse_args()
+    parser.add_argument(
+        "--indent", type=int, help="Indent level for JSON pretty-printing."
+    )
+
+    args = parser.parse_args(args_raw)
     if args.pages is not None:
         args.pages = list(chain(*args.pages))
     return args
 
 
-def to_csv(pdf, types, encoding):
-    objs = []
-    fields = set()
-    for t in types:
-        new_objs = getattr(pdf, t + "s")
-        if len(new_objs):
-            objs += new_objs
-            fields = fields.union(set(new_objs[0].keys()))
-
-    first_columns = [
-        "object_type",
-        "page_number",
-        "x0",
-        "x1",
-        "y0",
-        "y1",
-        "doctop",
-        "top",
-        "bottom",
-        "width",
-        "height",
-    ]
-
-    cols = first_columns + list(sorted(set(fields) - set(first_columns)))
-    stdout = sys.stdout.buffer if sys.version_info[0] >= 3 else sys.stdout
-    w = unicodecsv.DictWriter(stdout, fieldnames=cols, encoding=encoding)
-    w.writeheader()
-    w.writerows(objs)
-
-
-def to_json(pdf, types, encoding):
-    data = {"metadata": pdf.metadata}
-
-    def get_page_data(page):
-        d = dict((t + "s", getattr(page, t + "s")) for t in types)
-        d["width"] = page.width
-        d["height"] = page.height
-        return d
-
-    data["pages"] = list(map(get_page_data, pdf.pages))
-
-    if hasattr(sys.stdout, "buffer"):
-        sys.stdout = codecs.getwriter("utf-8")(sys.stdout.buffer, "strict")
-        json.dump(data, sys.stdout, cls=DecimalEncoder)
-    else:
-        json.dump(data, sys.stdout, cls=DecimalEncoder, encoding=encoding)
-
-
-def main():
-    args = parse_args()
-    pdf = pdfplumber.open(args.infile, pages=args.pages)
-    if args.format == "csv":
-        to_csv(pdf, args.types, args.encoding)
-    else:
-        to_json(pdf, args.types, args.encoding)
-
-
-if __name__ == "__main__":
-    main()
+def main(args_raw=sys.argv[1:]):
+    args = parse_args(args_raw)
+    converter = {"csv": convert.to_csv, "json": convert.to_json}[args.format]
+    kwargs = {"csv": {}, "json": {"indent": args.indent}}[args.format]
+    with PDF.open(args.infile, pages=args.pages) as pdf:
+        converter(pdf, sys.stdout, args.types, **kwargs)
diff --git a/pdfplumber/container.py b/pdfplumber/container.py
@@ -1,5 +1,5 @@
 from itertools import chain
-from . import utils
+from . import utils, convert
 
 
 class Container(object):
@@ -64,3 +64,7 @@ def test(x):
             return x["orientation"] == "v"
 
         return list(filter(test, self.edges))
+
+
+Container.to_json = convert.to_json
+Container.to_csv = convert.to_csv
diff --git a/pdfplumber/convert.py b/pdfplumber/convert.py
@@ -0,0 +1,132 @@
+from .utils import decode_text
+from decimal import Decimal, ROUND_HALF_UP
+from pdfminer.pdftypes import PDFStream, PDFObjRef
+from pdfminer.psparser import PSLiteral
+import json
+import csv
+import base64
+from io import StringIO
+
+DEFAULT_TYPES = [
+    "char",
+    "rect",
+    "line",
+    "curve",
+    "image",
+    "annot",
+]
+
+COLS_TO_PREPEND = [
+    "object_type",
+    "page_number",
+    "x0",
+    "x1",
+    "y0",
+    "y1",
+    "doctop",
+    "top",
+    "bottom",
+    "width",
+    "height",
+]
+
+ENCODINGS_TO_TRY = [
+    "utf-8",
+    "latin-1",
+    "utf-16",
+    "utf-16le",
+]
+
+
+def to_b64(data_bytes):
+    return base64.b64encode(data_bytes).decode("ascii")
+
+
+def serialize(obj):
+    # Convert int-like
+    t = type(obj)
+    if t is Decimal:
+        return float(obj.quantize(Decimal(".0001"), rounding=ROUND_HALF_UP))
+    # If tuple/list passed, bulk-convert
+    elif t in (list, tuple):
+        return t(serialize(x) for x in obj)
+    elif t is dict:
+        return {k: serialize(v) for k, v in obj.items()}
+    elif t is PDFStream:
+        return {"rawdata": to_b64(obj.rawdata)}
+    elif t is PSLiteral:
+        return decode_text(obj.name)
+    elif t is bytes:
+        try:
+            for e in ENCODINGS_TO_TRY:
+                return obj.decode(e)
+        # If none of the decodings work, raise whatever error
+        # decoding with utf-8 causes
+        except:  # pragma: no cover
+            obj.decode(ENCODINGS_TO_TRY[0])
+    elif obj is None:
+        return None
+    elif t in (int, float, str, bool):
+        return obj
+    else:
+        return str(obj)
+
+
+def to_json(container, stream=None, types=DEFAULT_TYPES, indent=None):
+    def page_to_dict(page):
+        d = {
+            "page_number": page.page_number,
+            "initial_doctop": page.initial_doctop,
+            "rotation": page.rotation,
+            "cropbox": page.cropbox,
+            "mediabox": page.mediabox,
+            "bbox": page.bbox,
+            "width": page.width,
+            "height": page.height,
+        }
+        for t in types:
+            d[t + "s"] = getattr(page, t + "s")
+        return d
+
+    if hasattr(container, "pages"):
+        data = {
+            "metadata": container.metadata,
+            "pages": list(map(page_to_dict, container.pages)),
+        }
+    else:
+        data = page_to_dict(container)
+
+    serialized = serialize(data)
+
+    if stream is None:
+        return json.dumps(serialized, indent=indent)
+    else:
+        return json.dump(serialized, stream, indent=indent)
+
+
+def to_csv(container, stream=None, types=DEFAULT_TYPES):
+    if stream is None:
+        stream = StringIO()
+        to_string = True
+    else:
+        to_string = False
+
+    objs = []
+
+    # Determine set of fields for all objects
+    fields = set()
+    for t in types:
+        new_objs = getattr(container, t + "s")
+        if len(new_objs):
+            objs += new_objs
+            new_keys = [k for k, v in new_objs[0].items() if type(v) is not dict]
+            fields = fields.union(set(new_keys))
+
+    cols = COLS_TO_PREPEND + list(sorted(set(fields) - set(COLS_TO_PREPEND)))
+
+    w = csv.DictWriter(stream, fieldnames=cols, extrasaction="ignore")
+    w.writeheader()
+    w.writerows(serialize(objs))
+    if to_string:
+        stream.seek(0)
+        return stream.read()
diff --git a/pdfplumber/page.py b/pdfplumber/page.py
@@ -2,7 +2,6 @@
 from .utils import resolve, resolve_all
 from .table import TableFinder
 from .container import Container
-
 import re
 
 lt_pat = re.compile(r"^LT")
@@ -60,30 +59,45 @@ def layout(self):
     @property
     def annots(self):
         def parse(annot):
-            data = resolve(annot.resolve())
-            rect = self.decimalize(resolve_all(data["Rect"]))
+            rect = self.decimalize(annot["Rect"])
+
+            a = annot.get("A", {})
+            extras = {
+                "uri": a.get("URI"),
+                "title": annot.get("T"),
+                "contents": annot.get("Contents"),
+            }
+            for k, v in extras.items():
+                if v is not None:
+                    extras[k] = v.decode("utf-8")
+
             parsed = {
                 "page_number": self.page_number,
+                "object_type": "annot",
+                "x0": rect[0],
+                "y0": rect[1],
+                "x1": rect[2],
+                "y1": rect[3],
                 "doctop": self.initial_doctop + self.height - rect[3],
                 "top": self.height - rect[3],
-                "x0": rect[0],
                 "bottom": self.height - rect[1],
-                "x1": rect[2],
                 "width": rect[2] - rect[0],
                 "height": rect[3] - rect[1],
-                "data": data,
             }
-            uri = data.get("A", {}).get("URI")
-            if uri is not None:
-                parsed["URI"] = uri.decode("utf-8")
+            parsed.update(extras)
+            # Replace the indirect reference to the page dictionary
+            # with a pointer to our actual page
+            if "P" in annot:
+                annot["P"] = self
+            parsed["data"] = annot
             return parsed
 
-        raw = resolve(self.page_obj.annots) or []
+        raw = resolve_all(self.page_obj.annots) or []
         return list(map(parse, raw))
 
     @property
     def hyperlinks(self):
-        return [a for a in self.annots if "URI" in a]
+        return [a for a in self.annots if a["uri"] is not None]
 
     @property
     def objects(self):
@@ -246,6 +260,9 @@ def to_image(self, **conversion_kwargs):
             kwargs["resolution"] = DEFAULT_RESOLUTION
         return PageImage(self, **kwargs)
 
+    def __repr__(self):
+        return f"<Page:{self.page_number}>"
+
 
 class DerivedPage(Page):
     is_original = False