diff --git a/README.md b/README.md index 474eda2..be004c7 100644 --- a/README.md +++ b/README.md @@ -41,9 +41,9 @@ The output will be a CSV containing info about every character, line, and rectan | Argument | Description | |----------|-------------| -|`--format [format]`| `csv` or `json`. The `json` format returns slightly more information; it includes PDF-level metadata and height/width information about each page.| +|`--format [format]`| `csv` or `json`. The `json` format returns more information; it includes PDF-level and page-level metadata, plus dictionary-nested attributes.| |`--pages [list of pages]`| A space-delimited, `1`-indexed list of pages or hyphenated page ranges. E.g., `1, 11-15`, which would return data for pages 1, 11, 12, 13, 14, and 15.| -|`--types [list of object types to extract]`| Choices are `char`, `line`, `curve`, `rect`, `rect_edge`. Defaults to `char`, `line`, `curve`, `rect`.| +|`--types [list of object types to extract]`| Choices are `char`, `rect`, `line`, `curve`, `image`, `annot`. Defaults to all.| ## Python library diff --git a/pdfplumber/cli.py b/pdfplumber/cli.py old mode 100755 new mode 100644 index a2d1c5e..3b72193 --- a/pdfplumber/cli.py +++ b/pdfplumber/cli.py @@ -1,116 +1,50 @@ #!/usr/bin/env python -import pdfplumber +from . import convert +from .pdf import PDF import argparse from itertools import chain - -try: - from cdecimal import Decimal, ROUND_HALF_UP -except ImportError: - from decimal import Decimal, ROUND_HALF_UP -import unicodecsv -import codecs -import json import sys -class DecimalEncoder(json.JSONEncoder): - def default(self, o): - if isinstance(o, Decimal): - return float(o.quantize(Decimal(".0001"), rounding=ROUND_HALF_UP)) - return super(DecimalEncoder, self).default(o) - - def parse_page_spec(p_str): if "-" in p_str: - return list(range(*map(int, p_str.split("-")))) + start, end = map(int, p_str.split("-")) + return range(start, end + 1) else: return [int(p_str)] -def parse_args(): +def parse_args(args_raw): parser = argparse.ArgumentParser("pdfplumber") - stdin = sys.stdin.buffer if sys.version_info[0] >= 3 else sys.stdin parser.add_argument( - "infile", nargs="?", type=argparse.FileType("rb"), default=stdin + "infile", nargs="?", type=argparse.FileType("rb"), default=sys.stdin.buffer ) parser.add_argument("--format", choices=["csv", "json"], default="csv") - parser.add_argument("--encoding", default="utf-8") - - TYPE_DEFAULTS = ["char", "anno", "line", "curve", "rect"] parser.add_argument( "--types", nargs="+", - choices=TYPE_DEFAULTS + ["rect_edge"], - default=TYPE_DEFAULTS, + default=convert.DEFAULT_TYPES, + choices=convert.DEFAULT_TYPES, ) parser.add_argument("--pages", nargs="+", type=parse_page_spec) - args = parser.parse_args() + parser.add_argument( + "--indent", type=int, help="Indent level for JSON pretty-printing." + ) + + args = parser.parse_args(args_raw) if args.pages is not None: args.pages = list(chain(*args.pages)) return args -def to_csv(pdf, types, encoding): - objs = [] - fields = set() - for t in types: - new_objs = getattr(pdf, t + "s") - if len(new_objs): - objs += new_objs - fields = fields.union(set(new_objs[0].keys())) - - first_columns = [ - "object_type", - "page_number", - "x0", - "x1", - "y0", - "y1", - "doctop", - "top", - "bottom", - "width", - "height", - ] - - cols = first_columns + list(sorted(set(fields) - set(first_columns))) - stdout = sys.stdout.buffer if sys.version_info[0] >= 3 else sys.stdout - w = unicodecsv.DictWriter(stdout, fieldnames=cols, encoding=encoding) - w.writeheader() - w.writerows(objs) - - -def to_json(pdf, types, encoding): - data = {"metadata": pdf.metadata} - - def get_page_data(page): - d = dict((t + "s", getattr(page, t + "s")) for t in types) - d["width"] = page.width - d["height"] = page.height - return d - - data["pages"] = list(map(get_page_data, pdf.pages)) - - if hasattr(sys.stdout, "buffer"): - sys.stdout = codecs.getwriter("utf-8")(sys.stdout.buffer, "strict") - json.dump(data, sys.stdout, cls=DecimalEncoder) - else: - json.dump(data, sys.stdout, cls=DecimalEncoder, encoding=encoding) - - -def main(): - args = parse_args() - pdf = pdfplumber.open(args.infile, pages=args.pages) - if args.format == "csv": - to_csv(pdf, args.types, args.encoding) - else: - to_json(pdf, args.types, args.encoding) - - -if __name__ == "__main__": - main() +def main(args_raw=sys.argv[1:]): + args = parse_args(args_raw) + converter = {"csv": convert.to_csv, "json": convert.to_json}[args.format] + kwargs = {"csv": {}, "json": {"indent": args.indent}}[args.format] + with PDF.open(args.infile, pages=args.pages) as pdf: + converter(pdf, sys.stdout, args.types, **kwargs) diff --git a/pdfplumber/container.py b/pdfplumber/container.py index e483674..2a142c6 100644 --- a/pdfplumber/container.py +++ b/pdfplumber/container.py @@ -1,5 +1,5 @@ from itertools import chain -from . import utils +from . import utils, convert class Container(object): @@ -64,3 +64,7 @@ def test(x): return x["orientation"] == "v" return list(filter(test, self.edges)) + + +Container.to_json = convert.to_json +Container.to_csv = convert.to_csv diff --git a/pdfplumber/convert.py b/pdfplumber/convert.py new file mode 100644 index 0000000..fb34003 --- /dev/null +++ b/pdfplumber/convert.py @@ -0,0 +1,132 @@ +from .utils import decode_text +from decimal import Decimal, ROUND_HALF_UP +from pdfminer.pdftypes import PDFStream, PDFObjRef +from pdfminer.psparser import PSLiteral +import json +import csv +import base64 +from io import StringIO + +DEFAULT_TYPES = [ + "char", + "rect", + "line", + "curve", + "image", + "annot", +] + +COLS_TO_PREPEND = [ + "object_type", + "page_number", + "x0", + "x1", + "y0", + "y1", + "doctop", + "top", + "bottom", + "width", + "height", +] + +ENCODINGS_TO_TRY = [ + "utf-8", + "latin-1", + "utf-16", + "utf-16le", +] + + +def to_b64(data_bytes): + return base64.b64encode(data_bytes).decode("ascii") + + +def serialize(obj): + # Convert int-like + t = type(obj) + if t is Decimal: + return float(obj.quantize(Decimal(".0001"), rounding=ROUND_HALF_UP)) + # If tuple/list passed, bulk-convert + elif t in (list, tuple): + return t(serialize(x) for x in obj) + elif t is dict: + return {k: serialize(v) for k, v in obj.items()} + elif t is PDFStream: + return {"rawdata": to_b64(obj.rawdata)} + elif t is PSLiteral: + return decode_text(obj.name) + elif t is bytes: + try: + for e in ENCODINGS_TO_TRY: + return obj.decode(e) + # If none of the decodings work, raise whatever error + # decoding with utf-8 causes + except: # pragma: no cover + obj.decode(ENCODINGS_TO_TRY[0]) + elif obj is None: + return None + elif t in (int, float, str, bool): + return obj + else: + return str(obj) + + +def to_json(container, stream=None, types=DEFAULT_TYPES, indent=None): + def page_to_dict(page): + d = { + "page_number": page.page_number, + "initial_doctop": page.initial_doctop, + "rotation": page.rotation, + "cropbox": page.cropbox, + "mediabox": page.mediabox, + "bbox": page.bbox, + "width": page.width, + "height": page.height, + } + for t in types: + d[t + "s"] = getattr(page, t + "s") + return d + + if hasattr(container, "pages"): + data = { + "metadata": container.metadata, + "pages": list(map(page_to_dict, container.pages)), + } + else: + data = page_to_dict(container) + + serialized = serialize(data) + + if stream is None: + return json.dumps(serialized, indent=indent) + else: + return json.dump(serialized, stream, indent=indent) + + +def to_csv(container, stream=None, types=DEFAULT_TYPES): + if stream is None: + stream = StringIO() + to_string = True + else: + to_string = False + + objs = [] + + # Determine set of fields for all objects + fields = set() + for t in types: + new_objs = getattr(container, t + "s") + if len(new_objs): + objs += new_objs + new_keys = [k for k, v in new_objs[0].items() if type(v) is not dict] + fields = fields.union(set(new_keys)) + + cols = COLS_TO_PREPEND + list(sorted(set(fields) - set(COLS_TO_PREPEND))) + + w = csv.DictWriter(stream, fieldnames=cols, extrasaction="ignore") + w.writeheader() + w.writerows(serialize(objs)) + if to_string: + stream.seek(0) + return stream.read() diff --git a/pdfplumber/page.py b/pdfplumber/page.py index 38c9667..1a4f232 100644 --- a/pdfplumber/page.py +++ b/pdfplumber/page.py @@ -2,7 +2,6 @@ from .utils import resolve, resolve_all from .table import TableFinder from .container import Container - import re lt_pat = re.compile(r"^LT") @@ -60,30 +59,45 @@ def layout(self): @property def annots(self): def parse(annot): - data = resolve(annot.resolve()) - rect = self.decimalize(resolve_all(data["Rect"])) + rect = self.decimalize(annot["Rect"]) + + a = annot.get("A", {}) + extras = { + "uri": a.get("URI"), + "title": annot.get("T"), + "contents": annot.get("Contents"), + } + for k, v in extras.items(): + if v is not None: + extras[k] = v.decode("utf-8") + parsed = { "page_number": self.page_number, + "object_type": "annot", + "x0": rect[0], + "y0": rect[1], + "x1": rect[2], + "y1": rect[3], "doctop": self.initial_doctop + self.height - rect[3], "top": self.height - rect[3], - "x0": rect[0], "bottom": self.height - rect[1], - "x1": rect[2], "width": rect[2] - rect[0], "height": rect[3] - rect[1], - "data": data, } - uri = data.get("A", {}).get("URI") - if uri is not None: - parsed["URI"] = uri.decode("utf-8") + parsed.update(extras) + # Replace the indirect reference to the page dictionary + # with a pointer to our actual page + if "P" in annot: + annot["P"] = self + parsed["data"] = annot return parsed - raw = resolve(self.page_obj.annots) or [] + raw = resolve_all(self.page_obj.annots) or [] return list(map(parse, raw)) @property def hyperlinks(self): - return [a for a in self.annots if "URI" in a] + return [a for a in self.annots if a["uri"] is not None] @property def objects(self): @@ -246,6 +260,9 @@ def to_image(self, **conversion_kwargs): kwargs["resolution"] = DEFAULT_RESOLUTION return PageImage(self, **kwargs) + def __repr__(self): + return f"" + class DerivedPage(Page): is_original = False diff --git a/pdfplumber/utils.py b/pdfplumber/utils.py index ea8791a..4d1edda 100644 --- a/pdfplumber/utils.py +++ b/pdfplumber/utils.py @@ -1,6 +1,8 @@ from pdfminer.utils import PDFDocEncoding from pdfminer.psparser import PSLiteral from pdfminer.pdftypes import PDFObjRef +from pdfminer.pdfdocument import PDFDocument +from pdfminer.pdfpage import PDFPage from decimal import Decimal, ROUND_HALF_UP import numbers from operator import itemgetter, gt, lt, add, sub @@ -92,20 +94,37 @@ def resolve(x): return x -# via pdfminer.pdftypes, altered slightly +def get_dict_type(d): + if type(d) is not dict: + return None + t = d.get("Type") + if type(t) is PSLiteral: + return decode_text(t.name) + else: + return t + + def resolve_all(x): """ Recursively resolves the given object and all the internals. """ t = type(x) if t == PDFObjRef: - return resolve_all(x.resolve()) - elif t == list: - return [resolve_all(v) for v in x] - elif t == tuple: - return tuple(resolve_all(v) for v in x) + resolved = x.resolve() + + # Avoid infinite recursion + if get_dict_type(resolved) == "Page": + return x + + return resolve_all(resolved) + elif t in (list, tuple): + return t(resolve_all(v) for v in x) elif t == dict: - return dict((k, resolve_all(v)) for k, v in x.items()) + if get_dict_type(x) == "Annot": + exceptions = ["Parent"] + else: + exceptions = [] + return dict((k, v if k in exceptions else resolve_all(v)) for k, v in x.items()) else: return x diff --git a/tests/pdfs/annotations.pdf b/tests/pdfs/annotations.pdf new file mode 100644 index 0000000..e8f76d9 Binary files /dev/null and b/tests/pdfs/annotations.pdf differ diff --git a/tests/pdfs/pdffill-demo.pdf b/tests/pdfs/pdffill-demo.pdf index dcc7eb3..a63d021 100644 Binary files a/tests/pdfs/pdffill-demo.pdf and b/tests/pdfs/pdffill-demo.pdf differ diff --git a/tests/test_basics.py b/tests/test_basics.py index 414dd98..37f55f9 100644 --- a/tests/test_basics.py +++ b/tests/test_basics.py @@ -28,7 +28,8 @@ def test_pagecount(self): assert(len(self.pdf.pages) == 1) def test_page_number(self): - assert(self.pdf.pages[0].page_number == 1) + assert self.pdf.pages[0].page_number == 1 + assert str(self.pdf.pages[0]) == "" def test_objects(self): assert len(self.pdf.chars) @@ -46,7 +47,11 @@ def test_annots(self): assert len(pdf.annots) assert len(pdf.hyperlinks) == 17 uri = "http://www.pdfill.com/pdf_drawing.html" - assert pdf.hyperlinks[0]["URI"] == uri + assert pdf.hyperlinks[0]["uri"] == uri + + path = os.path.join(HERE, "pdfs/annotations.pdf") + with pdfplumber.open(path) as pdf: + assert len(pdf.annots) def test_crop_and_filter(self): def test(obj): diff --git a/tests/test_convert.py b/tests/test_convert.py new file mode 100644 index 0000000..0ebc8d8 --- /dev/null +++ b/tests/test_convert.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python +import unittest +import pytest +import pandas as pd +import pdfplumber +from subprocess import Popen, PIPE +from io import StringIO +import json +import sys +import os + +import logging +logging.disable(logging.ERROR) + +HERE = os.path.abspath(os.path.dirname(__file__)) + +def run(cmd): + return Popen(cmd, stdout = PIPE).communicate()[0] + +class Test(unittest.TestCase): + + @classmethod + def setup_class(self): + self.path = os.path.join(HERE, "pdfs/pdffill-demo.pdf") + self.pdf = pdfplumber.open(self.path, pages = [ 1, 2, 5 ]) + + @classmethod + def teardown_class(self): + self.pdf.close() + + def test_json(self): + c = json.loads(self.pdf.to_json()) + assert c["pages"][0]["rects"][0]["bottom"] == float(self.pdf.pages[0].rects[0]["bottom"]) + + def test_single_pages(self): + c = json.loads(self.pdf.pages[0].to_json()) + assert c["rects"][0]["bottom"] == float(self.pdf.pages[0].rects[0]["bottom"]) + + def test_additional_attr_types(self): + path = os.path.join(HERE, "pdfs/issue-67-example.pdf") + with pdfplumber.open(path, pages = [ 1 ]) as pdf: + c = json.loads(pdf.to_json()) + assert len(c["pages"][0]["images"]) + + def test_csv(self): + c = self.pdf.to_csv() + assert c.split("\r\n")[1] == ( + 'char,1,45.83,58.826,656.82,674.82,117.18,117.18,135.18,12.996,' + '18.0,12.996,,,,,,TimesNewRomanPSMT,,,,"(0, 0, 0)",,,18.0,,,,,Y,,1,' + ) + + io = StringIO() + self.pdf.to_csv(io) + io.seek(0) + c_from_io = io.read() + assert c == c_from_io + + + def test_cli(self): + res = run([ + "pdfplumber", + self.path, + "--format", + "json", + "--pages", + "1-2", + "5", + "--indent", + "2", + ]) + + c = json.loads(res) + assert c["pages"][0]["page_number"] == 1 + assert c["pages"][1]["page_number"] == 2 + assert c["pages"][2]["page_number"] == 5 + assert c["pages"][0]["rects"][0]["bottom"] == float(self.pdf.pages[0].rects[0]["bottom"]) diff --git a/tests/test_utils.py b/tests/test_utils.py index fb06d26..67cc718 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -41,6 +41,7 @@ def test_resolve(self): annot = self.pdf.annots[0] annot_ad0 = utils.resolve(annot["data"]["A"]["D"][0]) assert annot_ad0["MediaBox"] == [0, 0, 612, 792] + assert utils.resolve(1) == 1 def test_resolve_all(self): info = self.pdf.doc.xrefs[0].trailer["Info"]