Skip to content

Commit

Permalink
Add convert.py/.to_json/.to_csv & improve testcov
Browse files Browse the repository at this point in the history
Moves most of the logic previously in cli.py to convert.py, for usage by
other submodules. Adds Container.to_json and Container.to_csv. Makes
adjustments/fixes to other parts of the library, based on edge-cases
encountered (such as infinite recursion in anntations).
  • Loading branch information
jsvine committed Aug 13, 2020
1 parent 3f4b4b2 commit cbc91c6
Show file tree
Hide file tree
Showing 11 changed files with 296 additions and 108 deletions.
4 changes: 2 additions & 2 deletions README.md
Expand Up @@ -41,9 +41,9 @@ The output will be a CSV containing info about every character, line, and rectan

| Argument | Description |
|----------|-------------|
|`--format [format]`| `csv` or `json`. The `json` format returns slightly more information; it includes PDF-level metadata and height/width information about each page.|
|`--format [format]`| `csv` or `json`. The `json` format returns more information; it includes PDF-level and page-level metadata, plus dictionary-nested attributes.|
|`--pages [list of pages]`| A space-delimited, `1`-indexed list of pages or hyphenated page ranges. E.g., `1, 11-15`, which would return data for pages 1, 11, 12, 13, 14, and 15.|
|`--types [list of object types to extract]`| Choices are `char`, `line`, `curve`, `rect`, `rect_edge`. Defaults to `char`, `line`, `curve`, `rect`.|
|`--types [list of object types to extract]`| Choices are `char`, `rect`, `line`, `curve`, `image`, `annot`. Defaults to all.|

## Python library

Expand Down
104 changes: 19 additions & 85 deletions pdfplumber/cli.py 100755 → 100644
@@ -1,116 +1,50 @@
#!/usr/bin/env python
import pdfplumber
from . import convert
from .pdf import PDF
import argparse
from itertools import chain

try:
from cdecimal import Decimal, ROUND_HALF_UP
except ImportError:
from decimal import Decimal, ROUND_HALF_UP
import unicodecsv
import codecs
import json
import sys


class DecimalEncoder(json.JSONEncoder):
def default(self, o):
if isinstance(o, Decimal):
return float(o.quantize(Decimal(".0001"), rounding=ROUND_HALF_UP))
return super(DecimalEncoder, self).default(o)


def parse_page_spec(p_str):
if "-" in p_str:
return list(range(*map(int, p_str.split("-"))))
start, end = map(int, p_str.split("-"))
return range(start, end + 1)
else:
return [int(p_str)]


def parse_args():
def parse_args(args_raw):
parser = argparse.ArgumentParser("pdfplumber")

stdin = sys.stdin.buffer if sys.version_info[0] >= 3 else sys.stdin
parser.add_argument(
"infile", nargs="?", type=argparse.FileType("rb"), default=stdin
"infile", nargs="?", type=argparse.FileType("rb"), default=sys.stdin.buffer
)

parser.add_argument("--format", choices=["csv", "json"], default="csv")

parser.add_argument("--encoding", default="utf-8")

TYPE_DEFAULTS = ["char", "anno", "line", "curve", "rect"]
parser.add_argument(
"--types",
nargs="+",
choices=TYPE_DEFAULTS + ["rect_edge"],
default=TYPE_DEFAULTS,
default=convert.DEFAULT_TYPES,
choices=convert.DEFAULT_TYPES,
)

parser.add_argument("--pages", nargs="+", type=parse_page_spec)

args = parser.parse_args()
parser.add_argument(
"--indent", type=int, help="Indent level for JSON pretty-printing."
)

args = parser.parse_args(args_raw)
if args.pages is not None:
args.pages = list(chain(*args.pages))
return args


def to_csv(pdf, types, encoding):
objs = []
fields = set()
for t in types:
new_objs = getattr(pdf, t + "s")
if len(new_objs):
objs += new_objs
fields = fields.union(set(new_objs[0].keys()))

first_columns = [
"object_type",
"page_number",
"x0",
"x1",
"y0",
"y1",
"doctop",
"top",
"bottom",
"width",
"height",
]

cols = first_columns + list(sorted(set(fields) - set(first_columns)))
stdout = sys.stdout.buffer if sys.version_info[0] >= 3 else sys.stdout
w = unicodecsv.DictWriter(stdout, fieldnames=cols, encoding=encoding)
w.writeheader()
w.writerows(objs)


def to_json(pdf, types, encoding):
data = {"metadata": pdf.metadata}

def get_page_data(page):
d = dict((t + "s", getattr(page, t + "s")) for t in types)
d["width"] = page.width
d["height"] = page.height
return d

data["pages"] = list(map(get_page_data, pdf.pages))

if hasattr(sys.stdout, "buffer"):
sys.stdout = codecs.getwriter("utf-8")(sys.stdout.buffer, "strict")
json.dump(data, sys.stdout, cls=DecimalEncoder)
else:
json.dump(data, sys.stdout, cls=DecimalEncoder, encoding=encoding)


def main():
args = parse_args()
pdf = pdfplumber.open(args.infile, pages=args.pages)
if args.format == "csv":
to_csv(pdf, args.types, args.encoding)
else:
to_json(pdf, args.types, args.encoding)


if __name__ == "__main__":
main()
def main(args_raw=sys.argv[1:]):
args = parse_args(args_raw)
converter = {"csv": convert.to_csv, "json": convert.to_json}[args.format]
kwargs = {"csv": {}, "json": {"indent": args.indent}}[args.format]
with PDF.open(args.infile, pages=args.pages) as pdf:
converter(pdf, sys.stdout, args.types, **kwargs)
6 changes: 5 additions & 1 deletion pdfplumber/container.py
@@ -1,5 +1,5 @@
from itertools import chain
from . import utils
from . import utils, convert


class Container(object):
Expand Down Expand Up @@ -64,3 +64,7 @@ def test(x):
return x["orientation"] == "v"

return list(filter(test, self.edges))


Container.to_json = convert.to_json
Container.to_csv = convert.to_csv
132 changes: 132 additions & 0 deletions pdfplumber/convert.py
@@ -0,0 +1,132 @@
from .utils import decode_text
from decimal import Decimal, ROUND_HALF_UP
from pdfminer.pdftypes import PDFStream, PDFObjRef
from pdfminer.psparser import PSLiteral
import json
import csv
import base64
from io import StringIO

DEFAULT_TYPES = [
"char",
"rect",
"line",
"curve",
"image",
"annot",
]

COLS_TO_PREPEND = [
"object_type",
"page_number",
"x0",
"x1",
"y0",
"y1",
"doctop",
"top",
"bottom",
"width",
"height",
]

ENCODINGS_TO_TRY = [
"utf-8",
"latin-1",
"utf-16",
"utf-16le",
]


def to_b64(data_bytes):
return base64.b64encode(data_bytes).decode("ascii")


def serialize(obj):
# Convert int-like
t = type(obj)
if t is Decimal:
return float(obj.quantize(Decimal(".0001"), rounding=ROUND_HALF_UP))
# If tuple/list passed, bulk-convert
elif t in (list, tuple):
return t(serialize(x) for x in obj)
elif t is dict:
return {k: serialize(v) for k, v in obj.items()}
elif t is PDFStream:
return {"rawdata": to_b64(obj.rawdata)}
elif t is PSLiteral:
return decode_text(obj.name)
elif t is bytes:
try:
for e in ENCODINGS_TO_TRY:
return obj.decode(e)
# If none of the decodings work, raise whatever error
# decoding with utf-8 causes
except: # pragma: no cover
obj.decode(ENCODINGS_TO_TRY[0])
elif obj is None:
return None
elif t in (int, float, str, bool):
return obj
else:
return str(obj)


def to_json(container, stream=None, types=DEFAULT_TYPES, indent=None):
def page_to_dict(page):
d = {
"page_number": page.page_number,
"initial_doctop": page.initial_doctop,
"rotation": page.rotation,
"cropbox": page.cropbox,
"mediabox": page.mediabox,
"bbox": page.bbox,
"width": page.width,
"height": page.height,
}
for t in types:
d[t + "s"] = getattr(page, t + "s")
return d

if hasattr(container, "pages"):
data = {
"metadata": container.metadata,
"pages": list(map(page_to_dict, container.pages)),
}
else:
data = page_to_dict(container)

serialized = serialize(data)

if stream is None:
return json.dumps(serialized, indent=indent)
else:
return json.dump(serialized, stream, indent=indent)


def to_csv(container, stream=None, types=DEFAULT_TYPES):
if stream is None:
stream = StringIO()
to_string = True
else:
to_string = False

objs = []

# Determine set of fields for all objects
fields = set()
for t in types:
new_objs = getattr(container, t + "s")
if len(new_objs):
objs += new_objs
new_keys = [k for k, v in new_objs[0].items() if type(v) is not dict]
fields = fields.union(set(new_keys))

cols = COLS_TO_PREPEND + list(sorted(set(fields) - set(COLS_TO_PREPEND)))

w = csv.DictWriter(stream, fieldnames=cols, extrasaction="ignore")
w.writeheader()
w.writerows(serialize(objs))
if to_string:
stream.seek(0)
return stream.read()
39 changes: 28 additions & 11 deletions pdfplumber/page.py
Expand Up @@ -2,7 +2,6 @@
from .utils import resolve, resolve_all
from .table import TableFinder
from .container import Container

import re

lt_pat = re.compile(r"^LT")
Expand Down Expand Up @@ -60,30 +59,45 @@ def layout(self):
@property
def annots(self):
def parse(annot):
data = resolve(annot.resolve())
rect = self.decimalize(resolve_all(data["Rect"]))
rect = self.decimalize(annot["Rect"])

a = annot.get("A", {})
extras = {
"uri": a.get("URI"),
"title": annot.get("T"),
"contents": annot.get("Contents"),
}
for k, v in extras.items():
if v is not None:
extras[k] = v.decode("utf-8")

parsed = {
"page_number": self.page_number,
"object_type": "annot",
"x0": rect[0],
"y0": rect[1],
"x1": rect[2],
"y1": rect[3],
"doctop": self.initial_doctop + self.height - rect[3],
"top": self.height - rect[3],
"x0": rect[0],
"bottom": self.height - rect[1],
"x1": rect[2],
"width": rect[2] - rect[0],
"height": rect[3] - rect[1],
"data": data,
}
uri = data.get("A", {}).get("URI")
if uri is not None:
parsed["URI"] = uri.decode("utf-8")
parsed.update(extras)
# Replace the indirect reference to the page dictionary
# with a pointer to our actual page
if "P" in annot:
annot["P"] = self
parsed["data"] = annot
return parsed

raw = resolve(self.page_obj.annots) or []
raw = resolve_all(self.page_obj.annots) or []
return list(map(parse, raw))

@property
def hyperlinks(self):
return [a for a in self.annots if "URI" in a]
return [a for a in self.annots if a["uri"] is not None]

@property
def objects(self):
Expand Down Expand Up @@ -246,6 +260,9 @@ def to_image(self, **conversion_kwargs):
kwargs["resolution"] = DEFAULT_RESOLUTION
return PageImage(self, **kwargs)

def __repr__(self):
return f"<Page:{self.page_number}>"


class DerivedPage(Page):
is_original = False
Expand Down

0 comments on commit cbc91c6

Please sign in to comment.