From 507429c1d99ea06583c456efefc5a510f90f1b2b Mon Sep 17 00:00:00 2001 From: Koen de Leijer Date: Tue, 17 Oct 2023 16:05:00 +0200 Subject: [PATCH] NEXTPY-569 -- Make pdfquery compatible with Python 3.9 and 3.11 --- .travis.yml | 11 +- CHANGES.txt | 2 +- README.rst | 2 +- appveyor.yml | 6 +- pdfquery/__init__.py | 2 +- pdfquery/cache.py | 23 +- pdfquery/pdfquery.py | 313 +++++---- pdfquery/pdftranslator.py | 11 +- setup.cfg | 74 +++ setup.py | 47 +- tests/saved_output/IRS_1040A_output_py35.xml | 646 ------------------- tests/test_main.py | 192 +++--- tests/utils.py | 34 +- 13 files changed, 408 insertions(+), 955 deletions(-) create mode 100644 setup.cfg delete mode 100644 tests/saved_output/IRS_1040A_output_py35.xml diff --git a/.travis.yml b/.travis.yml index 48a0f15..f6824d5 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,9 +1,7 @@ language: python python: - - "3.6" - - "3.7" - - "3.8" - "3.9" + - "3.11" env: CFLAGS="-O0" cache: @@ -11,11 +9,10 @@ cache: - $HOME/.cache/pip install: - - pip install -r requirements.txt -script: - python setup.py test + - pip install -e . +script: python setup.py test after_success: - coveralls # See: http://docs.travis-ci.com/user/migrating-from-legacy/ -sudo: false \ No newline at end of file +sudo: false diff --git a/CHANGES.txt b/CHANGES.txt index e932a2c..149c1e0 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -2,7 +2,7 @@ v0.5.0, 2021-05-03 -- 0.5.1 (unreleased) -- Nothing changed yet. +- Make pdfquery compatible with Python 3.9 and 3.11 0.5.0 (2021-05-04) diff --git a/README.rst b/README.rst index abc1a6a..d73277b 100644 --- a/README.rst +++ b/README.rst @@ -27,7 +27,7 @@ Installation as a package Installation for development ============================ -``pip install -e .[test,flake8,docs,release]`` +``pip install -e ".[test,flake8,docs,release]"`` Quick Start =========== diff --git a/appveyor.yml b/appveyor.yml index e165e0c..d88f778 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -1,11 +1,9 @@ environment: matrix: - - PYTHON: "C:\\Python36" - - PYTHON: "C:\\Python37" - - PYTHON: "C:\\Python38" - PYTHON: "C:\\Python39" + - PYTHON: "C:\\Python311" build: off test_script: - - "%PYTHON%\\python.exe setup.py test" \ No newline at end of file + - "%PYTHON%\\python.exe setup.py test" diff --git a/pdfquery/__init__.py b/pdfquery/__init__.py index 8f1dd85..de574bd 100644 --- a/pdfquery/__init__.py +++ b/pdfquery/__init__.py @@ -1 +1 @@ -from .pdfquery import PDFQuery \ No newline at end of file +from .pdfquery import PDFQuery diff --git a/pdfquery/cache.py b/pdfquery/cache.py index e007f75..517abcb 100644 --- a/pdfquery/cache.py +++ b/pdfquery/cache.py @@ -5,7 +5,6 @@ class BaseCache(object): - def __init__(self): self.hash_key = None @@ -34,30 +33,34 @@ class DummyCache(BaseCache): class FileCache(BaseCache): - - def __init__(self, directory='/tmp/'): + def __init__(self, directory="/tmp/"): self.directory = directory super(FileCache, self).__init__() def get_cache_filename(self, page_range_key): return "pdfquery_{hash_key}{page_range_key}.xml".format( - hash_key=self.hash_key, - page_range_key=page_range_key + hash_key=self.hash_key, page_range_key=page_range_key ) def get_cache_file(self, page_range_key, mode): try: - return zipfile.ZipFile(self.directory+self.get_cache_filename(page_range_key)+".zip", mode) + return zipfile.ZipFile( + self.directory + self.get_cache_filename(page_range_key) + ".zip", mode + ) except IOError: return None def set(self, page_range_key, tree): - xml = etree.tostring(tree, encoding='utf-8', pretty_print=False, xml_declaration=True) - cache_file = self.get_cache_file(page_range_key, 'w') + xml = etree.tostring( + tree, encoding="utf-8", pretty_print=False, xml_declaration=True + ) + cache_file = self.get_cache_file(page_range_key, "w") cache_file.writestr(self.get_cache_filename(page_range_key), xml) cache_file.close() def get(self, page_range_key): - cache_file = self.get_cache_file(page_range_key, 'r') + cache_file = self.get_cache_file(page_range_key, "r") if cache_file: - return etree.fromstring(cache_file.read(self.get_cache_filename(page_range_key))) \ No newline at end of file + return etree.fromstring( + cache_file.read(self.get_cache_filename(page_range_key)) + ) diff --git a/pdfquery/pdfquery.py b/pdfquery/pdfquery.py index 8f8461c..cedf597 100644 --- a/pdfquery/pdfquery.py +++ b/pdfquery/pdfquery.py @@ -3,7 +3,6 @@ import json import numbers import re -import sys from collections import OrderedDict import chardet @@ -19,17 +18,16 @@ from pdfminer.pdftypes import resolve1 from pdfminer.psparser import PSLiteral from pyquery import PyQuery -from six.moves import map, zip +from six.moves import zip from .cache import DummyCache -# local imports from .pdftranslator import PDFQueryTranslator # Re-sort the PDFMiner Layout tree so elements that fit inside other elements # will be children of them def _append_sorted(root, el, comparator): - """ Add el as a child of root, or as a child of one of root's children. + """Add el as a child of root, or as a child of one of root's children. Comparator is a function(a, b) returning > 0 if a is a child of b, < 0 if b is a child of a, 0 if neither. """ @@ -47,21 +45,24 @@ def _append_sorted(root, el, comparator): def _box_in_box(el, child): - """ Return True if child is contained within el. """ + """Return True if child is contained within el.""" return ( - float(el.get('x0')) <= float(child.get('x0')) and - float(el.get('x1')) >= float(child.get('x1')) and - float(el.get('y0')) <= float(child.get('y0')) and - float(el.get('y1')) >= float(child.get('y1')) + float(el.get("x0")) <= float(child.get("x0")) + and float(el.get("x1")) >= float(child.get("x1")) + and float(el.get("y0")) <= float(child.get("y0")) + and float(el.get("y1")) >= float(child.get("y1")) ) -_comp_bbox_keys_required = set(['x0', 'x1', 'y0', 'y1']) +_comp_bbox_keys_required = set(["x0", "x1", "y0", "y1"]) + + def _comp_bbox(el, el2): - """ Return 1 if el in el2, -1 if el2 in el, else 0""" + """Return 1 if el in el2, -1 if el2 in el, else 0""" # only compare if both elements have x/y coordinates - if _comp_bbox_keys_required <= set(el.keys()) and \ - _comp_bbox_keys_required <= set(el2.keys()): + if _comp_bbox_keys_required <= set(el.keys()) and _comp_bbox_keys_required <= set( + el2.keys() + ): if _box_in_box(el2, el): return 1 if _box_in_box(el, el2): @@ -82,35 +83,38 @@ def _flatten(l): i -= 1 break else: - l[i:i + 1] = l[i] + l[i : i + 1] = l[i] i += 1 return ltype(l) + # these might have to be removed from the start of a decoded string after # conversion -bom_headers = set([ - six.text_type(codecs.BOM_UTF8, 'utf8'), - six.text_type(codecs.BOM_UTF16_LE, 'utf-16LE'), - six.text_type(codecs.BOM_UTF16_BE, 'utf-16BE'), - six.text_type(codecs.BOM_UTF32_LE, 'utf-32LE'), - six.text_type(codecs.BOM_UTF32_BE, 'utf-32BE'), -]) +bom_headers = set( + [ + six.text_type(codecs.BOM_UTF8, "utf8"), + six.text_type(codecs.BOM_UTF16_LE, "utf-16LE"), + six.text_type(codecs.BOM_UTF16_BE, "utf-16BE"), + six.text_type(codecs.BOM_UTF32_LE, "utf-32LE"), + six.text_type(codecs.BOM_UTF32_BE, "utf-32BE"), + ] +) def smart_unicode_decode(encoded_string): """ - Given an encoded string of unknown format, detect the format with - chardet and return the unicode version. - Example input from bug #11: - ('\xfe\xff\x00I\x00n\x00s\x00p\x00e\x00c\x00t\x00i\x00o\x00n\x00' - '\x00R\x00e\x00p\x00o\x00r\x00t\x00 \x00v\x002\x00.\x002') + Given an encoded string of unknown format, detect the format with + chardet and return the unicode version. + Example input from bug #11: + ('\xfe\xff\x00I\x00n\x00s\x00p\x00e\x00c\x00t\x00i\x00o\x00n\x00' + '\x00R\x00e\x00p\x00o\x00r\x00t\x00 \x00v\x002\x00.\x002') """ if not encoded_string: - return u'' + return "" # optimization -- first try ascii try: - return encoded_string.decode('ascii') + return encoded_string.decode("ascii") except UnicodeDecodeError: pass @@ -118,11 +122,13 @@ def smart_unicode_decode(encoded_string): detected_encoding = chardet.detect(encoded_string) # bug 54 -- depending on chardet version, if encoding is not guessed, # either detected_encoding will be None or detected_encoding['encoding'] will be None - detected_encoding = detected_encoding['encoding'] if detected_encoding and detected_encoding.get('encoding') else 'utf8' + detected_encoding = ( + detected_encoding["encoding"] + if detected_encoding and detected_encoding.get("encoding") + else "utf8" + ) decoded_string = six.text_type( - encoded_string, - encoding=detected_encoding, - errors='replace' + encoded_string, encoding=detected_encoding, errors="replace" ) # unicode string may still have useless BOM character at the beginning @@ -131,6 +137,7 @@ def smart_unicode_decode(encoded_string): return decoded_string + def prepare_for_json_encoding(obj): """ Convert an arbitrary object into just JSON data types (list, dict, unicode str, int, bool, null). @@ -141,18 +148,24 @@ def prepare_for_json_encoding(obj): if obj_type == dict: # alphabetizing keys lets us compare attributes for equality across runs return OrderedDict( - (prepare_for_json_encoding(k), - prepare_for_json_encoding(obj[k])) for k in sorted(obj.keys()) + (prepare_for_json_encoding(k), prepare_for_json_encoding(obj[k])) + for k in sorted(obj.keys()) ) if obj_type == six.binary_type: return smart_unicode_decode(obj) - if obj_type == bool or obj is None or obj_type == six.text_type or isinstance(obj, numbers.Number): + if ( + obj_type == bool + or obj is None + or obj_type == six.text_type + or isinstance(obj, numbers.Number) + ): return obj if obj_type == PSLiteral: # special case because pdfminer.six currently adds extra quotes to PSLiteral.__repr__ - return u"/%s" % obj.name + return "/%s" % obj.name return six.text_type(obj) + def obj_to_string(obj, top=True): """ Turn an arbitrary object into a unicode string. If complex (dict/list/tuple), will be json-encoded. @@ -164,9 +177,13 @@ def obj_to_string(obj, top=True): # via http://stackoverflow.com/a/25920392/307769 -invalid_xml_chars_re = re.compile(u'[^\u0020-\uD7FF\u0009\u000A\u000D\uE000-\uFFFD\u10000-\u10FFFF]+') +invalid_xml_chars_re = re.compile( + "[^\u0020-\uD7FF\u0009\u000A\u000D\uE000-\uFFFD\u10000-\u10FFFF]+" +) + + def strip_invalid_xml_chars(s): - return invalid_xml_chars_re.sub(r'', s) + return invalid_xml_chars_re.sub(r"", s) # custom PDFDocument class @@ -201,12 +218,13 @@ def get_page_number(self, index): """ # get and cache page ranges - if not hasattr(self, 'page_range_pairs'): + if not hasattr(self, "page_range_pairs"): try: - page_ranges = resolve1(self.catalog['PageLabels'])['Nums'] + page_ranges = resolve1(self.catalog["PageLabels"])["Nums"] assert len(page_ranges) > 1 and len(page_ranges) % 2 == 0 self.page_range_pairs = list( - reversed(list(zip(page_ranges[::2], page_ranges[1::2])))) + reversed(list(zip(page_ranges[::2], page_ranges[1::2]))) + ) except: self.page_range_pairs = [] @@ -222,32 +240,32 @@ def get_page_number(self, index): page_label = "" # handle numeric part of label - if 'S' in label_format: - + if "S" in label_format: # first find number for this page ... page_label = index - starting_index - if 'St' in label_format: # alternate start value - page_label += label_format['St'] + if "St" in label_format: # alternate start value + page_label += label_format["St"] else: page_label += 1 # ... then convert to correct format - num_type = label_format['S'].name + num_type = label_format["S"].name # roman (upper or lower) - if num_type.lower() == 'r': + if num_type.lower() == "r": import roman + page_label = roman.toRoman(page_label) - if num_type == 'r': + if num_type == "r": page_label = page_label.lower() # letters - elif num_type.lower() == 'a': + elif num_type.lower() == "a": # a to z for the first 26 pages, aa to zz for the next 26, and # so on letter = chr(page_label % 26 + 65) letter *= page_label / 26 + 1 - if num_type == 'a': + if num_type == "a": letter = letter.lower() page_label = letter @@ -256,24 +274,27 @@ def get_page_number(self, index): page_label = obj_to_string(page_label) # handle string prefix - if 'P' in label_format: - page_label = smart_unicode_decode(label_format['P']) + page_label + if "P" in label_format: + page_label = smart_unicode_decode(label_format["P"]) + page_label return page_label # create etree parser using custom Element class + class LayoutElement(etree.ElementBase): @property def layout(self): - if not hasattr(self, '_layout'): + if not hasattr(self, "_layout"): self._layout = None return self._layout @layout.setter def layout(self, value): self._layout = value + + parser_lookup = etree.ElementDefaultClassLookup(element=LayoutElement) parser = etree.XMLParser() parser.set_element_class_lookup(parser_lookup) @@ -282,17 +303,17 @@ def layout(self, value): # main class class PDFQuery(object): def __init__( - self, - file, - merge_tags=('LTChar', 'LTAnno'), - round_floats=True, - round_digits=3, - input_text_formatter=None, - normalize_spaces=True, - resort=True, - parse_tree_cacher=None, - laparams={'all_texts':True, 'detect_vertical':True}, - password='' + self, + file, + merge_tags=("LTChar", "LTAnno"), + round_floats=True, + round_digits=3, + input_text_formatter=None, + normalize_spaces=True, + resort=True, + parse_tree_cacher=None, + laparams={"all_texts": True, "detect_vertical": True}, + password="", ): # store input self.merge_tags = merge_tags @@ -304,15 +325,15 @@ def __init__( if input_text_formatter: self.input_text_formatter = input_text_formatter elif normalize_spaces: - r = re.compile(r'\s+') - self.input_text_formatter = lambda s: re.sub(r, ' ', s) + r = re.compile(r"\s+") + self.input_text_formatter = lambda s: re.sub(r, " ", s) else: self.input_text_formatter = None # open doc - if not hasattr(file, 'read'): + if not hasattr(file, "read"): try: - file = open(file, 'rb') + file = open(file, "rb") except TypeError: raise TypeError("File must be file object or filepath string.") @@ -366,11 +387,11 @@ def load(self, *page_numbers): def extract(self, searches, tree=None, as_dict=True): """ - >>> foo = pdf.extract([['pages', 'LTPage']]) - >>> foo - {'pages': [, ]} - >>> pdf.extract([['bar', ':in_bbox("100,100,400,400")']], foo['pages'][0]) - {'bar': [, ,... + >>> foo = pdf.extract([['pages', 'LTPage']]) + >>> foo + {'pages': [, ]} + >>> pdf.extract([['bar', ':in_bbox("100,100,400,400")']], foo['pages'][0]) + {'bar': [, ,... """ if self.tree is None or self.pq is None: self.load() @@ -385,25 +406,31 @@ def extract(self, searches, tree=None, as_dict=True): if len(search) < 3: search = list(search) + [formatter] key, search, tmp_formatter = search - if key == 'with_formatter': + if key == "with_formatter": if isinstance(search, six.string_types): # is a pyquery method name, e.g. 'text' formatter = lambda o, search=search: getattr(o, search)() - elif hasattr(search, '__call__') or not search: + elif hasattr(search, "__call__") or not search: # is a method, or None to end formatting formatter = search else: - raise TypeError("Formatter should be either a pyquery " - "method name or a callable function.") - elif key == 'with_parent': + raise TypeError( + "Formatter should be either a pyquery " + "method name or a callable function." + ) + elif key == "with_parent": parent = pq(search) if search else pq else: try: - result = parent("*").filter(search) if \ - hasattr(search, '__call__') else parent(search) + result = ( + parent("*").filter(search) + if hasattr(search, "__call__") + else parent(search) + ) except cssselect.SelectorSyntaxError as e: raise cssselect.SelectorSyntaxError( - "Error applying selector '%s': %s" % (search, e)) + "Error applying selector '%s': %s" % (search, e) + ) if tmp_formatter: result = tmp_formatter(result) results += result if type(result) == tuple else [[key, result]] @@ -414,9 +441,9 @@ def extract(self, searches, tree=None, as_dict=True): # tree building stuff def get_pyquery(self, tree=None, page_numbers=None): """ - Wrap given tree in pyquery and return. - If no tree supplied, will generate one from given page_numbers, or - all page numbers. + Wrap given tree in pyquery and return. + If no tree supplied, will generate one from given page_numbers, or + all page numbers. """ if not page_numbers: page_numbers = [] @@ -425,17 +452,17 @@ def get_pyquery(self, tree=None, page_numbers=None): tree = self.tree else: tree = self.get_tree(page_numbers) - if hasattr(tree, 'getroot'): + if hasattr(tree, "getroot"): tree = tree.getroot() return PyQuery(tree, css_translator=PDFQueryTranslator()) def get_tree(self, *page_numbers): """ - Return lxml.etree.ElementTree for entire document, or page numbers - given if any. + Return lxml.etree.ElementTree for entire document, or page numbers + given if any. """ hasher = hashlib.md5() - hasher.update(str(page_numbers).encode('UTF-8')) + hasher.update(str(page_numbers).encode("UTF-8")) cache_key = "_{}".format(hasher.hexdigest()) tree = self._parse_tree_cacher.get(cache_key) if tree is None: @@ -453,24 +480,26 @@ def get_tree(self, *page_numbers): # If that happens we just replace non-word characters # with '_'. if "Invalid attribute name" in e.args[0]: - k = re.sub(r'\W', '_', k) + k = re.sub(r"\W", "_", k) root.set(k, v) # Parse pages and append to root. # If nothing was passed in for page_numbers, we do this for all # pages, but if None was explicitly passed in, we skip it. - if not(len(page_numbers) == 1 and page_numbers[0] is None): + if not (len(page_numbers) == 1 and page_numbers[0] is None): if page_numbers: - pages = [[n, self.get_layout(self.get_page(n))] for n in - _flatten(page_numbers)] + pages = [ + [n, self.get_layout(self.get_page(n))] + for n in _flatten(page_numbers) + ] else: pages = enumerate(self.get_layouts()) for n, page in pages: page = self._xmlize(page) if self.resort: self._sort(page) - page.set('page_index', obj_to_string(n)) - page.set('page_label', self.doc.get_page_number(n)) + page.set("page_index", obj_to_string(n)) + page.set("page_label", self.doc.get_page_number(n)) root.append(page) self._clean_text(root) @@ -482,8 +511,8 @@ def get_tree(self, *page_numbers): def _clean_text(self, branch): """ - Remove text from node if same text exists in its children. - Apply string formatter if set. + Remove text from node if same text exists in its children. + Apply string formatter if set. """ if branch.text and self.input_text_formatter: branch.text = self.input_text_formatter(branch.text) @@ -491,7 +520,7 @@ def _clean_text(self, branch): for child in branch: self._clean_text(child) if branch.text and branch.text.find(child.text) >= 0: - branch.text = branch.text.replace(child.text, '', 1) + branch.text = branch.text.replace(child.text, "", 1) except TypeError: # not an iterable node pass @@ -502,20 +531,39 @@ def _xmlize(self, node, root=None): else: # collect attributes of current node tags = self._getattrs( - node, 'y0', 'y1', 'x0', 'x1', 'width', 'height', 'bbox', - 'linewidth', 'pts', 'index', 'name', 'matrix', 'word_margin' + node, + "y0", + "y1", + "x0", + "x1", + "width", + "height", + "bbox", + "linewidth", + "pts", + "index", + "name", + "matrix", + "word_margin", ) if type(node) == LTImage: - tags.update(self._getattrs( - node, 'colorspace', 'bits', 'imagemask', 'srcsize', - 'stream', 'name', 'pts', 'linewidth') + tags.update( + self._getattrs( + node, + "colorspace", + "bits", + "imagemask", + "srcsize", + "stream", + "name", + "pts", + "linewidth", + ) ) elif type(node) == LTChar: - tags.update(self._getattrs( - node, 'fontname', 'adv', 'upright', 'size') - ) + tags.update(self._getattrs(node, "fontname", "adv", "upright", "size")) elif type(node) == LTPage: - tags.update(self._getattrs(node, 'pageid', 'rotate')) + tags.update(self._getattrs(node, "pageid", "rotate")) # create node branch = parser.makeelement(node.__class__.__name__, tags) @@ -526,11 +574,11 @@ def _xmlize(self, node, root=None): root = branch # add text - if hasattr(node, 'get_text'): + if hasattr(node, "get_text"): branch.text = strip_invalid_xml_chars(node.get_text()) # add children if node is an iterable - if hasattr(node, '__iter__'): + if hasattr(node, "__iter__"): last = None for child in node: child = self._xmlize(child, root) @@ -540,8 +588,8 @@ def _xmlize(self, node, root=None): elif last is not None and last.tag in self.merge_tags: last.text += child.text last.set( - '_obj_id', - last.get('_obj_id','') + "," + child.get('_obj_id','') + "_obj_id", + last.get("_obj_id", "") + "," + child.get("_obj_id", ""), ) continue # sort children by bounding boxes @@ -553,15 +601,18 @@ def _xmlize(self, node, root=None): return branch def _sort(self, tree): - """ Sort same-level elements top to bottom and left to right. """ + """Sort same-level elements top to bottom and left to right.""" children = list(tree) if children: - tree[:] = sorted(children, key=lambda child: (-float(child.get('y1')), float(child.get('x0')))) + tree[:] = sorted( + children, + key=lambda child: (-float(child.get("y1")), float(child.get("x0"))), + ) for child in children: self._sort(child) def _getattrs(self, obj, *attrs): - """ Return dictionary of given attrs on given object, if they exist, + """Return dictionary of given attrs on given object, if they exist, processing through _filter_value(). """ filtered_attrs = {} @@ -576,17 +627,17 @@ def _filter_value(self, val): if self.round_floats: if type(val) == float: val = round(val, self.round_digits) - elif hasattr(val, '__iter__') and not isinstance(val, six.string_types): + elif hasattr(val, "__iter__") and not isinstance(val, six.string_types): val = [self._filter_value(item) for item in val] return val # page access stuff def get_page(self, page_number): - """ Get PDFPage object -- 0-indexed.""" + """Get PDFPage object -- 0-indexed.""" return self._cached_pages(target_page=page_number) def get_layout(self, page): - """ Get PDFMiner Layout object for given page object or page number. """ + """Get PDFMiner Layout object for given page object or page number.""" if type(page) == int: page = self.get_page(page) self.interpreter.process_page(page) @@ -595,7 +646,7 @@ def get_layout(self, page): return layout def get_layouts(self): - """ Get list of PDFMiner Layout objects for each page. """ + """Get list of PDFMiner Layout objects for each page.""" return (self.get_layout(page) for page in self._cached_pages()) def _cached_pages(self, target_page=-1): @@ -622,22 +673,21 @@ def _cached_pages(self, target_page=-1): return self._pages def _add_annots(self, layout, annots): - """Adds annotations to the layout object - """ + """Adds annotations to the layout object""" if annots: for annot in resolve1(annots): annot = resolve1(annot) - if annot.get('Rect') is not None: - annot['bbox'] = annot.pop('Rect') # Rename key + if annot.get("Rect") is not None: + annot["bbox"] = annot.pop("Rect") # Rename key annot = self._set_hwxy_attrs(annot) try: - annot['URI'] = resolve1(annot['A'])['URI'] + annot["URI"] = resolve1(annot["A"])["URI"] except: # noqa pass for k, v in six.iteritems(annot): if not isinstance(v, six.string_types): annot[k] = obj_to_string(v) - elem = parser.makeelement('Annot', annot) + elem = parser.makeelement("Annot", annot) layout.add(elem) return layout @@ -646,17 +696,18 @@ def _set_hwxy_attrs(attr): """Using the bbox attribute, set the h, w, x0, x1, y0, and y1 attributes. """ - bbox = attr['bbox'] - attr['x0'] = bbox[0] - attr['x1'] = bbox[2] - attr['y0'] = bbox[1] - attr['y1'] = bbox[3] - attr['height'] = attr['y1'] - attr['y0'] - attr['width'] = attr['x1'] - attr['x0'] + bbox = attr["bbox"] + attr["x0"] = bbox[0] + attr["x1"] = bbox[2] + attr["y0"] = bbox[1] + attr["y1"] = bbox[3] + attr["height"] = attr["y1"] - attr["y0"] + attr["width"] = attr["x1"] - attr["x0"] return attr if __name__ == "__main__": import doctest + pdf = PDFQuery("../examples/sample.pdf") - doctest.testmod(extraglobs={'pdf': pdf}, optionflags=doctest.ELLIPSIS) + doctest.testmod(extraglobs={"pdf": pdf}, optionflags=doctest.ELLIPSIS) diff --git a/pdfquery/pdftranslator.py b/pdfquery/pdftranslator.py index bd41f5c..14a18aa 100644 --- a/pdfquery/pdftranslator.py +++ b/pdfquery/pdftranslator.py @@ -5,12 +5,11 @@ class PDFQueryTranslator(cssselect_xpath.GenericTranslator): - def xpath_in_bbox_function(self, xpath, fn): if len(fn.arguments) > 1: - x0,y0,x1,y1 = [float(t.value) for t in fn.arguments] + x0, y0, x1, y1 = [float(t.value) for t in fn.arguments] else: - x0,y0,x1,y1 = map(float, fn.arguments[0].value.split(",")) + x0, y0, x1, y1 = map(float, fn.arguments[0].value.split(",")) # TODO: seems to be doing < rather than <= ??? xpath.add_condition("@x0 >= %s" % x0) xpath.add_condition("@y0 >= %s" % y0) @@ -20,12 +19,12 @@ def xpath_in_bbox_function(self, xpath, fn): def xpath_overlaps_bbox_function(self, xpath, fn): if len(fn.arguments) > 1: - x0,y0,x1,y1 = [float(t.value) for t in fn.arguments] + x0, y0, x1, y1 = [float(t.value) for t in fn.arguments] else: - x0,y0,x1,y1 = map(float, fn.arguments[0].value.split(",")) + x0, y0, x1, y1 = map(float, fn.arguments[0].value.split(",")) # TODO: seems to be doing < rather than <= ??? xpath.add_condition("@x0 <= %s" % x1) xpath.add_condition("@y0 <= %s" % y1) xpath.add_condition("@x1 >= %s" % x0) xpath.add_condition("@y1 >= %s" % y0) - return xpath \ No newline at end of file + return xpath diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..bf1acc3 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,74 @@ + +# setuptools config +# see http://setuptools.readthedocs.io/en/latest/setuptools.html#configuring-setup-using-setup-cfg-files + +[metadata] +name = pdfquery +version = 0.5.1.dev0 +description = Concise and friendly PDF scraper using JQuery or XPath selectors +long_description = file: README.rst +license = MIT +author = Jack Cushman +author_email = jcushman@gmail.com +url = https://github.com/jcushman/pdfquery +classifiers = + Development Status :: 5 - Production/Stable + Intended Audience :: Developers + Operating System :: Unix + Operating System :: MacOS + Programming Language :: Python :: 3.9 + Programming Language :: Python :: 3.11 + Topic :: Software Development :: Libraries :: Python Modules + +[options] +zip_safe = False +include_package_data = True +packages = find: +install_requires = + cssselect + chardet + lxml + pdfminer.six + pyquery + roman +tests_require = + pytest + tox + pytest-remove-stale-bytecode + +[options.extras_require] +dev = + pre-commit + pdbpp + black + isort + flake8 +test = + pytest + pytest-remove-stale-bytecode + tox +coverage = pytest-cov +docs = + sphinx + doc8 +release = zest.releaser +pycodestyle = flake8 + +[aliases] +test=pytest + +[bdist_wheel] +universal = 1 + +[tool:pytest] +testpaths = tests + +[flake8] +max-line-length = 120 +exclude = env,.tox,doc + +[zest.releaser] +create-wheel = yes + +[distutils] +index-servers = isp diff --git a/setup.py b/setup.py index fff67d4..6068493 100644 --- a/setup.py +++ b/setup.py @@ -1,46 +1,3 @@ -import sys +from setuptools import setup -from setuptools import find_packages, setup - -TESTS_REQUIRE = [ - 'pytest', - 'tox', - 'isort', - 'freezegun', - 'pre-commit' -] - -setup( - name='pdfquery', - version='0.5.1.dev0', - author=u'Jack Cushman', - author_email='jcushman@gmail.com', - packages=find_packages(), - url='https://github.com/jcushman/pdfquery', - license='MIT', - description='Concise and friendly PDF scraper using JQuery or XPath selectors.', - keywords='', - long_description=open('README.rst').read(), - install_requires=open('requirements.txt').read(), - classifiers=[ - "Development Status :: 4 - Beta", - "Topic :: Text Processing", - "Topic :: Utilities", - "License :: OSI Approved :: MIT License", - "Intended Audience :: Developers", - "Operating System :: OS Independent", - "Programming Language :: Python", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.5", - "Programming Language :: Python :: 3.6", - "Programming Language :: Python :: 3.9", - ], - extras_require=dict( - test=TESTS_REQUIRE, - pep8=['flake8'], - coverage=['pytest-cov'], - docs=['sphinx'], - release=['zest.releaser'], - ), - test_suite='tests', -) +setup() diff --git a/tests/saved_output/IRS_1040A_output_py35.xml b/tests/saved_output/IRS_1040A_output_py35.xml deleted file mode 100644 index d0fd5a5..0000000 --- a/tests/saved_output/IRS_1040A_output_py35.xml +++ /dev/null @@ -1,646 +0,0 @@ - - - 2 TO BE REMOVED BEFORE PRINTING - TLS, have you transmitted all R text files for this cycle update? - - - - - - I.R.S. SPECIFICATIONS INSTRUCTIONS TO PRINTERS FORM 1040A, PAGE 1 of 2 MARGINS: TOP 13 mm (1⁄2 "), CENTER SIDES. PRINTS: HEAD TO HEAD PAPER: WHITE WRITING, SUB. 20. INK: BLACK FLAT SIZE: 203 mm (8") x 279 mm (11") PERFORATE: (NONE) DO NOT PRINT — DO NOT PRINT — DO NOT PRINT — DO NOT PRINT - Action - Date - Signature - - - - O.K. to print - - Revised proofs requested - - Date - - - - - - Separation 1 of 2: Black - Separation 2 of 2: PMS 185 (RED) - - - - - - Write the words “Stimulus Payment” across the top of the form you file. - - Form 1040A Label (See page 15.) - Department of the Treasury—Internal Revenue Service U.S. Individual Income Tax Return Your first name and initial Last name - 2007 - IRS Use Only—Do not write or staple in this space. - (99) - - - - - - - - LABEL HERE - John E. - Michaels - - If a joint return, spouse’s first name and initial - Last name - Susan R. - Michaels - - - Home address (number and street). If you have a P.O. box, see page 15. - Apt. no. - 1040 Main Street - - - City, town or post office, state, and ZIP code. If you have a foreign address, see page 15. - - - - - OMB No. 1545-0074 - - Your social security number - - - 011 00 2222 Spouse’s social security number - - Use the IRS label. Otherwise, please print or type. - - - 011 00 1111 You must enter your SSN(s) above. - - (cid:4) - (cid:4) - - - Checking a box below will not change your tax or refund. - - - Hometown, TX 77099 - Presidential Election Campaign (cid:2) Check here if you, or your spouse if filing jointly, want $3 to go to this fund (see page 15) (cid:2) - - - - - - - - Spouse - You - - - - Y - - L - - - - - - - 1 - 4 - Yourself. Single Married filing jointly (even if only one had income) Married filing separately. Enter spouse’s SSN above and full name here. (cid:2) Yourself. - - Head of household (with qualifying person). (See page 16.) If the qualifying person is a child but not your dependent, enter this child’s name here. (cid:2) Qualifying widow(er) with dependent child (see page 17) - Filing status Check only one box. - - - N E O T F I L - - - - 2 3 - - - - - - - - - - 5 - E - (cid:2) - - - - - 6a - If someone can claim you as a dependent, do not check box 6a. - Boxes checked on 6a and 6b - Exemptions - - - - L - - A M P b -c -b -c -O N D - - Spouse Dependents: - No. of children on 6c who: ● lived with you - - - - - - (4) - if qualifying - (3) Dependent’s relationship to you - - - - (2) Dependent’s social security number - child for child tax credit (see page 18) - O - (1) First name - Last name - - X - ● did not live with you due to divorce or separation (see page 19) - If more than six dependents, see page 18. - - - - - - - E - - - - - - - - - - - - - - - - - - - - - - - - - - Dependents on 6c not entered above - - - - - - - - - - - - - - - - - - - - - - - If you were self-employed or a partner, include the amount you would enter on Schedule SE, line 3. - - - - - - Add numbers on lines above (cid:2) - d - Total number of exemptions claimed. - - - - - Income - - (cid:2) - 7 - - Wages, salaries, tips, etc. Attach Form(s) W-2. - 7 - Attach Form(s) W-2 here. Also attach Form(s) 1099-R if tax was withheld. - - - 8a b 9a b Taxable interest. Attach Schedule 1 if required. Tax-exempt interest. Do not include on line 8a. Ordinary dividends. Attach Schedule 1 if required. Qualified dividends (see page 22). 10 11a Capital gain distributions (see page 22). IRA distributions. 11a 12a Pensions and annuities. - 8a - - - - - 8b - 9a - - - 9b - 10 - - - 11b - Taxable amount (see page 22). Taxable amount (see page 23). - 11b - - 12b - 12a - 12b - - 13 - - If you did not get a W-2, see page 21. - Enclose, but do not attach, any payment. - 13 Unemployment compensation and Alaska Permanent Fund dividends. 14a 14b - - - Social security, tier 1 railroad retirement, and veterans disability and death benefits - - - - Taxable amount (see page 25). - 14b - - - - Social security benefits. - (cid:2) - 14a - - (cid:2) - - 15 - Add lines 7 through 14b (far right column). This is your total income. - 15 - - 16 17 18 - 16 17 18 - Educator expenses (see page 25). IRA deduction (see page 27). Student loan interest deduction (see page 29). - 19 - 19 20 - Tuition and fees deduction. Attach Form 8917. Add lines 16 through 19. These are your total adjustments. - - - 20 - Subtract line 20 from line 15. This is your adjusted gross income. (cid:2) - 21 - 21 - - - - Adjusted gross income - - For Disclosure, Privacy Act, and Paperwork Reduction Act Notice, see page 74. - Form 1040A (2007) - Cat. No. 11327A - - - 2 TO BE REMOVED BEFORE PRINTING - - I.R.S. SPECIFICATIONS INSTRUCTIONS TO PRINTERS FORM 1040A, PAGE 2 of 2 MARGINS: TOP 13 mm (1⁄2 ") (TO BLACK IMAGE), CENTER SIDES. PRINTS: HEAD to HEAD PAPER: WHITE WRITING, SUB. 20. INK: BLACK FLAT SIZE: 203 mm (8") x 279 mm (11") PERFORATE: (NONE) DO NOT PRINT — DO NOT PRINT — DO NOT PRINT — DO NOT PRINT - - - - - - - - Separation 1 of 2: Black - Separation 2 of 2: PMS 185 (RED) - - - Page 2 - Form 1040A (2007) - - - - 22 - Enter the amount from line 21 (adjusted gross income). - 22 - Tax, credits, and payments Standard Deduction for— ● People who checked any box on line 23a or 23b or who can be claimed as a dependent, see page 30. ● All others: - - (cid:2) - (cid:3) - - - - 23a - You were born before January 2, 1943, Spouse was born before January 2, 1943, - Blind Blind - checked (cid:2) If you are married filing separately and your spouse itemizes (cid:2) deductions, see page 30 and check here Enter your standard deduction (see left margin). Subtract line 24 from line 22. If line 24 is more than line 22, enter -0-. If line 22 is $117,300 or less, multiply $3,400 by the total number of exemptions claimed on line 6d. If line 22 is over $117,300, see the worksheet on page 32. Subtract line 26 from line 25. If line 26 is more than line 25, enter -0-. This is your taxable income. Tax, including any alternative minimum tax (see page 30). Credit for child and dependent care expenses. Attach Schedule 2. Credit for the elderly or the disabled. Attach Schedule 3. Education credits. Attach Form 8863. Child tax credit (see page 35). Attach Form 8901 if required. Retirement savings contributions credit. Attach Form 8880. Add lines 29 through 33. These are your total credits. Subtract line 34 from line 28. If line 34 is more than line 28, enter -0-. Advance earned income credit payments from Form(s) W-2, box 9. Add lines 35 and 36. This is your total tax. Federal income tax withheld from Forms W-2 and 1099. 2007 estimated tax payments and amount applied from 2006 return. Earned income credit (EIC). b Nontaxable combat pay election. 40b Total boxes Check -if: -Check -if: -checked (cid:2) -23a -23a -b -b -If you are married filing separately and your spouse itemizes -(cid:2) -deductions, see page 30 and check here -23b -23b -Enter your standard deduction (see left margin). -Subtract line 24 from line 22. If line 24 is more than line 22, enter -0-. -If line 22 is $117,300 or less, multiply $3,400 by the total number of exemptions -claimed on line 6d. If line 22 is over $117,300, see the worksheet on page 32. -Y -Y -Subtract line 26 from line 25. If line 26 is more than line 25, enter -0-. -This is your taxable income. -L -L -N -E O -T F I L -N -E O -T F I L -Tax, including any alternative minimum tax (see page 30). -Credit for child and dependent care expenses. -Attach Schedule 2. -29 -29 -E -E -Credit for the elderly or the disabled. Attach -L -L -Schedule 3. -30 -31 -30 -31 -A M P -O N -D -A M P -Form 8880. -O N -D -Education credits. Attach Form 8863. -Child tax credit (see page 35). Attach -Form 8901 if required. -32 -32 -O -O -Retirement savings contributions credit. Attach -33 -33 -Add lines 29 through 33. These are your total credits. -Subtract line 34 from line 28. If line 34 is more than line 28, enter -0-. -Advance earned income credit payments from Form(s) W-2, box 9. -Add lines 35 and 36. This is your total tax. -Federal income tax withheld from Forms W-2 and 1099. -38 -38 -2007 estimated tax payments and amount -applied from 2006 return. -39 -40a -39 -40a -Earned income credit (EIC). -b Nontaxable combat pay election. 40b - - - - - Single or Married filing separately, $5,350 - Married filing jointly or Qualifying widow(er), $10,700 - Head of household, $7,850 - - - - - - - 24 25 26 - - 24 25 - - - - - - - 26 - - - 27 - (cid:2) - 27 28 - - 28 29 - - - 30 - - 31 32 - - - 33 - X - E - - - - - 34 35 36 37 38 39 - 34 35 36 37 - - - - (cid:2) - - - - - - If you have a qualifying child, attach Schedule EIC. - - - 40a - - - - - - - - - - 41 - 41 42 43 - - - (cid:2) 42 - - Refund - - - number Additional child tax credit. Attach Form 8812. Add lines 38, 39, 40a, and 41. These are your total payments. If line 42 is more than line 37, subtract line 37 from line 42. This is the amount you overpaid. Amount of line 43 you want refunded to you. If Form 8888 is attached, check here (cid:2) Routing (cid:2) -(cid:2) -c -c -Type: -Type: -Checking -Checking -Savings -Savings -number - - - 43 44a - - - - - - - - 44a (cid:2) b - Direct deposit? See page 52 and fill in 44b, 44c, and 44d or Form 8888. - (cid:2) d - number Amount of line 43 you want applied to your 2008 estimated tax. Amount you owe. Subtract line 42 from line 37. For details on how to pay, see page 53. Estimated tax penalty (see page 53). Account number -Amount of line 43 you want applied to your -2008 estimated tax. -45 -45 -Amount you owe. Subtract line 42 from line 37. For details on how -to pay, see page 53. -Estimated tax penalty (see page 53). -47 -47 - - - - 45 - - - - 46 - Amount you owe - (cid:2) - 46 - - - 47 - - - - - - - - - Yes. Complete the following. - No - Do you want to allow another person to discuss this return with the IRS (see page 54)? - Third party designee - - - (cid:2) name Under penalties of perjury, I declare that I have examined this return and accompanying schedules and statements, and to the best of my knowledge and belief, they are true, correct, and accurately list all amounts and sources of income I received during the tax year. Declaration of preparer (other than the taxpayer) is based on all information of which the preparer has any knowledge. Your signature Designee’s Phone no. Personal identification number (PIN) (cid:2) -(cid:2) ( -(cid:2) ( -(cid:2) -(cid:2) -) -) -name -Under penalties of perjury, I declare that I have examined this return and accompanying schedules and statements, and to the best of my -knowledge and belief, they are true, correct, and accurately list all amounts and sources of income I received during the tax year. Declaration -of preparer (other than the taxpayer) is based on all information of which the preparer has any knowledge. -Your occupation -Your occupation -Your signature -Date -Date - - - - - - - - Sign here Joint return? See page 15. Keep a copy for your records. - (cid:2) - - - - Spouse’s occupation - - Daytime phone number - - ( - ) - - - Spouse’s signature. If a joint return, both must sign. - Date - (cid:2) - - - - - Date - Preparer’s SSN or PTIN - Paid preparer’s use only - Preparer’s signature - Check if self-employed - - - - - (cid:2) - - - - Firm’s name (or yours if self-employed), address, and ZIP code - EIN - - ( - ) - Phone no. - - Form 1040A (2007) - - - - - - - - - Printed on recycled paper - - diff --git a/tests/test_main.py b/tests/test_main.py index 515efb3..1f4007e 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -1,10 +1,3 @@ -# to run: -# python setup.py test -# -# to debug: -# pip install nose -# nosetests --pdb - import sys import tempfile @@ -13,14 +6,10 @@ from .utils import BaseTestCase -### helpers ### - - - class TestPDFQuery(BaseTestCase): """ - Various tests based on the IRS_1040A sample doc. + Various tests based on the IRS_1040A sample doc. """ @classmethod @@ -30,126 +19,145 @@ def setUpClass(cls): def test_xml_conversion(self): """ - Test that converted XML hasn't changed from saved version. + Test that converted XML hasn't changed from saved version. """ - if (sys.version_info[:2] <= (3, 5)): - # PDFMiner parses slightly different with Python 3.5 - self.assertValidOutput(self.pdf, "IRS_1040A_output_py35") - else: - self.assertValidOutput(self.pdf, "IRS_1040A_output") + self.assertValidOutput(self.pdf, "IRS_1040A_output") def test_selectors(self): """ - Test the :contains and :in_bbox selectors. + Test the :contains and :in_bbox selectors. """ - label = self.pdf.pq('LTTextLineHorizontal:contains("Your first name ' - 'and initial")') + label = self.pdf.pq( + 'LTTextLineHorizontal:contains("Your first name ' 'and initial")' + ) self.assertEqual(len(label), 1) - left_corner = float(label.attr('x0')) + left_corner = float(label.attr("x0")) self.assertEqual(left_corner, 143.651) - bottom_corner = float(label.attr('y0')) + bottom_corner = float(label.attr("y0")) self.assertEqual(bottom_corner, 714.694) - name = self.pdf.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % - (left_corner, - bottom_corner - 30, - left_corner + 150, - bottom_corner) - ).text() + name = self.pdf.pq( + 'LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' + % (left_corner, bottom_corner - 30, left_corner + 150, bottom_corner) + ).text() self.assertEqual(name, "John E.") def test_extract(self): """ - Test the extract() function. + Test the extract() function. """ - values = self.pdf.extract([ - ('with_parent', 'LTPage[pageid="1"]'), - ('with_formatter', 'text'), - - ('last_name', 'LTTextLineHorizontal:in_bbox("315,680,395,700")'), - ('spouse', 'LTTextLineHorizontal:in_bbox("170,650,220,680")'), - - ('with_parent', 'LTPage[pageid="2"]'), - - ('oath', 'LTTextLineHorizontal:contains("perjury")', - lambda match: match.text()[:30] + "..."), - ('year', 'LTTextLineHorizontal:contains("Form 1040A (")', - lambda match: int(match.text()[-5:-1])) - ]) + values = self.pdf.extract( + [ + ("with_parent", 'LTPage[pageid="1"]'), + ("with_formatter", "text"), + ("last_name", 'LTTextLineHorizontal:in_bbox("315,680,395,700")'), + ("spouse", 'LTTextLineHorizontal:in_bbox("170,650,220,680")'), + ("with_parent", 'LTPage[pageid="2"]'), + ( + "oath", + 'LTTextLineHorizontal:contains("perjury")', + lambda match: match.text()[:30] + "...", + ), + ( + "year", + 'LTTextLineHorizontal:contains("Form 1040A (")', + lambda match: int(match.text()[-5:-1]), + ), + ] + ) - self.assertDictEqual(values, { - 'last_name': 'Michaels', - 'spouse': 'Susan R.', - 'oath': u'Under penalties of perjury, I ...', - 'year': 2007 - }) + self.assertDictEqual( + values, + { + "last_name": "Michaels", + "spouse": "Susan R.", + "oath": "Under penalties of perjury, I ...", + "year": 2007, + }, + ) def test_page_numbers(self): - self.assertEqual(self.pdf.tree.getroot()[0].get('page_label'), '1') + self.assertEqual(self.pdf.tree.getroot()[0].get("page_label"), "1") class TestDocInfo(BaseTestCase): - def test_docinfo(self): - doc_info_results = [ - ["tests/samples/bug11.pdf", - {'Producer': 'Mac OS X 10.9.3 Quartz PDFContext', - 'Title': u'\u262d\U0001f61c\U0001f4a9Unicode is fun!', - 'Author': 'Russkel', 'Creator': 'Firefox', - 'ModDate': "D:20140528141914+08'00'", - 'CreationDate': 'D:20140528061106Z', 'Subject': ''}], - ["tests/samples/bug15.pdf", - {'Producer': 'Mac OS X 10.9.3 Quartz PDFContext', - 'Author': 'Brepols Publishers', - 'Creator': 'PDFsharp 1.2.1269-g (www.pdfsharp.com)', - 'AAPL_Keywords': '["Brepols", "Publishers", "CTLO"]', - 'Title': 'Exporter', - 'ModDate': "D:20140614192741Z00'00'", - 'Keywords': 'Brepols, Publishers, CTLO', - 'CreationDate': "D:20140614192741Z00'00'", - 'Subject': 'Extrait de la Library of Latin Texts - Series A'}], - ["tests/samples/bug17.pdf", - {'CreationDate': 'D:20140328164512Z', - 'Creator': 'Adobe InDesign CC (Macintosh)', - 'ModDate': 'D:20140328164513Z', - 'Producer': 'Adobe PDF Library 10.0.1', 'Trapped': '/False'}] + [ + "tests/samples/bug11.pdf", + { + "Producer": "Mac OS X 10.9.3 Quartz PDFContext", + "Title": "\u262d\U0001f61c\U0001f4a9Unicode is fun!", + "Author": "Russkel", + "Creator": "Firefox", + "ModDate": "D:20140528141914+08'00'", + "CreationDate": "D:20140528061106Z", + "Subject": "", + }, + ], + [ + "tests/samples/bug15.pdf", + { + "Producer": "Mac OS X 10.9.3 Quartz PDFContext", + "Author": "Brepols Publishers", + "Creator": "PDFsharp 1.2.1269-g (www.pdfsharp.com)", + "AAPL_Keywords": '["Brepols", "Publishers", "CTLO"]', + "Title": "Exporter", + "ModDate": "D:20140614192741Z00'00'", + "Keywords": "Brepols, Publishers, CTLO", + "CreationDate": "D:20140614192741Z00'00'", + "Subject": "Extrait de la Library of Latin Texts - Series A", + }, + ], + [ + "tests/samples/bug17.pdf", + { + "CreationDate": "D:20140328164512Z", + "Creator": "Adobe InDesign CC (Macintosh)", + "ModDate": "D:20140328164513Z", + "Producer": "Adobe PDF Library 10.0.1", + "Trapped": "/False", + }, + ], ] for file_path, expected_results in doc_info_results: pdf = pdfquery.PDFQuery(file_path) pdf.load(None) docinfo = dict(pdf.tree.getroot().attrib) - self.assertDictEqual(docinfo,expected_results) + self.assertDictEqual(docinfo, expected_results) class TestUnicode(BaseTestCase): - def test_unicode_text(self): pdf = pdfquery.PDFQuery("tests/samples/bug18.pdf") pdf.load() self.assertEqual( pdf.pq('LTTextLineHorizontal:contains("Hop Hing Oils")').text(), - (u'5 Hop Hing Oils and Fats (Hong Kong) Ltd \uf06c ' - u'\u7279\u5bf6\u7cbe\u88fd\u8c6c\u6cb9') + ( + "5 Hop Hing Oils and Fats (Hong Kong) Ltd \uf06c " + "\u7279\u5bf6\u7cbe\u88fd\u8c6c\u6cb9" + ), ) def test_invalid_xml_characters(self): pdf = pdfquery.PDFQuery("tests/samples/bug39.pdf") - pdf.load(2) # throws error if we fail to strip ascii control characters -- see issue #39 + pdf.load( + 2 + ) # throws error if we fail to strip ascii control characters -- see issue #39 class TestAnnotations(BaseTestCase): """ - Ensure that annotations such as links are getting added to the PDFs - properly, as discussed in issue #28. + Ensure that annotations such as links are getting added to the PDFs + properly, as discussed in issue #28. """ def test_xml_conversion(self): """ - Test that converted XML hasn't changed from saved version. + Test that converted XML hasn't changed from saved version. """ pdf = pdfquery.PDFQuery("tests/samples/bug28.pdf") pdf.load() @@ -157,7 +165,7 @@ def test_xml_conversion(self): def test_annot_dereferencing(self): """ - See issues #37, #42. + See issues #37, #42. """ pdf = pdfquery.PDFQuery("tests/samples/bug37.pdf") pdf.load() @@ -167,30 +175,32 @@ def test_annot_dereferencing(self): class TestPageRange(BaseTestCase): """ - Test various page number parameters + Test various page number parameters """ @classmethod def setUpClass(cls): cache_dir = "{}/".format(tempfile.gettempdir()) - cls.pdf = pdfquery.PDFQuery("tests/samples/bug67.pdf", parse_tree_cacher=FileCache(cache_dir)) + cls.pdf = pdfquery.PDFQuery( + "tests/samples/bug67.pdf", parse_tree_cacher=FileCache(cache_dir) + ) def test_page_int(self): self.pdf.load(3) - self.assertEqual(len(self.pdf.pq('LTPage')), 1) + self.assertEqual(len(self.pdf.pq("LTPage")), 1) self.pdf.load(0, 10, 25, 49) - self.assertEqual(len(self.pdf.pq('LTPage')), 4) + self.assertEqual(len(self.pdf.pq("LTPage")), 4) def test_page_array(self): self.pdf.load([0, 7, 11]) - self.assertEqual(len(self.pdf.pq('LTPage')), 3) + self.assertEqual(len(self.pdf.pq("LTPage")), 3) self.pdf.load([10], [0, 12], [30, 40]) - self.assertEqual(len(self.pdf.pq('LTPage')), 5) + self.assertEqual(len(self.pdf.pq("LTPage")), 5) def test_page_mixed(self): self.pdf.load([0, 7, 11], [0, 44], 1) - self.assertEqual(len(self.pdf.pq('LTPage')), 6) + self.assertEqual(len(self.pdf.pq("LTPage")), 6) def test_page_range(self): self.pdf.load(range(0, 150)) - self.assertEqual(len(self.pdf.pq('LTPage')), 150) \ No newline at end of file + self.assertEqual(len(self.pdf.pq("LTPage")), 150) diff --git a/tests/utils.py b/tests/utils.py index 0ac5a6a..c959017 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -1,17 +1,16 @@ -import sys import unittest from lxml import etree from six import BytesIO # ignore index= attribute in xml comparison, as it is not stable between python versions -IGNORE_ATTRIBS = {'index'} +IGNORE_ATTRIBS = {"index"} -class BaseTestCase(unittest.TestCase): +class BaseTestCase(unittest.TestCase): def assertValidOutput(self, pdf, output_name): """ - Test that converted XML hasn't changed from saved version. + Test that converted XML hasn't changed from saved version. """ # get current XML for sample file tree_string = BytesIO() @@ -22,7 +21,7 @@ def assertValidOutput(self, pdf, output_name): # this varies by Python version, because the float handling isn't quite # the same comparison_file = "tests/saved_output/%s.xml" % (output_name,) - with open(comparison_file, 'rb') as f: + with open(comparison_file, "rb") as f: saved_string = f.read() # compare current to previous @@ -34,19 +33,30 @@ def assertValidOutput(self, pdf, output_name): out.write(tree_string) # for debugging: run `pytest --lf --pdb` and then use etree.dump(e1), etree.dump(e2) e1, e2 = e.args[1:3] - raise self.failureException("XML conversion of sample pdf has changed! Compare %s to %s" % (comparison_file, output_path)) from e + raise self.failureException( + "XML conversion of sample pdf has changed! Compare %s to %s" + % (comparison_file, output_path) + ) from e def xml_strings_equal(self, s1, s2, ignore_attribs=IGNORE_ATTRIBS): """ - Return true if two xml strings are semantically equivalent (ignoring attribute ordering and whitespace). + Return true if two xml strings are semantically equivalent (ignoring attribute ordering and whitespace). """ + # via http://stackoverflow.com/a/24349916/307769 def elements_equal(e1, e2): - if e1.tag != e2.tag: raise self.failureException("Mismatched tags", e1, e2) - if e1.text != e2.text: raise self.failureException("Mismatched text", e1, e2) - if e1.tail != e2.tail: raise self.failureException("Mismatched tail", e1, e2) - if set(e1.attrib) - ignore_attribs != set(e2.attrib) - ignore_attribs: raise self.failureException("Mismatched attributes %s and %s" % (e1.attrib, e2.attrib), e1, e2) - if len(e1) != len(e2): raise self.failureException("Mismatched children", e1, e2) + if e1.tag != e2.tag: + raise self.failureException("Mismatched tags", e1, e2) + if e1.text != e2.text: + raise self.failureException("Mismatched text", e1, e2) + if e1.tail != e2.tail: + raise self.failureException("Mismatched tail", e1, e2) + if set(e1.attrib) - ignore_attribs != set(e2.attrib) - ignore_attribs: + raise self.failureException( + "Mismatched attributes %s and %s" % (e1.attrib, e2.attrib), e1, e2 + ) + if len(e1) != len(e2): + raise self.failureException("Mismatched children", e1, e2) for c1, c2 in zip(e1, e2): elements_equal(c1, c2)