Skip to content

Commit

Permalink
api: exceptions are used instead of status codes
Browse files Browse the repository at this point in the history
* Make refextract  more idiomatic, raising exceptions instead of
having (result, error) return values in functions.
* INCOMPATIBLE FullTextNotAvailable is renamed to
FullTextNotAvailableError.
* NEW There are two new exceptions, UnknownDocumentTypeError when the
file/URL is not a PDF or plain text and GarbageFullTextError when the
PDF fulltext extraction gives garbage.
* The exception raised when 'pdftotext' is not found is now
FileNotFoundError instead of Exception.
* Fix the utterly broken error handling in extract_references_from_url.
* Add tests for UnknownDocumentTypeError and
FullTextNotAvailableError.

Signed-off-by: Micha Moskovic <michamos@gmail.com>
  • Loading branch information
michamos committed Jan 5, 2017
1 parent 10f0ccf commit 350b4e8
Show file tree
Hide file tree
Showing 6 changed files with 83 additions and 39 deletions.
10 changes: 5 additions & 5 deletions refextract/documents/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
from six import iteritems

from ..references.config import CFG_PATH_PDFTOTEXT
from ..references.errors import GarbageFullTextError

# a dictionary of undesirable characters and their replacements:
UNDESIRABLE_CHAR_REPLACEMENTS = {
Expand Down Expand Up @@ -480,18 +481,18 @@ def convert_PDF_to_plaintext(fpath, keep_layout=False):
Take the path to a PDF file and run pdftotext for this file, capturing
the output.
It raises GarbageFullTextError when this output is garbage.
@param fpath: (string) path to the PDF file
@return: (list) of unicode strings (contents of the PDF file translated
into plaintext; each string is a line in the document.)
"""
if not os.path.isfile(CFG_PATH_PDFTOTEXT):
raise Exception('Missing pdftotext executable')
raise FileNotFoundError('Missing pdftotext executable')

if keep_layout:
layout_option = "-layout"
else:
layout_option = "-raw"
status = 0
doclines = []
# Pattern to check for lines with a leading page-break character.
# If this pattern is matched, we want to split the page-break into
Expand Down Expand Up @@ -525,7 +526,6 @@ def convert_PDF_to_plaintext(fpath, keep_layout=False):

# finally, check conversion result not bad:
if pdftotext_conversion_is_bad(doclines):
status = 2
doclines = []
raise GarbageFullTextError("Garbage fulltext in '{0}'".format(fpath))

return (doclines, status)
return doclines
51 changes: 25 additions & 26 deletions refextract/references/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@


import os
import sys
import requests
import magic

Expand All @@ -41,7 +42,7 @@
parse_reference_line,
parse_references,
parse_tagged_reference_line)
from .errors import FullTextNotAvailable
from .errors import FullTextNotAvailableError
from .find import (find_numeration_in_body,
get_reference_section_beginning)
from .pdf import extract_texkeys_from_pdf
Expand All @@ -55,7 +56,9 @@ def extract_references_from_url(url, headers=None, chunk_size=1024, **kwargs):
The first parameter is the URL of the file.
It returns a list of parsed references.
It raises FullTextNotAvailable if the file does not exist.
It raises FullTextNotAvailableError if the URL gives a 404,
UnknownDocumentTypeError if it is not a PDF or plain text
and GarbageFullTextError if the fulltext extraction gives garbage.
The standard reference format is: {title} {volume} ({year}) {page}.
Expand All @@ -71,32 +74,26 @@ def extract_references_from_url(url, headers=None, chunk_size=1024, **kwargs):
>>> extract_references_from_url(path, override_kbs_files={'journals': 'my/path/to.kb'})
It raises FullTextNotAvailable if the url gives a 404
"""
# Get temporary filepath to download to
filename, filepath = mkstemp(
suffix="_{0}".format(os.path.basename(url)),
)
os.close(filename)

req = requests.get(
url=url,
headers=headers,
stream=True
)
if req.status_code == 200:
try:
req = requests.get(
url=url,
headers=headers,
stream=True
)
req.raise_for_status()
with open(filepath, 'wb') as f:
for chunk in req.iter_content(chunk_size):
f.write(chunk)

try:
try:
references = extract_references_from_file(filepath, **kwargs)
except IOError as err:
if err.code == 404:
raise FullTextNotAvailable()
else:
raise
references = extract_references_from_file(filepath, **kwargs)
except requests.exceptions.HTTPError as e:
raise FullTextNotAvailableError("URL not found: '{0}'".format(url)), None, sys.exc_info()[2]
finally:
os.remove(filepath)
return references
Expand All @@ -111,7 +108,10 @@ def extract_references_from_file(path,
The first parameter is the path to the file.
It returns a list of parsed references.
It raises FullTextNotAvailable if the file does not exist.
It raises FullTextNotAvailableError if the file does not exist,
UnknownDocumentTypeError if it is not a PDF or plain text
and GarbageFullTextError if the fulltext extraction gives garbage.
The standard reference format is: {title} {volume} ({year}) {page}.
Expand All @@ -129,12 +129,12 @@ def extract_references_from_file(path,
"""
if not os.path.isfile(path):
raise FullTextNotAvailable()
raise FullTextNotAvailableError("File not found: '{0}'".format(path))

docbody, dummy = get_plaintext_document_body(path)
docbody = get_plaintext_document_body(path)
reflines, dummy, dummy = extract_references_from_fulltext(docbody)
if not len(reflines):
docbody, dummy = get_plaintext_document_body(path, keep_layout=True)
if not reflines:
docbody = get_plaintext_document_body(path, keep_layout=True)
reflines, dummy, dummy = extract_references_from_fulltext(docbody)

parsed_refs, stats = parse_references(
Expand Down Expand Up @@ -163,7 +163,6 @@ def extract_references_from_string(source,
The first parameter is the path to the file.
It returns a tuple (references, stats).
It raises FullTextNotAvailable if the file does not exist.
If the string does not only contain references, improve accuracy by
specifing ``is_only_references=False``.
Expand All @@ -172,15 +171,15 @@ def extract_references_from_string(source,
E.g. you can change that by passing the reference_format:
>>> extract_references_from_url(path, reference_format="{title},{volume},{page}")
>>> extract_references_from_string(path, reference_format="{title},{volume},{page}")
If you want to also link each reference to some other resource (like a record),
you can provide a linker_callback function to be executed for every reference
element found.
To override KBs for journal names etc., use ``override_kbs_files``:
>>> extract_references_from_url(path, override_kbs_files={'journals': 'my/path/to.kb'})
>>> extract_references_from_string(path, override_kbs_files={'journals': 'my/path/to.kb'})
"""
docbody = source.split('\n')
if not is_only_references:
Expand Down
14 changes: 8 additions & 6 deletions refextract/references/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@
CFG_PATH_GFILE
)

from .errors import UnknownDocumentTypeError

from .tag import (
tag_reference_line,
sum_2_dictionaries,
Expand Down Expand Up @@ -1377,27 +1379,27 @@ def get_plaintext_document_body(fpath, keep_layout=False):
"""Given a file-path to a full-text, return a list of unicode strings
whereby each string is a line of the fulltext.
In the case of a plain-text document, this simply means reading the
contents in from the file. In the case of a PDF/PostScript however,
contents in from the file. In the case of a PDF however,
this means converting the document to plaintext.
It raises UnknownDocumentTypeError if the document is not a PDF or
plain text.
@param fpath: (string) - the path to the fulltext file
@return: (list) of strings - each string being a line in the document.
"""
textbody = []
status = 0
mime_type = magic.from_file(fpath, mime=True)

if mime_type == "text/plain":
with open(fpath, "r") as f:
textbody = [line.decode("utf-8") for line in f.readlines()]

elif mime_type == "application/pdf":
(textbody, status) = convert_PDF_to_plaintext(fpath, keep_layout)
textbody = convert_PDF_to_plaintext(fpath, keep_layout)

else:
# invalid format
status = 1
raise UnknownDocumentTypeError(mime_type)

return (textbody, status)
return textbody


def parse_references(reference_lines,
Expand Down
12 changes: 11 additions & 1 deletion refextract/references/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,16 @@
"""Custom exceptions."""


class FullTextNotAvailable(Exception):
class FullTextNotAvailableError(Exception):

"""Raised when we cannot access the document text."""


class GarbageFullTextError(Exception):

"""Raised when the fulltext extraction from the PDF gives garbage."""


class UnknownDocumentTypeError(Exception):

"""Raised when we don't know how to handle the document's MIME type."""
17 changes: 16 additions & 1 deletion tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@
extract_references_from_file,
)

from refextract.references.errors import FullTextNotAvailableError


@pytest.fixture
def kbs_override():
Expand Down Expand Up @@ -130,8 +132,10 @@ def test_extract_references_from_file(pdf_files):
assert 'texkey' in r[0]
assert 'author' in r[0]
assert len(r) == 36
with pytest.raises(FullTextNotAvailableError):
extract_references_from_file(pdf_files[0] + "error")


@responses.activate
def test_extract_references_from_url(pdf_files):
with open(pdf_files[0], 'rb') as fd:
url = "http://arxiv.org/pdf/1503.07589v1.pdf"
Expand All @@ -144,3 +148,14 @@ def test_extract_references_from_url(pdf_files):

r = extract_references_from_url(url)
assert len(r) == 36

with pytest.raises(FullTextNotAvailableError):
url = "http://www.example.com"
responses.add(
responses.GET,
url,
body="File not found!",
status=404,
content_type='text/plain',
)
extract_references_from_url(url)
18 changes: 18 additions & 0 deletions tests/test_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,15 @@

"""The Refextract unit test suite"""

import pytest

from refextract.references.engine import (
get_plaintext_document_body,
parse_references,
)

from refextract.references.errors import UnknownDocumentTypeError

from refextract.references.text import wash_and_repair_reference_line


Expand Down Expand Up @@ -99,3 +104,16 @@ def test_extra_a_in_report_number():
u'ATLAS-CONF-2012-078',
]
assert references[0]['linemarker'] == [u'14']

def test_get_plaintext_document_body(tmpdir):
input = [u"Some text\n", u"on multiple lines\n"]
f = tmpdir.join("plain.txt")
f.write("".join(input))
assert input == get_plaintext_document_body(str(f))

with pytest.raises(UnknownDocumentTypeError) as excinfo:
html = "<html><body>Some page</body></html>"
f = tmpdir.join("page.html")
f.write(html)
get_plaintext_document_body(str(f))
assert 'text/html' in excinfo.value

0 comments on commit 350b4e8

Please sign in to comment.