api: exceptions are used instead of status codes

* Make refextract more idiomatic, raising exceptions instead of having (result, error) return values in functions. * INCOMPATIBLE FullTextNotAvailable is renamed to FullTextNotAvailableError. * NEW There are two new exceptions, UnknownDocumentTypeError when the file/URL is not a PDF or plain text and GarbageFullTextError when the PDF fulltext extraction gives garbage. * The exception raised when 'pdftotext' is not found is now FileNotFoundError instead of Exception. * Fix the utterly broken error handling in extract_references_from_url. * Add tests for UnknownDocumentTypeError and FullTextNotAvailableError. Signed-off-by: Micha Moskovic <michamos@gmail.com>
inspirehep · Jan 5, 2017 · 350b4e8 · 350b4e8
1 parent 10f0ccf
commit 350b4e8
Show file tree

Hide file tree

Showing 6 changed files with 83 additions and 39 deletions.
diff --git a/refextract/documents/pdf.py b/refextract/documents/pdf.py
@@ -43,6 +43,7 @@
 from six import iteritems
 
 from ..references.config import CFG_PATH_PDFTOTEXT
+from ..references.errors import GarbageFullTextError
 
 # a dictionary of undesirable characters and their replacements:
 UNDESIRABLE_CHAR_REPLACEMENTS = {
@@ -480,18 +481,18 @@ def convert_PDF_to_plaintext(fpath, keep_layout=False):
 
     Take the path to a PDF file and run pdftotext for this file, capturing
     the output.
+    It raises GarbageFullTextError when this output is garbage.
     @param fpath: (string) path to the PDF file
     @return: (list) of unicode strings (contents of the PDF file translated
     into plaintext; each string is a line in the document.)
     """
     if not os.path.isfile(CFG_PATH_PDFTOTEXT):
-        raise Exception('Missing pdftotext executable')
+        raise FileNotFoundError('Missing pdftotext executable')
 
     if keep_layout:
         layout_option = "-layout"
     else:
         layout_option = "-raw"
-    status = 0
     doclines = []
     # Pattern to check for lines with a leading page-break character.
     # If this pattern is matched, we want to split the page-break into
@@ -525,7 +526,6 @@ def convert_PDF_to_plaintext(fpath, keep_layout=False):
 
     # finally, check conversion result not bad:
     if pdftotext_conversion_is_bad(doclines):
-        status = 2
-        doclines = []
+        raise GarbageFullTextError("Garbage fulltext in '{0}'".format(fpath))
 
-    return (doclines, status)
+    return doclines
diff --git a/refextract/references/api.py b/refextract/references/api.py
@@ -30,6 +30,7 @@
 
 
 import os
+import sys
 import requests
 import magic
 
@@ -41,7 +42,7 @@
                      parse_reference_line,
                      parse_references,
                      parse_tagged_reference_line)
-from .errors import FullTextNotAvailable
+from .errors import FullTextNotAvailableError
 from .find import (find_numeration_in_body,
                    get_reference_section_beginning)
 from .pdf import extract_texkeys_from_pdf
@@ -55,7 +56,9 @@ def extract_references_from_url(url, headers=None, chunk_size=1024, **kwargs):
     The first parameter is the URL of the file.
     It returns a list of parsed references.
 
-    It raises FullTextNotAvailable if the file does not exist.
+    It raises FullTextNotAvailableError if the URL gives a 404,
+    UnknownDocumentTypeError if it is not a PDF or plain text
+    and GarbageFullTextError if the fulltext extraction gives garbage.
 
     The standard reference format is: {title} {volume} ({year}) {page}.
 
@@ -71,32 +74,26 @@ def extract_references_from_url(url, headers=None, chunk_size=1024, **kwargs):
 
     >>> extract_references_from_url(path, override_kbs_files={'journals': 'my/path/to.kb'})
 
-    It raises FullTextNotAvailable if the url gives a 404
     """
     # Get temporary filepath to download to
     filename, filepath = mkstemp(
         suffix="_{0}".format(os.path.basename(url)),
     )
     os.close(filename)
 
-    req = requests.get(
-        url=url,
-        headers=headers,
-        stream=True
-    )
-    if req.status_code == 200:
+    try:
+        req = requests.get(
+            url=url,
+            headers=headers,
+            stream=True
+        )
+        req.raise_for_status()
         with open(filepath, 'wb') as f:
             for chunk in req.iter_content(chunk_size):
                 f.write(chunk)
-
-    try:
-        try:
-            references = extract_references_from_file(filepath, **kwargs)
-        except IOError as err:
-            if err.code == 404:
-                raise FullTextNotAvailable()
-            else:
-                raise
+        references = extract_references_from_file(filepath, **kwargs)
+    except requests.exceptions.HTTPError as e:
+        raise FullTextNotAvailableError("URL not found: '{0}'".format(url)), None, sys.exc_info()[2]
     finally:
         os.remove(filepath)
     return references
@@ -111,7 +108,10 @@ def extract_references_from_file(path,
 
     The first parameter is the path to the file.
     It returns a list of parsed references.
-    It raises FullTextNotAvailable if the file does not exist.
+    It raises FullTextNotAvailableError if the file does not exist,
+    UnknownDocumentTypeError if it is not a PDF or plain text
+    and GarbageFullTextError if the fulltext extraction gives garbage.
+
 
     The standard reference format is: {title} {volume} ({year}) {page}.
 
@@ -129,12 +129,12 @@ def extract_references_from_file(path,
 
     """
     if not os.path.isfile(path):
-        raise FullTextNotAvailable()
+        raise FullTextNotAvailableError("File not found: '{0}'".format(path))
 
-    docbody, dummy = get_plaintext_document_body(path)
+    docbody = get_plaintext_document_body(path)
     reflines, dummy, dummy = extract_references_from_fulltext(docbody)
-    if not len(reflines):
-        docbody, dummy = get_plaintext_document_body(path, keep_layout=True)
+    if not reflines:
+        docbody = get_plaintext_document_body(path, keep_layout=True)
         reflines, dummy, dummy = extract_references_from_fulltext(docbody)
 
     parsed_refs, stats = parse_references(
@@ -163,7 +163,6 @@ def extract_references_from_string(source,
 
     The first parameter is the path to the file.
     It returns a tuple (references, stats).
-    It raises FullTextNotAvailable if the file does not exist.
 
     If the string does not only contain references, improve accuracy by
     specifing ``is_only_references=False``.
@@ -172,15 +171,15 @@ def extract_references_from_string(source,
 
     E.g. you can change that by passing the reference_format:
 
-    >>> extract_references_from_url(path, reference_format="{title},{volume},{page}")
+    >>> extract_references_from_string(path, reference_format="{title},{volume},{page}")
 
     If you want to also link each reference to some other resource (like a record),
     you can provide a linker_callback function to be executed for every reference
     element found.
 
     To override KBs for journal names etc., use ``override_kbs_files``:
 
-    >>> extract_references_from_url(path, override_kbs_files={'journals': 'my/path/to.kb'})
+    >>> extract_references_from_string(path, override_kbs_files={'journals': 'my/path/to.kb'})
     """
     docbody = source.split('\n')
     if not is_only_references:

diff --git a/refextract/references/engine.py b/refextract/references/engine.py
@@ -46,6 +46,8 @@
     CFG_PATH_GFILE
 )
 
+from .errors import UnknownDocumentTypeError
+
 from .tag import (
     tag_reference_line,
     sum_2_dictionaries,
@@ -1377,27 +1379,27 @@ def get_plaintext_document_body(fpath, keep_layout=False):
     """Given a file-path to a full-text, return a list of unicode strings
        whereby each string is a line of the fulltext.
        In the case of a plain-text document, this simply means reading the
-       contents in from the file. In the case of a PDF/PostScript however,
+       contents in from the file. In the case of a PDF however,
        this means converting the document to plaintext.
+       It raises UnknownDocumentTypeError if the document is not a PDF or
+       plain text.
        @param fpath: (string) - the path to the fulltext file
        @return: (list) of strings - each string being a line in the document.
     """
     textbody = []
-    status = 0
     mime_type = magic.from_file(fpath, mime=True)
 
     if mime_type == "text/plain":
         with open(fpath, "r") as f:
             textbody = [line.decode("utf-8") for line in f.readlines()]
 
     elif mime_type == "application/pdf":
-        (textbody, status) = convert_PDF_to_plaintext(fpath, keep_layout)
+        textbody = convert_PDF_to_plaintext(fpath, keep_layout)
 
     else:
-        # invalid format
-        status = 1
+        raise UnknownDocumentTypeError(mime_type)
 
-    return (textbody, status)
+    return textbody
 
 
 def parse_references(reference_lines,

diff --git a/refextract/references/errors.py b/refextract/references/errors.py
@@ -24,6 +24,16 @@
 """Custom exceptions."""
 
 
-class FullTextNotAvailable(Exception):
+class FullTextNotAvailableError(Exception):
 
     """Raised when we cannot access the document text."""
+
+
+class GarbageFullTextError(Exception):
+
+    """Raised when the fulltext extraction from the PDF gives garbage."""
+
+
+class UnknownDocumentTypeError(Exception):
+
+    """Raised when we don't know how to handle the document's MIME type."""
diff --git a/tests/test_api.py b/tests/test_api.py
@@ -35,6 +35,8 @@
     extract_references_from_file,
 )
 
+from refextract.references.errors import FullTextNotAvailableError
+
 
 @pytest.fixture
 def kbs_override():
@@ -130,8 +132,10 @@ def test_extract_references_from_file(pdf_files):
     assert 'texkey' in r[0]
     assert 'author' in r[0]
     assert len(r) == 36
+    with pytest.raises(FullTextNotAvailableError):
+        extract_references_from_file(pdf_files[0] + "error")
 
-
+@responses.activate
 def test_extract_references_from_url(pdf_files):
     with open(pdf_files[0], 'rb') as fd:
         url = "http://arxiv.org/pdf/1503.07589v1.pdf"
@@ -144,3 +148,14 @@ def test_extract_references_from_url(pdf_files):
 
     r = extract_references_from_url(url)
     assert len(r) == 36
+
+    with pytest.raises(FullTextNotAvailableError):
+        url = "http://www.example.com"
+        responses.add(
+            responses.GET,
+            url,
+            body="File not found!",
+            status=404,
+            content_type='text/plain',
+        )
+        extract_references_from_url(url)
diff --git a/tests/test_engine.py b/tests/test_engine.py
@@ -23,10 +23,15 @@
 
 """The Refextract unit test suite"""
 
+import pytest
+
 from refextract.references.engine import (
+    get_plaintext_document_body,
     parse_references,
 )
 
+from refextract.references.errors import UnknownDocumentTypeError
+
 from refextract.references.text import wash_and_repair_reference_line
 
 
@@ -99,3 +104,16 @@ def test_extra_a_in_report_number():
         u'ATLAS-CONF-2012-078',
     ]
     assert references[0]['linemarker'] == [u'14']
+
+def test_get_plaintext_document_body(tmpdir):
+    input = [u"Some text\n", u"on multiple lines\n"]
+    f = tmpdir.join("plain.txt")
+    f.write("".join(input))
+    assert input == get_plaintext_document_body(str(f))
+
+    with pytest.raises(UnknownDocumentTypeError) as excinfo:
+        html = "<html><body>Some page</body></html>"
+        f = tmpdir.join("page.html")
+        f.write(html)
+        get_plaintext_document_body(str(f))
+    assert 'text/html' in excinfo.value