inspirehep · kaplun · Jan 5, 2017 · Dec 2, 2016 · Dec 2, 2016 · Dec 6, 2016
diff --git a/README.rst b/README.rst
@@ -76,32 +76,18 @@ To extract references from a publication full-text PDF:
     from refextract import extract_references_from_file
     reference = extract_references_from_file("some/fulltext/1503.07589v1.pdf")
     print(reference)
-    {
-        'references': [
-                {'author': [u'F. Englert and R. Brout'],
-                 'doi': [u'10.1103/PhysRevLett.13.321'],
-                 'journal_page': [u'321'],
-                 'journal_reference': ['Phys.Rev.Lett.,13,1964'],
-                 'journal_title': [u'Phys.Rev.Lett.'],
-                 'journal_volume': [u'13'],
-                 'journal_year': [u'1964'],
-                 'linemarker': [u'1'],
-                 'title': [u'Broken symmetry and the mass of gauge vector mesons'],
-                 'year': [u'1964']}, ...
-           ],
-        'stats': {
-              'author': 15,
-              'date': '2016-01-12 10:52:58',
-              'doi': 1,
-              'misc': 0,
-              'old_stats_str': '0-1-1-15-0-1-0',
-              'reportnum': 1,
-              'status': 0,
-              'title': 1,
-              'url': 0,
-              'version': u'0.1.0.dev20150722'
-        }
-    }
+    [
+            {'author': [u'F. Englert and R. Brout'],
+             'doi': [u'10.1103/PhysRevLett.13.321'],
+             'journal_page': [u'321'],
+             'journal_reference': ['Phys.Rev.Lett.,13,1964'],
+             'journal_title': [u'Phys.Rev.Lett.'],
+             'journal_volume': [u'13'],
+             'journal_year': [u'1964'],
+             'linemarker': [u'1'],
+             'title': [u'Broken symmetry and the mass of gauge vector mesons'],
+             'year': [u'1964']}, ...
+    ]
 
 You can also extract directly from a URL:
 
@@ -110,29 +96,15 @@ You can also extract directly from a URL:
     from refextract import extract_references_from_url
     reference = extract_references_from_url("http://arxiv.org/pdf/1503.07589v1.pdf")
     print(reference)
-    {
-        'references': [
-                {'author': [u'F. Englert and R. Brout'],
-                 'doi': [u'10.1103/PhysRevLett.13.321'],
-                 'journal_page': [u'321'],
-                 'journal_reference': ['Phys.Rev.Lett.,13,1964'],
-                 'journal_title': [u'Phys.Rev.Lett.'],
-                 'journal_volume': [u'13'],
-                 'journal_year': [u'1964'],
-                 'linemarker': [u'1'],
-                 'title': [u'Broken symmetry and the mass of gauge vector mesons'],
-                 'year': [u'1964']}, ...
-           ],
-        'stats': {
-              'author': 15,
-              'date': '2016-01-12 10:52:58',
-              'doi': 1,
-              'misc': 0,
-              'old_stats_str': '0-1-1-15-0-1-0',
-              'reportnum': 1,
-              'status': 0,
-              'title': 1,
-              'url': 0,
-              'version': u'0.1.0.dev20150722'
-        }
-    }
+    [
+             {'author': [u'F. Englert and R. Brout'],
+              'doi': [u'10.1103/PhysRevLett.13.321'],
+              'journal_page': [u'321'],
+              'journal_reference': ['Phys.Rev.Lett.,13,1964'],
+              'journal_title': [u'Phys.Rev.Lett.'],
+              'journal_volume': [u'13'],
+              'journal_year': [u'1964'],
+              'linemarker': [u'1'],
+              'title': [u'Broken symmetry and the mass of gauge vector mesons'],
+              'year': [u'1964']}, ...
+    ]
diff --git a/refextract/documents/pdf.py b/refextract/documents/pdf.py
@@ -43,6 +43,7 @@
 from six import iteritems
 
 from ..references.config import CFG_PATH_PDFTOTEXT
+from ..references.errors import GarbageFullTextError
 
 # a dictionary of undesirable characters and their replacements:
 UNDESIRABLE_CHAR_REPLACEMENTS = {
@@ -480,18 +481,18 @@ def convert_PDF_to_plaintext(fpath, keep_layout=False):
 
     Take the path to a PDF file and run pdftotext for this file, capturing
     the output.
+    It raises GarbageFullTextError when this output is garbage.
     @param fpath: (string) path to the PDF file
     @return: (list) of unicode strings (contents of the PDF file translated
     into plaintext; each string is a line in the document.)
     """
     if not os.path.isfile(CFG_PATH_PDFTOTEXT):
-        raise Exception('Missing pdftotext executable')
+        raise FileNotFoundError('Missing pdftotext executable')
 
     if keep_layout:
         layout_option = "-layout"
     else:
         layout_option = "-raw"
-    status = 0
     doclines = []
     # Pattern to check for lines with a leading page-break character.
     # If this pattern is matched, we want to split the page-break into
@@ -525,7 +526,6 @@ def convert_PDF_to_plaintext(fpath, keep_layout=False):
 
     # finally, check conversion result not bad:
     if pdftotext_conversion_is_bad(doclines):
-        status = 2
-        doclines = []
+        raise GarbageFullTextError("Garbage fulltext in '{0}'".format(fpath))
 
-    return (doclines, status)
+    return doclines
diff --git a/refextract/references/api.py b/refextract/references/api.py
@@ -30,27 +30,35 @@
 
 
 import os
+import sys
 import requests
+import magic
 
 from tempfile import mkstemp
+from itertools import izip
 
 from .engine import (get_kbs,
                      get_plaintext_document_body,
                      parse_reference_line,
                      parse_references,
                      parse_tagged_reference_line)
-from .errors import FullTextNotAvailable
+from .errors import FullTextNotAvailableError
 from .find import (find_numeration_in_body,
                    get_reference_section_beginning)
+from .pdf import extract_texkeys_from_pdf
 from .tag import tag_reference_line
 from .text import extract_references_from_fulltext, rebuild_reference_lines
 
 
 def extract_references_from_url(url, headers=None, chunk_size=1024, **kwargs):
     """Extract references from the pdf specified in the url.
 
-    The first parameter is the path to the file
-    It raises FullTextNotAvailable if the file does not exist.
+    The first parameter is the URL of the file.
+    It returns a list of parsed references.
+
+    It raises FullTextNotAvailableError if the URL gives a 404,
+    UnknownDocumentTypeError if it is not a PDF or plain text
+    and GarbageFullTextError if the fulltext extraction gives garbage.
 
     The standard reference format is: {title} {volume} ({year}) {page}.
 
@@ -66,32 +74,26 @@ def extract_references_from_url(url, headers=None, chunk_size=1024, **kwargs):
 
     >>> extract_references_from_url(path, override_kbs_files={'journals': 'my/path/to.kb'})
 
-    It raises FullTextNotAvailable if the url gives a 404
     """
     # Get temporary filepath to download to
     filename, filepath = mkstemp(
         suffix="_{0}".format(os.path.basename(url)),
     )
     os.close(filename)
 
-    req = requests.get(
-        url=url,
-        headers=headers,
-        stream=True
-    )
-    if req.status_code == 200:
+    try:
+        req = requests.get(
+            url=url,
+            headers=headers,
+            stream=True
+        )
+        req.raise_for_status()
         with open(filepath, 'wb') as f:
             for chunk in req.iter_content(chunk_size):
                 f.write(chunk)
-
-    try:
-        try:
-            references = extract_references_from_file(filepath, **kwargs)
-        except IOError as err:
-            if err.code == 404:
-                raise FullTextNotAvailable()
-            else:
-                raise
+        references = extract_references_from_file(filepath, **kwargs)
+    except requests.exceptions.HTTPError as e:
+        raise FullTextNotAvailableError("URL not found: '{0}'".format(url)), None, sys.exc_info()[2]
     finally:
         os.remove(filepath)
     return references
@@ -104,8 +106,12 @@ def extract_references_from_file(path,
                                  override_kbs_files=None):
     """Extract references from a local pdf file.
 
-    The first parameter is the path to the file
-    It raises FullTextNotAvailable if the file does not exist.
+    The first parameter is the path to the file.
+    It returns a list of parsed references.
+    It raises FullTextNotAvailableError if the file does not exist,
+    UnknownDocumentTypeError if it is not a PDF or plain text
+    and GarbageFullTextError if the fulltext extraction gives garbage.
+
 
     The standard reference format is: {title} {volume} ({year}) {page}.
 
@@ -121,25 +127,31 @@ def extract_references_from_file(path,
 
     >>> extract_references_from_file(path, override_kbs_files={'journals': 'my/path/to.kb'})
 
-    Returns a dictionary with extracted references and stats.
     """
     if not os.path.isfile(path):
-        raise FullTextNotAvailable()
+        raise FullTextNotAvailableError("File not found: '{0}'".format(path))
 
-    docbody, dummy = get_plaintext_document_body(path)
+    docbody = get_plaintext_document_body(path)
     reflines, dummy, dummy = extract_references_from_fulltext(docbody)
-    if not len(reflines):
-        docbody, dummy = get_plaintext_document_body(path, keep_layout=True)
+    if not reflines:
+        docbody = get_plaintext_document_body(path, keep_layout=True)
         reflines, dummy, dummy = extract_references_from_fulltext(docbody)
 
-    return parse_references(
+    parsed_refs, stats = parse_references(
         reflines,
         recid=recid,
         reference_format=reference_format,
         linker_callback=linker_callback,
         override_kbs_files=override_kbs_files,
     )
 
+    if magic.from_file(path, mime=True) == "application/pdf":
+        texkeys = extract_texkeys_from_pdf(path)
+        if len(texkeys) == len(parsed_refs):
+            parsed_refs = [dict(ref, texkey=[key]) for ref, key in izip(parsed_refs, texkeys)]
+
+    return parsed_refs
+
 
 def extract_references_from_string(source,
                                    is_only_references=True,
@@ -149,8 +161,8 @@ def extract_references_from_string(source,
                                    override_kbs_files=None):
     """Extract references from a raw string.
 
-    The first parameter is the path to the file
-    It raises FullTextNotAvailable if the file does not exist.
+    The first parameter is the path to the file.
+    It returns a tuple (references, stats).
 
     If the string does not only contain references, improve accuracy by
     specifing ``is_only_references=False``.
@@ -159,15 +171,15 @@ def extract_references_from_string(source,
 
     E.g. you can change that by passing the reference_format:
 
-    >>> extract_references_from_url(path, reference_format="{title},{volume},{page}")
+    >>> extract_references_from_string(path, reference_format="{title},{volume},{page}")
 
     If you want to also link each reference to some other resource (like a record),
     you can provide a linker_callback function to be executed for every reference
     element found.
 
     To override KBs for journal names etc., use ``override_kbs_files``:
 
-    >>> extract_references_from_url(path, override_kbs_files={'journals': 'my/path/to.kb'})
+    >>> extract_references_from_string(path, override_kbs_files={'journals': 'my/path/to.kb'})
     """
     docbody = source.split('\n')
     if not is_only_references:
@@ -181,13 +193,14 @@ def extract_references_from_string(source,
 
         reflines = rebuild_reference_lines(
             docbody, refs_info['marker_pattern'])
-    return parse_references(
+    parsed_refs, stats = parse_references(
         reflines,
         recid=recid,
         reference_format=reference_format,
         linker_callback=linker_callback,
         override_kbs_files=override_kbs_files,
     )
+    return parsed_refs
 
 
 def extract_journal_reference(line, override_kbs_files=None):

diff --git a/refextract/references/engine.py b/refextract/references/engine.py
@@ -30,6 +30,8 @@
 from datetime import datetime
 from itertools import chain
 
+import magic
+
 from .config import (
     CFG_REFEXTRACT_MARKER_CLOSING_REPORT_NUM,
     CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_INCL,
@@ -44,6 +46,8 @@
     CFG_PATH_GFILE
 )
 
+from .errors import UnknownDocumentTypeError
+
 from .tag import (
     tag_reference_line,
     sum_2_dictionaries,
@@ -1370,44 +1374,32 @@ def remove_leading_garbage_lines_from_reference_section(ref_sectn):
 
 # Tasks related to conversion of full-text to plain-text:
 
+
 def get_plaintext_document_body(fpath, keep_layout=False):
     """Given a file-path to a full-text, return a list of unicode strings
        whereby each string is a line of the fulltext.
        In the case of a plain-text document, this simply means reading the
-       contents in from the file. In the case of a PDF/PostScript however,
+       contents in from the file. In the case of a PDF however,
        this means converting the document to plaintext.
+       It raises UnknownDocumentTypeError if the document is not a PDF or
+       plain text.
        @param fpath: (string) - the path to the fulltext file
        @return: (list) of strings - each string being a line in the document.
     """
     textbody = []
-    status = 0
-    if os.access(fpath, os.F_OK | os.R_OK):
-        # filepath OK - attempt to extract references:
-        # get file type:
-        cmd_pdftotext = [CFG_PATH_GFILE, fpath]
-        pipe_pdftotext = subprocess.Popen(
-            cmd_pdftotext, stdout=subprocess.PIPE)
-        res_gfile = pipe_pdftotext.stdout.read()
-
-        if (res_gfile.lower().find("text") != -1) and \
-                (res_gfile.lower().find("pdf") == -1):
-            # plain-text file: don't convert - just read in:
-            f = open(fpath, "r")
-            try:
-                textbody = [line.decode("utf-8") for line in f.readlines()]
-            finally:
-                f.close()
-        elif (res_gfile.lower().find("pdf") != -1) or \
-                (res_gfile.lower().find("pdfa") != -1):
-            # convert from PDF
-            (textbody, status) = convert_PDF_to_plaintext(fpath, keep_layout)
-        else:
-            # invalid format
-            status = 1
+    mime_type = magic.from_file(fpath, mime=True)
+
+    if mime_type == "text/plain":
+        with open(fpath, "r") as f:
+            textbody = [line.decode("utf-8") for line in f.readlines()]
+
+    elif mime_type == "application/pdf":
+        textbody = convert_PDF_to_plaintext(fpath, keep_layout)
+
     else:
-        # filepath not OK
-        status = 1
-    return (textbody, status)
+        raise UnknownDocumentTypeError(mime_type)
+
+    return textbody
 
 
 def parse_references(reference_lines,
@@ -1426,10 +1418,8 @@ def parse_references(reference_lines,
     processed_references, counts, dummy_bad_titles_count = \
         parse_references_elements(reference_lines, kbs, linker_callback)
 
-    return {
-        "references": build_references(processed_references, reference_format),
-        "stats": build_stats(counts)
-    }
+    return (build_references(processed_references, reference_format),
+            build_stats(counts))
 
 
 def build_stats(counts):