Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extract texkeys from PDFs #18

Merged
merged 6 commits into from
Jan 5, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 24 additions & 52 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -76,32 +76,18 @@ To extract references from a publication full-text PDF:
from refextract import extract_references_from_file
reference = extract_references_from_file("some/fulltext/1503.07589v1.pdf")
print(reference)
{
'references': [
{'author': [u'F. Englert and R. Brout'],
'doi': [u'10.1103/PhysRevLett.13.321'],
'journal_page': [u'321'],
'journal_reference': ['Phys.Rev.Lett.,13,1964'],
'journal_title': [u'Phys.Rev.Lett.'],
'journal_volume': [u'13'],
'journal_year': [u'1964'],
'linemarker': [u'1'],
'title': [u'Broken symmetry and the mass of gauge vector mesons'],
'year': [u'1964']}, ...
],
'stats': {
'author': 15,
'date': '2016-01-12 10:52:58',
'doi': 1,
'misc': 0,
'old_stats_str': '0-1-1-15-0-1-0',
'reportnum': 1,
'status': 0,
'title': 1,
'url': 0,
'version': u'0.1.0.dev20150722'
}
}
[
{'author': [u'F. Englert and R. Brout'],
'doi': [u'10.1103/PhysRevLett.13.321'],
'journal_page': [u'321'],
'journal_reference': ['Phys.Rev.Lett.,13,1964'],
'journal_title': [u'Phys.Rev.Lett.'],
'journal_volume': [u'13'],
'journal_year': [u'1964'],
'linemarker': [u'1'],
'title': [u'Broken symmetry and the mass of gauge vector mesons'],
'year': [u'1964']}, ...
]

You can also extract directly from a URL:

Expand All @@ -110,29 +96,15 @@ You can also extract directly from a URL:
from refextract import extract_references_from_url
reference = extract_references_from_url("http://arxiv.org/pdf/1503.07589v1.pdf")
print(reference)
{
'references': [
{'author': [u'F. Englert and R. Brout'],
'doi': [u'10.1103/PhysRevLett.13.321'],
'journal_page': [u'321'],
'journal_reference': ['Phys.Rev.Lett.,13,1964'],
'journal_title': [u'Phys.Rev.Lett.'],
'journal_volume': [u'13'],
'journal_year': [u'1964'],
'linemarker': [u'1'],
'title': [u'Broken symmetry and the mass of gauge vector mesons'],
'year': [u'1964']}, ...
],
'stats': {
'author': 15,
'date': '2016-01-12 10:52:58',
'doi': 1,
'misc': 0,
'old_stats_str': '0-1-1-15-0-1-0',
'reportnum': 1,
'status': 0,
'title': 1,
'url': 0,
'version': u'0.1.0.dev20150722'
}
}
[
{'author': [u'F. Englert and R. Brout'],
'doi': [u'10.1103/PhysRevLett.13.321'],
'journal_page': [u'321'],
'journal_reference': ['Phys.Rev.Lett.,13,1964'],
'journal_title': [u'Phys.Rev.Lett.'],
'journal_volume': [u'13'],
'journal_year': [u'1964'],
'linemarker': [u'1'],
'title': [u'Broken symmetry and the mass of gauge vector mesons'],
'year': [u'1964']}, ...
]
10 changes: 5 additions & 5 deletions refextract/documents/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
from six import iteritems

from ..references.config import CFG_PATH_PDFTOTEXT
from ..references.errors import GarbageFullTextError

# a dictionary of undesirable characters and their replacements:
UNDESIRABLE_CHAR_REPLACEMENTS = {
Expand Down Expand Up @@ -480,18 +481,18 @@ def convert_PDF_to_plaintext(fpath, keep_layout=False):
Take the path to a PDF file and run pdftotext for this file, capturing
the output.
It raises GarbageFullTextError when this output is garbage.
@param fpath: (string) path to the PDF file
@return: (list) of unicode strings (contents of the PDF file translated
into plaintext; each string is a line in the document.)
"""
if not os.path.isfile(CFG_PATH_PDFTOTEXT):
raise Exception('Missing pdftotext executable')
raise FileNotFoundError('Missing pdftotext executable')

if keep_layout:
layout_option = "-layout"
else:
layout_option = "-raw"
status = 0
doclines = []
# Pattern to check for lines with a leading page-break character.
# If this pattern is matched, we want to split the page-break into
Expand Down Expand Up @@ -525,7 +526,6 @@ def convert_PDF_to_plaintext(fpath, keep_layout=False):

# finally, check conversion result not bad:
if pdftotext_conversion_is_bad(doclines):
status = 2
doclines = []
raise GarbageFullTextError("Garbage fulltext in '{0}'".format(fpath))

return (doclines, status)
return doclines
77 changes: 45 additions & 32 deletions refextract/references/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,27 +30,35 @@


import os
import sys
import requests
import magic

from tempfile import mkstemp
from itertools import izip

from .engine import (get_kbs,
get_plaintext_document_body,
parse_reference_line,
parse_references,
parse_tagged_reference_line)
from .errors import FullTextNotAvailable
from .errors import FullTextNotAvailableError
from .find import (find_numeration_in_body,
get_reference_section_beginning)
from .pdf import extract_texkeys_from_pdf
from .tag import tag_reference_line
from .text import extract_references_from_fulltext, rebuild_reference_lines


def extract_references_from_url(url, headers=None, chunk_size=1024, **kwargs):
"""Extract references from the pdf specified in the url.

The first parameter is the path to the file
It raises FullTextNotAvailable if the file does not exist.
The first parameter is the URL of the file.
It returns a list of parsed references.

It raises FullTextNotAvailableError if the URL gives a 404,
UnknownDocumentTypeError if it is not a PDF or plain text
and GarbageFullTextError if the fulltext extraction gives garbage.

The standard reference format is: {title} {volume} ({year}) {page}.

Expand All @@ -66,32 +74,26 @@ def extract_references_from_url(url, headers=None, chunk_size=1024, **kwargs):

>>> extract_references_from_url(path, override_kbs_files={'journals': 'my/path/to.kb'})

It raises FullTextNotAvailable if the url gives a 404
"""
# Get temporary filepath to download to
filename, filepath = mkstemp(
suffix="_{0}".format(os.path.basename(url)),
)
os.close(filename)

req = requests.get(
url=url,
headers=headers,
stream=True
)
if req.status_code == 200:
try:
req = requests.get(
url=url,
headers=headers,
stream=True
)
req.raise_for_status()
with open(filepath, 'wb') as f:
for chunk in req.iter_content(chunk_size):
f.write(chunk)

try:
try:
references = extract_references_from_file(filepath, **kwargs)
except IOError as err:
if err.code == 404:
raise FullTextNotAvailable()
else:
raise
references = extract_references_from_file(filepath, **kwargs)
except requests.exceptions.HTTPError as e:
raise FullTextNotAvailableError("URL not found: '{0}'".format(url)), None, sys.exc_info()[2]
finally:
os.remove(filepath)
return references
Expand All @@ -104,8 +106,12 @@ def extract_references_from_file(path,
override_kbs_files=None):
"""Extract references from a local pdf file.

The first parameter is the path to the file
It raises FullTextNotAvailable if the file does not exist.
The first parameter is the path to the file.
It returns a list of parsed references.
It raises FullTextNotAvailableError if the file does not exist,
UnknownDocumentTypeError if it is not a PDF or plain text
and GarbageFullTextError if the fulltext extraction gives garbage.


The standard reference format is: {title} {volume} ({year}) {page}.

Expand All @@ -121,25 +127,31 @@ def extract_references_from_file(path,

>>> extract_references_from_file(path, override_kbs_files={'journals': 'my/path/to.kb'})

Returns a dictionary with extracted references and stats.
"""
if not os.path.isfile(path):
raise FullTextNotAvailable()
raise FullTextNotAvailableError("File not found: '{0}'".format(path))

docbody, dummy = get_plaintext_document_body(path)
docbody = get_plaintext_document_body(path)
reflines, dummy, dummy = extract_references_from_fulltext(docbody)
if not len(reflines):
docbody, dummy = get_plaintext_document_body(path, keep_layout=True)
if not reflines:
docbody = get_plaintext_document_body(path, keep_layout=True)
reflines, dummy, dummy = extract_references_from_fulltext(docbody)

return parse_references(
parsed_refs, stats = parse_references(
reflines,
recid=recid,
reference_format=reference_format,
linker_callback=linker_callback,
override_kbs_files=override_kbs_files,
)

if magic.from_file(path, mime=True) == "application/pdf":
texkeys = extract_texkeys_from_pdf(path)
if len(texkeys) == len(parsed_refs):
parsed_refs = [dict(ref, texkey=[key]) for ref, key in izip(parsed_refs, texkeys)]

return parsed_refs


def extract_references_from_string(source,
is_only_references=True,
Expand All @@ -149,8 +161,8 @@ def extract_references_from_string(source,
override_kbs_files=None):
"""Extract references from a raw string.

The first parameter is the path to the file
It raises FullTextNotAvailable if the file does not exist.
The first parameter is the path to the file.
It returns a tuple (references, stats).

If the string does not only contain references, improve accuracy by
specifing ``is_only_references=False``.
Expand All @@ -159,15 +171,15 @@ def extract_references_from_string(source,

E.g. you can change that by passing the reference_format:

>>> extract_references_from_url(path, reference_format="{title},{volume},{page}")
>>> extract_references_from_string(path, reference_format="{title},{volume},{page}")

If you want to also link each reference to some other resource (like a record),
you can provide a linker_callback function to be executed for every reference
element found.

To override KBs for journal names etc., use ``override_kbs_files``:

>>> extract_references_from_url(path, override_kbs_files={'journals': 'my/path/to.kb'})
>>> extract_references_from_string(path, override_kbs_files={'journals': 'my/path/to.kb'})
"""
docbody = source.split('\n')
if not is_only_references:
Expand All @@ -181,13 +193,14 @@ def extract_references_from_string(source,

reflines = rebuild_reference_lines(
docbody, refs_info['marker_pattern'])
return parse_references(
parsed_refs, stats = parse_references(
reflines,
recid=recid,
reference_format=reference_format,
linker_callback=linker_callback,
override_kbs_files=override_kbs_files,
)
return parsed_refs


def extract_journal_reference(line, override_kbs_files=None):
Expand Down
54 changes: 22 additions & 32 deletions refextract/references/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@
from datetime import datetime
from itertools import chain

import magic

from .config import (
CFG_REFEXTRACT_MARKER_CLOSING_REPORT_NUM,
CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_INCL,
Expand All @@ -44,6 +46,8 @@
CFG_PATH_GFILE
)

from .errors import UnknownDocumentTypeError

from .tag import (
tag_reference_line,
sum_2_dictionaries,
Expand Down Expand Up @@ -1370,44 +1374,32 @@ def remove_leading_garbage_lines_from_reference_section(ref_sectn):

# Tasks related to conversion of full-text to plain-text:


def get_plaintext_document_body(fpath, keep_layout=False):
"""Given a file-path to a full-text, return a list of unicode strings
whereby each string is a line of the fulltext.
In the case of a plain-text document, this simply means reading the
contents in from the file. In the case of a PDF/PostScript however,
contents in from the file. In the case of a PDF however,
this means converting the document to plaintext.
It raises UnknownDocumentTypeError if the document is not a PDF or
plain text.
@param fpath: (string) - the path to the fulltext file
@return: (list) of strings - each string being a line in the document.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This does not corresponds to the current return, which also return a status flag.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can fix this, but wrong docstrings are the norm rather than the exception here

"""
textbody = []
status = 0
if os.access(fpath, os.F_OK | os.R_OK):
# filepath OK - attempt to extract references:
# get file type:
cmd_pdftotext = [CFG_PATH_GFILE, fpath]
pipe_pdftotext = subprocess.Popen(
cmd_pdftotext, stdout=subprocess.PIPE)
res_gfile = pipe_pdftotext.stdout.read()

if (res_gfile.lower().find("text") != -1) and \
(res_gfile.lower().find("pdf") == -1):
# plain-text file: don't convert - just read in:
f = open(fpath, "r")
try:
textbody = [line.decode("utf-8") for line in f.readlines()]
finally:
f.close()
elif (res_gfile.lower().find("pdf") != -1) or \
(res_gfile.lower().find("pdfa") != -1):
# convert from PDF
(textbody, status) = convert_PDF_to_plaintext(fpath, keep_layout)
else:
# invalid format
status = 1
mime_type = magic.from_file(fpath, mime=True)

if mime_type == "text/plain":
with open(fpath, "r") as f:
textbody = [line.decode("utf-8") for line in f.readlines()]

elif mime_type == "application/pdf":
textbody = convert_PDF_to_plaintext(fpath, keep_layout)

else:
# filepath not OK
status = 1
return (textbody, status)
raise UnknownDocumentTypeError(mime_type)

return textbody


def parse_references(reference_lines,
Expand All @@ -1426,10 +1418,8 @@ def parse_references(reference_lines,
processed_references, counts, dummy_bad_titles_count = \
parse_references_elements(reference_lines, kbs, linker_callback)

return {
"references": build_references(processed_references, reference_format),
"stats": build_stats(counts)
}
return (build_references(processed_references, reference_format),
build_stats(counts))


def build_stats(counts):
Expand Down
Loading