Skip to content

Commit

Permalink
Merge pull request #63 from monaawi/remove-print-statements
Browse files Browse the repository at this point in the history
Replaced print statement with logs
  • Loading branch information
drjova committed Oct 29, 2018
2 parents 1cb2bcc + 094a5f9 commit d70e378
Show file tree
Hide file tree
Showing 7 changed files with 46 additions and 39 deletions.
13 changes: 5 additions & 8 deletions refextract/authors/regexs.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,15 @@

from __future__ import absolute_import, division, print_function

import logging
import re
import sys

from six.moves import xrange

from ..references.config import CFG_REFEXTRACT_KBS

LOGGER = logging.getLogger(__name__)


def get_author_affiliation_numeration_str(punct=None):
"""The numeration which can be applied to author names. Numeration
Expand Down Expand Up @@ -415,19 +417,14 @@ def add_to_auth_list(s):
fh = open(fpath, "r")
except IOError:
# problem opening KB for reading, or problem while reading from it:
emsg = """Error: Could not build knowledge base containing """ \
"""author patterns - failed """ \
"""to read from KB %(kb)s.\n""" \
% {'kb': fpath}
print(emsg, sys.stderr, verbose=0)
LOGGER.debug(u"Error: Could not build knowledge base containing author patterns - failed to read from KB %s s.\n", fpath)
raise IOError("Error: Unable to open collaborations kb '%s'" % fpath)

for line_num, rawline in enumerate(fh):
try:
rawline = rawline.decode("utf-8")
except UnicodeError:
print("*** Unicode problems in %s for line %d"
% (fpath, line_num), sys.stderr, verbose=0)
LOGGER.debug(u"Unicode problems in %s for line %d", fpath, line_num)
raise UnicodeError(
"Error: Unable to parse collaboration kb (line: %s)" % str(line_num))
if rawline.strip() and rawline[0].strip() != '#':
Expand Down
9 changes: 6 additions & 3 deletions refextract/documents/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@

from __future__ import absolute_import, division, print_function

import logging
import os
import re
import subprocess
Expand All @@ -44,6 +45,8 @@

from ..references.config import CFG_PATH_PDFTOTEXT

LOGGER = logging.getLogger(__name__)

# a dictionary of undesirable characters and their replacements:
UNDESIRABLE_CHAR_REPLACEMENTS = {
# Control characters not allowed in XML:
Expand Down Expand Up @@ -471,7 +474,8 @@ def convert_PDF_to_plaintext(fpath, keep_layout=False):
# build pdftotext command:
cmd_pdftotext = [CFG_PATH_PDFTOTEXT, layout_option, "-q",
"-enc", "UTF-8", fpath, "-"]
print("* %s" % ' '.join(cmd_pdftotext))

LOGGER.debug(u"%s", ' '.join(cmd_pdftotext))
# open pipe to pdftotext:
pipe_pdftotext = subprocess.Popen(cmd_pdftotext, stdout=subprocess.PIPE)

Expand All @@ -490,7 +494,6 @@ def convert_PDF_to_plaintext(fpath, keep_layout=False):
doclines.append(u"\f")
doclines.append(m_break_in_line.group(1))

print("* convert_PDF_to_plaintext found: "
"%s lines of text" % len(doclines))
LOGGER.debug(u"convert_PDF_to_plaintext found: %s lines of text", len(doclines))

return doclines
18 changes: 10 additions & 8 deletions refextract/references/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@

from __future__ import absolute_import, division, print_function

import logging
import re

from datetime import datetime
Expand Down Expand Up @@ -71,6 +72,7 @@
re_hdl)
from ..version import __version__ as version

LOGGER = logging.getLogger(__name__)

description = """
Refextract tries to extract the reference section from a full-text document.
Expand Down Expand Up @@ -626,12 +628,12 @@ def look_for_hdl_urls(citation_elements):
# End of elements transformations

def print_citations(splitted_citations, line_marker):
print('* splitted_citations')
print(' * line marker %s' % line_marker)
LOGGER.debug(u'split_citations')
LOGGER.debug(u"line marker %s", line_marker)
for citation in splitted_citations:
print(" * elements")
LOGGER.debug(u"elements")
for el in citation:
print(' * %s %s' % (el['type'], repr(el)))
LOGGER.debug('%s %s', el['type'], repr(el))


def parse_reference_line(ref_line, kbs, bad_titles_count={}, linker_callback=None):
Expand All @@ -652,7 +654,7 @@ def parse_reference_line(ref_line, kbs, bad_titles_count={}, linker_callback=Non
bad_titles_count)

# Debug print tagging (authors, titles, volumes, etc.)
print('* tags %r' % tagged_line)
LOGGER.debug("tags %r", tagged_line)

# Using the recorded information, create a MARC XML representation
# of the rebuilt line:
Expand Down Expand Up @@ -736,7 +738,7 @@ def search_for_book_in_misc(citation, kbs):
"""
citation_year = year_from_citation(citation)
for citation_element in citation:
print('* Searching for book title in: %s' % citation_element['misc_txt'])
LOGGER.debug(u"Searching for book title in: %s", citation_element['misc_txt'])
for title in kbs['books']:
startIndex = find_substring_ignore_special_chars(citation_element['misc_txt'], title)
if startIndex != -1:
Expand All @@ -760,7 +762,7 @@ def search_for_book_in_misc(citation, kbs):
book_found = True

if book_found:
print('* Book found: %s' % title)
LOGGER.debug(u"Book found: %s", title)
book_element = {'type': 'BOOK',
'misc_txt': '',
'authors': book_authors,
Expand All @@ -772,7 +774,7 @@ def search_for_book_in_misc(citation, kbs):
citation_element['misc_txt'] = remove_year(citation_element['misc_txt'], book_year)
return True

print(' * Book not found!')
LOGGER.debug("Book not found!")

return False

Expand Down
13 changes: 8 additions & 5 deletions refextract/references/find.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@

from __future__ import absolute_import, division, print_function

import logging
import re

from .regexs import \
Expand All @@ -38,6 +39,8 @@
re_reference_line_number_markers, \
re_num

LOGGER = logging.getLogger(__name__)


def find_reference_section(docbody):
"""Search in document body for its reference section.
Expand Down Expand Up @@ -500,10 +503,10 @@ def get_reference_section_beginning(fulltext):
sect_start['how_found_start'] = 4

if sect_start:
print('* title %r' % sect_start['title_string'])
print('* marker %r' % sect_start['marker'])
print('* title_marker_same_line %s'
% sect_start['title_marker_same_line'])
LOGGER.debug(u"title %r", sect_start['title_string'])
LOGGER.debug(u"marker %r", sect_start['marker'])
LOGGER.debug(u"title_marker_same_line %s", sect_start['title_marker_same_line'])

else:
print('* could not find references section')
LOGGER.debug(u"could not find references section")
return sect_start
19 changes: 10 additions & 9 deletions refextract/references/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,16 @@

from __future__ import absolute_import, division, print_function

import sys
import logging

from PyPDF2 import PdfFileReader

from .regexs import re_reference_in_dest


LOGGER = logging.getLogger(__name__)


class IncompleteCoordinatesError(Exception):
"""Exception raised when a named destination does not have all required
coordinates.
Expand All @@ -51,22 +54,21 @@ def extract_texkeys_from_pdf(pdf_file):
try:
pdf = PdfFileReader(pdf_stream, strict=False)
destinations = pdf.getNamedDestinations()
except Exception as exc:
print("* PDF: Internal PyPDF2 error, no TeXkeys returned.", exc,
file=sys.stderr)
except Exception:
LOGGER.debug(u"PDF: Internal PyPDF2 error, no TeXkeys returned.")
return []
# not all named destinations point to references
refs = [dest for dest in destinations.iteritems()
if re_reference_in_dest.match(dest[0])]
try:
if _destinations_in_two_columns(pdf, refs):
print("* PDF: Using two-column layout")
LOGGER.debug(u"PDF: Using two-column layout")

def sortfunc(dest_couple):
return _destination_position(pdf, dest_couple[1])

else:
print("* PDF: Using single-column layout")
LOGGER.debug(u"PDF: Using single-column layout")

def sortfunc(dest_couple):
(page, _, ypos, xpos) = _destination_position(
Expand All @@ -77,9 +79,8 @@ def sortfunc(dest_couple):
# extract the TeXkey from the named destination name
return [re_reference_in_dest.match(destname).group(1)
for (destname, _) in refs]
except Exception as exc:
print("* PDF: Impossible to determine layout, no TeXkeys returned",
exc, file=sys.stderr)
except Exception:
LOGGER.debug(u"PDF: Impossible to determine layout, no TeXkeys returned")
return []


Expand Down
11 changes: 6 additions & 5 deletions refextract/references/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

from __future__ import absolute_import, division, print_function

import logging
import re

from ..documents.pdf import replace_undesirable_characters
Expand All @@ -36,6 +37,8 @@
from .config import CFG_REFEXTRACT_MAX_LINES
from .find import find_end_of_reference_section, get_reference_section_beginning

LOGGER = logging.getLogger(__name__)


def extract_references_from_fulltext(fulltext):
"""Locate and extract the reference section from a fulltext document.
Expand All @@ -60,8 +63,7 @@ def extract_references_from_fulltext(fulltext):
# No References
refs = []
status = 4
print("* extract_references_from_fulltext: "
"ref_sect_start is None")
LOGGER.debug(u"extract_references_from_fulltext: ref_sect_start is None")
else:
# If a reference section was found, however weak
ref_sect_end = \
Expand All @@ -73,8 +75,7 @@ def extract_references_from_fulltext(fulltext):
# No End to refs? Not safe to extract
refs = []
status = 5
print("* extract_references_from_fulltext: "
"no end to refs!")
LOGGER.debug(u"extract_references_from_fulltext: no end to refs!")
else:
# If the end of the reference section was found.. start extraction
refs = get_reference_lines(fulltext,
Expand Down Expand Up @@ -202,7 +203,7 @@ def rebuild_reference_lines(ref_sectn, ref_line_marker_ptn):
indentation_splitting = True
ref_line_marker_ptn = ur'^[^\s]'

print('* references separator %s' % ref_line_marker_ptn)
LOGGER.debug(u"references separator %s", ref_line_marker_ptn)
p_ref_line_marker = re.compile(ref_line_marker_ptn, re.I | re.UNICODE)

# Start from ref 1
Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,4 @@ include = refextract/*.py
addopts = --cov=refextract --cov-report=term-missing:skip-covered

[flake8]
ignore = *.py E501 FI12 FI14 FI15 FI16 FI17 FI50 FI51 FI53
ignore = *.py E501 FI12 FI14 FI15 FI16 FI17 FI50 FI51 FI53 W504 W605

0 comments on commit d70e378

Please sign in to comment.