Merge pull request #63 from monaawi/remove-print-statements

Replaced print statement with logs
inspirehep · Oct 29, 2018 · d70e378 · d70e378
2 parents 1cb2bcc + 094a5f9
commit d70e378
Show file tree

Hide file tree

Showing 7 changed files with 46 additions and 39 deletions.
diff --git a/refextract/authors/regexs.py b/refextract/authors/regexs.py
@@ -23,13 +23,15 @@
 
 from __future__ import absolute_import, division, print_function
 
+import logging
 import re
-import sys
 
 from six.moves import xrange
 
 from ..references.config import CFG_REFEXTRACT_KBS
 
+LOGGER = logging.getLogger(__name__)
+
 
 def get_author_affiliation_numeration_str(punct=None):
     """The numeration which can be applied to author names. Numeration
@@ -415,19 +417,14 @@ def add_to_auth_list(s):
         fh = open(fpath, "r")
     except IOError:
         # problem opening KB for reading, or problem while reading from it:
-        emsg = """Error: Could not build knowledge base containing """ \
-               """author patterns - failed """ \
-               """to read from KB %(kb)s.\n""" \
-               % {'kb': fpath}
-        print(emsg, sys.stderr, verbose=0)
+        LOGGER.debug(u"Error: Could not build knowledge base containing author patterns - failed to read from KB %s s.\n", fpath)
         raise IOError("Error: Unable to open collaborations kb '%s'" % fpath)
 
     for line_num, rawline in enumerate(fh):
         try:
             rawline = rawline.decode("utf-8")
         except UnicodeError:
-            print("*** Unicode problems in %s for line %d"
-                  % (fpath, line_num), sys.stderr, verbose=0)
+            LOGGER.debug(u"Unicode problems in %s for line %d", fpath, line_num)
             raise UnicodeError(
                 "Error: Unable to parse collaboration kb (line: %s)" % str(line_num))
         if rawline.strip() and rawline[0].strip() != '#':

diff --git a/refextract/documents/pdf.py b/refextract/documents/pdf.py
@@ -36,6 +36,7 @@
 
 from __future__ import absolute_import, division, print_function
 
+import logging
 import os
 import re
 import subprocess
@@ -44,6 +45,8 @@
 
 from ..references.config import CFG_PATH_PDFTOTEXT
 
+LOGGER = logging.getLogger(__name__)
+
 # a dictionary of undesirable characters and their replacements:
 UNDESIRABLE_CHAR_REPLACEMENTS = {
     # Control characters not allowed in XML:
@@ -471,7 +474,8 @@ def convert_PDF_to_plaintext(fpath, keep_layout=False):
     # build pdftotext command:
     cmd_pdftotext = [CFG_PATH_PDFTOTEXT, layout_option, "-q",
                      "-enc", "UTF-8", fpath, "-"]
-    print("* %s" % ' '.join(cmd_pdftotext))
+
+    LOGGER.debug(u"%s", ' '.join(cmd_pdftotext))
     # open pipe to pdftotext:
     pipe_pdftotext = subprocess.Popen(cmd_pdftotext, stdout=subprocess.PIPE)
 
@@ -490,7 +494,6 @@ def convert_PDF_to_plaintext(fpath, keep_layout=False):
             doclines.append(u"\f")
             doclines.append(m_break_in_line.group(1))
 
-    print("* convert_PDF_to_plaintext found: "
-          "%s lines of text" % len(doclines))
+    LOGGER.debug(u"convert_PDF_to_plaintext found: %s lines of text", len(doclines))
 
     return doclines
diff --git a/refextract/references/engine.py b/refextract/references/engine.py
@@ -25,6 +25,7 @@
 
 from __future__ import absolute_import, division, print_function
 
+import logging
 import re
 
 from datetime import datetime
@@ -71,6 +72,7 @@
     re_hdl)
 from ..version import __version__ as version
 
+LOGGER = logging.getLogger(__name__)
 
 description = """
 Refextract tries to extract the reference section from a full-text document.
@@ -626,12 +628,12 @@ def look_for_hdl_urls(citation_elements):
 # End of elements transformations
 
 def print_citations(splitted_citations, line_marker):
-    print('* splitted_citations')
-    print('  * line marker %s' % line_marker)
+    LOGGER.debug(u'split_citations')
+    LOGGER.debug(u"line marker %s", line_marker)
     for citation in splitted_citations:
-        print("  * elements")
+        LOGGER.debug(u"elements")
         for el in citation:
-            print('    * %s %s' % (el['type'], repr(el)))
+            LOGGER.debug('%s %s', el['type'], repr(el))
 
 
 def parse_reference_line(ref_line, kbs, bad_titles_count={}, linker_callback=None):
@@ -652,7 +654,7 @@ def parse_reference_line(ref_line, kbs, bad_titles_count={}, linker_callback=Non
                                                        bad_titles_count)
 
     # Debug print tagging (authors, titles, volumes, etc.)
-    print('* tags %r' % tagged_line)
+    LOGGER.debug("tags %r", tagged_line)
 
     # Using the recorded information, create a MARC XML representation
     # of the rebuilt line:
@@ -736,7 +738,7 @@ def search_for_book_in_misc(citation, kbs):
     """
     citation_year = year_from_citation(citation)
     for citation_element in citation:
-        print('* Searching for book title in: %s' % citation_element['misc_txt'])
+        LOGGER.debug(u"Searching for book title in: %s", citation_element['misc_txt'])
         for title in kbs['books']:
             startIndex = find_substring_ignore_special_chars(citation_element['misc_txt'], title)
             if startIndex != -1:
@@ -760,7 +762,7 @@ def search_for_book_in_misc(citation, kbs):
                             book_found = True
 
                     if book_found:
-                        print('* Book found: %s' % title)
+                        LOGGER.debug(u"Book found: %s", title)
                         book_element = {'type': 'BOOK',
                                         'misc_txt': '',
                                         'authors': book_authors,
@@ -772,7 +774,7 @@ def search_for_book_in_misc(citation, kbs):
                         citation_element['misc_txt'] = remove_year(citation_element['misc_txt'], book_year)
                         return True
 
-        print('  * Book not found!')
+        LOGGER.debug("Book not found!")
 
     return False
 

diff --git a/refextract/references/find.py b/refextract/references/find.py
@@ -25,6 +25,7 @@
 
 from __future__ import absolute_import, division, print_function
 
+import logging
 import re
 
 from .regexs import \
@@ -38,6 +39,8 @@
     re_reference_line_number_markers, \
     re_num
 
+LOGGER = logging.getLogger(__name__)
+
 
 def find_reference_section(docbody):
     """Search in document body for its reference section.
@@ -500,10 +503,10 @@ def get_reference_section_beginning(fulltext):
                     sect_start['how_found_start'] = 4
 
     if sect_start:
-        print('* title %r' % sect_start['title_string'])
-        print('* marker %r' % sect_start['marker'])
-        print('* title_marker_same_line %s'
-              % sect_start['title_marker_same_line'])
+        LOGGER.debug(u"title %r", sect_start['title_string'])
+        LOGGER.debug(u"marker %r", sect_start['marker'])
+        LOGGER.debug(u"title_marker_same_line %s", sect_start['title_marker_same_line'])
+
     else:
-        print('* could not find references section')
+        LOGGER.debug(u"could not find references section")
     return sect_start
diff --git a/refextract/references/pdf.py b/refextract/references/pdf.py
@@ -23,13 +23,16 @@
 
 from __future__ import absolute_import, division, print_function
 
-import sys
+import logging
 
 from PyPDF2 import PdfFileReader
 
 from .regexs import re_reference_in_dest
 
 
+LOGGER = logging.getLogger(__name__)
+
+
 class IncompleteCoordinatesError(Exception):
     """Exception raised when a named destination does not have all required
     coordinates.
@@ -51,22 +54,21 @@ def extract_texkeys_from_pdf(pdf_file):
         try:
             pdf = PdfFileReader(pdf_stream, strict=False)
             destinations = pdf.getNamedDestinations()
-        except Exception as exc:
-            print("* PDF: Internal PyPDF2 error, no TeXkeys returned.", exc,
-                  file=sys.stderr)
+        except Exception:
+            LOGGER.debug(u"PDF: Internal PyPDF2 error, no TeXkeys returned.")
             return []
         # not all named destinations point to references
         refs = [dest for dest in destinations.iteritems()
                 if re_reference_in_dest.match(dest[0])]
         try:
             if _destinations_in_two_columns(pdf, refs):
-                print("* PDF: Using two-column layout")
+                LOGGER.debug(u"PDF: Using two-column layout")
 
                 def sortfunc(dest_couple):
                     return _destination_position(pdf, dest_couple[1])
 
             else:
-                print("* PDF: Using single-column layout")
+                LOGGER.debug(u"PDF: Using single-column layout")
 
                 def sortfunc(dest_couple):
                     (page, _, ypos, xpos) = _destination_position(
@@ -77,9 +79,8 @@ def sortfunc(dest_couple):
             # extract the TeXkey from the named destination name
             return [re_reference_in_dest.match(destname).group(1)
                     for (destname, _) in refs]
-        except Exception as exc:
-            print("* PDF: Impossible to determine layout, no TeXkeys returned",
-                  exc, file=sys.stderr)
+        except Exception:
+            LOGGER.debug(u"PDF: Impossible to determine layout, no TeXkeys returned")
             return []
 
 

diff --git a/refextract/references/text.py b/refextract/references/text.py
@@ -23,6 +23,7 @@
 
 from __future__ import absolute_import, division, print_function
 
+import logging
 import re
 
 from ..documents.pdf import replace_undesirable_characters
@@ -36,6 +37,8 @@
 from .config import CFG_REFEXTRACT_MAX_LINES
 from .find import find_end_of_reference_section, get_reference_section_beginning
 
+LOGGER = logging.getLogger(__name__)
+
 
 def extract_references_from_fulltext(fulltext):
     """Locate and extract the reference section from a fulltext document.
@@ -60,8 +63,7 @@ def extract_references_from_fulltext(fulltext):
         # No References
         refs = []
         status = 4
-        print("* extract_references_from_fulltext: "
-              "ref_sect_start is None")
+        LOGGER.debug(u"extract_references_from_fulltext: ref_sect_start is None")
     else:
         # If a reference section was found, however weak
         ref_sect_end = \
@@ -73,8 +75,7 @@ def extract_references_from_fulltext(fulltext):
             # No End to refs? Not safe to extract
             refs = []
             status = 5
-            print("* extract_references_from_fulltext: "
-                  "no end to refs!")
+            LOGGER.debug(u"extract_references_from_fulltext: no end to refs!")
         else:
             # If the end of the reference section was found.. start extraction
             refs = get_reference_lines(fulltext,
@@ -202,7 +203,7 @@ def rebuild_reference_lines(ref_sectn, ref_line_marker_ptn):
             indentation_splitting = True
             ref_line_marker_ptn = ur'^[^\s]'
 
-    print('* references separator %s' % ref_line_marker_ptn)
+    LOGGER.debug(u"references separator %s", ref_line_marker_ptn)
     p_ref_line_marker = re.compile(ref_line_marker_ptn, re.I | re.UNICODE)
 
     # Start from ref 1

diff --git a/setup.cfg b/setup.cfg
@@ -32,4 +32,4 @@ include = refextract/*.py
 addopts = --cov=refextract --cov-report=term-missing:skip-covered
 
 [flake8]
-ignore = *.py E501 FI12 FI14 FI15 FI16 FI17 FI50 FI51 FI53
+ignore = *.py E501 FI12 FI14 FI15 FI16 FI17 FI50 FI51 FI53 W504 W605