From a4bc794a70ec69e5b63b66b79157aae7e8157099 Mon Sep 17 00:00:00 2001 From: Travis Brooks Date: Tue, 30 Sep 2008 18:26:38 -0700 Subject: [PATCH 01/15] refextract: basic framework for author extraction * Passes tests for author extractions but no logic (i.e. extraction is no good) --- modules/bibedit/lib/refextract.py | 342 +++++++++++++++++++++++++----- 1 file changed, 288 insertions(+), 54 deletions(-) diff --git a/modules/bibedit/lib/refextract.py b/modules/bibedit/lib/refextract.py index 5bd83ccb33..52dc5bafe9 100644 --- a/modules/bibedit/lib/refextract.py +++ b/modules/bibedit/lib/refextract.py @@ -2031,6 +2031,26 @@ def _cmp_bystrlen_reverse(a, b): ## return the raw knowledge base: return (kb, standardised_titles, seek_phrases) + + +def get_affiliation_canonical_value(proposed_affil): + """Given a proposed affiliation, look for a canonical form in the + affils knowledge base + + @param proposed_affil the possible affiliation name to be looked for + @return canonical form returns none if no key matches + + """ + + try: + from invenio.bibformat_dblayer import get_kb_mapping_value + except ImportError: + def get_kb_mapping_value(kb_name, key): + """ if we have no kb, just accept affiliations as they are""" + return None #default + + + def standardize_and_markup_numeration_of_citations_in_line(line): """Given a reference line, attempt to locate instances of citation 'numeration' in the line. @@ -4677,6 +4697,88 @@ def perform_regex_search_upon_line_with_pattern_list(line, patterns): return m +def find_author_section(docbody, author_marker = None, first_author = None): + """Search in document body for its author section. + Looks top down for things that look like an author list. This will + work generally poorly unless one is using the LaTeX in some way, or + if one knows the first author. Both of these methods are tried + first, falling back to a default search for the first line + matching + [A-Z]\w+, [A-Z]\.?\s?[A-Z]?\.?\s?\d* + (i.e. a word starting with caps, followed by comma, space, one + or two initials with possible periods and then possibly a number. + + @param docbody: (list) of strings - the full document body. + @param author_marker: (string) optional (regexp) marker embedded by latex + for beginning and end of author section + @param first_author: (string) optional (regexp) first author to help find + beginning of section + @return: (dictionary) : + { 'start_line' : (integer) - index in docbody of 1st author line, + 'end_line' : (integer) - index of last author line + } + Much of this information is used by later functions to rebuild + a reference section. + -- OR -- + (None) - when the reference section could not be found. + """ + auth_start_line = None + auth_end_line = None + #A pattern to match author names + # demands name has a comma + # allows space or hyphen in family name + # allows only initials (capital letters) but allows many (3 or more if + # no . or spaces used...) + # allows a trailing number + # Aubert, F. I. 3 + author_pattern = re.compile('([A-Z]\w+\s?\w+)\s?([A-Z\.\s]{1,9})\.?\s?(\d*)') + # F. I. Aubert, 3 + author_pattern = re.compile('([A-Z])\.\s?([A-Z]?)\.?\s?([A-Z]\w+\s?\w*)\,?\s?(\d*)') + start_pattern = author_pattern + end_pattern = author_pattern + +# if author_marker is not None: +# start_pattern = re.compile(author_marker+'(.*)') +# end_pattern = re.compile('(.*)'+author_marker) +# if first_author is not None: +# start_pattern = re.compile(first_author) +# end_pattern = None; + + + for position in range(len(docbody)): + line = docbody[position] + if auth_start_line is None: + if cli_opts['verbosity'] > 2: + print "examining " + line.encode("utf8") + print "re -> " + start_pattern.pattern + if start_pattern.search(line): + auth_start_line = position + elif auth_end_line is None and end_pattern.search(line): + # this could be the last author or one of many + auth_end_line = position + elif auth_end_line is not None and end_pattern.search(line): + break + # leave when we have found a possible and, and the ending + # pattern no longer matches this will fail if there are + # affiliations interspersed, or othe corruptions of the list + + + if auth_start_line is not None: + ## return dictionary containing details of author section: + auth_sect_details = { + 'start_line' : auth_start_line, + 'end_line' : auth_end_line, + 'marker_pattern' : author_pattern, + 'title_string' : None, + 'marker' : None, + 'title_marker_same_line' : None, + } + else: + auth_sect_details = None + return auth_sect_details + + + def find_reference_section(docbody): """Search in document body for its reference section. More precisely, find the first line of the reference section. Effectively, the function starts @@ -5392,6 +5494,24 @@ def wash_and_repair_reference_line(line): line = re_multiple_space.sub(u' ', line) return line + +def rebuild_author_lines(author_lines, author_pattern): + """Given the lines that we think make up the author section reset + everything so that each author is on one line + """ + def found_author(matchobj): + """ given an author in the match obj, pushes it on the stack of lines + """ + authors.append(matchobj.group(0)) + if cli_opts['verbosity'] > 1: + print "Found author -> "+ matchobj.group(0)+ "\n" + return ' ' + authors = [] + author_string = ' '.join(author_lines) + author_pattern.sub(found_author, author_string) + return authors + + def rebuild_reference_lines(ref_sectn, ref_line_marker_ptn): """Given a reference section, rebuild the reference lines. After translation from PDF to text, reference lines are often broken. This is because @@ -5490,6 +5610,63 @@ def rebuild_reference_lines(ref_sectn, ref_line_marker_ptn): return rebuilt_references +def get_lines(docbody, + start_line, + end_line, + title, + marker_ptn, + title_marker_same_line, + section = 'references'): + """from a given section of a document extract the relevant lines, not + including the various markers. + @param start_line index of docbody on which sect starts + @param end_line index of docbody on which sect ends + @param title a string that signifies the beginning + @param marker_ptn pattern that ids start of a line + @param title_marker_same_line integer tells whether title and + marker are on same line + @param section[="references"] string denoting type of section + @return: (list) of strings. Each string is a reference line, extracted + from the document. """ + + start_idx = start_line + if title_marker_same_line: + ## Title on same line as 1st ref- take title out! + title_start = docbody[start_idx].find(title) + if title_start != -1: + docbody[start_idx] = docbody[start_idx][title_start + \ + len(title):] + elif title is not None: + ## Pass title line + start_idx += 1 + + + + ## now rebuild reference lines: + if type(end_line) is int: + if section is 'references': + lines = \ + rebuild_reference_lines(docbody[start_idx:end_line+1], \ + marker_ptn) + elif section is 'authors': + print "ready to rebuild" + lines = \ + rebuild_author_lines(docbody[start_idx:end_line+1], \ + marker_ptn) + #lines = docbody[start_idx:end_line+1] + else: + if section is 'references': + lines = rebuild_reference_lines(docbody[start_idx:], \ + marker_ptn) + elif section is 'authors': + lines = \ + rebuild_author_lines(docbody[start_idx:], \ + marker_ptn) + #lines = docbody[start_idx:] + return lines + + + def get_reference_lines(docbody, ref_sect_start_line, ref_sect_end_line, @@ -5520,6 +5697,8 @@ def get_reference_lines(docbody, from the document. """ start_idx = ref_sect_start_line + + if title_marker_same_line: ## Title on same line as 1st ref- take title out! title_start = docbody[start_idx].find(ref_sect_title) @@ -5547,65 +5726,101 @@ def get_reference_lines(docbody, ## ----> Glue - logic for finding and extracting reference section: def extract_references_from_fulltext(fulltext): - """Locate and extract the reference section from a fulltext document. + """Locate and extract references from a fulltext document. Return the extracted reference section as a list of strings, whereby each string in the list is considered to be a single reference line. E.g. a string could be something like: '[19] Wilson, A. Unpublished (1986). + wrapper for more general extract_section_from_fulltext() + @param fulltext: (list) of strings, whereby each string is a line of the document. @return: (list) of strings, where each string is an extracted reference line. """ + return extract_section_from_fulltext(fulltext, 'references') + +def extract_section_from_fulltext(fulltext, section): + """Locate and extract a relevant named section from a fulltext document. + Return the extracted section as a list of strings, whereby each + string in the list is considered to be a single line (reference, + author, abstract etc). + E.g. a string could be something like: + '[19] Wilson, A. Unpublished (1986). + @param fulltext: (list) of strings, whereby each string is a line of the + document. + @param section: 'references', 'authors', or FIXME 'abstract' + @return: (list) of strings, where each string is an extracted line. + """ ## Try to remove pagebreaks, headers, footers fulltext = remove_page_boundary_lines(fulltext) status = 0 + sect_start = {'start_line' : None, + 'end_line' : None, + 'title_string': None, + 'marker_pattern': None, + 'marker' : None, + } + + sect_end = None #How ref section found flag how_found_start = 0 - ## Find start of refs section: - ref_sect_start = find_reference_section(fulltext) - if ref_sect_start is not None: how_found_start = 1 - if ref_sect_start is None: - ## No references found - try with no title option - ref_sect_start = find_reference_section_no_title_via_brackets(fulltext) - if ref_sect_start is not None: how_found_start = 2 - ## Try weaker set of patterns if needed - if ref_sect_start is None: - ## No references found - try with no title option (with weaker patterns..) - ref_sect_start = find_reference_section_no_title_via_dots(fulltext) - if ref_sect_start is not None: how_found_start = 3 - if ref_sect_start is None: - ## No references found - try with no title option (with even weaker patterns..) - ref_sect_start = find_reference_section_no_title_via_numbers(fulltext) - if ref_sect_start is not None: how_found_start = 4 - if ref_sect_start is None: + if section == 'references': + ## Find start of refs section: + sect_start = find_reference_section(fulltext) + if sect_start is not None: how_found_start = 1 + if sect_start is None: + ## No references found - try with no title option + sect_start = find_reference_section_no_title_via_brackets(fulltext) + if sect_start is not None: how_found_start = 2 + ## Try weaker set of patterns if needed + if sect_start is None: + ## No references found - try with no title option (with weaker patterns..) + sect_start = find_reference_section_no_title_via_dots(fulltext) + if sect_start is not None: how_found_start = 3 + if sect_start is None: + ## No references found - try with no title option (with even weaker patterns..) + sect_start = find_reference_section_no_title_via_numbers(fulltext) + if sect_start is not None: how_found_start = 4 + elif section == 'authors': + + sect_start = find_author_section(fulltext, first_author = cli_opts['first_author']) + + + if sect_start is None: ## No References - refs = [] + lines = [] status = 4 - write_message("-----extract_references_from_fulltext: " \ - "ref_sect_start is None\n", verbose=2) + write_message("-----extract_section_from_fulltext: " \ + "No section found\n", verbose=2) else: - ## If a reference section was found, however weak - ref_sect_end = \ - find_end_of_reference_section(fulltext, \ - ref_sect_start["start_line"], \ - ref_sect_start["marker"], \ - ref_sect_start["marker_pattern"]) - if ref_sect_end is None: + sect_end = None + if sect_start.has_key("end_line"): + sect_end = sect_start["end_line"] + if sect_end is None: + sect_end = \ + find_end_of_reference_section(fulltext, \ + sect_start["start_line"], \ + sect_start["marker"], \ + sect_start["marker_pattern"]) + + if sect_end is None: ## No End to refs? Not safe to extract - refs = [] + lines = [] status = 5 - write_message("-----extract_references_from_fulltext: " \ - "no end to refs!\n", verbose=2) + write_message("-----extract_section_from_fulltext: " \ + "No end to section!\n", verbose=2) else: - ## If the end of the reference section was found.. start extraction - refs = get_reference_lines(fulltext, \ - ref_sect_start["start_line"], \ - ref_sect_end, \ - ref_sect_start["title_string"], \ - ref_sect_start["marker_pattern"], \ - ref_sect_start["title_marker_same_line"]) - return (refs, status, how_found_start) + ## Extract + lines = get_lines(fulltext, \ + sect_start["start_line"], \ + sect_end, \ + sect_start["title_string"], \ + sect_start["marker_pattern"], \ + sect_start["title_marker_same_line"], + section, + ) + return (lines, status, how_found_start) ## Tasks related to conversion of full-text to plain-text: @@ -5783,7 +5998,7 @@ def get_cli_options(): """ global cli_opts ## dictionary of important flags and values relating to cli call of program: - cli_opts = { 'treat_as_reference_section' : 0, + cli_opts = { 'treat_as_raw_section' : 0, 'fulltext' : [], 'output_raw' : 0, 'verbosity' : 0, @@ -5792,6 +6007,8 @@ def get_cli_options(): 'inspire' : 0, 'kb-journal' : 0, 'kb-report-number' : 0, + 'authors' : 0, + 'first_author' : 0, } try: @@ -5801,12 +6018,15 @@ def get_cli_options(): "verbose=", "fulltext=", "raw-references", + "raw-authors", + "authors", "output-raw-refs", "xmlfile=", "dictfile=", "inspire", "kb-journal=", - "kb-report-number=",]) + "kb-report-number=", + "first_author",]) except getopt.GetoptError, err: if err.opt in ("c", "collection", "i", "recid", "e", "extraction-job"): ## These are arguments designed to be used for the daemon mode only @@ -5837,9 +6057,9 @@ def get_cli_options(): elif o[0] in ("-f", "--fulltext"): ## add a pdf/text file from where to extract references cli_opts['fulltext'].append(o[1]) - elif o[0] in ("-z", "--raw-references"): - ## treat input as pure reference lines: - cli_opts['treat_as_reference_section'] = 1 + elif o[0] in ("-z", "--raw-references", "--raw-authors"): + ## treat input as pure lines relevant to extraction: + cli_opts['treat_as_raw_section'] = 1 elif o[0] in ("-x", "--xmlfile"): ## Write out MARC XML references to the specified file cli_opts['xmlfile'] = o[1] @@ -5860,6 +6080,13 @@ def get_cli_options(): ## The location of the report number kb requested to override ## a 'configuration file'-specified kb cli_opts['kb-report-number'] = o[1] + elif o[0] in ("-a", "--authors"): + cli_opts['authors'] = 1; + # What journal title format are we using? + if cli_opts['verbosity'] > 0 and cli_opts['inspire']: + sys.stdout.write("--- Using inspire journal title form\n") + elif cli_opts['verbosity'] > 0: + sys.stdout.write("--- Using invenio journal title form\n") if len(myargs) >= 1: ## some standalone arguments are present, abort @@ -6110,15 +6337,22 @@ def begin_extraction(daemon_cli_options=None): if len(docbody) > 0: ## the document body is not empty: ## 2. If necessary, locate the reference section: - if cli_opts['treat_as_reference_section']: - ## don't search for citations in the document body: - ## treat it as a reference section: - reflines = docbody + if cli_opts['treat_as_raw_section']: + ## don't search for sections in the document body: + ## treat entire input as relevant section: + extract_lines = docbody + else: - ## launch search for the reference section in the document body: - (reflines, extract_error, how_found_start) = \ - extract_references_from_fulltext(docbody) - if len(reflines) == 0 and extract_error == 0: + + ## launch search for the relevant section in the document body: + if cli_opts['authors'] == 1: + section = 'authors' + else: + section = 'references' + + (extract_lines, extract_error, how_found_start) = \ + extract_section_from_fulltext(docbody, section) + if len(extract_lines) == 0 and extract_error == 0: extract_error = 6 write_message("-----extract_references_from_fulltext " \ "gave len(reflines): %s overall error: " \ @@ -6152,7 +6386,7 @@ def begin_extraction(daemon_cli_options=None): else: ## document body is empty, therefore the reference section is empty: - reflines = [] + extract_lines = [] processed_references = [] ## 4. Display the extracted references, status codes, etc: @@ -6161,7 +6395,7 @@ def begin_extraction(daemon_cli_options=None): raw_file = str(recid) + '.rawrefs' try: rawfilehdl = open(raw_file, 'w') - write_raw_references_to_stream(recid, reflines, rawfilehdl) + write_raw_references_to_stream(recid, extract_lines, rawfilehdl) rawfilehdl.close() except: write_message("***%s\n\n" % raw_file, \ From d32a3be6f6a307f3ed096c8f5b7ee18deab97899 Mon Sep 17 00:00:00 2001 From: Christopher Hayward Date: Wed, 15 Sep 2010 13:40:42 +0200 Subject: [PATCH 02/15] refextract: adds DOI recognition functionality * Refextract is now able to identify DOI numbers inside a citation and correctly markup a found DOI into the new 'a' subfield. --- modules/bibedit/lib/refextract.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/modules/bibedit/lib/refextract.py b/modules/bibedit/lib/refextract.py index 52dc5bafe9..b594aa0ea4 100644 --- a/modules/bibedit/lib/refextract.py +++ b/modules/bibedit/lib/refextract.py @@ -318,6 +318,10 @@ def compress_subfields(out,subfield_code): J. von Delft and D.C. Ralph, Phys. Rep. 345 (2001) 61 + + - group together the contents of misc elements which belong to the same datafield element + - correctly manage mulitple references inside a single citation line (using multiple 'o' + tags to denote mulitple references) """ in_lines = out.split('\n') ## hold the subfield compressed version of the xml, line by line @@ -3593,6 +3597,7 @@ def convert_processed_reference_line_to_marc_xml(line_marker, elif tag_type == "URL": + ## This tag is an identified URL: ## From the "identified_urls" list, get this URL and its @@ -4095,6 +4100,11 @@ def create_marc_xml_reference_section(ref_sect, ## Strip the 'marker' (e.g. [1]) from this reference line: (line_marker, working_line1) = \ remove_reference_line_marker(ref_line) + + + ## Find DOI sections in citation + (working_line1, identified_dois) = identify_and_tag_doi(working_line1) + ## Find DOI sections in citation @@ -6562,5 +6572,3 @@ def test_get_reference_lines(): ] return reflines -if __name__ == '__main__': - main() From 3e3964d9beb0a6002ce349941ab2d89a4b590e3d Mon Sep 17 00:00:00 2001 From: Christopher Hayward Date: Fri, 24 Sep 2010 16:37:28 +0200 Subject: [PATCH 03/15] refextract: improvements * Improved some comments. Created checks against spaces at the end of raw reference lines which are being processed. * Will now add a space at the end of raw references, if there is a need to concatenate two reference lines as a result of bad pdf2txt parsing. The space will only be added if a space does not already exist for the base raw reference. --- modules/bibedit/lib/refextract.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/bibedit/lib/refextract.py b/modules/bibedit/lib/refextract.py index b594aa0ea4..583c6ab719 100644 --- a/modules/bibedit/lib/refextract.py +++ b/modules/bibedit/lib/refextract.py @@ -4100,7 +4100,7 @@ def create_marc_xml_reference_section(ref_sect, ## Strip the 'marker' (e.g. [1]) from this reference line: (line_marker, working_line1) = \ remove_reference_line_marker(ref_line) - + ## Find DOI sections in citation (working_line1, identified_dois) = identify_and_tag_doi(working_line1) From 103aef0217c6e1e2ed40d5c3835ed876df88f911 Mon Sep 17 00:00:00 2001 From: Christopher Hayward Date: Mon, 11 Oct 2010 10:46:44 +0200 Subject: [PATCH 04/15] refextract: author identification * Identifies Authors in citations. Splits references based on the number of author groups found, and the presence of semi-colons. Completely refactored how the output MARC-XML is created, removed a lot of redundant methods. Authors are identified as 'groups' of authors within citations. Multiple groups may indicate that a reference is actually two citations. The accurate classification of author names is reliant upon a large and extensive titles knowledge base, since tagged titles will not be tagged as authors afterwards. Found authors also helps to identify useful semi-colons, since author tagging limits the text that is dumped into the misc-subfield. Author groups can include words such as 'and' and 'et al'. Also, if an 'and' is located at the start of an author group, then a weaker author pattern is applied to the preceeding misc text, which is likely to hold an author that was not correctly matched. On top of this, authors which look like editors (have an 'ed' phrase somewhere around the author, in some format, are not tagged as authors, since they do not dictate multiple citations inside a single reference line. The methods which control the conversion of a tagged reference line to MARC-XML has been completely re-written, using the same branches of execution that were applied to the previous methods. However, the new methods are not only much easier to understand, but also take into consideration all of the tagged elements in a citation line when making the decision to split a reference line into two or more citations. The management of IBID's has also been simplified, by attaching a list of IBID dictionaries to the starting title they apply to. --- modules/bibedit/lib/refextract.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/modules/bibedit/lib/refextract.py b/modules/bibedit/lib/refextract.py index 583c6ab719..7e07162ad8 100644 --- a/modules/bibedit/lib/refextract.py +++ b/modules/bibedit/lib/refextract.py @@ -318,10 +318,6 @@ def compress_subfields(out,subfield_code): J. von Delft and D.C. Ralph, Phys. Rep. 345 (2001) 61 - - - group together the contents of misc elements which belong to the same datafield element - - correctly manage mulitple references inside a single citation line (using multiple 'o' - tags to denote mulitple references) """ in_lines = out.split('\n') ## hold the subfield compressed version of the xml, line by line @@ -3587,7 +3583,15 @@ def convert_processed_reference_line_to_marc_xml(line_marker, report_num = processed_line[tag_match_end:idx_closing_tag] ## now trim this matched institutional report-number and its tags from the start of the line: processed_line = processed_line[idx_closing_tag+len(CFG_REFEXTRACT_MARKER_CLOSING_REPORT_NUM):] + + identified_citation_element = { 'type' : "REPORTNUMBER", + 'misc_txt' : "%s" % cur_misc_txt, + 'report_num' : "%s" % report_num, + } + count_reportnum += 1 + cur_misc_txt = u"" +<<<<<<< HEAD identified_citation_element = { 'type' : "REPORTNUMBER", 'misc_txt' : "%s" % cur_misc_txt, 'report_num' : "%s" % report_num, @@ -3595,9 +3599,10 @@ def convert_processed_reference_line_to_marc_xml(line_marker, count_reportnum += 1 cur_misc_txt = u"" +======= +>>>>>>> Identifies Authors in citations. Splits references based on the elif tag_type == "URL": - ## This tag is an identified URL: ## From the "identified_urls" list, get this URL and its @@ -4027,7 +4032,6 @@ def remove_reference_line_marker(line): else: marker_val = u" " return (marker_val, line) - def create_marc_xml_reference_section(ref_sect, preprint_repnum_search_kb, preprint_repnum_standardised_categs, @@ -4100,7 +4104,7 @@ def create_marc_xml_reference_section(ref_sect, ## Strip the 'marker' (e.g. [1]) from this reference line: (line_marker, working_line1) = \ remove_reference_line_marker(ref_line) - + ## Find DOI sections in citation (working_line1, identified_dois) = identify_and_tag_doi(working_line1) From 3a38615ce2d71e6090a1f6b5887307b66d8210d7 Mon Sep 17 00:00:00 2001 From: Christopher Hayward Date: Thu, 14 Oct 2010 16:55:46 +0200 Subject: [PATCH 05/15] refextract: further changes * Further changes made to refextract. Corrects problems noted by Annette. Including: - support for the unicode character 's' with a tilde - support for volume numbers separated with a hyphen - making sure that the first author in an author group, if starting with an 'A' initial, must have a full-stop after that first initial - support for hyphens between initials - support for a separated surname 'prefix' - looking for bad 'et al' placement (before an author group rather than after), which causes the author group to be ignored - arXiv suppression: removing 'arxiv' or 'e-print arxiv' before a report number and after a title - ed. ==> eds?. - corrected placement of 'et al' and the last 'ed.' pattern (reversed) - support for author groups in brackets --- modules/bibedit/lib/refextract.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/modules/bibedit/lib/refextract.py b/modules/bibedit/lib/refextract.py index 7e07162ad8..2f6a2fb281 100644 --- a/modules/bibedit/lib/refextract.py +++ b/modules/bibedit/lib/refextract.py @@ -852,7 +852,7 @@ def get_bad_char_replacements(): ## \030 : cedilla u'\u0327c' : u'\u00E7', u'\u0327C' : u'\u00C7', - ## \02DC : tilde + ## \02DC : tilde (s with a tilde turns to just 's') u'\u02DCn' : u'\u00F1', u'\u02DCN' : u'\u00D1', u'\u02DCo' : u'\u00F5', @@ -4032,6 +4032,7 @@ def remove_reference_line_marker(line): else: marker_val = u" " return (marker_val, line) + def create_marc_xml_reference_section(ref_sect, preprint_repnum_search_kb, preprint_repnum_standardised_categs, @@ -4107,7 +4108,7 @@ def create_marc_xml_reference_section(ref_sect, ## Find DOI sections in citation - (working_line1, identified_dois) = identify_and_tag_doi(working_line1) + (working_line1, identified_dois) = identify_and_tag_DOI(working_line1) @@ -6555,7 +6556,7 @@ def test_get_reference_lines(): """[1] P. A. M. Dirac, Proc. R. Soc. London, Ser. A155, 447(1936); ibid, D24, 3333(1981).""", """[40] O.O. Vaneeva, R.O. Popovych and C. Sophocleous, Enhanced Group Analysis and Exact Solutions of Vari-able Coefficient Semilinear Diffusion Equations with a Power Source, Acta Appl. Math., doi:10.1007/s10440-008-9280-9, 46 p., arXiv:0708.3457.""", """[41] M. I. Trofimov, N. De Filippis and E. A. Smolenskii. Application of the electronegativity indices of organic molecules to tasks of chemical informatics. Russ. Chem. Bull., 54:2235-2246, 2005. http://dx.doi.org/10.1007/s11172-006-0105-6.""", - """[42] M. Gell-Mann, P. Ramon ans R. Slansky, in Supergravity, P. van Niewenhuizen and D. Freedman (North-Holland 1979); T. Yanagida, in Proceedings of the Workshop on the Unified Thoery and the Baryon Number in teh Universe, ed. O. Sawaga and A. Sugamoto (Tsukuba 1979); R.N. Mohapatra and G. Senjanovic, Phys. Rev. Lett. 44, 912, (1980). + """[42] M. Gell-Mann, P. Ramon ans R. Slansky, in Supergravity, P. van Niewenhuizen and D. Freedman (North-Holland 1979); T. Yanagida, in Proceedings of the Workshop on the Unified Thoery and the Baryon Number in teh Universe, ed. O. Sawaga and A. Sugamoto (Tsukuba 1979); R.N. Mohapatra and G. Senjanovic’, Phys. Rev. Lett. 44, 912, (1980). """, """[43] L.S. Durkin and P. Langacker, Phys. Lett B166, 436 (1986); Amaldi et al., Phys. Rev. D36, 1385 (1987); Hayward and Yellow et al., eds. Phys. Lett B245, 669 (1990); Nucl. Phys. B342, 15 (1990); """, From 1286b5a3bdae633e25e3e79599eaf6f3b48bddc5 Mon Sep 17 00:00:00 2001 From: Christopher Hayward Date: Mon, 18 Oct 2010 16:30:10 +0200 Subject: [PATCH 06/15] refextract: improved author recognition * Improves author recognition (increased the number of recognised author formats), added comments. * Added 'surname [and surname] et al' recognition (et al must be present) * Improved underscore author text validation (escapes all tags and all tagged content now, rather than just titles). Completely removes the change that part of tagged text (or a tag itself) is seen as an author. * Improved author split/dump heuristics (will dump into misc if two author groups are found in a row, with minimal misc text between them) * Added some more test reference lines * Added comments to some methods (still need to complete this) --- modules/bibedit/lib/refextract.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/modules/bibedit/lib/refextract.py b/modules/bibedit/lib/refextract.py index 2f6a2fb281..2fb1400f5d 100644 --- a/modules/bibedit/lib/refextract.py +++ b/modules/bibedit/lib/refextract.py @@ -852,14 +852,14 @@ def get_bad_char_replacements(): ## \030 : cedilla u'\u0327c' : u'\u00E7', u'\u0327C' : u'\u00C7', - ## \02DC : tilde (s with a tilde turns to just 's') + ## \02DC : tilde u'\u02DCn' : u'\u00F1', u'\u02DCN' : u'\u00D1', u'\u02DCo' : u'\u00F5', u'\u02DCO' : u'\u00D5', u'\u02DCa' : u'\u00E3', u'\u02DCA' : u'\u00C3', - u'\u02DCs' : u'_', ## no valid 's with tilde' character + u'\u02DCs' : u'\u0303s', ## Combining tilde with 's' } return replacements @@ -3390,6 +3390,7 @@ def build_formatted_xml_citation(citation_elements,line_marker): xml_line += """ \n""" + return xml_line @@ -3429,6 +3430,7 @@ def convert_processed_reference_line_to_marc_xml(line_marker, # the last tag element found when working from left-to-right across the line identified_citation_element = None + while tag_match is not None: ## While there are tags inside this reference line... tag_match_start = tag_match.start() @@ -6556,7 +6558,11 @@ def test_get_reference_lines(): """[1] P. A. M. Dirac, Proc. R. Soc. London, Ser. A155, 447(1936); ibid, D24, 3333(1981).""", """[40] O.O. Vaneeva, R.O. Popovych and C. Sophocleous, Enhanced Group Analysis and Exact Solutions of Vari-able Coefficient Semilinear Diffusion Equations with a Power Source, Acta Appl. Math., doi:10.1007/s10440-008-9280-9, 46 p., arXiv:0708.3457.""", """[41] M. I. Trofimov, N. De Filippis and E. A. Smolenskii. Application of the electronegativity indices of organic molecules to tasks of chemical informatics. Russ. Chem. Bull., 54:2235-2246, 2005. http://dx.doi.org/10.1007/s11172-006-0105-6.""", - """[42] M. Gell-Mann, P. Ramon ans R. Slansky, in Supergravity, P. van Niewenhuizen and D. Freedman (North-Holland 1979); T. Yanagida, in Proceedings of the Workshop on the Unified Thoery and the Baryon Number in teh Universe, ed. O. Sawaga and A. Sugamoto (Tsukuba 1979); R.N. Mohapatra and G. Senjanovic’, Phys. Rev. Lett. 44, 912, (1980). + """[42] M. Gell-Mann, P. Ramon ans R. Slansky, in Supergravity, P. van Niewenhuizen and D. Freedman (North-Holland 1979); T. Yanagida, in Proceedings of the Workshop on the Unified Thoery and the Baryon Number in teh Universe, ed. O. Sawaga and A. Sugamoto (Tsukuba 1979); R.N. Mohapatra and G. Senjanovic, Phys. Rev. Lett. 44, 912, (1980). + """, + """[43] L.S. Durkin and P. Langacker, Phys. Lett B166, 436 (1986); Amaldi et al., Phys. Rev. D36, 1385 (1987); Hayward and Yellow et al., Phys. Lett B245, 669 (1990); Nucl. Phys. B342, 15 (1990); + """, + """[44] M. I. _________________________________________ Molinero, and J. C. Oller, Performance test of the CMS link alignment system """, """[43] L.S. Durkin and P. Langacker, Phys. Lett B166, 436 (1986); Amaldi et al., Phys. Rev. D36, 1385 (1987); Hayward and Yellow et al., eds. Phys. Lett B245, 669 (1990); Nucl. Phys. B342, 15 (1990); """, From 49a679e23b485ca37d744f7f10c5a800f6dc2e38 Mon Sep 17 00:00:00 2001 From: Christopher Hayward Date: Tue, 9 Nov 2010 14:49:07 +0100 Subject: [PATCH 07/15] refextract: corrects some issues raised by Annette * Standardises 'et al' and author editor notation. * Improvements made semi-colon splitting heuristics (check misc len > 60). * Re-formats the journal information inside $s. * A small change to IBID checking, which still needs to be implemented properly. --- modules/bibedit/lib/refextract.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modules/bibedit/lib/refextract.py b/modules/bibedit/lib/refextract.py index 2fb1400f5d..e6b944dcca 100644 --- a/modules/bibedit/lib/refextract.py +++ b/modules/bibedit/lib/refextract.py @@ -6580,6 +6580,8 @@ def test_get_reference_lines(): """, """[237] L.S. Durkin and P. Langacker, Phys. Lett B166, 436 (1986); Amaldi et al., Phys. Rev. D36, 1385 (1987); G. Altarelli et al., Phys. Lett B245, 669 (1990); Nucl. Phys. B342, 15 (1990); Phys. Lett. B261, 146 (1991); ibidem B263, 459 (1991); """, + """[15] Nucl. Phys., B372, 3 (1992); T.G. Rizzo, Phys. Rev. D40, 3035 (1989); Proceedings of the 1990 Summer Study on High Energy Physics. ed E. Berger, June 25-July 13, 1990, Snowmass Colorado (World Scientific, Singapore, 1992) p. 233; V. Barger, J.L. Hewett and T.G. Rizzo, Phys. Rev. D42, 152 (1990); J.L. Hewett, Phys. Lett. B238, 98 (1990); + """, ] return reflines From 66e1707ff7bf1eff6de265e4e94e77f3ef3b1078 Mon Sep 17 00:00:00 2001 From: Christopher Hayward Date: Wed, 1 Dec 2010 14:04:19 +0100 Subject: [PATCH 08/15] refextract: author replication for IBIDs * Added author replication for IBID's and generally improved author recognition and documentation. * Implemented author duplication for titles marked as IBID's * Improved the main author regex (added apostrophe between surname and surname prefix) * Removed the dependency on letter case when matching authors taken from the author knowledge base * Added configuration variables for author tags, for new 'title-ibid' tags, and for heuristic values (length of misc text deemed to be 'useful' within a citation ...) * Removed any persisting print statements * Improved comments, notably for the function which generates the author regular expression --- modules/bibedit/lib/refextract.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/bibedit/lib/refextract.py b/modules/bibedit/lib/refextract.py index e6b944dcca..7b697746d1 100644 --- a/modules/bibedit/lib/refextract.py +++ b/modules/bibedit/lib/refextract.py @@ -3430,7 +3430,6 @@ def convert_processed_reference_line_to_marc_xml(line_marker, # the last tag element found when working from left-to-right across the line identified_citation_element = None - while tag_match is not None: ## While there are tags inside this reference line... tag_match_start = tag_match.start() @@ -3926,7 +3925,8 @@ def add_tagged_title(reading_line, ## of the reading-line, up to the point of the matched TITLE: rebuilt_line = reading_line[startpos:true_replacement_index] ## Test to see whether a title or an "IBID" was matched: - if matched_title.upper().find("IBID") != -1: + if matched_title.upper().find("IBID") != -1 \ + or matched_title.upper().find("IBIDEM") != -1: ## This is an IBID ## Try to replace the IBID with a title: if len(previous_match) > 1: From 00cdd58a05b9931da5c3b266a41e08aff875c45f Mon Sep 17 00:00:00 2001 From: Christopher Hayward Date: Mon, 20 Dec 2010 13:14:46 +0100 Subject: [PATCH 09/15] refextract: removes leading whitespace characters * Removed all leading whitespace characters. * The proper series letter is now used, for IBID's explicitly marked as an IBID in the reference. This was an overlapping problem concerning the checking between the word 'IBID' and 'IBIDEM', but now rectified using negative lookahead assertion. --- modules/bibedit/lib/refextract.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/modules/bibedit/lib/refextract.py b/modules/bibedit/lib/refextract.py index 7b697746d1..adccd4d936 100644 --- a/modules/bibedit/lib/refextract.py +++ b/modules/bibedit/lib/refextract.py @@ -3584,7 +3584,7 @@ def convert_processed_reference_line_to_marc_xml(line_marker, report_num = processed_line[tag_match_end:idx_closing_tag] ## now trim this matched institutional report-number and its tags from the start of the line: processed_line = processed_line[idx_closing_tag+len(CFG_REFEXTRACT_MARKER_CLOSING_REPORT_NUM):] - + identified_citation_element = { 'type' : "REPORTNUMBER", 'misc_txt' : "%s" % cur_misc_txt, 'report_num' : "%s" % report_num, @@ -3925,8 +3925,7 @@ def add_tagged_title(reading_line, ## of the reading-line, up to the point of the matched TITLE: rebuilt_line = reading_line[startpos:true_replacement_index] ## Test to see whether a title or an "IBID" was matched: - if matched_title.upper().find("IBID") != -1 \ - or matched_title.upper().find("IBIDEM") != -1: + if matched_title.upper().find("IBID") != -1: ## This is an IBID ## Try to replace the IBID with a title: if len(previous_match) > 1: @@ -4107,7 +4106,7 @@ def create_marc_xml_reference_section(ref_sect, ## Strip the 'marker' (e.g. [1]) from this reference line: (line_marker, working_line1) = \ remove_reference_line_marker(ref_line) - + ## Find DOI sections in citation (working_line1, identified_dois) = identify_and_tag_DOI(working_line1) From 095c23eb729f8a6488c527942c70cad5b7f8f43e Mon Sep 17 00:00:00 2001 From: Christopher Hayward Date: Tue, 25 Jan 2011 10:23:58 +0100 Subject: [PATCH 10/15] refextract: attempt at integrating giva * Attemp at integrating Giva functionality into refextract. * Improvements made to '--authors' mode, but no LaTeX support yet. * Uses affiliations to support the identification of ambiguous authors. * Work still needed to improve the reliability of the seeking of the end of author section (uses likely ending keywords, such as 'Abstract') * Added '--affiliations' mode, which will try to extract affiliations from a document * Moved the authextract-specific tests into the new 'refextract_authextract_tests.py' file, since it the name was clashing with the current refextract_tests.py test suite. * Refextract-config holds some of the institution names which are used to find affiliations, and were also held inside Giva. --- modules/bibedit/lib/refextract.py | 461 +++++++++++++++--- .../lib/refextract_authextract_tests.py | 287 +++++++++++ modules/bibedit/lib/refextract_config.py | 3 + 3 files changed, 670 insertions(+), 81 deletions(-) create mode 100644 modules/bibedit/lib/refextract_authextract_tests.py diff --git a/modules/bibedit/lib/refextract.py b/modules/bibedit/lib/refextract.py index adccd4d936..d1c61d95f7 100644 --- a/modules/bibedit/lib/refextract.py +++ b/modules/bibedit/lib/refextract.py @@ -34,6 +34,7 @@ CFG_REFEXTRACT_KB_JOURNAL_TITLES, \ CFG_REFEXTRACT_KB_REPORT_NUMBERS, \ CFG_REFEXTRACT_KB_AUTHORS, \ + CFG_INSTITUTIONS, \ CFG_REFEXTRACT_CTRL_FIELD_RECID, \ CFG_REFEXTRACT_TAG_ID_REFERENCE, \ CFG_REFEXTRACT_IND1_REFERENCE, \ @@ -368,7 +369,7 @@ def restrict_m_subfields(reference_lines): """Remove complete datafields which hold ONLY a single 'm' subfield, AND where the misc content is too short or too long to be of use. Min and max lengths derived by inspection of actual data. """ - min_length = 12 + min_length = 4 max_length = 1024 m_tag=re.compile('\(.*?)\<\/subfield\>') filter_list = [] @@ -1233,6 +1234,81 @@ def get_bad_char_replacements(): [\w\-_;\(\)\/]) #any character excluding a full stop """, re.VERBOSE)) +def get_single_and_extra_author_pattern(): + """Generates a simple, one-hit-only, author name pattern, matching just one author + name, but ALSO INCLUDING author names generated from the knowledge base. The author + patterns are the same ones used inside the main 'author group' pattern generator. + This function is used not for reference extraction, but for author extraction.""" + return get_single_author_pattern()+"|"+make_extra_author_regex_str() + +def get_single_author_pattern(incl_numeration=True): + """Generates a simple, one-hit-only, author name pattern, matching just one author + name in either of the 'S I' or 'I S' formats. The author patterns are the same + ones used inside the main 'author group' pattern generator. This function is used + not for reference extraction, but for author extraction. Numeration is appended + to author patterns by default. + @return (string): Just the author name pattern designed to identify single author names + in both SI and IS formats. (NO 'et al', editors, 'and'... matching)""" + return "(?:"+get_initial_surname_author_pattern(incl_numeration)+"|"+\ + get_surname_initial_author_pattern(incl_numeration)+")" + +def get_initial_surname_author_pattern(incl_numeration=False): + """Return a standard author, with a maximum of 6 initials, and a surname. + The author pattern returned will match 'Initials Surname' formats only. + The Initials MUST be uppercase, and MUST have at least a dot, hypen or apostrophe between them. + @param incl_numeration: (boolean) Return an author pattern with optional numeration after authors. + @return (string): The 'Initials Surname' author pattern.""" + append_num_re = "" + ## Possible inclusion of superscript numeration at the end of author names + if incl_numeration: + append_num_re = "(?:\d*)" + return u""" + ( + (? 0) and (rawline[0] != '#'): + if (len(rawline) > 0) and (rawline.strip()[0] != '#'): add_to_auth_list(rawline) ## Shorten collaboration to 'coll' if rawline.lower().endswith('collaboration\n'): @@ -4494,7 +4541,7 @@ def _create_regex_pattern_add_optional_spaces_to_word_characters(word): if ch.isspace(): new_word += ch else: - new_word += ch + unicode(r'\s*?') + new_word += ch + unicode(r'\s*') return new_word @@ -4712,6 +4759,155 @@ def perform_regex_search_upon_line_with_pattern_list(line, patterns): break return m +def standardise_line_affiliations(line): + ## Removes numeration, 'the'/'and', and replace titles + line = line.strip() + line = re.sub(r"^Livermore","LLNL, Livermore",line) + line = re.sub(r".*Stanford Linear Accelerator Center.*","SLAC",line) + line = re.sub(r"^Fermi National Accelerator Laboratory","Fermilab",line) + line = re.sub(r"[tT][hH][eE]"," ",line) + line = re.sub(r"[aA][nN][dD]"," ",line) + return line + +re_aff_num = re.compile(r"(^[\d]+[A-Z])") +re_aff_inst = re.compile(r"(univ|institut|laborator)",re.I) +re_aff_univ = re.compile(r"univ[a-z]+\s+(of)?\s+([a-z\s\-]+)|([a-z\s\-]+)\s+(?!univ[a-z]+\sof)univ[a-z]+",re.I) + +def find_author_affiliations(docbody,use_to_find_authors=False): + """ Given a possible author section, attempt to retrieve any affliations. + @param docbody: The document body as a list of lines. + @param use_to_find_authors: Boolean, whether or not the affiliations found + within this function should be used to support the identification of authors. + (This will be True in the case when '--authors' is selected, and no authors + have been found using the specific author regular expression during the first + method.) + @return (tuple): Affilations and the possibly improved author section. + """ + + ## Used to validate a set of words found above an affiliation + ## This is used when no authors have been found for a paper, but an affiliation has + ## Will try to match a single ambiguous author, such as "William J. Smith" + re_find_ambig_auth = re.compile(r"\s*[A-Z][^\s_<>0-9]+\s+([^\s_<>0-9]{1,3}\.?\s+)?[A-Z][^\s_<>0-9]+\s*(\d*)\s*$",re.UNICODE) + + end_of_section_position = find_end_of_auth_aff_section(docbody) + + start = 0 + if end_of_section_position is not None: + end = end_of_section_position + else: + if cli_opts['verbosity'] > 2: + print "no ending keyword, stopping affiliation search" + return False + + docbody = docbody[start:end] + + affiliations = [] + affiliation_positions = [] + + for position in range(len(docbody)): + second_try_authors = [] + line = standardise_line_affiliations(docbody[position]) + if cli_opts['verbosity'] > 2: + print "(find affiliations) examining " + line.encode("utf8") + + if re_aff_num.search(line) or re_aff_inst.search(line): + line = re.sub(r"[0-9]","",line) + ## Format the found affiliation + univ_name = re_aff_univ.search(line) + if univ_name: + ## Get the University name + line = (univ_name.group(2) or univ_name.group(3)) + " U." + ## Check and set an institution + for inst in CFG_INSTITUTIONS: + if line.find(" "+inst) != -1: + line = inst + break + + ## And save the position within this affiliation section + affiliation_positions.append(position) + + if use_to_find_authors == True: + ## Use the found affiliation to try and help with author extraction + if ((position - 1) > 0) and not ((position - 1) in affiliation_positions): + ## Replace 'and' or '&' with a comma + tmp_line = re.sub(r"\s([Aa][Nn][Dd]|&)\s",", ",docbody[position-1]) + possible_authors = tmp_line.strip().split(",") + print "checking these possible authors:" + print possible_authors + ## Return the list of ok authors found in the split line, above the affiliation + second_try_authors = filter(lambda x: re_find_ambig_auth.match(x), possible_authors) + + ## Add the institution to the list of institutions for this document + affiliations.append((line,second_try_authors)) + + print "identified affiliations:" + print affiliations + return affiliations + +def get_post_author_section_keyword_patterns(): + """ Return a list of compiled regex's based on keywords used as an indication of the + end of a possible author section on the title page of a document. + """ + ptns = [] + ptns.append(_create_regex_pattern_add_optional_spaces_to_word_characters('abstract')) + ptns.append(_create_regex_pattern_add_optional_spaces_to_word_characters('introduction')) + ptns.append(_create_regex_pattern_add_optional_spaces_to_word_characters('intro')) + ptns.append(_create_regex_pattern_add_optional_spaces_to_word_characters('overview')) + ptns.append(_create_regex_pattern_add_optional_spaces_to_word_characters('contents')) + ptns.append(_create_regex_pattern_add_optional_spaces_to_word_characters('table of contents')) + ptns.append(_create_regex_pattern_add_optional_spaces_to_word_characters('content')) + ptns.append(_create_regex_pattern_add_optional_spaces_to_word_characters('overview')) + ptns.append(_create_regex_pattern_add_optional_spaces_to_word_characters('objectives')) + #ptns.append(_create_regex_pattern_add_optional_spaces_to_word_characters('copyright')) + ptns.append(_create_regex_pattern_add_optional_spaces_to_word_characters('page')) + ptns.append(_create_regex_pattern_add_optional_spaces_to_word_characters('preface')) + ptns.append(_create_regex_pattern_add_optional_spaces_to_word_characters('summary')) + ## Page number 1 + ptns.append('\s*(page)?\s*(1|2|i)\s*\.?\s*$') + + ## Add an optional chapter numeration (1., 2, 1.1...) to the start of each pattern + ptns = map(lambda x:'\s*(\d\s*\.?\s*1?\s*\.?\s*)?'+x,ptns) + + compiled_patterns = [] + for p in ptns: + compiled_patterns.append(re.compile(p, re.I|re.UNICODE)) + return compiled_patterns + + +def check_for_end_of_author_section_match_keywords(line): + """ Given a lowercase, stripped line from the start of a document, try to find a match the + line exactly for a keyword. A match should indicate the end of the author section. + @param line: The line to be checked for ending section keywords. + @return (match object): The match object returned when a keyword match is found. + """ + found_keyword = perform_regex_match_upon_line_with_pattern_list(line, get_post_author_section_keyword_patterns()) + if found_keyword: + return found_keyword + +def find_end_of_auth_aff_section(docbody): + """ Return the ending position of the author/affiliation section. + @param docbody: The full, line-by-line document + @return (int): The position of the found end-keyword + """ + + end_of_section_keyword_position = None + + ## Obtain the line numbers of lines which hold authors + for position in range(len(docbody)): + ## Skip the first line ##FIXME + if position == 0: + continue + line = docbody[position] + print '(find ending keyword) examining: %s' % line.strip() + ## Check for post-author keywords in the line which signifies the end of an author section + if check_for_end_of_author_section_match_keywords(line.strip().lower()): + end_of_section_keyword_position = position + print "ending keyword match, stopping auth/aff section search: %d" % position + print "on line: %s" % line.strip().lower() + break + + return end_of_section_keyword_position + def find_author_section(docbody, author_marker = None, first_author = None): """Search in document body for its author section. @@ -4740,6 +4936,7 @@ def find_author_section(docbody, author_marker = None, first_author = None): """ auth_start_line = None auth_end_line = None + #A pattern to match author names # demands name has a comma # allows space or hyphen in family name @@ -4747,11 +4944,30 @@ def find_author_section(docbody, author_marker = None, first_author = None): # no . or spaces used...) # allows a trailing number # Aubert, F. I. 3 - author_pattern = re.compile('([A-Z]\w+\s?\w+)\s?([A-Z\.\s]{1,9})\.?\s?(\d*)') + #author_pattern = re.compile('([A-Z]\w+\s?\w+)\s?([A-Z\.\s]{1,9})\.?\s?(\d*)') # F. I. Aubert, 3 - author_pattern = re.compile('([A-Z])\.\s?([A-Z]?)\.?\s?([A-Z]\w+\s?\w*)\,?\s?(\d*)') - start_pattern = author_pattern - end_pattern = author_pattern + #author_pattern = re.compile('([A-Z])\.\s?([A-Z]?)\.?\s?([A-Z]\w+\s?\w*)\,?\s?(\d*)') + + ## Obtain the compiled expression which includes the proper author numeration + ## (The pattern used to identify authors of papers) + total_author_pattern = (re.compile(make_auth_regex_str(re_etal,\ + get_initial_surname_author_pattern(incl_numeration=True),\ + get_surname_initial_author_pattern(incl_numeration=True)),re.VERBOSE|re.UNICODE)) + + ## Obtain the compiled expression which includes the user-specified 'extra' authors + extra_author_pattern = re_extra_auth + + ## Obtain the compiled expression which matches single author names (Initials Surname, or Surname Initials) + ## Also caters for the possible inclusion of superscript numeration at the end of author names + ## (e.g. W.H.Smith2) + single_author_pattern = re.compile(get_single_and_extra_author_pattern(),re.VERBOSE) + + ## 'initial surname', or 'surname initial' + #start_pattern = total_author_pattern + #end_pattern = total_author_pattern + + ## 'initial surname' only + #end_pattern = re.compile(get_initial_surname_author_pattern(),re.VERBOSE) # if author_marker is not None: # start_pattern = re.compile(author_marker+'(.*)') @@ -4759,42 +4975,81 @@ def find_author_section(docbody, author_marker = None, first_author = None): # if first_author is not None: # start_pattern = re.compile(first_author) # end_pattern = None; - + ## Obtain the line numbers of lines which hold authors for position in range(len(docbody)): + + ##FIXME + if position == 0: + continue + line = docbody[position] - if auth_start_line is None: - if cli_opts['verbosity'] > 2: - print "examining " + line.encode("utf8") - print "re -> " + start_pattern.pattern - if start_pattern.search(line): + if cli_opts['verbosity'] > 2: + print "(find authors) examining " + line.encode("utf8") + #print "re -> " + start_pattern.pattern + + ## Set the ending position to equal this line number + ## (This could be the last author or one of many) + if total_author_pattern.search(line) or extra_author_pattern.search(line): + print "author found -> " + line.encode("utf8") + ## Set the starting position, if it has not been set + if auth_start_line is None: auth_start_line = position - elif auth_end_line is None and end_pattern.search(line): - # this could be the last author or one of many + ## Always update the ending position auth_end_line = position - elif auth_end_line is not None and end_pattern.search(line): - break + + + #elif (auth_end_line is not None) and (not end_pattern.search(line))\ + #and (not kb_author_pattern.search(line)): # leave when we have found a possible and, and the ending # pattern no longer matches this will fail if there are # affiliations interspersed, or othe corruptions of the list + # if ((position+1) < len(docbody))\ + # and (not end_pattern.search(docbody[position+1]))\ + # and (not kb_author_pattern.search(docbody[position+1])): + ## Finish searching when a gap of two lines is found (between authors) + # break + + ## Check for post-author keywords in the line which signifies the end of an author section + elif check_for_end_of_author_section_match_keywords(line.strip().lower()): + print "ending keyword match, stopping author section search" + break + + ## If moved 10 lines beyond the last found author line, stop the search. + #elif auth_end_line and (position == auth_end_line + 10): + # print "maximum search line limit reached, stopping author section search" + # break + + #elif position > 0: + ## End the search when two adjacent lines, of the maximum page length are found + ## (High likelihood that these two lines indicate the start of a paragraph.) + # if (len(line) == prev_line_length) and prev_line_ends_with_new_line and line.endswith('\n'): + # break + #prev_line_length = len(line) + #prev_line_ends_with_new_line = line.endswith('\n') - if auth_start_line is not None: - ## return dictionary containing details of author section: - auth_sect_details = { - 'start_line' : auth_start_line, - 'end_line' : auth_end_line, - 'marker_pattern' : author_pattern, - 'title_string' : None, - 'marker' : None, - 'title_marker_same_line' : None, + ## Return dictionary containing details of author section: + ## (The pattern used is just a single name matching author pattern, + ## and not the full author pattern. This allows for each author group + ## to be split into separate author names, within the output xml.) + auth_sect_details = {'start_line' : auth_start_line, + 'end_line' : auth_end_line, + 'marker_pattern' : single_author_pattern,#just the single author matches, as opposed to re_auth + 'title_string' : None, + 'marker' : None, + 'title_marker_same_line': None } else: auth_sect_details = None - return auth_sect_details + ## Now attempt to get the affilations. This will also try to get authors once again, + ## if authors have not already been found... + return auth_sect_details + + def find_reference_section(docbody): """Search in document body for its reference section. More precisely, find the first line of the reference section. Effectively, the function starts @@ -5680,7 +5935,7 @@ def get_lines(docbody, marker_ptn) #lines = docbody[start_idx:] return lines - + def get_reference_lines(docbody, @@ -5770,14 +6025,18 @@ def extract_section_from_fulltext(fulltext, section): """ ## Try to remove pagebreaks, headers, footers fulltext = remove_page_boundary_lines(fulltext) + + #fulltext = ['Some title','Some date','Chris Hayward, Tim Smith, Joe Harris','University of Bath','','Abstract'] + status = 0 + lines = [] sect_start = {'start_line' : None, 'end_line' : None, 'title_string': None, 'marker_pattern': None, 'marker' : None, } - + sect_end = None #How ref section found flag how_found_start = 0 @@ -5798,22 +6057,52 @@ def extract_section_from_fulltext(fulltext, section): ## No references found - try with no title option (with even weaker patterns..) sect_start = find_reference_section_no_title_via_numbers(fulltext) if sect_start is not None: how_found_start = 4 - elif section == 'authors': + elif section == 'authors': sect_start = find_author_section(fulltext, first_author = cli_opts['first_author']) - + + elif section == 'affiliations': + sect_start = None + affiliations = find_author_affiliations(fulltext) + if affiliations: + return ([aff[0] for aff in affiliations], status, how_found_start) + else: + if cli_opts['verbosity'] >= 1: + sys.stdout.write("-----extract_section_from_fulltext: " \ + "No ending keyword found for affilation extraction!\n") if sect_start is None: - ## No References - lines = [] - status = 4 - write_message("-----extract_section_from_fulltext: " \ - "No section found\n", verbose=2) + ## Only if an ending keyword was found, look for affilations + if section == 'authors': + affiliations = find_author_affiliations(fulltext,use_to_find_authors=True) + + ## Found affiliations... there could be some new authors found too! + if affiliations: + ## Append the affiliation supported authors, since the first method failed + for aff_auth_pair in affiliations: + lines.extend([auth for auth in aff_auth_pair[1]])#Authors + return (lines,status,how_found_start) + #lines.append(aff_auth_pair[0])#Affiliation + else: + ## No References + lines = [] + status = 4 + write_message("-----extract_section_from_fulltext: " \ + "No section found\n", verbose=2) else: + ## Only if an ending keyword was found, look for affilations + ## It's a bonus that authors were found + #if end_of_section_keyword: + # affiliations = find_author_affiliations(fulltext) + # lines.extend([i[0] for i in affiliations]) + sect_end = None if sect_start.has_key("end_line"): sect_end = sect_start["end_line"] - if sect_end is None: + + ## Attempt to find the end of the section in the case where references are being + ## extracted, and a first pass failed at finding the end of the reference section + if (sect_end is None) and (section == 'references'): sect_end = \ find_end_of_reference_section(fulltext, \ sect_start["start_line"], \ @@ -5821,8 +6110,7 @@ def extract_section_from_fulltext(fulltext, section): sect_start["marker_pattern"]) if sect_end is None: - ## No End to refs? Not safe to extract - lines = [] + ## No End to reference or author section? Not safe to extract... status = 5 write_message("-----extract_section_from_fulltext: " \ "No end to section!\n", verbose=2) @@ -5834,10 +6122,9 @@ def extract_section_from_fulltext(fulltext, section): sect_start["title_string"], \ sect_start["marker_pattern"], \ sect_start["title_marker_same_line"], - section, - ) - return (lines, status, how_found_start) + section) + return (lines, status, how_found_start) ## Tasks related to conversion of full-text to plain-text: @@ -6024,25 +6311,27 @@ def get_cli_options(): 'kb-journal' : 0, 'kb-report-number' : 0, 'authors' : 0, + 'affiliations' : 0, 'first_author' : 0, } try: - myoptions, myargs = getopt.getopt(sys.argv[1:], "hVv:f:zrx:d:pj:n:", \ + myoptions, myargs = getopt.getopt(sys.argv[1:], "hVv:f:zalrx:d:pj:n:", \ ["help", "version", "verbose=", "fulltext=", "raw-references", - "raw-authors", "authors", + "affiliations", "output-raw-refs", "xmlfile=", "dictfile=", "inspire", "kb-journal=", "kb-report-number=", - "first_author",]) + "first_author", + "raw-authors",]) except getopt.GetoptError, err: if err.opt in ("c", "collection", "i", "recid", "e", "extraction-job"): ## These are arguments designed to be used for the daemon mode only @@ -6098,6 +6387,14 @@ def get_cli_options(): cli_opts['kb-report-number'] = o[1] elif o[0] in ("-a", "--authors"): cli_opts['authors'] = 1; + elif o[0] in ("-f", "--affiliations"): + cli_opts['affiliations'] = 1; + elif o[0] in ("--first_author"): + cli_opts['first_author'] = 1; + if len(myargs) == 0: + ## no arguments: error message + usage(wmsg="Error: no full-text.") + # What journal title format are we using? if cli_opts['verbosity'] > 0 and cli_opts['inspire']: sys.stdout.write("--- Using inspire journal title form\n") @@ -6363,6 +6660,8 @@ def begin_extraction(daemon_cli_options=None): ## launch search for the relevant section in the document body: if cli_opts['authors'] == 1: section = 'authors' + elif cli_opts['affiliations'] == 1: + section = 'affiliations' else: section = 'references' diff --git a/modules/bibedit/lib/refextract_authextract_tests.py b/modules/bibedit/lib/refextract_authextract_tests.py new file mode 100644 index 0000000000..52571069cf --- /dev/null +++ b/modules/bibedit/lib/refextract_authextract_tests.py @@ -0,0 +1,287 @@ +# -*- coding: utf-8 -*- +## +## $Id$ +## +## This file is part of CDS Invenio. +## Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008 CERN. +## +## CDS Invenio is free software; you can redistribute it and/or +## modify it under the terms of the GNU General Public License as +## published by the Free Software Foundation; either version 2 of the +## License, or (at your option) any later version. +## +## CDS Invenio is distributed in the hope that it will be useful, but +## WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +## General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with CDS Invenio; if not, write to the Free Software Foundation, Inc., +## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. + +__revision__ = "$Id$" + +import unittest +import sys +import re +from invenio.config import CFG_TMPDIR, CFG_ETCDIR + +try: + #try local version first + import refextract +except ImportError: + #then get installed version + print "Using installed refextract\n" + import invenio.refextract + +from invenio.testutils import make_test_suite, run_test_suite + + +# pylint: disable-msg=C0301 + +def setup_files(self): + self.test_pdfname = CFG_TMPDIR + '/demoextract.pdf' + self.test_txtname = CFG_TMPDIR + '/demoextract.txt' + from os.path import exists, getsize + if (not exists(self.test_pdfname) or getsize(self.test_pdfname) is 0): + from urllib import urlretrieve + urlretrieve('http://arxiv.org/pdf/0809.4120', self.test_pdfname) + self.assert_( exists(self.test_pdfname) and getsize(self.test_pdfname) > 0) + + +def set_test_cli_opts(): + refextract.cli_opts = { 'treat_as_raw_section' : 0, + 'output_raw' : 0, + 'verbosity' : 1, + 'xmlfile' + + + : 0, + 'dictfile' : 0, + 'authors' : 0, + 'first_author' : "", + } + + + + +class RefExtractPDFTest(unittest.TestCase): + """ refextract test pdf to text extraction""" + + def setUp(self): + setup_files(self) + set_test_cli_opts() + + + def test_PDF_extraction(self): + """ refextract test basic pdf extraction ---necessary for some remaining tests""" + (docbody, extract_error) = refextract.get_plaintext_document_body(self.test_pdfname) + self.assert_(len(docbody) > 10) + self.assert_(len([1 for line in docbody if line.find('babar') > -1])>0) + from codecs import open + file = open(self.test_txtname, 'w', 'utf8') + for line in docbody: + file.write(line) + file.close + + +class RefExtractExtractSectionTest(unittest.TestCase): + """ refextract - test finding ref and auth sections """ + + def setUp(self): + # pylint: disable-msg=C0103 + """Initialize stuff""" + setup_files(self) + set_test_cli_opts + file = open(self.test_txtname, 'r') + self.textbody = [] + for line in file.readlines(): + self.textbody.append(line.decode("utf-8")) + file.close() + + + def test_reference_finding(self): + """ find a reference section """ + (references, extract_error, how_start) = refextract.extract_section_from_fulltext(self.textbody,'references') + self.assertEqual(extract_error, 0) +# for line in references: +# print "found -> %s\n" % line + self.assertEqual(len(references), 17) + + def test_author_finding(self): + """ find author section """ + + (authors, extract_error, how_start) = refextract.extract_section_from_fulltext(self.textbody,'authors') + for line in authors: + print "%s" % line.encode("utf8") + self.assertEqual(len(authors), 530) + + +class RefExtractAuthorParsingTest(unittest.TestCase): + def setUp(self): + self.authlines = [ + """B. Aubert,1""" + ,"""M. Bona,1""" + ,"""Y. Karyotakis,1""" + ,"""J. P. Lees,1""" + ,"""V. Poireau,1""" + ,"""E. Prencipe,1""" + ,"""X. Prudent,1""" + ,"""V. Tisserand,1""" + ,"""J. Garra Tico,2""" + ,"""E. Grauges,2""" + ,"""L. Lopezab + """ + ,"""A. Palanoab + """ + ,"""M. Pappagalloab + """ + ,"""N. L. Blount,56""" + ,"""J. Brau,56""" + ,"""R. Frey,56""" + ,"""O. Igonkina,56""" + ,"""J. A. Kolb,56""" + ,"""M. Lu,56""" + ,"""R. Rahmat,56""" + ,"""N. B. Sinev,56""" + ,"""D. Strom,56""" + ,"""J. Strube,56""" + ,"""E. Torrence,56""" + ,"""G. Castelliab + """ + ,"""N. Gagliardiab + """ + ,"""M. Margoniab + """ + ,"""M. Morandina + """ + ,"""M. Posoccoa + """ + ,"""M. Rotondoa + """ + ,"""F. Simonettoab + """ + ,"""R. Stroiliab + """ + ,"""C. Vociab + """ + ,"""E. Ben""" + ,"""H. Briand,58""" + ,"""G. Calderini,58""" + ,"""J. Chauveau,58""" + ,"""P. David,58""" + ,"""L. Del Buono,58""" + ,"""O. Hamon,58""" + ,"""J. Ocariz,58""" + ,"""A. Perez,58""" + ,"""J. Prendki,58""" + ,"""S. Sitt,58""" + ,"""L. Gladney,59""" + ,"""M. Biasiniab + """] + + + + def test_reference_parsing(self): + """Use a hardcoded set of authors to test the parsing""" + (processed_authors, count_author, \ + count_aff ) = \ + refextract.create_marc_xml_author_section(self.authlines) + self.assert_(re.search('Biasini, M.\s?',processed_authors[44])) + print processed_authors + + +class RefExtractReferenceParsingTest(unittest.TestCase): + """ Test the parsing of reference strings """ + def setUp(self): + self.reflines = ["""[1] CERN Document Server J. Maldacena, Adv. Theor. Math. Phys. 2 (1998) 231; hep-th/9711200. http://cdsweb.cern.ch/ then http://www.itp.ucsb.edu/online/susyc99/discussion/. ; L. Susskind, J. Math. Phys. 36 (1995) 6377; hep-th/9409089. hello world aYahoo!. Fin.""", + """[1] J. Maldacena, Adv. Theor. Math. Phys. 2 (1998) 231; hep-th/9711200. http://cdsweb.cern.ch/""", + """[2] S. Gubser, I. Klebanov and A. Polyakov, Phys. Lett. B428 (1998) 105; hep-th/9802109. http://cdsweb.cern.ch/search.py?AGE=hello-world&ln=en""", + """[3] E. Witten, Adv. Theor. Math. Phys. 2 (1998) 253; hep-th/9802150.""", + """[4] O. Aharony, S. Gubser, J. Maldacena, H. Ooguri and Y. Oz, hep-th/9905111.""", + """[5] L. Susskind, J. Math. Phys. 36 (1995) 6377; hep-th/9409089.""", + """[6] L. Susskind and E. Witten, hep-th/9805114.""", + """[7] W. Fischler and L. Susskind, hep-th/9806039; N. Kaloper and A. Linde, Phys. Rev. D60 (1999) 105509, hep-th/9904120.""", + """[8] R. Bousso, JHEP 9906:028 (1999); hep-th/9906022.""", + """[9] R. Penrose and W. Rindler, Spinors and Spacetime, volume 2, chapter 9 (Cambridge University Press, Cambridge, 1986).""", + """[10] R. Britto-Pacumio, A. Strominger and A. Volovich, JHEP 9911:013 (1999); hep-th/9905211. blah hep-th/9905211 blah hep-ph/9711200""", + """[11] V. Balasubramanian and P. Kraus, Commun. Math. Phys. 208 (1999) 413; hep-th/9902121.""", + """[12] V. Balasubramanian and P. Kraus, Phys. Rev. Lett. 83 (1999) 3605; hep-th/9903190.""", + """[13] P. Kraus, F. Larsen and R. Siebelink, hep-th/9906127.""", + """[14] L. Randall and R. Sundrum, Phys. Rev. Lett. 83 (1999) 4690; hep-th/9906064. this is a test RN of a different type: CERN-LHC-Project-Report-2006-003. more text.""", + """[15] S. Gubser, hep-th/9912001.""", + """[16] H. Verlinde, hep-th/9906182; H. Verlinde, hep-th/9912018; J. de Boer, E. Verlinde and H. Verlinde, hep-th/9912012.""", + """[17] E. Witten, remarks at ITP Santa Barbara conference, "New dimensions in field theory and string theory": http://www.itp.ucsb.edu/online/susyc99/discussion/.""", + """[18] D. Page and C. Pope, Commun. Math. Phys. 127 (1990) 529.""", + """[19] M. Duff, B. Nilsson and C. Pope, Physics Reports 130 (1986), chapter 9.""", + """[20] D. Page, Phys. Lett. B79 (1978) 235.""", + """[21] M. Cassidy and S. Hawking, Phys. Rev. D57 (1998) 2372, hep-th/9709066; S. Hawking, Phys. Rev. D52 (1995) 5681.""", + """[22] K. Skenderis and S. Solodukhin, hep-th/9910023.""", + """[23] M. Henningson and K. Skenderis, JHEP 9807:023 (1998), hep-th/9806087.""", + """[24] C. Fefferman and C. Graham, "Conformal Invariants", in Elie Cartan et les Mathematiques d'aujourd'hui (Asterisque, 1985) 95.""", + """[25] C. Graham and J. Lee, Adv. Math. 87 (1991) 186. CERN Document Server""", + """[26] E. Witten and S.-T. Yau, hep-th/9910245.""", + """[27] R. Emparan, JHEP 9906:036 (1999); hep-th/9906040.""", + """[28] A. Chamblin, R. Emparan, C. Johnson and R. Myers, Phys. Rev. D59 (1999) 64010, hep-th/9808177; S. Hawking, C. Hunter and D. Page, Phys. Rev. D59 (1999) 44033, hep-th/9809035.""", + """[29] S. Sethi and L. Susskind, Phys. Lett. B400 (1997) 265, hep-th/9702101; T. Banks and N. Seiberg, Nucl. Phys. B497 (1997) 41, hep-th/9702187.""", + """[30] R. Emparan, C. Johnson and R. Myers, Phys. Rev. D60 (1999) 104001; hep-th/9903238.""", + """[31] S. Hawking, C. Hunter and M. Taylor-Robinson, Phys. Rev. D59 (1999) 064005; hep-th/9811056.""", + """[32] J. Dowker, Class. Quant. Grav. 16 (1999) 1937; hep-th/9812202.""", + """[33] J. Brown and J. York, Phys. Rev. D47 (1993) 1407.""", + """[34] D. Freedman, S. Mathur, A. Matsuis and L. Rastelli, Nucl. Phys. B546 (1999) 96; hep-th/9804058. More text, followed by an IBID A 546 (1999) 96""", + """[35] D. Freedman, S. Mathur, A. Matsuis and L. Rastelli, Nucl. Phys. B546 (1999) 96; hep-th/9804058. More text, followed by an IBID A""", + """[36] whatever http://cdsware.cern.ch/""", + """[37] some misc lkjslkdjlksjflksj [hep-th/9804058] lkjlkjlkjlkj [hep-th/0001567], hep-th/1212321, some more misc, Nucl. Phys. B546 (1999) 96""", + """[38] R. Emparan, C. Johnson and R.... Myers, Phys. Rev. D60 (1999) 104001; this is :: .... misc! hep-th/9903238. and some ...,.,.,.,::: more hep-ph/9912000""", + """[10] A. Ceresole, G. Dall Agata and R. D Auria, JHEP 11(1999) 009, [hep-th/9907216].""", + """[12] D.P. Jatkar and S. Randjbar-Daemi, Phys. Lett. B460, 281 (1999) [hep-th/9904187].""", + """[14] G. DallAgata, Phys. Lett. B460, (1999) 79, [hep-th/9904198].""", + """[13] S.M. Donaldson, Instantons and Geometric Invariant Theory, Comm. Math. Phys., 93, (1984), 453-460.""", + """[16] Becchi C., Blasi A., Bonneau G., Collina R., Delduc F., Commun. Math. Phys., 1988, 120, 121.""", + """[26]: N. Nekrasov, A. Schwarz, Instantons on noncommutative R4 and (2, 0) superconformal six-dimensional theory, Comm. Math. Phys., 198, (1998), 689-703.""", + """[2] H. J. Bhabha, Rev. Mod. Phys. 17, 200(1945); ibid, 21, 451(1949); S. Weinberg, Phys. Rev. 133, B1318(1964); ibid, 134, 882(1964); D. L. Pursey, Ann. Phys(N. Y)32, 157(1965); W. K. Tung, Phys, Rev. Lett. 16, 763(1966); Phys. Rev. 156, 1385(1967); W. J. Hurley, Phys. Rev. Lett. 29, 1475(1972).""", + """[21] E. Schrodinger, Sitzungsber. Preuss. Akad. Wiss. Phys. Math. Kl. 24, 418(1930); ibid, 3, 1(1931); K. Huang, Am. J. Phys. 20, 479(1952); H. Jehle, Phys, Rev. D3, 306(1971); G. A. Perkins, Found. Phys. 6, 237(1976); J. A. Lock, Am. J. Phys. 47, 797(1979); A. O. Barut et al, Phys. Rev. D23, 2454(1981); ibid, D24, 3333(1981); ibid, D31, 1386(1985); Phys. Rev. Lett. 52, 2009(1984).""", + """[1] P. A. M. Dirac, Proc. R. Soc. London, Ser. A155, 447(1936); ibid, D24, 3333(1981).""", + ] + (self.title_search_kb, \ + self.title_search_standardised_titles, \ + self.title_search_keys) = \ + refextract.build_titles_knowledge_base(refextract.CFG_REFEXTRACT_KB_JOURNAL_TITLES) + (self.preprint_reportnum_sre, \ + self.standardised_preprint_reportnum_categs) = \ + refextract.build_reportnum_knowledge_base(refextract.CFG_REFEXTRACT_KB_REPORT_NUMBERS) + + + def test_reference_parsing(self): + """Use a hardcoded set of refstrings to test the parsing""" + (processed_references, count_misc, \ + count_title, count_reportnum, \ + count_url, count_doi, record_titles_count) = \ + refextract.create_marc_xml_reference_section(self.reflines, + preprint_repnum_search_kb=\ + self.preprint_reportnum_sre, + preprint_repnum_standardised_categs=\ + self.standardised_preprint_reportnum_categs, + periodical_title_search_kb=\ + self.title_search_kb, + standardised_periodical_titles=\ + self.title_search_standardised_titles, + periodical_title_search_keys=\ + self.title_search_keys) + self.assertEqual(count_title, 56) + self.assertEqual(count_reportnum, 45) + + + + + + + +TEST_SUITE = make_test_suite(#RefExtractPDFTest, + #RefExtractExtractSectionTest, + RefExtractAuthorParsingTest, + RefExtractReferenceParsingTest, + ) + +if __name__ == '__main__': + run_test_suite(TEST_SUITE) diff --git a/modules/bibedit/lib/refextract_config.py b/modules/bibedit/lib/refextract_config.py index 8823566ca1..ba0ff34338 100644 --- a/modules/bibedit/lib/refextract_config.py +++ b/modules/bibedit/lib/refextract_config.py @@ -35,6 +35,9 @@ # authors which should be recognised as such CFG_REFEXTRACT_KB_AUTHORS = "%s/bibedit/refextract-authors.kb" % CFG_ETCDIR +## Institutions, paired with author and affiliation extraction +CFG_INSTITUTIONS = ['CERN','DESY','Rutherford','Fermilab','SLAC','TRIUMF','Brookhaven Livermore','Argonne'] + ## MARC Fields and subfields used by refextract: From 6b79ae83551ab0c677c4b2b1572d4e82fa3e35ba Mon Sep 17 00:00:00 2001 From: Christopher Hayward Date: Thu, 27 Jan 2011 18:07:58 +0100 Subject: [PATCH 11/15] refextract: improve author/affiliation extraction * Added extra 'end of author section keyword'. Added a new 'bad unicode' character, and made sure to check for bad characters in both author lines and affiliation lines. * Improve overall program flow of handling requests to extract authors or affiliations. * Use both author matching and keyword matching when locating the top section of a document, in both author extraction and affiliation extraction modes. Plus other improvements. * Enrich keyword kb. * Extend author name numeration to include brackets. * Update usage() to include '--affiliations' option. * Add a comment explaining why a top section would not be found, and remove some redundant spaces between statements. --- modules/bibedit/lib/refextract.py | 867 +++++++++++++++++------------- 1 file changed, 502 insertions(+), 365 deletions(-) diff --git a/modules/bibedit/lib/refextract.py b/modules/bibedit/lib/refextract.py index d1c61d95f7..b46385c5ae 100644 --- a/modules/bibedit/lib/refextract.py +++ b/modules/bibedit/lib/refextract.py @@ -178,14 +178,22 @@ def encode_for_xml(s): Output raw references, as extracted from the document. No MARC XML mark-up - just each extracted line, prefixed by the recid of the document that it came from. + -a, --authors extract authors, not references. most other options + work as expected + --first_author use the following regexp as the first author, helps for + author extraction, ignored otherwise + -l, --affiliations + extract affiliations from the document. -x, --xmlfile Write the extracted references, in xml form, to a file rather than standard output. -d, --dictfile Write statistics about all matched title abbreviations (i.e. LHS terms in the titles knowledge base) to a file. - -z, --raw-references Treat the input file as pure references. i.e. skip the - stage of trying to locate the reference section within a + -z, --raw-references, --raw_authors + Treat the input file as the search space. i.e. skip the + stage of trying to locate the reference/top section within a document and instead move to the stage of recognition - and standardisation of citations within lines. + and standardisation of citations within lines, and the + extraction of authors. -p, --inspire Output journal standard reference form in the INSPIRE recognised format: [series]volume,page. -j, --kb-journal Manually specify the location of a journal title @@ -522,7 +530,8 @@ def get_bad_char_replacements(): u'\uFFFB' : u"", u'\uFFFC' : u"", u'\uFEFF' : u"", - u'\uFFFF' : u"", #Unrecognised characters converted to this + ## Remove the result of an bad UTF-8 character + u'\uFFFF' : u"", ## Language Tag Code Points: u"\U000E0000" : u"", u"\U000E0001" : u"", @@ -1236,7 +1245,7 @@ def get_bad_char_replacements(): def get_single_and_extra_author_pattern(): """Generates a simple, one-hit-only, author name pattern, matching just one author - name, but ALSO INCLUDING author names generated from the knowledge base. The author + name, but ALSO INCLUDING author names generated from the knowledge base. The author patterns are the same ones used inside the main 'author group' pattern generator. This function is used not for reference extraction, but for author extraction.""" return get_single_author_pattern()+"|"+make_extra_author_regex_str() @@ -1260,8 +1269,9 @@ def get_initial_surname_author_pattern(incl_numeration=False): @return (string): The 'Initials Surname' author pattern.""" append_num_re = "" ## Possible inclusion of superscript numeration at the end of author names + ## Will match the empty string if incl_numeration: - append_num_re = "(?:\d*)" + append_num_re = "(?:\s*[\{\(]?\s*\d*\.?\s*[\}\)]?)" return u""" ( (? ## Look for editor notation after the author group... ((([Ee][Dd]s?|[Ee]dited|[Ee]ditors?)(([\.\,]{0,2}\s)|([\.\,]{1,2}((\s)|($))?))) ## 'eds?.' | 'ed. ' | 'ed ' @@ -1434,34 +1444,6 @@ def make_auth_regex_str(etal,initial_surname_author=None,surname_initial_author= 'i_s_author' : initial_surname_author, 's_i_author' : surname_initial_author } -## Finding an et. al, before author names indicates a bad match!!! -## I.e. could be a title match... ignore it -etal_matches = (' et al.,',' et. al.,',' et. al.',' et.al.,',' et al.',' et al') - -## Standard et al ('and others') pattern for author recognition -re_etal = u"""[Ee][Tt](?:(?:(?:,|\.)\s*)|(?:(?:,|\.)?\s+))[Aa][Ll][,\.]?[,\.]?""" - -## The pattern used to identify authors inside references -re_auth = (re.compile(make_auth_regex_str(re_etal),re.VERBOSE|re.UNICODE)) - - - -## Given an Auth hit, some misc text, and then another Auth hit straight after, -## (OR a bad_and was found) -## check the entire misc text to see if is 'looks' like an author group, which didn't match -## as a normal author. In which case, append it to the single author group. -## PLEASE use this pattern only against space stripped text. -## IF a bad_and was found (from above).. do re.search using this pattern -## ELIF an auth-misc-auth combo was hit, do re.match using this pattern - -re_weaker_author = """ - (?:([A-Z]((\.\s?)|(\.?\s+)|(\-))){1,5} ## look closely for initials, and less closely at the last name. - (?:[^\s_<>0-9]+(?:(?:[,\.]\s*)|(?:[,\.]?\s+)))+)""" - -## End of line MUST match, since the next string is definitely a portion of an author group (append '$') -re_auth_near_miss = (re.compile(make_auth_regex_str(re_etal,"("+re_weaker_author+")+$"),re.VERBOSE|re.UNICODE)) - - def make_extra_author_regex_str(): """ From the authors knowledge-base, construct a single regex holding the or'd possibilities of patterns which should be included in $h subfields. The word 'Collaboration' is also converted to 'Coll', and @@ -1508,10 +1490,39 @@ def add_to_auth_list(s): if len(auths) > 0: for a in auths: author_match_re = author_match_re + "(?:"+a+")|" - author_match_re = "(?:(?:[\(\"]?(?:"+author_match_re[:-1] + ")[\)\"]?[\,\.]?\s?(?:and\s)?)+)" + author_match_re = "(?:(?:[\(\"]?(?P"+author_match_re[:-1] + ")[\)\"]?[\,\.]?\s?(?:and\s)?)+)" return author_match_re + + +## Finding an et. al, before author names indicates a bad match!!! +## I.e. could be a title match... ignore it +etal_matches = (' et al.,',' et. al.,',' et. al.',' et.al.,',' et al.',' et al') + +## Standard et al ('and others') pattern for author recognition +re_etal = u"""[Ee][Tt](?:(?:(?:,|\.)\s*)|(?:(?:,|\.)?\s+))[Aa][Ll][,\.]?[,\.]?""" + +## The pattern used to identify authors inside references +re_auth = (re.compile(make_auth_regex_str(re_etal),re.VERBOSE|re.UNICODE)) + +## Given an Auth hit, some misc text, and then another Auth hit straight after, +## (OR a bad_and was found) +## check the entire misc text to see if is 'looks' like an author group, which didn't match +## as a normal author. In which case, append it to the single author group. +## PLEASE use this pattern only against space stripped text. +## IF a bad_and was found (from above).. do re.search using this pattern +## ELIF an auth-misc-auth combo was hit, do re.match using this pattern +re_weaker_author = """ + (?:([A-Z]((\.\s?)|(\.?\s+)|(\-))){1,5} ## look closely for initials, and less closely at the last name. + (?:[^\s_<>0-9]+(?:(?:[,\.]\s*)|(?:[,\.]?\s+)))+)""" + +## End of line MUST match, since the next string is definitely a portion of an author group (append '$') +re_auth_near_miss = (re.compile(make_auth_regex_str(re_etal,"("+re_weaker_author+")+$"),re.VERBOSE|re.UNICODE)) + +## Targets single author names +re_single_author_pattern = re.compile(get_single_and_extra_author_pattern(), re.VERBOSE) + ## Create the regular expression used to find user-specified 'extra' authors ## (letter case is not concidered when matching) re_extra_auth = re.compile(make_extra_author_regex_str(), re.IGNORECASE) @@ -1542,8 +1553,7 @@ def get_recids_and_filepaths(args): jobs = [] for x in args: - ## Split a maximum of once so as to cater for file names with colons - ## (e.g. arXiv names) + ## Cater for arxiv notation, which includes a colon. e.g. 1010:/opt/examples/arxiv:23923.pdf items = x.split(":", 1) if len(items) != 2: write_message(u"W: Recid:filepath argument invalid. Skipping.\n", \ @@ -2420,12 +2430,14 @@ def identify_and_tag_extra_authors(line): extra_authors = re_extra_auth.finditer(line) positions = [] for match in extra_authors: - positions.append({ 'start' : match.start(), - 'end' : match.end()}) + + positions.append({ 'start' : match.start(), + 'end' : match.end(), + 'author' : match.group('extra_auth')}) positions.reverse() for p in positions: line = line[:p['start']] + "" \ - + line[p['start']:p['end']].strip(".,:;- []") + CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_INCL + line[p['end']:] + + p['author'].strip(".,:;- []") + CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_INCL + line[p['end']:] return line @@ -2472,7 +2484,7 @@ def identify_and_tag_extra_authors(line): ## Has the group with name 'ee' (for ed. after the author) been found in the pattern? matched_positions.append({ 'start' : match.start(), 'end' : match.end(), - 'etal' : match.group('et'), + 'etal' : match.group('et') or match.group('et2'), 'ed_start' : match.group('es'), 'ed_end' : match.group('ee'), 'multi_auth' : match.group('multi_auth'), @@ -4173,7 +4185,7 @@ def create_marc_xml_reference_section(ref_sect, ## Identify and standardise numeration in the line: working_line1 = \ - standardize_and_markup_numeration_of_citations_in_line(working_line1) + standardize_and_markup_numeration_of_citations_in_line(working_line1) ## Now that numeration has been marked-up, check for and remove any ## ocurrences of " bf ": @@ -4759,59 +4771,81 @@ def perform_regex_search_upon_line_with_pattern_list(line, patterns): break return m +def get_post_author_section_keyword_patterns(): + """ Return a list of compiled regex's based on keywords used as an indication of the + end of a possible author section on the title page of a document. + @return: (List of compiled keywords which denote a possible end of author + section) + """ + keywords = ['abstract', 'acknowledgements', 'introduction', 'intro', 'overview', + 'contents', 'content', 'context', 'table of contents', 'table', + 'objectives', 'page', 'preface', 'summary', 'copyright', 'keywords', + 'figure', 'fig'] + + ptns = map(_create_regex_pattern_add_optional_spaces_to_word_characters, keywords) + ## Add an optional chapter numeration (1., 1.1, i, A..) to the start of each pattern + ptns = ['\s*([ai1]\s*\.?\s*[1]?\s*\.?\s*)?'+x for x in ptns] + + ## Page number 1 + ptns.append('\s*(page)?\s*[i\d]\s*\.?\s*$') + ## Number one at the start of a possible chapter + ptns.append('\s*\d\.?\s*$') + + compiled_patterns = [] + for p in ptns: + compiled_patterns.append(re.compile(p, re.I|re.UNICODE)) + return compiled_patterns + def standardise_line_affiliations(line): ## Removes numeration, 'the'/'and', and replace titles line = line.strip() line = re.sub(r"^Livermore","LLNL, Livermore",line) - line = re.sub(r".*Stanford Linear Accelerator Center.*","SLAC",line) + line = re.sub(r".*?Stanford Linear Accelerator Center.*?","SLAC",line) line = re.sub(r"^Fermi National Accelerator Laboratory","Fermilab",line) - line = re.sub(r"[tT][hH][eE]"," ",line) - line = re.sub(r"[aA][nN][dD]"," ",line) + line = re.sub(r"\s[tT][hH][eE]\s"," ",line) + line = re.sub(r"\s[aA][nN][dD]\s"," ",line) return line re_aff_num = re.compile(r"(^[\d]+[A-Z])") -re_aff_inst = re.compile(r"(univ|institut|laborator)",re.I) +re_aff_name = re.compile(r"(univ|institut|laborator)", re.I) re_aff_univ = re.compile(r"univ[a-z]+\s+(of)?\s+([a-z\s\-]+)|([a-z\s\-]+)\s+(?!univ[a-z]+\sof)univ[a-z]+",re.I) +re_aff_email = re.compile(r"^.*?@.*?$") -def find_author_affiliations(docbody,use_to_find_authors=False): +## Used to validate a set of words found above an affiliation +## This is used when no authors have been found for a paper, but an affiliation has +## Will try to match a single ambiguous author, such as "William J. Smith" +re_ambig_auth = re.compile(r"\s*[A-Z][^\s_<>0-9]+\s+([^\s_<>0-9]{1,3}\.?\s+)?[A-Z][^\s_<>0-9]+\s*(\d*)\s*$",re.UNICODE) + +def find_affiliations(docbody, use_to_find_authors=False): """ Given a possible author section, attempt to retrieve any affliations. @param docbody: The document body as a list of lines. - @param use_to_find_authors: Boolean, whether or not the affiliations found + @param use_to_find_authors: Boolean, whether or not the affiliations found within this function should be used to support the identification of authors. (This will be True in the case when '--authors' is selected, and no authors - have been found using the specific author regular expression during the first + have been found using the specific author regular expression during the first method.) @return (tuple): Affilations and the possibly improved author section. """ + top_section = find_top_section(docbody) + ## Must find the top_section + if not top_section: + return None - ## Used to validate a set of words found above an affiliation - ## This is used when no authors have been found for a paper, but an affiliation has - ## Will try to match a single ambiguous author, such as "William J. Smith" - re_find_ambig_auth = re.compile(r"\s*[A-Z][^\s_<>0-9]+\s+([^\s_<>0-9]{1,3}\.?\s+)?[A-Z][^\s_<>0-9]+\s*(\d*)\s*$",re.UNICODE) - - end_of_section_position = find_end_of_auth_aff_section(docbody) - - start = 0 - if end_of_section_position is not None: - end = end_of_section_position - else: - if cli_opts['verbosity'] > 2: - print "no ending keyword, stopping affiliation search" - return False - - docbody = docbody[start:end] + top_lines = top_section['lines'] affiliations = [] affiliation_positions = [] - for position in range(len(docbody)): + for position in range(len(top_lines)): second_try_authors = [] - line = standardise_line_affiliations(docbody[position]) + line = standardise_line_affiliations(top_lines[position]) if cli_opts['verbosity'] > 2: print "(find affiliations) examining " + line.encode("utf8") - if re_aff_num.search(line) or re_aff_inst.search(line): - line = re.sub(r"[0-9]","",line) + ## Obtain either a single university/institution, or the entire line + ## Also look for the emails + if re_aff_num.search(line) or re_aff_name.search(line) or re_aff_email.search(line): + line = re.sub(r"[0-9]", "", line) ## Format the found affiliation univ_name = re_aff_univ.search(line) if univ_name: @@ -4819,97 +4853,191 @@ def find_author_affiliations(docbody,use_to_find_authors=False): line = (univ_name.group(2) or univ_name.group(3)) + " U." ## Check and set an institution for inst in CFG_INSTITUTIONS: - if line.find(" "+inst) != -1: + if line.find(inst) != -1: line = inst break - ## And save the position within this affiliation section + ## Save the line number of this identified affiliation affiliation_positions.append(position) + ## Try to obtain more authors, if needed if use_to_find_authors == True: ## Use the found affiliation to try and help with author extraction if ((position - 1) > 0) and not ((position - 1) in affiliation_positions): ## Replace 'and' or '&' with a comma - tmp_line = re.sub(r"\s([Aa][Nn][Dd]|&)\s",", ",docbody[position-1]) + tmp_line = re.sub(r"\s([Aa][Nn][Dd]|&)\s", ", ", top_lines[position-1]) possible_authors = tmp_line.strip().split(",") - print "checking these possible authors:" - print possible_authors - ## Return the list of ok authors found in the split line, above the affiliation - second_try_authors = filter(lambda x: re_find_ambig_auth.match(x), possible_authors) + ## Make a list of ok authors found in the split line, for this affiliation + second_try_authors = filter(lambda x: re_ambig_auth.match(x), possible_authors) ## Add the institution to the list of institutions for this document - affiliations.append((line,second_try_authors)) + affiliations.append((line, second_try_authors)) - print "identified affiliations:" - print affiliations return affiliations -def get_post_author_section_keyword_patterns(): - """ Return a list of compiled regex's based on keywords used as an indication of the - end of a possible author section on the title page of a document. - """ - ptns = [] - ptns.append(_create_regex_pattern_add_optional_spaces_to_word_characters('abstract')) - ptns.append(_create_regex_pattern_add_optional_spaces_to_word_characters('introduction')) - ptns.append(_create_regex_pattern_add_optional_spaces_to_word_characters('intro')) - ptns.append(_create_regex_pattern_add_optional_spaces_to_word_characters('overview')) - ptns.append(_create_regex_pattern_add_optional_spaces_to_word_characters('contents')) - ptns.append(_create_regex_pattern_add_optional_spaces_to_word_characters('table of contents')) - ptns.append(_create_regex_pattern_add_optional_spaces_to_word_characters('content')) - ptns.append(_create_regex_pattern_add_optional_spaces_to_word_characters('overview')) - ptns.append(_create_regex_pattern_add_optional_spaces_to_word_characters('objectives')) - #ptns.append(_create_regex_pattern_add_optional_spaces_to_word_characters('copyright')) - ptns.append(_create_regex_pattern_add_optional_spaces_to_word_characters('page')) - ptns.append(_create_regex_pattern_add_optional_spaces_to_word_characters('preface')) - ptns.append(_create_regex_pattern_add_optional_spaces_to_word_characters('summary')) - ## Page number 1 - ptns.append('\s*(page)?\s*(1|2|i)\s*\.?\s*$') +def find_top_section(docbody, first_author=None): + """From the lines of text of the document body, attempt to locate + a subset of lines which correspond to the top section of the docbody. + The top section is classed as that which encapsulates the authors and any + affiliations of the document. Author name and affiliation patterns + are used to feature-equipped top section finder """ + + def check_for_end_of_author_section_match_keywords(line): + """ Given a lowercase, stripped line from the start of a document, try to find a match the + line exactly for a keyword. A match should indicate the end of the author section. + @param line: The line to be checked for ending section keywords. + @return (match object): The match object returned when a keyword match is found. + """ + found_keyword = perform_regex_match_upon_line_with_pattern_list(line, \ + get_post_author_section_keyword_patterns()) + if found_keyword: + return found_keyword + else: + return False - ## Add an optional chapter numeration (1., 2, 1.1...) to the start of each pattern - ptns = map(lambda x:'\s*(\d\s*\.?\s*1?\s*\.?\s*)?'+x,ptns) + top_section = None + ## Author match positions (first/last) + first_matched_author_line = None + last_matched_author_line = None + ## Affiliation match position(last) + last_matched_aff_line = None + ## Keyword match position (first) + keyword_line = None - compiled_patterns = [] - for p in ptns: - compiled_patterns.append(re.compile(p, re.I|re.UNICODE)) - return compiled_patterns + ## In an unfortunate case, this line position will be treated + ## as the start of the top-section + start_line = 0 + ## The number of lines to jump forward, in the event of an author/aff match + ## together with a non-existant top-section keyword match + forward_jump = 30 -def check_for_end_of_author_section_match_keywords(line): - """ Given a lowercase, stripped line from the start of a document, try to find a match the - line exactly for a keyword. A match should indicate the end of the author section. - @param line: The line to be checked for ending section keywords. - @return (match object): The match object returned when a keyword match is found. - """ - found_keyword = perform_regex_match_upon_line_with_pattern_list(line, get_post_author_section_keyword_patterns()) - if found_keyword: - return found_keyword - -def find_end_of_auth_aff_section(docbody): - """ Return the ending position of the author/affiliation section. - @param docbody: The full, line-by-line document - @return (int): The position of the found end-keyword - """ + ## Obtain the compiled expression which includes the proper author numeration + ## (The pattern used to identify authors of papers) + total_author_pattern = (re.compile(make_auth_regex_str(re_etal, \ + get_initial_surname_author_pattern(incl_numeration=True), \ + get_surname_initial_author_pattern(incl_numeration=True)), \ + re.VERBOSE|re.UNICODE)) + + ## Obtain the compiled expression which includes the user-specified 'extra' authors + extra_author_pattern = re_extra_auth - end_of_section_keyword_position = None + ## Holds the matched authors/affiliations whilst finding the end of the top section + collected_authors = [] + collected_affiliations = [] - ## Obtain the line numbers of lines which hold authors + ## obtain the line numbers of lines which hold authors for position in range(len(docbody)): - ## Skip the first line ##FIXME - if position == 0: - continue line = docbody[position] - print '(find ending keyword) examining: %s' % line.strip() - ## Check for post-author keywords in the line which signifies the end of an author section - if check_for_end_of_author_section_match_keywords(line.strip().lower()): - end_of_section_keyword_position = position - print "ending keyword match, stopping auth/aff section search: %d" % position - print "on line: %s" % line.strip().lower() + if cli_opts['verbosity'] > 2: + print "looking for authors in: " + line.encode("utf8").strip() + #print "re -> " + start_pattern.pattern + + ## Check for post-author-section keywords in the line which signifies the end of an + ## author section + keyword_match = check_for_end_of_author_section_match_keywords(line.strip().lower()) + + if keyword_match: + if cli_opts['verbosity'] > 2: + print "! Keyword match on line: %s" % line + ## Always save this line position with the ending keyword + keyword_line = position + ## Stop the search immediately break - return end_of_section_keyword_position + ## Affiliation in line + elif re_aff_num.search(line) or re_aff_name.search(line) or re_aff_email.search(line): + if cli_opts['verbosity'] > 2: + print "! Affiliation match on line: %s" % line + last_matched_aff_line = position + collected_affiliations.append(line) + + ## Set the ending position to equal this line number + ## (This could be the last author or one of many) + else: + standard_author_pattern_match = total_author_pattern.search(line) + extra_author_pattern_match = extra_author_pattern.search(line) + if standard_author_pattern_match or extra_author_pattern_match: + if not first_matched_author_line: + first_matched_author_line = position + last_matched_author_line = position + + if standard_author_pattern_match: + ## Append the matched author string (standard pattern) + collected_authors.append(standard_author_pattern_match.group('author_names')) + if extra_author_pattern_match: + ## Append the matched author string (extra pattern) + collected_authors.append(extra_author_pattern_match.group('extra_auth')) + + if cli_opts['verbosity'] > 2: + print "! Author pattern match on line: %s" % line + + if cli_opts['verbosity'] > 2: + print "TOP SECTION COLLECTED AUTHORS" + print collected_authors + print "TOP SECTION COLLECTED AFFILIATIONS" + print collected_affiliations + + final_tagged_authors = [] + ## If the number of author LINES is equal to the affiliation count, + ## associate the two + if len(collected_authors) == len(collected_affiliations): + for x in range(len(collected_authors)): + rebuilt_collected_authors = rebuild_author_lines(list(collected_authors[x]), \ + re_single_author_pattern) + ## Associate authors with affiliations + tagged_authors = ["%s%s%s%s" % \ + ("", \ + an_author, \ + CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_STND, \ + collected_affiliations[x]) for an_author in rebuilt_collected_authors] + ## Increase stength for this (when len(aff)=len(auth))? + else: + ## Assemble into a list, with one author name per line, without affiliations + rebuilt_collected_authors = rebuild_author_lines(collected_authors, re_single_author_pattern) + tagged_authors = ["%s%s%s" % ("", \ + an_author, \ + CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_STND) for an_author in rebuilt_collected_authors] + + ## Only returns a top section if a solid indicator was found + if keyword_line or last_matched_aff_line or last_matched_author_line: + + ## Set the starting position of the start of the top section + if first_matched_author_line: + start_line = first_matched_author_line + + ## Set the end line position of the end of the top section + if keyword_line: + ## (Top-section keyword termination) Best case scenario + ## Ending was found using a keyword, safe to use from the word go + new_end = keyword_line + elif last_matched_aff_line: + ## Next best thing: use found affiliations + ## (Affiliation terminates the end of the top section) + new_end = last_matched_aff_line + forward_jump + else: + ## Cater for this author line, and an affiliation line afterwards + ## (Author terminates the end of the top section) + new_end = last_matched_author_line + forward_jump + + try: + ## Top section details + top_section = {'start' : start_line, + 'end' : new_end, + 'lines' : docbody[start_line:new_end], + 'authors' : tagged_authors, ## list of author strings! + 'affiliations' : collected_affiliations, + } + except IndexError, err: + ## Overshoots the length of the document body, completely abort. + pass + + print "TAGGED AUTHORS FROM FIND_TOP_SECTION" + print top_section['authors'] + return top_section -def find_author_section(docbody, author_marker = None, first_author = None): +def find_simple_authors(docbody, author_marker = None, first_author = None): """Search in document body for its author section. Looks top down for things that look like an author list. This will work generally poorly unless one is using the LaTeX in some way, or @@ -4919,10 +5047,10 @@ def find_author_section(docbody, author_marker = None, first_author = None): [A-Z]\w+, [A-Z]\.?\s?[A-Z]?\.?\s?\d* (i.e. a word starting with caps, followed by comma, space, one or two initials with possible periods and then possibly a number. - + @param docbody: (list) of strings - the full document body. @param author_marker: (string) optional (regexp) marker embedded by latex - for beginning and end of author section + for beginning and end of author section @param first_author: (string) optional (regexp) first author to help find beginning of section @return: (dictionary) : @@ -4934,121 +5062,22 @@ def find_author_section(docbody, author_marker = None, first_author = None): -- OR -- (None) - when the reference section could not be found. """ - auth_start_line = None - auth_end_line = None - - #A pattern to match author names - # demands name has a comma - # allows space or hyphen in family name - # allows only initials (capital letters) but allows many (3 or more if - # no . or spaces used...) - # allows a trailing number - # Aubert, F. I. 3 - #author_pattern = re.compile('([A-Z]\w+\s?\w+)\s?([A-Z\.\s]{1,9})\.?\s?(\d*)') - # F. I. Aubert, 3 - #author_pattern = re.compile('([A-Z])\.\s?([A-Z]?)\.?\s?([A-Z]\w+\s?\w*)\,?\s?(\d*)') - - ## Obtain the compiled expression which includes the proper author numeration - ## (The pattern used to identify authors of papers) - total_author_pattern = (re.compile(make_auth_regex_str(re_etal,\ - get_initial_surname_author_pattern(incl_numeration=True),\ - get_surname_initial_author_pattern(incl_numeration=True)),re.VERBOSE|re.UNICODE)) - - ## Obtain the compiled expression which includes the user-specified 'extra' authors - extra_author_pattern = re_extra_auth - - ## Obtain the compiled expression which matches single author names (Initials Surname, or Surname Initials) - ## Also caters for the possible inclusion of superscript numeration at the end of author names - ## (e.g. W.H.Smith2) - single_author_pattern = re.compile(get_single_and_extra_author_pattern(),re.VERBOSE) + ## Single author pattern (this is just used to split the output lines as 'markers') + ## Top section of the document (if any) + top_section = find_top_section(docbody, first_author) - ## 'initial surname', or 'surname initial' - #start_pattern = total_author_pattern - #end_pattern = total_author_pattern - - ## 'initial surname' only - #end_pattern = re.compile(get_initial_surname_author_pattern(),re.VERBOSE) - -# if author_marker is not None: -# start_pattern = re.compile(author_marker+'(.*)') -# end_pattern = re.compile('(.*)'+author_marker) -# if first_author is not None: -# start_pattern = re.compile(first_author) -# end_pattern = None; - - ## Obtain the line numbers of lines which hold authors - for position in range(len(docbody)): - - ##FIXME - if position == 0: - continue - - line = docbody[position] - if cli_opts['verbosity'] > 2: - print "(find authors) examining " + line.encode("utf8") - #print "re -> " + start_pattern.pattern - - ## Set the ending position to equal this line number - ## (This could be the last author or one of many) - if total_author_pattern.search(line) or extra_author_pattern.search(line): - print "author found -> " + line.encode("utf8") - ## Set the starting position, if it has not been set - if auth_start_line is None: - auth_start_line = position - ## Always update the ending position - auth_end_line = position - - - #elif (auth_end_line is not None) and (not end_pattern.search(line))\ - #and (not kb_author_pattern.search(line)): - # leave when we have found a possible and, and the ending - # pattern no longer matches this will fail if there are - # affiliations interspersed, or othe corruptions of the list - # if ((position+1) < len(docbody))\ - # and (not end_pattern.search(docbody[position+1]))\ - # and (not kb_author_pattern.search(docbody[position+1])): - ## Finish searching when a gap of two lines is found (between authors) - # break - - ## Check for post-author keywords in the line which signifies the end of an author section - elif check_for_end_of_author_section_match_keywords(line.strip().lower()): - print "ending keyword match, stopping author section search" - break - - ## If moved 10 lines beyond the last found author line, stop the search. - #elif auth_end_line and (position == auth_end_line + 10): - # print "maximum search line limit reached, stopping author section search" - # break - - #elif position > 0: - ## End the search when two adjacent lines, of the maximum page length are found - ## (High likelihood that these two lines indicate the start of a paragraph.) - # if (len(line) == prev_line_length) and prev_line_ends_with_new_line and line.endswith('\n'): - # break - #prev_line_length = len(line) - #prev_line_ends_with_new_line = line.endswith('\n') - - if auth_start_line is not None: + ## Top section was found, with basic authors + if top_section and top_section['authors']: ## Return dictionary containing details of author section: ## (The pattern used is just a single name matching author pattern, ## and not the full author pattern. This allows for each author group ## to be split into separate author names, within the output xml.) - auth_sect_details = {'start_line' : auth_start_line, - 'end_line' : auth_end_line, - 'marker_pattern' : single_author_pattern,#just the single author matches, as opposed to re_auth - 'title_string' : None, - 'marker' : None, - 'title_marker_same_line': None - } + auth_section = top_section else: - auth_sect_details = None - - - - ## Now attempt to get the affilations. This will also try to get authors once again, - ## if authors have not already been found... - return auth_sect_details + ## No basic author names were found when locating the top section. + auth_section = None + return auth_section def find_reference_section(docbody): """Search in document body for its reference section. More precisely, find @@ -5749,7 +5778,6 @@ def correct_rebuilt_lines(rebuilt_lines, p_refmarker): return fixed - def wash_and_repair_reference_line(line): """Wash a reference line of undesirable characters (such as poorly-encoded letters, etc), and repair any errors (such as broken URLs) if possible. @@ -5765,23 +5793,25 @@ def wash_and_repair_reference_line(line): line = re_multiple_space.sub(u' ', line) return line - def rebuild_author_lines(author_lines, author_pattern): """Given the lines that we think make up the author section reset - everything so that each author is on one line + everything so that each author is on one line """ def found_author(matchobj): """ given an author in the match obj, pushes it on the stack of lines """ - authors.append(matchobj.group(0)) + ## Append author and remove undesirable unicode characters for this author list + authors.append(replace_undesirable_characters(matchobj.group(0))) if cli_opts['verbosity'] > 1: print "Found author -> "+ matchobj.group(0)+ "\n" return ' ' authors = [] - author_string = ' '.join(author_lines) + ## Kill the new line characters in the author lines + ## FIXME Need to remove the numeration character for authors + author_string = ' '.join([x.strip() for x in author_lines]) author_pattern.sub(found_author, author_string) + return authors - def rebuild_reference_lines(ref_sectn, ref_line_marker_ptn): """Given a reference section, rebuild the reference lines. After translation @@ -5886,9 +5916,8 @@ def get_lines(docbody, end_line, title, marker_ptn, - title_marker_same_line, - section = 'references'): - """from a given section of a document extract the relevant lines, not + title_marker_same_line): + """from a reference section a document extract the relevant lines, not including the various markers. @param start_line index of docbody on which sect starts @param end_line index of docbody on which sect ends @@ -5896,7 +5925,6 @@ def get_lines(docbody, @param marker_ptn pattern that ids start of a line @param title_marker_same_line integer tells whether title and marker are on same line - @param section[="references"] string denoting type of section @return: (list) of strings. Each string is a reference line, extracted from the document. """ @@ -5908,32 +5936,19 @@ def get_lines(docbody, docbody[start_idx] = docbody[start_idx][title_start + \ len(title):] elif title is not None: - ## Pass title line + ## Pass title line start_idx += 1 ## now rebuild reference lines: if type(end_line) is int: - if section is 'references': - lines = \ - rebuild_reference_lines(docbody[start_idx:end_line+1], \ - marker_ptn) - elif section is 'authors': - print "ready to rebuild" - lines = \ - rebuild_author_lines(docbody[start_idx:end_line+1], \ - marker_ptn) - #lines = docbody[start_idx:end_line+1] + lines = \ + rebuild_reference_lines(docbody[start_idx:end_line+1], \ + marker_ptn) else: - if section is 'references': - lines = rebuild_reference_lines(docbody[start_idx:], \ + lines = rebuild_reference_lines(docbody[start_idx:], \ marker_ptn) - elif section is 'authors': - lines = \ - rebuild_author_lines(docbody[start_idx:], \ - marker_ptn) - #lines = docbody[start_idx:] return lines @@ -5969,7 +5984,7 @@ def get_reference_lines(docbody, """ start_idx = ref_sect_start_line - + if title_marker_same_line: ## Title on same line as 1st ref- take title out! title_start = docbody[start_idx].find(ref_sect_title) @@ -6003,7 +6018,7 @@ def extract_references_from_fulltext(fulltext): E.g. a string could be something like: '[19] Wilson, A. Unpublished (1986). wrapper for more general extract_section_from_fulltext() - + @param fulltext: (list) of strings, whereby each string is a line of the document. @return: (list) of strings, where each string is an extracted reference @@ -6011,66 +6026,186 @@ def extract_references_from_fulltext(fulltext): """ return extract_section_from_fulltext(fulltext, 'references') -def extract_section_from_fulltext(fulltext, section): - """Locate and extract a relevant named section from a fulltext document. +## Custom weightings, the higher the value, the more valuable the method +context_method_weightings = {'numeration_accoc': 1, + 'strict_pattern' : 0.5, + 'line_above_aff' : 0.4, + 'keyword_body' : 0.3, + 'weak_pattern' : 0.2,} + +def choose_author_method(p_authors, c_authors): + """Decide which list of possible authors to return. + """ + + ## Immediately discard non-sets of authors (hold multiple entries of the same name) + if len(p_authors['matches']) != len(set(p_authors['matches'])): + p_authors['matches'] = [] + + if len(c_authors['matches']) != len(set(c_authors['matches'])): + c_authors['matches'] = [] + + if p_authors or c_authors: + ## Discard a list of authors if it is small (what is small?) + + + ## Here we look at which set of authors to choose, depending on their perceived accuracy + pattern_data = (p_authors['method'], p_authors['strength'], p_authors['authors']) + context_data = (c_authors['method'], c_authors['strength'], c_authors['authors']) + + ## + for_pattern = 0 + for_context = 0 + + ## Compare the list of authors found by both method groups +# for a in p_authors['authors']: +# if a in c_authors['authors']: + + + for n in range(3): + p = pattern_data[n] + c = context_data[n] + try: + int_p = 1/int(p) + int_c = 1/int(c) + except ValueError: + int_p = method_weightings[p] + int_c = method_weightings[c] + + ## indicating a very poor match + if int_p == 0 or int_c == 0: + if int_p == 0: + for_pattern -= int_p + if int_c == 0: + for_context -= int_c + else: + ## Here we have a value between 0 and 1, for a feature + ## Bias is always placed on the context + if int_p > int_c: + for_pattern += 1 + else: + for_context += 1 + + ## Make the desision, depending on the larger pattern or context value + if for_pattern > for_context: + chosen_author_section = p_authors + else: + ## Again, bias is placed on using context + chosen_author_section = c_authors + + else: + ## Both lists are empty + chosen_author_section = [] + + return chosen_author_section + +def extract_authors_from_fulltext(fulltext): + """Locate and extract authors of a paper, from a fulltext document. + Return the extracted authors section as a list of strings, whereby each + string in the list is considered to be line holding authors. + E.g. a string could be something like: + 'Wilson, A., T Wells. A. Einstein ...' + wrapper for more general extract_section_from_fulltext() + + @param fulltext: (list) of strings, whereby each string is a line of the + document. + @return: (list) of strings, where each string is an extracted author + line. + """ + + status = how_found_start = 0 + author_section = [] + + ## EXTRACTION 1 + ## This will attempt to find the top section of the document using + ## author pattern matches as boundaries only (single author names are markers) + + ## For author extraction, this entire encapsulating function will only + ## return authors which are found using the pattern (initials surname, etc...) + authors_using_pattern = find_simple_authors(fulltext, first_author=cli_opts['first_author']) + +# fulltext = ['Some title', 'Some date', 'Chris Hayward, Tim Smith, Joe Harris', 'University of Bath', '', 'Abstract'] +# authors_using_pattern = extract_section_from_fulltext(fulltext, 'authors') + + ## EXTRACTION 2 + ## Now attempt to find authors in the context of nearby affiliations + aff_auth_pairs = find_affiliations(fulltext, use_to_find_authors=True) + + ## Append the affiliation-supported authors, if affiliations were found + affiliation_supported_authors = [] + if aff_auth_pairs is not None: + for pair in aff_auth_pairs: + ## For each (affiliation, author list) pair, add tagged authors, and associated affiliation + affiliation_supported_authors.append("%s%s%s%s" % \ + ("", \ + pair[1], \ + CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_STND, \ + pair[0])) + + authors_using_affiliations = map(replace_undesirable_characters, affiliation_supported_authors) + + ## VERIFICATION 1 + ## Compare possible authors against a list of authors in the references (confirmed) + + ## VERIFICATION 2 + ## Compare long words in a line against the rest of the document body (negative) + + ## Given two lists of authors, which have been 'extracted' using two different methods + ## decide which list to take as a set of reliable authors (if any) +# author_section = choose_author_method(authors_using_pattern, authors_using_affiliations) + + ## temp + if authors_using_affiliations: + author_lines = authors_using_affiliations + elif authors_using_pattern: + author_lines = authors_using_pattern['authors'] + + return (author_lines, status, how_found_start) + +def extract_affiliations_from_fulltext(fulltext): + """Locate and extract affiliations of a paper, from a fulltext document. + Return the extracted affiliations section as a list of strings, whereby each + string in the list is considered to be line holding affiliations. + E.g. a string could be something like: + 'U. Bath, CERN ...' + This function does not involve itself with extract_section_from_fulltext() + + @param fulltext: (list) of strings, whereby each string is a line of the + document. + @return: (list) of strings, where each string is an extracted affiliations + line. + """ + aff_auth_pairs = find_affiliations(fulltext) + return ([aff[0] for aff in aff_auth_pairs], 0, 0) + +def extract_references_from_fulltext(fulltext): + """Locate and extract the reference section from a fulltext document. Return the extracted section as a list of strings, whereby each - string in the list is considered to be a single line (reference, - author, abstract etc). + string in the list is considered to be a single line. E.g. a string could be something like: '[19] Wilson, A. Unpublished (1986). @param fulltext: (list) of strings, whereby each string is a line of the document. - @param section: 'references', 'authors', or FIXME 'abstract' @return: (list) of strings, where each string is an extracted line. """ ## Try to remove pagebreaks, headers, footers fulltext = remove_page_boundary_lines(fulltext) - #fulltext = ['Some title','Some date','Chris Hayward, Tim Smith, Joe Harris','University of Bath','','Abstract'] - status = 0 lines = [] - sect_start = {'start_line' : None, - 'end_line' : None, - 'title_string': None, - 'marker_pattern': None, - 'marker' : None, + sect_start = {'start_line' : None, + 'end_line' : None, + 'title_string' : None, + 'marker_pattern' : None, + 'marker' : None, } sect_end = None #How ref section found flag how_found_start = 0 - if section == 'references': - ## Find start of refs section: - sect_start = find_reference_section(fulltext) - if sect_start is not None: how_found_start = 1 - if sect_start is None: - ## No references found - try with no title option - sect_start = find_reference_section_no_title_via_brackets(fulltext) - if sect_start is not None: how_found_start = 2 - ## Try weaker set of patterns if needed - if sect_start is None: - ## No references found - try with no title option (with weaker patterns..) - sect_start = find_reference_section_no_title_via_dots(fulltext) - if sect_start is not None: how_found_start = 3 - if sect_start is None: - ## No references found - try with no title option (with even weaker patterns..) - sect_start = find_reference_section_no_title_via_numbers(fulltext) - if sect_start is not None: how_found_start = 4 - - elif section == 'authors': - sect_start = find_author_section(fulltext, first_author = cli_opts['first_author']) - - elif section == 'affiliations': - sect_start = None - affiliations = find_author_affiliations(fulltext) - if affiliations: - return ([aff[0] for aff in affiliations], status, how_found_start) - else: - if cli_opts['verbosity'] >= 1: - sys.stdout.write("-----extract_section_from_fulltext: " \ - "No ending keyword found for affilation extraction!\n") + ## Find start of refs section: + sect_start = find_reference_section(fulltext) + if sect_start is not None: how_found_start = 1 if sect_start is None: ## Only if an ending keyword was found, look for affilations if section == 'authors': @@ -6081,7 +6216,7 @@ def extract_section_from_fulltext(fulltext, section): ## Append the affiliation supported authors, since the first method failed for aff_auth_pair in affiliations: lines.extend([auth for auth in aff_auth_pair[1]])#Authors - return (lines,status,how_found_start) + return (map(replace_undesirable_characters,lines),status,how_found_start) #lines.append(aff_auth_pair[0])#Affiliation else: ## No References @@ -6090,12 +6225,6 @@ def extract_section_from_fulltext(fulltext, section): write_message("-----extract_section_from_fulltext: " \ "No section found\n", verbose=2) else: - ## Only if an ending keyword was found, look for affilations - ## It's a bonus that authors were found - #if end_of_section_keyword: - # affiliations = find_author_affiliations(fulltext) - # lines.extend([i[0] for i in affiliations]) - sect_end = None if sect_start.has_key("end_line"): sect_end = sect_start["end_line"] @@ -6121,8 +6250,7 @@ def extract_section_from_fulltext(fulltext, section): sect_end, \ sect_start["title_string"], \ sect_start["marker_pattern"], \ - sect_start["title_marker_same_line"], - section) + sect_start["title_marker_same_line"]) return (lines, status, how_found_start) @@ -6559,38 +6687,44 @@ def begin_extraction(daemon_cli_options=None): ## no files provided for reference extraction - error message usage(wmsg="Error: No valid input file specified (-f id:file [-f id:file ...])") - ## Read the journal titles knowledge base, creating the search - ## patterns and replace terms. Check for user-specified journal kb. - if cli_opts['kb-journal'] != 0: - titles_kb_file = cli_opts['kb-journal'] - else: - titles_kb_file = CFG_REFEXTRACT_KB_JOURNAL_TITLES - - ## Do a quick test to see if the specified kb file exists. - ## If it does not, assume name and append onto etc directory. - if not os.path.exists(titles_kb_file): - titles_kb_file = os.path.join(CFG_ETCDIR, 'bibedit', os.path.basename(titles_kb_file)) - - (title_search_kb, \ - title_search_standardised_titles, \ - title_search_keys) = \ - build_titles_knowledge_base(titles_kb_file) - - ## Read the report numbers knowledge base, creating the search - ## patterns and replace terms. Check for user-specified rep-no kb. - if cli_opts['kb-report-number'] != 0: - repno_kb_file = cli_opts['kb-report-number'] - else: - repno_kb_file = CFG_REFEXTRACT_KB_REPORT_NUMBERS + ## Don't parse the knowledge bases if authors/affiliations are being extracted + if not cli_opts['authors'] and not cli_opts['affiliations']: + + ## Read the journal titles knowledge base, creating the search + ## patterns and replace terms. Check for user-specified journal kb. + if cli_opts['kb-journal'] != 0: + titles_kb_file = cli_opts['kb-journal'] + else: + titles_kb_file = CFG_REFEXTRACT_KB_JOURNAL_TITLES + + ## Do a quick test to see if the specified kb file exists. + ## If it does not, assume name and append onto etc directory. + if not os.path.exists(titles_kb_file): + titles_kb_file = os.path.join(CFG_ETCDIR, 'bibedit', os.path.basename(titles_kb_file)) - ## Do a quick test to see if the specified kb file exists. - ## If it does not, assume name and append onto etc directory. - if not os.path.exists(repno_kb_file): - repno_kb_file = os.path.join(CFG_ETCDIR, 'bibedit', os.path.basename(repno_kb_file)) - (preprint_reportnum_sre, \ - standardised_preprint_reportnum_categs) = \ - build_reportnum_knowledge_base(repno_kb_file) + ## Read the journal titles knowledge base, creating the search + ## patterns and replace terms. Check for user-specified journal kb. + (title_search_kb, \ + title_search_standardised_titles, \ + title_search_keys) = \ + build_titles_knowledge_base(titles_kb_file) + + ## Read the report numbers knowledge base, creating the search + ## patterns and replace terms. Check for user-specified rep-no kb. + if cli_opts['kb-report-number'] != 0: + repno_kb_file = cli_opts['kb-report-number'] + else: + repno_kb_file = CFG_REFEXTRACT_KB_REPORT_NUMBERS + + ## Do a quick test to see if the specified kb file exists. + ## If it does not, assume name and append onto etc directory. + if not os.path.exists(repno_kb_file): + repno_kb_file = os.path.join(CFG_ETCDIR, 'bibedit', os.path.basename(repno_kb_file)) + + (preprint_reportnum_sre, \ + standardised_preprint_reportnum_categs) = \ + build_reportnum_knowledge_base(repno_kb_file) done_coltags = 0 ## flag to signal that the starting XML collection ## tags have been output to either an xml file or stdout @@ -6656,17 +6790,20 @@ def begin_extraction(daemon_cli_options=None): extract_lines = docbody else: - + ## launch search for the relevant section in the document body: if cli_opts['authors'] == 1: - section = 'authors' + (tagged_lines, extract_error, how_found_start) = \ + extract_authors_from_fulltext(docbody) elif cli_opts['affiliations'] == 1: - section = 'affiliations' + (tagged_lines, extract_error, how_found_start) = \ + extract_affiliations_from_fulltext(docbody) else: - section = 'references' + (extract_lines, extract_error, how_found_start) = \ + extract_references_from_fulltext(docbody) + + if not cli_opts['authors'] and not cli_opts['affiliations']: - (extract_lines, extract_error, how_found_start) = \ - extract_section_from_fulltext(docbody, section) if len(extract_lines) == 0 and extract_error == 0: extract_error = 6 write_message("-----extract_references_from_fulltext " \ @@ -6680,7 +6817,7 @@ def begin_extraction(daemon_cli_options=None): count_title, count_reportnum, \ count_url, count_doi, count_auth_group, \ record_titles_count) = \ - create_marc_xml_reference_section(reflines, + create_marc_xml_reference_section(extract_lines, preprint_repnum_search_kb=\ preprint_reportnum_sre, preprint_repnum_standardised_categs=\ @@ -6743,9 +6880,9 @@ def begin_extraction(daemon_cli_options=None): ## since filter_processed_references expects the ## original xml format. ## Compress mulitple 'm' subfields in a datafield - out = compress_subfields(out,CFG_REFEXTRACT_SUBFIELD_MISC) + out = compress_subfields(out, CFG_REFEXTRACT_SUBFIELD_MISC) ## Compress multiple 'h' subfields in a datafield - out = compress_subfields(out,CFG_REFEXTRACT_SUBFIELD_AUTH) + out = compress_subfields(out, CFG_REFEXTRACT_SUBFIELD_AUTH) lines = out.split('\n') write_message("-----display_xml_record gave: %s significant " \ From b48c5ea50500d18bfc852bf3cb9b1629f8d98342 Mon Sep 17 00:00:00 2001 From: Christopher Hayward Date: Tue, 12 Jul 2011 19:14:49 +0200 Subject: [PATCH 12/15] refextract: include author choice heuristics * Improve ambiguous author handling. * Implement strength values for line-above affiliated authors. * Add choice heuristics (set operations etc) when making desisions based on which set of authors to take, given that a different method was used to obtain each one. --- modules/bibedit/lib/refextract.py | 1490 ++++++++++++++++++++--------- 1 file changed, 1014 insertions(+), 476 deletions(-) diff --git a/modules/bibedit/lib/refextract.py b/modules/bibedit/lib/refextract.py index b46385c5ae..a3caa5acde 100644 --- a/modules/bibedit/lib/refextract.py +++ b/modules/bibedit/lib/refextract.py @@ -414,9 +414,9 @@ def filter_processed_references(out): for i in range(len(ref_lines)): ## Checks to see that the datafield has the attribute ind2="6", ## Before looking to see if the subfield code attribute is 'a' - if ref_lines[i].find('') <> -1 and (len(ref_lines)-1) > i: + if ref_lines[i].find('') != -1 and (len(ref_lines)-1) > i: ## For each line in this datafield element, try to find the subfield whose code attribute is 'a' - while ref_lines[i].find('') <> -1 and (len(ref_lines)-1) > i: + while ref_lines[i].find('') != -1 and (len(ref_lines)-1) > i: i+=1 ## Invenio/X.XX.X refextract/X.XX.X-timestamp-err-repnum-title-URL-misc if a_tag.search(ref_lines[i]): ## remake the "a" tag for new numbe of "m" tags @@ -1230,7 +1230,6 @@ def get_bad_char_replacements(): '(\\g<2>) ' \ '\\g<6> ')) - ## Pattern used to locate references of a doi inside a citation ## This pattern matches both url (http) and 'doi:' or 'DOI' formats re_doi = (re.compile(""" @@ -1243,23 +1242,54 @@ def get_bad_char_replacements(): [\w\-_;\(\)\/]) #any character excluding a full stop """, re.VERBOSE)) +def get_author_affiliation_numeration_str(punct=None): + """The numeration which can be applied to author names. Numeration + is sometimes found next to authors of papers. + @return: (string), which can be compiled into a regex; identifies + numeration next to an author name. + """ + ## Number to look for, either general or specific + re_number = '(?:\d\d?)' + re_chained_numbers = "(?:(?:[,;]\s*%s\.?\s*))*" % re_number + ## Punctuation surrounding the number, either general or specific again + if punct is None: + re_punct = "(?:[\{\(\]]?)" + else: + re_punct = re.escape(punct) + + ## Generic number finder (MUST NOT INCLUDE NAMED GROUPS!!!) + numeration_str = """ + (?:\s*(%(punct)s)\s* ## Left numeration punctuation + (%(num)s\s* ## Core numeration item, either specific or generic + %(num_chain)s ## Extra numeration, either generic or empty + ) + (?:(%(punct)s)|[^\d]) ## Right numeration punctuation + )""" % {'num' : re_number, + 'num_chain' : re_chained_numbers, + 'punct' : re_punct} + return numeration_str + def get_single_and_extra_author_pattern(): """Generates a simple, one-hit-only, author name pattern, matching just one author name, but ALSO INCLUDING author names generated from the knowledge base. The author patterns are the same ones used inside the main 'author group' pattern generator. - This function is used not for reference extraction, but for author extraction.""" + This function is used not for reference extraction, but for author extraction. + @return: (string) the union of the built-in author pattern, with the kb defined + patterns.""" return get_single_author_pattern()+"|"+make_extra_author_regex_str() -def get_single_author_pattern(incl_numeration=True): +def get_single_author_pattern(): """Generates a simple, one-hit-only, author name pattern, matching just one author name in either of the 'S I' or 'I S' formats. The author patterns are the same ones used inside the main 'author group' pattern generator. This function is used not for reference extraction, but for author extraction. Numeration is appended to author patterns by default. @return (string): Just the author name pattern designed to identify single author names - in both SI and IS formats. (NO 'et al', editors, 'and'... matching)""" - return "(?:"+get_initial_surname_author_pattern(incl_numeration)+"|"+\ - get_surname_initial_author_pattern(incl_numeration)+")" + in both SI and IS formats. (NO 'et al', editors, 'and'... matching) + @return: (string) the union of 'initial surname' and 'surname initial' + authors""" + return "(?:"+get_initial_surname_author_pattern(incl_numeration=True)+"|"+\ + get_surname_initial_author_pattern(incl_numeration=True)+")" def get_initial_surname_author_pattern(incl_numeration=False): """Return a standard author, with a maximum of 6 initials, and a surname. @@ -1271,7 +1301,7 @@ def get_initial_surname_author_pattern(incl_numeration=False): ## Possible inclusion of superscript numeration at the end of author names ## Will match the empty string if incl_numeration: - append_num_re = "(?:\s*[\{\(]?\s*\d*\.?\s*[\}\)]?)" + append_num_re = get_author_affiliation_numeration_str()+'?' return u""" ( (?0-9]+\s+([^\s_<>0-9]{1,3}\.?\s+)?[A-Z][^\s_<>0-9]+\s*$", \ + re.UNICODE) + +## Obtain the compiled expression which includes the proper author numeration +## (The pattern used to identify authors of papers) +## This pattern will match groups of authors, from the start of the line +re_auth_with_number = (re.compile(make_auth_regex_str(re_etal, \ + get_initial_surname_author_pattern(incl_numeration=True), \ + get_surname_initial_author_pattern(incl_numeration=True)), \ + re.VERBOSE | re.UNICODE)) + +## Used to obtain authors chained by connectives across multiple lines +re_comma_or_and_at_start = re.compile("^(,|((,\s*)?[Aa][Nn][Dd]|&))\s", re.UNICODE) + +## Given an Auth hit, some misc text, and then another Auth hit straight after, +## (OR a bad_and was found) +## check the entire misc text to see if is 'looks' like an author group, which didn't match +## as a normal author. In which case, append it to the single author group. +## PLEASE use this pattern only against space stripped text. +## IF a bad_and was found (from above).. do re.search using this pattern +## ELIF an auth-misc-auth combo was hit, do re.match using this pattern + +re_weaker_author = """ + (?:([A-Z]((\.\s?)|(\.?\s+)|(\-))){1,5} ## look closely for initials, and less closely at the last name. + (?:[^\s_<>0-9]+(?:(?:[,\.]\s*)|(?:[,\.]?\s+)))+)""" + +## End of line MUST match, since the next string is definitely a portion of an author group (append '$') +re_auth_near_miss = (re.compile(make_auth_regex_str(re_etal,"("+re_weaker_author+")+$"),re.VERBOSE|re.UNICODE)) + + def make_extra_author_regex_str(): """ From the authors knowledge-base, construct a single regex holding the or'd possibilities of patterns which should be included in $h subfields. The word 'Collaboration' is also converted to 'Coll', and @@ -1479,7 +1551,7 @@ def add_to_auth_list(s): % (fpath, str(line_num)), sys.stderr, verbose=0) halt(err=UnicodeError, \ msg="Error: Unable to parse author kb (line: %s)" % str(line_num), exit_code=1) - if (len(rawline) > 0) and (rawline.strip()[0] != '#'): + if (len(rawline.strip()) > 0) and (rawline.strip()[0] != '#'): add_to_auth_list(rawline) ## Shorten collaboration to 'coll' if rawline.lower().endswith('collaboration\n'): @@ -1532,6 +1604,13 @@ def add_to_auth_list(s): (arxiv)|(e[\-\s]?print:?\s*arxiv) """, re.VERBOSE) +## Targets single author names +re_single_author_pattern_with_numeration = re.compile(get_single_and_extra_author_pattern(), re.VERBOSE) + +re_author_tag = \ + re.compile(r"^\s*((prepared|(?Pedited)|written)\sby|authors?)\s*[,:;]?", \ + re.UNICODE | re.IGNORECASE) + # et. al. before J. /// means J is a journal ## a list of patterns used to try to repair broken URLs within reference lines: @@ -2089,7 +2168,6 @@ def _cmp_bystrlen_reverse(a, b): return (kb, standardised_titles, seek_phrases) - def get_affiliation_canonical_value(proposed_affil): """Given a proposed affiliation, look for a canonical form in the affils knowledge base @@ -2107,7 +2185,6 @@ def get_kb_mapping_value(kb_name, key): return None #default - def standardize_and_markup_numeration_of_citations_in_line(line): """Given a reference line, attempt to locate instances of citation 'numeration' in the line. @@ -2440,7 +2517,6 @@ def identify_and_tag_extra_authors(line): + p['author'].strip(".,:;- []") + CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_INCL + line[p['end']:] return line - output_line = line tmp_line = line ## Firstly, go through and change ALL TAGS and their contents to underscores @@ -2539,7 +2615,7 @@ def identify_and_tag_extra_authors(line): if m['etal'] and not(m['ed_start'] or m['ed_end'] or dump_in_misc): ## Insert the etal tag... ## Replace the found 'et al' phrase with the standardised version - tmp_stnd_etal_line = re.sub(re_etal,'et al',output_line[start:end].strip(".,:;- []()"), re.IGNORECASE) + tmp_stnd_etal_line = re.sub(re_etal, 'et al', output_line[start:end].strip(".,:;- []()"), re.IGNORECASE) output_line = output_line[:start] + "" \ + tmp_stnd_etal_line \ + CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_ETAL + add_to_misc + output_line[end:] @@ -2767,9 +2843,6 @@ def account_for_stripped_whitespace(spaces_keys, ## whitespace accounted for: return (true_replacement_index, extras) - - - def create_marc_xml_reference_line(line_marker, working_line, found_title_len, @@ -3651,17 +3724,6 @@ def convert_processed_reference_line_to_marc_xml(line_marker, count_reportnum += 1 cur_misc_txt = u"" -<<<<<<< HEAD - identified_citation_element = { 'type' : "REPORTNUMBER", - 'misc_txt' : "%s" % cur_misc_txt, - 'report_num' : "%s" % report_num, - } - count_reportnum += 1 - cur_misc_txt = u"" - -======= ->>>>>>> Identifies Authors in citations. Splits references based on the - elif tag_type == "URL": ## This tag is an identified URL: @@ -3709,20 +3771,20 @@ def convert_processed_reference_line_to_marc_xml(line_marker, count_doi += 1 cur_misc_txt = u"" - elif tag_type.find("AUTH") <> -1: + elif tag_type.find("AUTH") != -1: ## This tag is an identified Author: auth_type = "" ## extract the title from the line: - if tag_type.find("stnd") <> -1: + if tag_type.find("stnd") != -1: auth_type = "stnd" idx_closing_tag_nearest = processed_line.find(\ CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_STND, tag_match_end) - elif tag_type.find("etal") <> -1: + elif tag_type.find("etal") != -1: auth_type = "etal" idx_closing_tag_nearest = processed_line.find(\ CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_ETAL, tag_match_end) - elif tag_type.find("incl") <> -1: + elif tag_type.find("incl") != -1: auth_type = "incl" idx_closing_tag_nearest = processed_line.find(\ CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_INCL, tag_match_end) @@ -3787,7 +3849,7 @@ def convert_processed_reference_line_to_marc_xml(line_marker, CFG_REFEXTRACT_MARKER_CLOSING_PAGE) identified_citation_element = None - if identified_citation_element <> None: + if identified_citation_element != None: ## Append the found tagged data and current misc text citation_elements.append(identified_citation_element) identified_citation_element = None @@ -4677,6 +4739,7 @@ def get_first_reference_line_numeration_marker_patterns_via_numbers(): for p in patterns: compiled_patterns.append(re.compile(p, re.I|re.UNICODE)) return compiled_patterns + def get_post_reference_section_title_patterns(): """Return a list of compiled regex patterns used to search for the title of the section after the reference section in a full-text document. @@ -4771,6 +4834,8 @@ def perform_regex_search_upon_line_with_pattern_list(line, patterns): break return m +re_aff_email = re.compile(r"^.*?@.*?$") + def get_post_author_section_keyword_patterns(): """ Return a list of compiled regex's based on keywords used as an indication of the end of a possible author section on the title page of a document. @@ -4785,299 +4850,948 @@ def get_post_author_section_keyword_patterns(): ptns = map(_create_regex_pattern_add_optional_spaces_to_word_characters, keywords) ## Add an optional chapter numeration (1., 1.1, i, A..) to the start of each pattern ptns = ['\s*([ai1]\s*\.?\s*[1]?\s*\.?\s*)?'+x for x in ptns] - - ## Page number 1 - ptns.append('\s*(page)?\s*[i\d]\s*\.?\s*$') + ## Page number 1 ... must force a 'page' match, sometimes numeration is broken + ptns.append('\s*page\s*[i\d]\s*\.?\s*$') ## Number one at the start of a possible chapter - ptns.append('\s*\d\.?\s*$') - + #ptns.append('\s*\d\.?\s*$') compiled_patterns = [] for p in ptns: - compiled_patterns.append(re.compile(p, re.I|re.UNICODE)) + compiled_patterns.append(re.compile(p, re.I | re.UNICODE)) return compiled_patterns -def standardise_line_affiliations(line): +re_aff_num = re.compile(r"(^[\d]+[A-Z])") +re_aff_name = re.compile(r"(univ|institut|laborator)", re.I) +re_aff_univ = re.compile(r"univ[a-z]+\s+(of)?\s+([a-z\s\-]+)|([a-z\s\-]+)\s+(?!univ[a-z]+\sof)univ[a-z]+", re.I) + +re_splitting_comma = re.compile(",[^\d]", re.UNICODE) + +def arrange_possible_authors(line): + ## Replace and's with commas + comma_split_line = re.sub(r"(^\s*|\s)([Aa][Nn][Dd]|&)\s", ", ", line) + ## Split by commas + possible_authors = re_splitting_comma.split(comma_split_line.strip()) + ## Remove empty stuff + possible_authors = filter(lambda x: x.strip(), possible_authors) + return possible_authors + +def gather_numerated_authors_affiliations(lines, aff_positions, number_to_find): + """Use the found affiliation to try and help with author extraction""" + def has_number(possible_auth, number_to_find): + """Does this possible author have the numeration I want?""" + (auth_nums, auth_num_match) = obtain_author_affiliation_numeration_list(possible_auth) + return number_to_find in auth_nums + + def remove_excess_numeration(author_match): + return re.sub("^\d+|\d+$", "", author_match) + +# def make_numerated_author_pattern(list_of_numerated_authors): +# patterns = [] +# for num_auth in list_of_numerated_authors: +# num_auth = re.sub("[a-z]", "[a-z]", num_auth) +# num_auth = re.sub("(?<=%s)+?%s" % (re.escape("[a-z][a-z]"), re.escape("[a-z]")), "+", num_auth) +# +# num_auth = re.sub("[A-Z]", "[A-Z]", num_auth) +# num_auth = re.sub("(?<=%s)+?%s" % (re.escape("[A-Z][A-Z]"), re.escape("[A-Z]")), "+", num_auth) +# +# num_auth = re.sub("[0-9]", "[0-9]", num_auth) +# num_auth = re.sub("(?<=%s)+?%s" % (re.escape("[0-9][0-9]"), re.escape("[0-9]")), "+", num_auth) +# +# num_auth = re.sub("[\-]", "", num_auth) +# num_auth = re.sub("\s", "\\s", num_auth) +# num_auth = re.escape(num_auth) +# patterns.append(num_auth) +# return patterns + + ## Holds numerated authors. + numerated_authors = [] + all_split_authors = [] + ## Make a copy of the list of above lines [must be a copy due to pop()] + lines_to_check = lines[:] + while lines_to_check: + line = lines_to_check.pop().strip() + position = len(lines_to_check) + if aff_positions and (position in aff_positions): + continue + ## Split according to commas/'and's + possible_authors = arrange_possible_authors(line) + ## Make a list of ok authors found in the split line, for this affiliation + numerated_authors.extend(filter(lambda a: has_number(a, number_to_find), possible_authors)) + ## So, on this line, a numerated author was found. So, + ## make sure to save the rest of the split authors in this line. + if numerated_authors: + all_split_authors.extend(possible_authors) +# numerated_author_patterns.extend(make_numerated_author_pattern(numerated_authors)) + + return (map(remove_excess_numeration, numerated_authors), \ + map(remove_excess_numeration, all_split_authors)) + +def initiate_affiliated_author_search(affiliations, top_lines, aff_positions): + """ Using obtained affiliation details, try to find authors, using primarily the + numeration-associated method (pairing numerated authors with numerated affiliations, + and as a fall-back, the 'lines-above' affiliation. + @param affiliations: (dictionary) Already collected affiliations, with their possible + numeration too. + @param top_lines: (list) The top lines (search space) of the document + @param aff_positions: (list) A numeric list of positions where known affiliations + exist (ignores these lines; prevents returning an affiliation as a possible author) + @return: (tuple) affiliation data, and loose authors (all authors found) + """ + ## Used to validate a set of words found above an affiliation + ## This is used when no authors have been found for a paper, but an affiliation has + ## Will try to match a single ambiguous author, such as "William J. Smith" + tried_numeration = [] + ## Holds all split items in a line where numerated authors were found! + loose_authors = [] + + for cur_aff in affiliations: + position_above = cur_aff['position']-1 + ## Using numerated affiliations + if cur_aff['aff_nums']: + numerated_authors = [] + for num in cur_aff['aff_nums']: + if not num in tried_numeration: + ## For this single, new, affiliation numeration val + ## use it to find authors, given: + ## 1. Lines above the affiliation + ## 2. The already identified affiliation positions + ## 3. The affiliation number, for authors, to look for + (numerated_authors_single_num, all_split_authors) = \ + gather_numerated_authors_affiliations(top_lines, \ + aff_positions, \ + number_to_find=num) + numerated_authors.extend(numerated_authors_single_num) + ## Save all split authors, if at least one numerated author was found! + ## Otherwise, this is just an empty addition + loose_authors.extend(all_split_authors) + tried_numeration.append(num) + + ## Substantially reliable + cur_aff['author_data'] = {'authors' : numerated_authors, + 'strength' : 1} + + else: + ## Using (line-above) NON-numerated affiliations to look for authors + ## This method is far less accurate than using numeration, but nonetheless + ## correct in a wide variety of situations. + ## Get the next non-empty line above the affiliation + while (position_above >= assumed_top_section_start) and \ + (position_above >= 0) and \ + (not top_lines[position_above].strip()): + position_above -= 1 + + ## The position above is a line which is another affiliation + ##i.e. no 'non-blank' possible author line inbetween + if position_above in aff_positions: + position_above = -1 + + ## If a valid line (not empty & not another affiliation) was found above the affiliation + if position_above >= 0: + lines_above = [top_lines[position_above]] + ## For each line, look for an 'and' start and collect them up + while re_comma_or_and_at_start.search(top_lines[position_above]) and \ + (not position_above in aff_positions): + try: + lines_above.append(top_lines[position_above-1]) + except IndexError: + break + position_above -= 1 + ## For each 'possible author' line above the affiliation + ## Build a list of weakly-matched authors + for line_above in lines_above: + ## Insert commas over 'and's and split using commas + split_line_above = arrange_possible_authors(line_above) + ## If the list of comma separated line elements in the above line + ## is longer than 1 (i.e. it has commas separating components) + if len(split_line_above) > 1: + ## This makes for a more reliable match (comma separated line above aff) + strength_for_this_line_above = 1 + else: + ## This match isnt so reliable + strength_for_this_line_above = 0 + + ## Far less reliable than the numerated version + cur_aff['author_data'] = {'authors' : filter(lambda a: re_ambig_auth.search(a), split_line_above), + 'strength' : strength_for_this_line_above,} + +# ## Check all numerated authors which were found +# all_numerated_authors = [] +# all_numerated_authors.extend([a for a in cur_aff['author_data'] if a not in all_numerated_authors]) +# if all_numerated_authors: +# ## Extend the standard set of authors, in the event numerated authors are found +# topline_standard_authors = collect_standard_authors(top_lines, 0) + + return (affiliations, loose_authors) + +def build_start_end_numeration_str(predefined_punct=None): + """Pieces together the leading and trailing numeration strings, + for affiliations and authors. + @param predefined_number: (int) punctuation which surrounds numeration. + (e.g. brackets) + @return: (regex) The regex which will match both starting and ending + numeration on a line, with any additional punctuation included.""" + numeration_str = "^"+get_author_affiliation_numeration_str(predefined_punct) \ + +"|"+get_author_affiliation_numeration_str(predefined_punct) \ + +"$" + return numeration_str + +def obtain_author_affiliation_numeration_list(line, punct=None): + """Extract the leading or trailing numeration from the line. + @param line: (string) a line of text (possibly holding an affiliation) + @param punct: (string) the punctuation known to surround numeration + elements. (makes the search more strict) + @return: (list) stripped and raw integer numeration""" + ## List of integer numeration associated with this author/affiliation + i_stripped_nums = [] + ## Given a line with an affiliation, see if numeration is on the line + re_numeration = \ + re.compile(build_start_end_numeration_str(punct), re.UNICODE|re.VERBOSE) + num_match = re.search(re_numeration, line.strip()) + ## Numeration exists for this affiliation + if num_match: + ## Get the start/end number match (or string of separated numbers)! + str_num = num_match.group(2) or num_match.group(5) + ## Split if multiple numbers + if ";" in str_num: + stripped_nums = str_num.split(";") + elif "," in str_num: + stripped_nums = str_num.split(",") + else: + stripped_nums = [str_num] + ## Attempt to convert each numeration value to an integer + try: + i_stripped_nums = map(lambda n: int(n.strip()), stripped_nums) + except ValueError: + pass + ## num_match is used to obtain punctuation around the numeration + return (i_stripped_nums, num_match) + +def standardise_affiliation_names(line): + """ Standardise some affiliations. Convert some + domain specific HEP names to a standard form. + This will very likely be moved out into a kb soon. + @param line: (string) Line from the document holding a + possibly unstandardised affiliation. + @return: the line holding now standardised affiliations + """ ## Removes numeration, 'the'/'and', and replace titles line = line.strip() - line = re.sub(r"^Livermore","LLNL, Livermore",line) - line = re.sub(r".*?Stanford Linear Accelerator Center.*?","SLAC",line) - line = re.sub(r"^Fermi National Accelerator Laboratory","Fermilab",line) - line = re.sub(r"\s[tT][hH][eE]\s"," ",line) - line = re.sub(r"\s[aA][nN][dD]\s"," ",line) + line = re.sub(r"^Livermore", "LLNL, Livermore", line) + line = re.sub(r".*?Stanford Linear Accelerator Center.*?", "SLAC", line) + line = re.sub(r"^Fermi National Accelerator Laboratory", "Fermilab", line) + line = re.sub(r"\s[tT][hH][eE]\s", " ", line) + line = re.sub(r"\s[aA][nN][dD]\s", " ", line) return line -re_aff_num = re.compile(r"(^[\d]+[A-Z])") -re_aff_name = re.compile(r"(univ|institut|laborator)", re.I) -re_aff_univ = re.compile(r"univ[a-z]+\s+(of)?\s+([a-z\s\-]+)|([a-z\s\-]+)\s+(?!univ[a-z]+\sof)univ[a-z]+",re.I) -re_aff_email = re.compile(r"^.*?@.*?$") +def standardise_affiliation_formats(line): + """ Standardise some affiliations. This will remove numeration, + and will convert university names into a standard format. + @param line: (string) Line from the document holding a + possibly unstandardised affiliation. + @return: the line holding now standardised formats of affiliations + """ + ## Kill numeration + line = re.sub(r"[0-9]","",line) + ## Format the found affiliation + univ_name = re_aff_univ.search(line) + if univ_name: + ## Get the University name + line = (univ_name.group(2) or univ_name.group(3)) + " U." + ## Check and set an institution + for inst in CFG_INSTITUTIONS: + if line.find(inst) != -1: + line = inst + break + return line + +def extract_numerated_affiliations(num_position, num_section, num_find, num_punct): + """ Collect numerated affiliations, using a section of the document, and + the number which to search for. The punctuation surrounding any numeration (the + first number found) (if any) is used to improve the strictness of the search. + @param num_position: (int) position in section from where to look + for numerated affiliations + @param num_section: (list) section holding numerated affiliations + @param num_find: (int) number to find, paired with affiliations + @param num_punct: (string) punctuation around affiliation numeration (if any) + @return: (list) of dictionary elements corresponding to the position, + content and numeration data of an affiliation. + """ + affs = [] + if num_section: + ## First line + line = num_section[0].strip() + ## A number has been found before this iteration + ## Use previous number, and previous punctuation! + (aff_nums, specific_num_match) = obtain_author_affiliation_numeration_list(line, num_punct) + if num_find in aff_nums: + ## Attempt to get numeration for this affiliation + try: + num_find = num_find + 1 + except ValueError: + sys.stderr.write("Error: Unable to obtain integer affiliation numeration.") + sys.exit(1) + ## Save the punctuation surrounding the numeration + affs.append({'position' : num_position, + 'line' : standardise_affiliation_formats(line), + 'aff_nums' : aff_nums, + 'author_data' : None}) + + ## Do until end of docbody section (num_section) + affs.extend(extract_numerated_affiliations(num_position+1, \ + num_section[1:], \ + num_find, + num_punct)) + return affs + +## Numeration at the start of the line +re_start_numeration = re.compile("^%s$" % get_author_affiliation_numeration_str(), \ + re.VERBOSE|re.UNICODE) + +def realign_numeration(docbody): + """ Create a duplicate document body, but with starting numeration + replicated on the next line. This is to do with the reparation + of numeration across multiple lines, from the pdftottext conversion. + Both of these docbody's are later checked, and the one which makes + sense in terms of numeration positioning is used from then onwards. + Essentially means that the conversion of pdf to text is less likely + to hinder numeration searching. + @param docbody: (list) List of lines of the entire input document. + @return: (list) The list of lines of the entire input document, + with any start-line numeration shifted accordingly. + """ + docbody_alternate = docbody[:] + ## Get the positions of all single '1's + starting_numeration = [] + for position, line in enumerate(docbody): + num_match = re_start_numeration.search(line) + if num_match: + try: + i_num = int(num_match.group(2)) + if i_num == 1: + ## If this number found is + starting_numeration.append(position) + except ValueError: + continue + ## Now, using the positions of the '1's, go forward and locate + ## subsequent numeration, and replicate on the following line if need be + num = 1 + for start in starting_numeration: + for position, line in enumerate(docbody[start:]): + num_match = re_start_numeration.search(line) + if num_match: + try: + i_num = int(num_match.group(2)) + if i_num == num: + ## If this number found is + docbody_alternate[start+position] = "\n" + docbody_alternate[start+position+1] = num_match.group(0).strip() + docbody_alternate[start+position+1] + num = num + 1 + except IndexError: + break + except ValueError: + continue -## Used to validate a set of words found above an affiliation -## This is used when no authors have been found for a paper, but an affiliation has -## Will try to match a single ambiguous author, such as "William J. Smith" -re_ambig_auth = re.compile(r"\s*[A-Z][^\s_<>0-9]+\s+([^\s_<>0-9]{1,3}\.?\s+)?[A-Z][^\s_<>0-9]+\s*(\d*)\s*$",re.UNICODE) + return docbody_alternate -def find_affiliations(docbody, use_to_find_authors=False): +def find_affiliations(lines, start, end=None, use_to_find_authors=False): """ Given a possible author section, attempt to retrieve any affliations. - @param docbody: The document body as a list of lines. - @param use_to_find_authors: Boolean, whether or not the affiliations found + @param docbody: (list) The entire document body as a list of lines. + @param start: (int) The start position, from where to start finding + affiliations. + @param end: (int) The boundary position: Stop searching here. + @param use_to_find_authors: (boolean) whether or not the affiliations found within this function should be used to support the identification of authors. (This will be True in the case when '--authors' is selected, and no authors have been found using the specific author regular expression during the first method.) @return (tuple): Affilations and the possibly improved author section. """ - top_section = find_top_section(docbody) - ## Must find the top_section - if not top_section: - return None - - top_lines = top_section['lines'] + def get_smaller(x, y): + if x < y: + return x + return y affiliations = [] - affiliation_positions = [] - - for position in range(len(top_lines)): - second_try_authors = [] - line = standardise_line_affiliations(top_lines[position]) - if cli_opts['verbosity'] > 2: - print "(find affiliations) examining " + line.encode("utf8") - - ## Obtain either a single university/institution, or the entire line - ## Also look for the emails - if re_aff_num.search(line) or re_aff_name.search(line) or re_aff_email.search(line): - line = re.sub(r"[0-9]", "", line) - ## Format the found affiliation - univ_name = re_aff_univ.search(line) - if univ_name: - ## Get the University name - line = (univ_name.group(2) or univ_name.group(3)) + " U." - ## Check and set an institution - for inst in CFG_INSTITUTIONS: - if line.find(inst) != -1: - line = inst - break + starting_num_position = None + numerated_aff_num_punct_ptn = None + top = None - ## Save the line number of this identified affiliation - affiliation_positions.append(position) + if not start: + start = 0 - ## Try to obtain more authors, if needed - if use_to_find_authors == True: - ## Use the found affiliation to try and help with author extraction - if ((position - 1) > 0) and not ((position - 1) in affiliation_positions): - ## Replace 'and' or '&' with a comma - tmp_line = re.sub(r"\s([Aa][Nn][Dd]|&)\s", ", ", top_lines[position-1]) - possible_authors = tmp_line.strip().split(",") - ## Make a list of ok authors found in the split line, for this affiliation - second_try_authors = filter(lambda x: re_ambig_auth.match(x), possible_authors) + ## If a keyword was found, then use it to limit the search space + if end: + top_lines_orig = lines[start:end] + else: + top_lines_orig = lines[start:] + + ## Get an alternative version of the top section, of the same length + ## but with some alone numeration replicated on the next line! + top_lines_alt = realign_numeration(top_lines_orig) + + for position in range(len(top_lines_orig)): + ## Standardise some affiliations + line = standardise_affiliation_names(top_lines_orig[position].strip()) + line_alt = standardise_affiliation_names(top_lines_alt[position].strip()) + + ## If a previous numeration value was found in the previous iteration + ## check for the increment of this value on this line + if re_aff_num.search(line) or re_aff_name.search(line): + ## Check numeration in replica docbody + (aff_nums, num_match) = obtain_author_affiliation_numeration_list(line) + ## Check numeration in the numeration-realigned docbody + (aff_nums_alt, num_match_alt) = obtain_author_affiliation_numeration_list(line_alt) + ## Set the information to the correct top_section, depending on + ## if the numeration was found split across lines or not. + if aff_nums or not aff_nums_alt: + top = top_lines_orig + elif aff_nums_alt: + top = top_lines_alt + aff_nums = aff_nums_alt + num_match = num_match_alt + + ## Aff number '1' numeration found + if aff_nums and num_match and 1 in aff_nums: + starting_num_position = position + numerated_aff_num_punct_ptn = num_match.group(1) + ## So, an AFFILIATION KEYWORD was found on this line, but this is not a '1'! + ## Move up lines to get the starting affiliation position, using NUMERATION + elif aff_nums and num_match: + ## Get the smallest affiliation number, and minus 1 from it + find_num = reduce(lambda x, y: get_smaller(x, y), aff_nums) - 1 + reversed_position = position - 1 + ## Attempt to go back and find the start of this numeration section + ## Get numeration for this line + while (reversed_position >= 0) and (starting_num_position is None): + ## Check numeration in the numeration-realigned docbody + (rev_aff_nums, rev_num_match) = \ + obtain_author_affiliation_numeration_list(top[reversed_position]) + ## Check for numeration n, n = 1 + if find_num == 1 and (find_num in rev_aff_nums): + starting_num_position = reversed_position + numerated_aff_num_punct_ptn = rev_num_match.group(1) + ## Check for numeration n, 1 < n < last found + elif find_num in rev_aff_nums: + find_num = find_num - 1 + ## Move position up one line + reversed_position = reversed_position - 1 + + ## Starting numeration was found..! + if not starting_num_position: + ## Could not find start. Abort everything. + break + else: + ## The normal way of appending lines with affiliation names + affiliations.append({'position' : position, + 'line' : standardise_affiliation_formats(line), + 'aff_nums' : None, + 'author_data' : None,}) + + ## Stop searching if a keyworded and numerated affiliation has been found + if starting_num_position is not None: + break - ## Add the institution to the list of institutions for this document - affiliations.append((line, second_try_authors)) + ## In the situation where numeration has been found for an affiliation + ## Collect up all of the following numerated affiliations, + ## or go backwards and obtain them + if starting_num_position: + affiliations = extract_numerated_affiliations(starting_num_position, \ + top[starting_num_position:], \ + 1, \ + numerated_aff_num_punct_ptn) + + loose_authors = [] + + ## Try to obtain more authors, if needed + if use_to_find_authors: + aff_positions = [aff['position'] for aff in affiliations] + ## Then, if the above didn't work, do the 'line above' method + (affiliations, loose_authors) = initiate_affiliated_author_search(affiliations, \ + top, \ + aff_positions) + + for tmp_aff in affiliations: + tmp_aff['line'] = replace_undesirable_characters(standardise_affiliation_formats(tmp_aff['line']).strip(".,:;- []()*\\")) + + return (affiliations, loose_authors) + +def collect_standard_authors(top_lines, position): + """Obtain standard authors [recursive] + @param top_lines: (list) top lines of document + @param position: (int) position in top lines + @return: list holding the list of collected authors, + and the position of the last author line + """ + authors_on_line = [] + if position < len(top_lines): + line = top_lines[position] + ## Get all standard author matches for this line + total_author_matches = re_auth_with_number.search(line) + if total_author_matches: + ## Save the matching strings in a list + authors_on_line = total_author_matches.group('author_names') + (position, more_authors) = collect_standard_authors(top_lines, position+1) + ## Recurse on the next position + authors_on_line.extend(more_authors) + ## Authors for this line + return (position, authors_on_line) + +def collect_tagged_authors(top_section, position, first_line=None, \ + orig_blank_lines=None, cur_blank_lines=None): + """Recursively try to obtain authors after an 'author tag' has been + found. + @param top_section: (list) Lines corresponding to the document's top section + @param position: (integer) Current position in the top_section + @param first_line: (string) An optional, over-riding line to be processed on the + first iteration + @param orig_blank_lines: (integer) The static gap width, calculated when finding the + first non-empty line, before collecting subsequent lines. The blank line count is + reset to this value for each tagged author collected. + @param cur_blank_lines: (integer) An optional, blank line count, which is calculated + after traversing lines after a tag, before iterating. This is then used to find possible + subsequent authors. + @return: list holding the list of collected tagged authors, + and the position of the last author line + """ + def leading_comma(line): + return line.rstrip().endswith(",") - return affiliations + line_parts = [] + if position < len(top_section): + if first_line: + line = first_line.strip() + else: + line = top_section[position].strip() -def find_top_section(docbody, first_author=None): - """From the lines of text of the document body, attempt to locate - a subset of lines which correspond to the top section of the docbody. - The top section is classed as that which encapsulates the authors and any - affiliations of the document. Author name and affiliation patterns - are used to feature-equipped top section finder """ + if orig_blank_lines and not cur_blank_lines: + dec_blank_lines = orig_blank_lines - 1 + elif cur_blank_lines: + dec_blank_lines = cur_blank_lines - 1 + else: + dec_blank_lines = orig_blank_lines + + comma_subd_line = re.sub(r"\s([Aa][Nn][Dd]|&)\s", ", ", line) + line_has_leading_comma = leading_comma(comma_subd_line) + line_parts = comma_subd_line.split(",") + + #FIXME possibly generate a pattern from the tagged author match, to be used to verify other authors! (pattern from FIRST match) + + ## Check to see if this line starts with an 'and' + ## or a comma, or has an author form. In either case it's likely + ## that more author names preceed it. + author_match = re_single_author_pattern_with_numeration.search(line) + if line_has_leading_comma or author_match: + if line_has_leading_comma: + ## Reset and reuse the blank line count (comma found at the end of the line) + dec_blank_lines = orig_blank_lines + else: + ## Do not consider any more blank lines when searching + dec_blank_lines = 0 + (position, more_line_parts) = collect_tagged_authors(top_section, \ + position+1, \ + first_line=None, \ + orig_blank_lines=orig_blank_lines, \ + cur_blank_lines=dec_blank_lines) + ## Extend the parts found on this line, with the parts found + ## in previous iterations. (going backwards) + line_parts.extend(more_line_parts) + ## Or if it is known that there exists blank lines between tagged authors, + ## and this line has a leading comma (evidence for more authors somewhere), + ## or is blank then continue to look for authors, until the gap width + ## (blank line count) is reached + elif cur_blank_lines > 0 and ((not line) or line_has_leading_comma): + (position, more_line_parts) = collect_tagged_authors(top_section, \ + position+1, \ + first_line=None, \ + orig_blank_lines=orig_blank_lines, \ + cur_blank_lines=dec_blank_lines) + ## Nothing gets added from this line, just pass the line to the next iteration + line_parts = more_line_parts + + return (position, line_parts) + +## Used in the event that no keyword is found (max length of top section) +assumed_top_section_length = 100 + +## Used to force the validity of found keywords +## (Valid if they appear after this position) +assumed_top_section_start = 1 + +## Was extract_authors_from_fulltext +def extract_top_document_information_from_fulltext(docbody, first_author=None): + """ Given a list of lines from a document body, obtain author/affiliation + information of the document. This is done via the examination of the top + section of the document, via similar regex's used to identify authors in + references, and also author tags and the use of affiliation information. + Tagged authors always have the highest level of precedence when deciding + which group of authors to output for this document. In general, + affiliated authors have a higher precedence level than standard authors, + however, this can change depending on the method used to identify + the affiliated authors, and whether or not the affiliated authors is a + smaller subset of the standard author list. + + The number of lines which will constitute the search-space, is + identified through two configuration values, and more importantly, by + using usual 'start-of-document-body' keyword (abstract, introduction etc..) + + Author identification is completed by running through three, partially- + separated steps: + + 1. Obtain authors which are explicitly tagged as such + + 2. Collect up standard-authors, using the re_auth comprehensive regex. + + 3. Attempt to collect authors using affiliation matches. + 3.1 Lean on numerated affiliations to allow for the accurate extraction + of paired numerated authors. + 3.2 Look at the lines above affiliations, and check for possible authors. + @param docbody: (list) List of lines corresponding to the entire text version + of the input document. + @param first_author: (string) An optional first author from where to start + @return: (tuple) The top document information, holding affiliations + and the chosen set of author names. Also holds two status values. + """ - def check_for_end_of_author_section_match_keywords(line): + def check_for_end_of_author_section_match_keywords(docbody): """ Given a lowercase, stripped line from the start of a document, try to find a match the line exactly for a keyword. A match should indicate the end of the author section. @param line: The line to be checked for ending section keywords. @return (match object): The match object returned when a keyword match is found. """ - found_keyword = perform_regex_match_upon_line_with_pattern_list(line, \ - get_post_author_section_keyword_patterns()) - if found_keyword: - return found_keyword - else: - return False - - top_section = None - ## Author match positions (first/last) - first_matched_author_line = None - last_matched_author_line = None - ## Affiliation match position(last) - last_matched_aff_line = None - ## Keyword match position (first) - keyword_line = None - - ## In an unfortunate case, this line position will be treated - ## as the start of the top-section - start_line = 0 - - ## The number of lines to jump forward, in the event of an author/aff match - ## together with a non-existant top-section keyword match - forward_jump = 30 - - ## Obtain the compiled expression which includes the proper author numeration - ## (The pattern used to identify authors of papers) - total_author_pattern = (re.compile(make_auth_regex_str(re_etal, \ - get_initial_surname_author_pattern(incl_numeration=True), \ - get_surname_initial_author_pattern(incl_numeration=True)), \ - re.VERBOSE|re.UNICODE)) - - ## Obtain the compiled expression which includes the user-specified 'extra' authors - extra_author_pattern = re_extra_auth - - ## Holds the matched authors/affiliations whilst finding the end of the top section - collected_authors = [] - collected_affiliations = [] - - ## obtain the line numbers of lines which hold authors - for position in range(len(docbody)): - line = docbody[position] - if cli_opts['verbosity'] > 2: - print "looking for authors in: " + line.encode("utf8").strip() - #print "re -> " + start_pattern.pattern - - ## Check for post-author-section keywords in the line which signifies the end of an - ## author section - keyword_match = check_for_end_of_author_section_match_keywords(line.strip().lower()) - - if keyword_match: - if cli_opts['verbosity'] > 2: - print "! Keyword match on line: %s" % line - ## Always save this line position with the ending keyword - keyword_line = position - ## Stop the search immediately - break - ## Affiliation in line - elif re_aff_num.search(line) or re_aff_name.search(line) or re_aff_email.search(line): - if cli_opts['verbosity'] > 2: - print "! Affiliation match on line: %s" % line - last_matched_aff_line = position - collected_affiliations.append(line) + found_ending_keyword = None + found_author_tag = None + ending_keyword_ptns = get_post_author_section_keyword_patterns() + for position, line in enumerate(docbody): + ## Find top section ending keywords + ## Must exceed the first 3 lines + keyword_hit = perform_regex_match_upon_line_with_pattern_list(line, ending_keyword_ptns) + if keyword_hit and not found_ending_keyword and (position > 3): + if cli_opts['verbosity'] > 7: + print "--- ending keyword on line: %s, position: %d" % (line.strip(), position) + found_ending_keyword = position + + ## Look for author tags + author_tag_hit = re_author_tag.search(line) + if author_tag_hit and not found_author_tag: + if cli_opts['verbosity'] > 7: + print "--- author tag on line: %s, position: %d" % (line.strip(), position) + found_author_tag= position + + ## Only in the top X lines + if (found_ending_keyword and found_author_tag) \ + or position >= assumed_top_section_length: + break - ## Set the ending position to equal this line number - ## (This could be the last author or one of many) - else: - standard_author_pattern_match = total_author_pattern.search(line) - extra_author_pattern_match = extra_author_pattern.search(line) - if standard_author_pattern_match or extra_author_pattern_match: - if not first_matched_author_line: - first_matched_author_line = position - last_matched_author_line = position - - if standard_author_pattern_match: - ## Append the matched author string (standard pattern) - collected_authors.append(standard_author_pattern_match.group('author_names')) - if extra_author_pattern_match: - ## Append the matched author string (extra pattern) - collected_authors.append(extra_author_pattern_match.group('extra_auth')) - - if cli_opts['verbosity'] > 2: - print "! Author pattern match on line: %s" % line - - if cli_opts['verbosity'] > 2: - print "TOP SECTION COLLECTED AUTHORS" - print collected_authors - print "TOP SECTION COLLECTED AFFILIATIONS" - print collected_affiliations - - final_tagged_authors = [] - ## If the number of author LINES is equal to the affiliation count, - ## associate the two - if len(collected_authors) == len(collected_affiliations): - for x in range(len(collected_authors)): - rebuilt_collected_authors = rebuild_author_lines(list(collected_authors[x]), \ - re_single_author_pattern) - ## Associate authors with affiliations - tagged_authors = ["%s%s%s%s" % \ - ("", \ - an_author, \ - CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_STND, \ - collected_affiliations[x]) for an_author in rebuilt_collected_authors] - ## Increase stength for this (when len(aff)=len(auth))? - else: - ## Assemble into a list, with one author name per line, without affiliations - rebuilt_collected_authors = rebuild_author_lines(collected_authors, re_single_author_pattern) - tagged_authors = ["%s%s%s" % ("", \ - an_author, \ - CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_STND) for an_author in rebuilt_collected_authors] - - ## Only returns a top section if a solid indicator was found - if keyword_line or last_matched_aff_line or last_matched_author_line: - - ## Set the starting position of the start of the top section - if first_matched_author_line: - start_line = first_matched_author_line - - ## Set the end line position of the end of the top section - if keyword_line: - ## (Top-section keyword termination) Best case scenario - ## Ending was found using a keyword, safe to use from the word go - new_end = keyword_line - elif last_matched_aff_line: - ## Next best thing: use found affiliations - ## (Affiliation terminates the end of the top section) - new_end = last_matched_aff_line + forward_jump - else: - ## Cater for this author line, and an affiliation line afterwards - ## (Author terminates the end of the top section) - new_end = last_matched_author_line + forward_jump + return (found_ending_keyword, found_author_tag) - try: - ## Top section details - top_section = {'start' : start_line, - 'end' : new_end, - 'lines' : docbody[start_line:new_end], - 'authors' : tagged_authors, ## list of author strings! - 'affiliations' : collected_affiliations, - } - except IndexError, err: - ## Overshoots the length of the document body, completely abort. - pass + ## Example docbody + #docbody = ['Some title', 'Some date', 'Chris Hayward, Tim Smith, Joe Harris', 'University of Bath', '', 'Abstract'] + #docbody = ['Some title', 'Some date', \ + # 'Authors:', 'Chris Hayward,', '', 'Tim Smith,', '', 'Joe Harris', 'University of Bath', '', 'Abstract'] - print "TAGGED AUTHORS FROM FIND_TOP_SECTION" - print top_section['authors'] + affiliations = [] + ## end-of-top-section-keyword position, if any + (pre_ending_keyword, pre_author_tag) = check_for_end_of_author_section_match_keywords(docbody) + ## Default return values + status = how_found_start = 0 - return top_section + if pre_ending_keyword: + top_section = docbody[:pre_ending_keyword] + elif len(docbody) < assumed_top_section_length: + ## Half total length + top_section = docbody + else: + ## First 100 lines? + top_section = docbody[:assumed_top_section_length] + + tagged_author_information = [] + just_tagged_authors = [] + first_author_tag_position = None + + ## METHOD 1 -------------- Look for tagged authors. + for position in range(len(top_section)): + line = top_section[position] + ## 'Prepared/edited/written by:', or 'authors:' + if re_author_tag.search(line): + ## We know there's a tag, save this position + first_author_tag_position = last_tagged_author_position = position + ## Remove the tag, and check content + detagged_line = re_author_tag.sub('', line, 1).strip() + if detagged_line: + ## From this point, go on and collect tagged authors + (last_tagged_author_position, tagged_authors) = \ + collect_tagged_authors(top_section, \ + position, \ + first_line=detagged_line) + else: + ## So, in this situation, there is nothing following the + ## author tag on the same line, but a tag is present, meaning + ## that authors are below the tag somewhere! + ## Get the next non-empty line, and look at that. + position_find = position + 1 + tagged_authors = None + ## From this point + while (position_find < len(top_section)) and (not tagged_authors): + ## Hit a non-blank line after a tag, start searching recursively from here + if top_section[position_find].strip() != '': + gap_width = position_find - position + (last_tagged_author_position, tagged_authors) = \ + collect_tagged_authors(top_section, \ + position_find, \ + orig_blank_lines=gap_width) + ## Save the position of the last author line + position_find += 1 + + if tagged_authors: + tagged_author_information = [{'authors' : tagged_authors, + 'affiliation' : None,}] + just_tagged_authors = tagged_authors + ## Break with whatever was collected from the author tag. + break -def find_simple_authors(docbody, author_marker = None, first_author = None): - """Search in document body for its author section. - Looks top down for things that look like an author list. This will - work generally poorly unless one is using the LaTeX in some way, or - if one knows the first author. Both of these methods are tried - first, falling back to a default search for the first line - matching - [A-Z]\w+, [A-Z]\.?\s?[A-Z]?\.?\s?\d* - (i.e. a word starting with caps, followed by comma, space, one - or two initials with possible periods and then possibly a number. + ## METHOD 2 -------------- look for standard authors (basic pattern) + ## Look for standard (initials surname, or surname initials) authors. + ## This is done before affiliation-assisted author-search is initiated + ## since the positions can be used to improve affiliation detection. + first_standard_author_position = None + standard_authors = [] + for position in range(len(top_section)): + ## An author tag was found, delay the search until the tag position is reached + if first_author_tag_position and (position < first_author_tag_position): + continue - @param docbody: (list) of strings - the full document body. - @param author_marker: (string) optional (regexp) marker embedded by latex - for beginning and end of author section - @param first_author: (string) optional (regexp) first author to help find - beginning of section - @return: (dictionary) : - { 'start_line' : (integer) - index in docbody of 1st author line, - 'end_line' : (integer) - index of last author line - } - Much of this information is used by later functions to rebuild - a reference section. - -- OR -- - (None) - when the reference section could not be found. + line = top_section[position] + ## 'Initial Surname' / 'Surname Initials' authors or + ## Knowledge-base specified authors +# if re_auth_with_number.search(line) or re_extra_auth.search(line): + ## Keep a list of standard authors, and their positions +# (last_standard_author_position, standard_authors) = collect_standard_authors(top_section, \ +# position) + ## Use the first matched author from WHERE TO START +# first_standard_author_position = position + + standard_author_pattern_match = re_auth_with_number.search(line) + extra_author_pattern_match = re_extra_auth.search(line) + if standard_author_pattern_match or extra_author_pattern_match: + if not first_standard_author_position: + first_standard_author_position = position + last_standard_author_position = position + if standard_author_pattern_match: + ## Append the matched author string (standard pattern) + standard_authors.append(standard_author_pattern_match.group('author_names')) + if extra_author_pattern_match: + ## Append the matched author string (extra pattern) + standard_authors.append(extra_author_pattern_match.group('extra_auth')) + + if cli_opts['verbosity'] > 7: + print "--- author pattern match on line: %s, position: %d" % (line.strip(), position) + + ## By this point, we've managed to try and get tagged authors, + ## as well as anything in the top section that looks like an author + ## according to the main author regex. + + ## Attempt to obtain authors using affiliation positions. + ## A tagged author position is considered the best. + ## Otherwise start from the top of the section. + ## Always attempt to find authors too. + (affiliations, loose_authors) = find_affiliations(top_section, \ + start=first_author_tag_position, \ + use_to_find_authors=True) + + ## METHOD 3 -------------- Look for authors using affiliations + ## and handle the assembly of standard authors too + + affiliation_associated_affiliated_authors = [] + affiliation_associated_standard_authors = [] + + just_standard_authors = [] + just_affiliated_authors = [] + + if affiliations is not None: + ## Attempt to pair together standard authors with identified affiliations. + ## If the number of affiliation is equal to the number of author lines + if len(affiliations) == len(standard_authors): + ## Increase stength for this (when len(aff)=len(auth))? + for x in range(len(standard_authors)): + rebuilt_standard_authors = rebuild_author_lines([standard_authors[x]], \ + re_single_author_pattern_with_numeration) + ## Associate authors with affiliations + affiliation_associated_standard_authors.append({'authors' : rebuilt_standard_authors, + 'affiliation' : affiliations[x]['line']}) + just_standard_authors.extend(rebuilt_standard_authors) + ## Now assemble affiliated authors, with their affiliations + for aff in affiliations: + ## Append any affiliation supported authors. + ## [!Do not include repeated authors, may be repeated from double numeration] + if aff['author_data']: + author_list_for_affiliation = aff['author_data']['authors'] + affiliated_author_strength = aff['author_data']['strength'] + else: + author_list_for_affiliation = [] + affiliated_author_strength = None + affiliation_associated_affiliated_authors.append( \ + {'authors' : [auth for auth in author_list_for_affiliation if auth not in just_affiliated_authors], + 'affiliation' : aff['line'], + 'strength' : affiliated_author_strength,}) + just_affiliated_authors.extend([auth for auth in author_list_for_affiliation]) + + ## In the event that standard authors were not paired with affiliations + ## then just make a list of dictionaries of authors without affiliations + if standard_authors and not affiliation_associated_standard_authors: + rebuilt_standard_authors = \ + [rebuild_author_lines([std_auth_line], re_single_author_pattern_with_numeration) \ + for std_auth_line in standard_authors] + for r in rebuilt_standard_authors: + affiliation_associated_standard_authors.append({'authors' : r, + 'affiliation' : None,}) + just_standard_authors.extend(r) + + if cli_opts['verbosity'] > 7: + sys.stdout.write("--- Author extraction results:\n") + sys.stdout.write("---- tagged: %s\n" % tagged_author_information) + sys.stdout.write("---- standard: %s\n" % affiliation_associated_standard_authors) + sys.stdout.write("---- affilated: %s\n" % affiliation_associated_affiliated_authors) + + ## Given three lists of authors, which have been 'extracted' using three different methods + ## decide which list to return as a set of reliable authors (if any) + final_authors = choose_author_method(tagged_author_information, \ + affiliation_associated_standard_authors, \ + affiliation_associated_affiliated_authors, \ + just_tagged_authors, \ + just_standard_authors, \ + just_affiliated_authors) + + if cli_opts['verbosity'] > 7: + sys.stdout.write("--- Selected authors info: %s\n" % final_authors) + + marked_up_authors = mark_up_authors_with_affiliations(final_authors) + + document_information = {'authors' : final_authors, + 'affiliations' : affiliations, + 'marked_up_authors' : marked_up_authors,} + + return (document_information, status, how_found_start) + +def mark_up_authors_with_affiliations(final_authors): + """ Prepare authors and any possible associated affiliations + into marked-up (tagged) lines according to identified authors. + @param final_authors: (list) Holding dictionary items + holding lists of authors with their optional affiliation. + @return: (list) A list of lines, holding marked-up authors + and their affiliations. """ - ## Single author pattern (this is just used to split the output lines as 'markers') - ## Top section of the document (if any) - top_section = find_top_section(docbody, first_author) - - ## Top section was found, with basic authors - if top_section and top_section['authors']: - ## Return dictionary containing details of author section: - ## (The pattern used is just a single name matching author pattern, - ## and not the full author pattern. This allows for each author group - ## to be split into separate author names, within the output xml.) - auth_section = top_section - else: - ## No basic author names were found when locating the top section. - auth_section = None + ## Pair authors and affiliations together, in the event that + ## affiliated supported authors were found! + tagged_authors = [] + + def process_auth(a): + ## Also remove numeration + a = re.sub('\d+', '', a) + a = replace_undesirable_characters(a).strip(".,:;- []()*\\") + return a + + def process_aff(a): + a = replace_undesirable_characters(a).strip(".,:;- []()*\\") + return a + + for aff_auth_dict in final_authors: + for auth in aff_auth_dict['authors']: + ## Otherwise the closing element tag dissappears (!?) + if auth: + if not aff_auth_dict['affiliation']: + aff_for_auth = '' + else: + aff_for_auth = aff_auth_dict['affiliation'] + tagged_authors.append("%s%s%s%s" % ("", \ + process_auth(auth), \ + CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_STND, \ + process_aff(aff_for_auth))) + return tagged_authors + +def choose_author_method(tagged_info, std_info, aff_info, \ + tagged_authors, std_authors, aff_authors): + """Decide which list of authors to treat as the most accurate and + reliable list of authors for a document. This is accomplished + primarily through set operations of the author values, and the methods + by which they were extracted. + @param tagged_info: (dict) Affiliation and author information for authors + obtained using explicit in-document author notation (tags). + @param std_info: (dict) Affiliation and author information for authors + obtained using the comphrehensive author pattern. + @param aff_info: (dict) Affiliation and author information for authors + obtained using two types of affiliation-context (numeration, and positioning). + @param tagged_authors: (list) List of purely tagged authors. + @param std_authors: (list) List of purely standard-matched authors. + @param aff_authors: (list) List of purely affiliated authors. + @return: (dict) Affiliation and author information which is deemed to be + the most accurate for the document. + """ + + ## Immediately discard non-sets of authors (hold duplicate entries) + if len(tagged_authors) != len(set(tagged_authors)): + tagged_info = [] + if len(std_authors) != len(set(std_authors)): + std_info = [] + if len(aff_authors) != len(set(aff_authors)): + aff_info = [] + + tagged_authors = map(lambda x: x.strip(" ,"), tagged_authors) + std_authors = map(lambda y: y.strip(" ,"), std_authors) + aff_authors = map(lambda z: z.strip(" ,"), aff_authors) + + ## False if there is a 'weak' affiliation-supported author match + ## AND none of them are found in the list of standard authors + weak_affiliated_authors = False + + ## True if 'weak' affiliated authors are present, and at least one of + ## those authors has, as a subset, an author from the standard author list + author_match_with_standard_authors = False + + ## If standard authors and affiliated authors exist + ## Otherwise there's no point in deciding which to take + if std_authors and aff_authors: + ## Is there a 'line-above' author line? + weak_affiliated_authors = filter(lambda tmp_aff: \ + ((tmp_aff['strength'] is 0) and tmp_aff['authors']), aff_info) + + for f in aff_info: + if (f['strength'] is 0) and f['authors']: + ## Given that there exists at least one 'line above' set of authors + ## See if any of these so-called weak authors also exist in the + ## set of standard authors (even as substrings) + for auth in f['authors']: + ## If there exists a standard author which is a substring + ## of at least one affiliated author + author_match_with_standard_authors = filter(lambda tmp_std: \ + auth.find(tmp_std), std_authors) + ## Do not place precedence on the standard authors + if author_match_with_standard_authors: + break + ## Do not give precedence to standard authors when there exists + ## a line-above author in the list of standard authors + if author_match_with_standard_authors: + weak_affiliated_authors = False + break - return auth_section + aff_authors_is_a_subset_of_std_authors = False + if set(aff_authors).difference(std_authors) and set(aff_authors).issubset(std_authors): + ## Is the set of affiliated authors a subset of standard authors + ## And do they differ?! + aff_authors_is_a_subset_of_std_authors = True + + ## The situations where std_info has precendence over aff_info + ## 1. There exists at least one 'line above affiliation' (weakly) found author + ## 2. Affiliated authors are a subset of standard authors + ## 3. Standard authors double the number of affiliated authors + standard_over_affiliated = weak_affiliated_authors or \ + aff_authors_is_a_subset_of_std_authors or \ + ((len(aff_authors) * 2) < len(std_authors)) + + ## Make the choice, with the appropriate precedence + if standard_over_affiliated: + return tagged_info or std_info or aff_info + else: + return tagged_info or aff_info or std_info def find_reference_section(docbody): """Search in document body for its reference section. More precisely, find @@ -5801,16 +6515,12 @@ def found_author(matchobj): """ given an author in the match obj, pushes it on the stack of lines """ ## Append author and remove undesirable unicode characters for this author list - authors.append(replace_undesirable_characters(matchobj.group(0))) - if cli_opts['verbosity'] > 1: - print "Found author -> "+ matchobj.group(0)+ "\n" + authors.append(matchobj.group(0)) return ' ' authors = [] ## Kill the new line characters in the author lines - ## FIXME Need to remove the numeration character for authors author_string = ' '.join([x.strip() for x in author_lines]) author_pattern.sub(found_author, author_string) - return authors def rebuild_reference_lines(ref_sectn, ref_line_marker_ptn): @@ -5864,7 +6574,7 @@ def rebuild_reference_lines(ref_sectn, ref_line_marker_ptn): if m_ref_line_marker is not None: ## Reference line marker found! : Append this reference to the ## list of fixed references and reset the working_line to 'blank' - if current_string <> '': + if current_string != '': ## If it's not a blank line to separate refs . if current_string[len(current_string) - 1] in (u'-', u' '): ## space or hyphenated word at the end of the @@ -5939,8 +6649,6 @@ def get_lines(docbody, ## Pass title line start_idx += 1 - - ## now rebuild reference lines: if type(end_line) is int: lines = \ @@ -5951,8 +6659,6 @@ def get_lines(docbody, marker_ptn) return lines - - def get_reference_lines(docbody, ref_sect_start_line, ref_sect_end_line, @@ -6011,172 +6717,6 @@ def get_reference_lines(docbody, ## ----> Glue - logic for finding and extracting reference section: -def extract_references_from_fulltext(fulltext): - """Locate and extract references from a fulltext document. - Return the extracted reference section as a list of strings, whereby each - string in the list is considered to be a single reference line. - E.g. a string could be something like: - '[19] Wilson, A. Unpublished (1986). - wrapper for more general extract_section_from_fulltext() - - @param fulltext: (list) of strings, whereby each string is a line of the - document. - @return: (list) of strings, where each string is an extracted reference - line. - """ - return extract_section_from_fulltext(fulltext, 'references') - -## Custom weightings, the higher the value, the more valuable the method -context_method_weightings = {'numeration_accoc': 1, - 'strict_pattern' : 0.5, - 'line_above_aff' : 0.4, - 'keyword_body' : 0.3, - 'weak_pattern' : 0.2,} - -def choose_author_method(p_authors, c_authors): - """Decide which list of possible authors to return. - """ - - ## Immediately discard non-sets of authors (hold multiple entries of the same name) - if len(p_authors['matches']) != len(set(p_authors['matches'])): - p_authors['matches'] = [] - - if len(c_authors['matches']) != len(set(c_authors['matches'])): - c_authors['matches'] = [] - - if p_authors or c_authors: - ## Discard a list of authors if it is small (what is small?) - - - ## Here we look at which set of authors to choose, depending on their perceived accuracy - pattern_data = (p_authors['method'], p_authors['strength'], p_authors['authors']) - context_data = (c_authors['method'], c_authors['strength'], c_authors['authors']) - - ## - for_pattern = 0 - for_context = 0 - - ## Compare the list of authors found by both method groups -# for a in p_authors['authors']: -# if a in c_authors['authors']: - - - for n in range(3): - p = pattern_data[n] - c = context_data[n] - try: - int_p = 1/int(p) - int_c = 1/int(c) - except ValueError: - int_p = method_weightings[p] - int_c = method_weightings[c] - - ## indicating a very poor match - if int_p == 0 or int_c == 0: - if int_p == 0: - for_pattern -= int_p - if int_c == 0: - for_context -= int_c - else: - ## Here we have a value between 0 and 1, for a feature - ## Bias is always placed on the context - if int_p > int_c: - for_pattern += 1 - else: - for_context += 1 - - ## Make the desision, depending on the larger pattern or context value - if for_pattern > for_context: - chosen_author_section = p_authors - else: - ## Again, bias is placed on using context - chosen_author_section = c_authors - - else: - ## Both lists are empty - chosen_author_section = [] - - return chosen_author_section - -def extract_authors_from_fulltext(fulltext): - """Locate and extract authors of a paper, from a fulltext document. - Return the extracted authors section as a list of strings, whereby each - string in the list is considered to be line holding authors. - E.g. a string could be something like: - 'Wilson, A., T Wells. A. Einstein ...' - wrapper for more general extract_section_from_fulltext() - - @param fulltext: (list) of strings, whereby each string is a line of the - document. - @return: (list) of strings, where each string is an extracted author - line. - """ - - status = how_found_start = 0 - author_section = [] - - ## EXTRACTION 1 - ## This will attempt to find the top section of the document using - ## author pattern matches as boundaries only (single author names are markers) - - ## For author extraction, this entire encapsulating function will only - ## return authors which are found using the pattern (initials surname, etc...) - authors_using_pattern = find_simple_authors(fulltext, first_author=cli_opts['first_author']) - -# fulltext = ['Some title', 'Some date', 'Chris Hayward, Tim Smith, Joe Harris', 'University of Bath', '', 'Abstract'] -# authors_using_pattern = extract_section_from_fulltext(fulltext, 'authors') - - ## EXTRACTION 2 - ## Now attempt to find authors in the context of nearby affiliations - aff_auth_pairs = find_affiliations(fulltext, use_to_find_authors=True) - - ## Append the affiliation-supported authors, if affiliations were found - affiliation_supported_authors = [] - if aff_auth_pairs is not None: - for pair in aff_auth_pairs: - ## For each (affiliation, author list) pair, add tagged authors, and associated affiliation - affiliation_supported_authors.append("%s%s%s%s" % \ - ("", \ - pair[1], \ - CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_STND, \ - pair[0])) - - authors_using_affiliations = map(replace_undesirable_characters, affiliation_supported_authors) - - ## VERIFICATION 1 - ## Compare possible authors against a list of authors in the references (confirmed) - - ## VERIFICATION 2 - ## Compare long words in a line against the rest of the document body (negative) - - ## Given two lists of authors, which have been 'extracted' using two different methods - ## decide which list to take as a set of reliable authors (if any) -# author_section = choose_author_method(authors_using_pattern, authors_using_affiliations) - - ## temp - if authors_using_affiliations: - author_lines = authors_using_affiliations - elif authors_using_pattern: - author_lines = authors_using_pattern['authors'] - - return (author_lines, status, how_found_start) - -def extract_affiliations_from_fulltext(fulltext): - """Locate and extract affiliations of a paper, from a fulltext document. - Return the extracted affiliations section as a list of strings, whereby each - string in the list is considered to be line holding affiliations. - E.g. a string could be something like: - 'U. Bath, CERN ...' - This function does not involve itself with extract_section_from_fulltext() - - @param fulltext: (list) of strings, whereby each string is a line of the - document. - @return: (list) of strings, where each string is an extracted affiliations - line. - """ - aff_auth_pairs = find_affiliations(fulltext) - return ([aff[0] for aff in aff_auth_pairs], 0, 0) - def extract_references_from_fulltext(fulltext): """Locate and extract the reference section from a fulltext document. Return the extracted section as a list of strings, whereby each @@ -6231,7 +6771,7 @@ def extract_references_from_fulltext(fulltext): ## Attempt to find the end of the section in the case where references are being ## extracted, and a first pass failed at finding the end of the reference section - if (sect_end is None) and (section == 'references'): + if sect_end is None: sect_end = \ find_end_of_reference_section(fulltext, \ sect_start["start_line"], \ @@ -6514,11 +7054,11 @@ def get_cli_options(): ## a 'configuration file'-specified kb cli_opts['kb-report-number'] = o[1] elif o[0] in ("-a", "--authors"): - cli_opts['authors'] = 1; + cli_opts['authors'] = 1 elif o[0] in ("-f", "--affiliations"): - cli_opts['affiliations'] = 1; + cli_opts['affiliations'] = 1 elif o[0] in ("--first_author"): - cli_opts['first_author'] = 1; + cli_opts['first_author'] = 1 if len(myargs) == 0: ## no arguments: error message usage(wmsg="Error: no full-text.") @@ -6788,22 +7328,20 @@ def begin_extraction(daemon_cli_options=None): ## don't search for sections in the document body: ## treat entire input as relevant section: extract_lines = docbody - else: - ## launch search for the relevant section in the document body: - if cli_opts['authors'] == 1: - (tagged_lines, extract_error, how_found_start) = \ - extract_authors_from_fulltext(docbody) - elif cli_opts['affiliations'] == 1: - (tagged_lines, extract_error, how_found_start) = \ - extract_affiliations_from_fulltext(docbody) + if cli_opts['authors'] == 1 or cli_opts['affiliations'] == 1: + (document_info, extract_error, how_found_start) = \ + extract_top_document_information_from_fulltext(docbody, first_author=cli_opts['first_author']) + if cli_opts['authors']: + extract_lines = document_info['authors'] + elif cli_opts['affiliations']: + extract_lines = document_info['affiliations'] else: (extract_lines, extract_error, how_found_start) = \ extract_references_from_fulltext(docbody) if not cli_opts['authors'] and not cli_opts['affiliations']: - if len(extract_lines) == 0 and extract_error == 0: extract_error = 6 write_message("-----extract_references_from_fulltext " \ From f12b25aa95c41e1a3e25e238c93aa62b24f5893e Mon Sep 17 00:00:00 2001 From: Christopher Hayward Date: Fri, 12 Aug 2011 23:36:01 +0200 Subject: [PATCH 13/15] refextract: improve realign numeration * Improve the realigning of numeration, across badly split affiliation lines (added config variable to specify an acceptable numeric gap between numerated affiliations, in the event of bad pdftotext conv.). * Remove -raw from pdftotext conversion, to do: use -layout instead. * Strip reference section from document when looking for auths/affs. * partially able to obtain standard authors through comma and numeration placement --- modules/bibedit/lib/refextract.py | 461 ++++++++++++++++++----- modules/bibedit/lib/refextract_config.py | 3 +- 2 files changed, 363 insertions(+), 101 deletions(-) diff --git a/modules/bibedit/lib/refextract.py b/modules/bibedit/lib/refextract.py index a3caa5acde..58f8a1df5d 100644 --- a/modules/bibedit/lib/refextract.py +++ b/modules/bibedit/lib/refextract.py @@ -34,7 +34,8 @@ CFG_REFEXTRACT_KB_JOURNAL_TITLES, \ CFG_REFEXTRACT_KB_REPORT_NUMBERS, \ CFG_REFEXTRACT_KB_AUTHORS, \ - CFG_INSTITUTIONS, \ + CFG_REFEXTRACT_INSTITUTIONS, \ + CFG_REFEXTRACT_AFFILIATION_NUMERATION_ALLOWABLE_GAP, \ CFG_REFEXTRACT_CTRL_FIELD_RECID, \ CFG_REFEXTRACT_TAG_ID_REFERENCE, \ CFG_REFEXTRACT_IND1_REFERENCE, \ @@ -71,6 +72,8 @@ CFG_REFEXTRACT_KB_JOURNAL_TITLES = "%s/etc/refextract-journal-titles.kb" % '..' CFG_REFEXTRACT_KB_REPORT_NUMBERS = "%s/etc/refextract-report-numbers.kb" % '..' CFG_REFEXTRACT_KB_AUTHORS = "%s/etc/refextract-authors.kb" % '..' + CFG_REFEXTRACT_INSTITUTIONS = [] + CFG_REFEXTRACT_AFFILIATION_NUMERATION_ALLOWABLE_GAP = 2 CFG_REFEXTRACT_CTRL_FIELD_RECID = "001" ## control-field recid CFG_REFEXTRACT_TAG_ID_REFERENCE = "999" ## ref field tag CFG_REFEXTRACT_IND1_REFERENCE = "C" ## ref field ind1 @@ -1253,7 +1256,7 @@ def get_author_affiliation_numeration_str(punct=None): re_chained_numbers = "(?:(?:[,;]\s*%s\.?\s*))*" % re_number ## Punctuation surrounding the number, either general or specific again if punct is None: - re_punct = "(?:[\{\(\]]?)" + re_punct = "(?:[\{\(\[]?)" else: re_punct = re.escape(punct) @@ -1482,7 +1485,7 @@ def make_auth_regex_str(etal, initial_surname_author=None, surname_initial_autho re_etal = u"""[Ee][Tt](?:(?:(?:,|\.)\s*)|(?:(?:,|\.)?\s+))[Aa][Ll][,\.]?[,\.]?""" ## The pattern used to identify authors inside references -re_auth = (re.compile(make_auth_regex_str(re_etal),re.VERBOSE|re.UNICODE)) +re_auth = (re.compile(make_auth_regex_str(re_etal), re.VERBOSE|re.UNICODE)) ## Used as a weak mechanism to classify possible authors above identified affiliations ## (start) Firstname SurnamePrefix Surname (end) @@ -5097,13 +5100,13 @@ def standardise_affiliation_formats(line): ## Get the University name line = (univ_name.group(2) or univ_name.group(3)) + " U." ## Check and set an institution - for inst in CFG_INSTITUTIONS: + for inst in CFG_REFEXTRACT_INSTITUTIONS: if line.find(inst) != -1: line = inst break return line -def extract_numerated_affiliations(num_position, num_section, num_find, num_punct): +def extract_numerated_affiliations(num_position, num_section, num_find, num_punct, missing): """ Collect numerated affiliations, using a section of the document, and the number which to search for. The punctuation surrounding any numeration (the first number found) (if any) is used to improve the strictness of the search. @@ -5117,7 +5120,7 @@ def extract_numerated_affiliations(num_position, num_section, num_find, num_punc """ affs = [] if num_section: - ## First line + ## First line, holding first affiliation with the number 1 line = num_section[0].strip() ## A number has been found before this iteration ## Use previous number, and previous punctuation! @@ -5125,6 +5128,7 @@ def extract_numerated_affiliations(num_position, num_section, num_find, num_punc if num_find in aff_nums: ## Attempt to get numeration for this affiliation try: +# print "num with aff: %d" % num_find num_find = num_find + 1 except ValueError: sys.stderr.write("Error: Unable to obtain integer affiliation numeration.") @@ -5135,18 +5139,24 @@ def extract_numerated_affiliations(num_position, num_section, num_find, num_punc 'aff_nums' : aff_nums, 'author_data' : None}) + elif num_find in missing: + ## Get the next non missing number and use that + while num_find in missing: + num_find += 1 + ## Do until end of docbody section (num_section) affs.extend(extract_numerated_affiliations(num_position+1, \ - num_section[1:], \ - num_find, - num_punct)) + num_section[1:], \ + num_find, \ + num_punct, \ + missing)) return affs ## Numeration at the start of the line re_start_numeration = re.compile("^%s$" % get_author_affiliation_numeration_str(), \ re.VERBOSE|re.UNICODE) -def realign_numeration(docbody): +def realign_numeration(toplines): """ Create a duplicate document body, but with starting numeration replicated on the next line. This is to do with the reparation of numeration across multiple lines, from the pdftottext conversion. @@ -5158,10 +5168,12 @@ def realign_numeration(docbody): @return: (list) The list of lines of the entire input document, with any start-line numeration shifted accordingly. """ - docbody_alternate = docbody[:] + + toplines_alternate = toplines[:] ## Get the positions of all single '1's + ## These positions will denote the start of each realignment process starting_numeration = [] - for position, line in enumerate(docbody): + for position, line in enumerate(toplines): num_match = re_start_numeration.search(line) if num_match: try: @@ -5171,30 +5183,87 @@ def realign_numeration(docbody): starting_numeration.append(position) except ValueError: continue + ## Now, using the positions of the '1's, go forward and locate ## subsequent numeration, and replicate on the following line if need be - num = 1 + missing_nums = [] for start in starting_numeration: - for position, line in enumerate(docbody[start:]): + alignment_error = 0 + num = 1 + for position, line in enumerate(toplines[start:]): +# print "start pos: %d, now on num: %d" % (start, num) num_match = re_start_numeration.search(line) + if num_match: + ## Sanity check, make sure the match is an integer try: i_num = int(num_match.group(2)) - if i_num == num: - ## If this number found is - docbody_alternate[start+position] = "\n" - docbody_alternate[start+position+1] = num_match.group(0).strip() + docbody_alternate[start+position+1] - num = num + 1 - except IndexError: - break except ValueError: continue - return docbody_alternate +# print "got num %d, position %d, on line %s" % (i_num, start+position, line.strip()) + + ## Hit a number which is not expected, and is not just 2 ahead + if (i_num != num) and ((i_num < num) or ((i_num - num) > \ + CFG_REFEXTRACT_AFFILIATION_NUMERATION_ALLOWABLE_GAP)): + #alignment_error = 1 + ## Skipping can occur, but only twice in a row before exiting + continue + else: + ## When there exists an acceptable missed number, for whatever reason + if (i_num > num) and ((i_num - num) <= CFG_REFEXTRACT_AFFILIATION_NUMERATION_ALLOWABLE_GAP): + ## Append all the missing numbers between the gap + missing_num = num + while missing_num != i_num: + missing_nums.append(missing_num) + missing_num += 1 + num += (i_num - num) + + try: + ## Otherwise, if this number found is equal to the incremented number + toplines_alternate[start+position] = "\n" + except IndexError: + alignment_error = 3 + else: + lookahead = start+position + + ## Now place the number on the next NON-EMPTY line + while not alignment_error: + lookahead += 1 + try: + line_ahead = toplines_alternate[lookahead].strip() + int_val_line = int(line_ahead) + except ValueError: + ## ValueError is good + if line_ahead: + toplines_alternate[lookahead] = \ + num_match.group(0).strip() + line_ahead +# print "new line: %s" % toplines_alternate[lookahead] + ## Increment the next number to look for + num += 1 + break + except IndexError: + alignment_error = 4 + else: + ## A line following a found lone number is also a number + ## To dangerous to continue. + alignment_error = 5 + +# elif line.strip(): +# print "Stopping, num: %d, line: %s" % (num, line.strip()) +# break + + if alignment_error: + print "alignment error: %d (1 or 2 is bad)" % alignment_error + ## Scrap the alternate version + toplines_alternate = toplines + break + + return (toplines_alternate, missing_nums) def find_affiliations(lines, start, end=None, use_to_find_authors=False): """ Given a possible author section, attempt to retrieve any affliations. - @param docbody: (list) The entire document body as a list of lines. + @param lines: (list) The entire document body as a list of lines. @param start: (int) The start position, from where to start finding affiliations. @param end: (int) The boundary position: Stop searching here. @@ -5224,9 +5293,15 @@ def get_smaller(x, y): else: top_lines_orig = lines[start:] +# print "top_lines" +# print top_lines_orig + ## Get an alternative version of the top section, of the same length ## but with some alone numeration replicated on the next line! - top_lines_alt = realign_numeration(top_lines_orig) + (top_lines_alt, missing_nums) = realign_numeration(top_lines_orig) + +# print "top_lines_alt:" +# print top_lines_alt for position in range(len(top_lines_orig)): ## Standardise some affiliations @@ -5236,6 +5311,9 @@ def get_smaller(x, y): ## If a previous numeration value was found in the previous iteration ## check for the increment of this value on this line if re_aff_num.search(line) or re_aff_name.search(line): + + print "AFF match on line: %s" % line + ## Check numeration in replica docbody (aff_nums, num_match) = obtain_author_affiliation_numeration_list(line) ## Check numeration in the numeration-realigned docbody @@ -5243,13 +5321,15 @@ def get_smaller(x, y): ## Set the information to the correct top_section, depending on ## if the numeration was found split across lines or not. if aff_nums or not aff_nums_alt: + print "using orig lines" top = top_lines_orig elif aff_nums_alt: + print "using alt lines" top = top_lines_alt aff_nums = aff_nums_alt num_match = num_match_alt - ## Aff number '1' numeration found + ## Aff number '1' numeration found, save position and punctuation if aff_nums and num_match and 1 in aff_nums: starting_num_position = position numerated_aff_num_punct_ptn = num_match.group(1) @@ -5294,10 +5374,12 @@ def get_smaller(x, y): ## Collect up all of the following numerated affiliations, ## or go backwards and obtain them if starting_num_position: + print "Going to extract numerated affiliations..." affiliations = extract_numerated_affiliations(starting_num_position, \ - top[starting_num_position:], \ - 1, \ - numerated_aff_num_punct_ptn) + top[starting_num_position:], \ + 1, \ + numerated_aff_num_punct_ptn, \ + missing_nums) loose_authors = [] @@ -5314,26 +5396,34 @@ def get_smaller(x, y): return (affiliations, loose_authors) -def collect_standard_authors(top_lines, position): +def collect_standard_authors(top_lines, position=0, first=None): """Obtain standard authors [recursive] @param top_lines: (list) top lines of document @param position: (int) position in top lines @return: list holding the list of collected authors, and the position of the last author line """ - authors_on_line = [] + author_matches = [] if position < len(top_lines): line = top_lines[position] ## Get all standard author matches for this line - total_author_matches = re_auth_with_number.search(line) - if total_author_matches: - ## Save the matching strings in a list - authors_on_line = total_author_matches.group('author_names') - (position, more_authors) = collect_standard_authors(top_lines, position+1) + author_matches = re_single_author_pattern_with_numeration.search(line) + author_matches_alt = \ + re_single_author_pattern_with_numeration.finditer(realign_shifted_line_numeration_around_commas(line)) + if author_matches or author_matches_alt: + if first is None: + first = position + ## Recurse on the next position - authors_on_line.extend(more_authors) + (more_author_matches, first, position) = collect_standard_authors(top_lines, position+1, first) + + if len(author_matches) > len(author_matches_alt): + ## Save the matching strings in a list + author_matches.extend(more_author_matches) + else: + author_matches_alt.extent(more_author_matches) ## Authors for this line - return (position, authors_on_line) + return (author_matches, first, position-1) def collect_tagged_authors(top_section, position, first_line=None, \ orig_blank_lines=None, cur_blank_lines=None): @@ -5409,8 +5499,21 @@ def leading_comma(line): return (position, line_parts) +re_misaligned_numeration_around_comma = re.compile("(.*?)(?P[,;])\s*(\d{1,3})") + +def realign_shifted_line_numeration_around_commas(line): + ## First see how many swap substitutions will take place, before-hand. + swaps = [x for x in re_misaligned_numeration_around_comma.finditer(line)] + delimiter = None + if len(swaps) >= 1: + ## Get the first matches' delimiter, which can be reused to split a line later. + delimeter = swaps[0].group("delim") + ## Do the swapping. + line = re_misaligned_numeration_around_comma.sub(r"\g<1>\g<3>,", line).strip(",") + return (line, delimiter) + ## Used in the event that no keyword is found (max length of top section) -assumed_top_section_length = 100 +assumed_top_section_length = 300 ## Used to force the validity of found keywords ## (Valid if they appear after this position) @@ -5495,14 +5598,19 @@ def check_for_end_of_author_section_match_keywords(docbody): ## Default return values status = how_found_start = 0 + print "Top Section Ending:" + if pre_ending_keyword: + print "---keyword!" top_section = docbody[:pre_ending_keyword] elif len(docbody) < assumed_top_section_length: ## Half total length top_section = docbody + print "--whole!" else: ## First 100 lines? top_section = docbody[:assumed_top_section_length] + print "--first 100" tagged_author_information = [] just_tagged_authors = [] @@ -5553,14 +5661,15 @@ def check_for_end_of_author_section_match_keywords(docbody): ## Look for standard (initials surname, or surname initials) authors. ## This is done before affiliation-assisted author-search is initiated ## since the positions can be used to improve affiliation detection. + first_standard_author_position = None standard_authors = [] + standard_author_matches = [] for position in range(len(top_section)): ## An author tag was found, delay the search until the tag position is reached if first_author_tag_position and (position < first_author_tag_position): continue - line = top_section[position] ## 'Initial Surname' / 'Surname Initials' authors or ## Knowledge-base specified authors # if re_auth_with_number.search(line) or re_extra_auth.search(line): @@ -5570,26 +5679,141 @@ def check_for_end_of_author_section_match_keywords(docbody): ## Use the first matched author from WHERE TO START # first_standard_author_position = position - standard_author_pattern_match = re_auth_with_number.search(line) - extra_author_pattern_match = re_extra_auth.search(line) - if standard_author_pattern_match or extra_author_pattern_match: +# (first_standard_author_position, std_authors, \ +# last_standard_author_position) = collect_standard_authors(top_section) + +# line = top_section[position] +# line_alt = realign_shifted_line_numeration_around_commas(line) + +# print line_alt +# print position + + ## Match on non-rearranged lines +# standard_author_pattern_match = re_auth_with_number.search(line) +# extra_author_pattern_match = re_extra_auth.search(line) + + line = top_section[position] + (shifted_line, numeration_delimiter) = realign_shifted_line_numeration_around_commas(line) + + author_matches = \ + [x for x in re_single_author_pattern_with_numeration.finditer(line)] + author_matches_alt = \ + [y for y in re_single_author_pattern_with_numeration.finditer(shifted_line)] + + if author_matches or author_matches_alt: if not first_standard_author_position: first_standard_author_position = position last_standard_author_position = position - if standard_author_pattern_match: + + print "from line: %s" % line + + use_line = "orig" + + if author_matches and author_matches_alt: + ## Save the list of matching authors in a list + if len(author_matches) <= len(author_matches_alt): + use_line = "alt" + elif author_matches_alt: + use_line = "alt" + + if use_line == "alt": + which_line = re.sub("[Aa][Nn][Dd]|&", ",", shifted_line.strip()) + standard_author_matches = author_matches_alt + print "USING ALT" + else: + which_line = re.sub("[Aa][Nn][Dd]|&", ",", line.strip()) + standard_author_matches = author_matches + print "USING NORM" + + ## Remove matches from string +# for s in standard_author_matches: +# m_start = s.start() +# m_end = s.end() +# esc_line = esc_line[:m_start]+((m_end - m_start) * "_")+esc_line[m_end:] + +# print "esc_line: %s" % esc_line + + new_standard_authors = [x.group(0) for x in standard_author_matches] + + ## Split the line based on a common delimiter, + ## If at least two authors were matched on this line + if len(standard_author_matches) >= 2: + first_matched_auth = standard_author_matches[0].group(0) + second_matched_auth = standard_author_matches[1].group(0) + + try: + delimiter = None + ## Take either the delimiter from two author matches + ## or from the result of swapping numeration eariler. + ## Use this information to split a line, thus maximising the author count. + if first_matched_auth.strip()[-1] == second_matched_auth.strip()[-1]: + delimiter = first_matched_auth.strip()[-1] + elif numeration_delimiter: + delimiter = numeration_delimiter + if delimiter: + split_authors = [n for n in which_line.split(delimiter) if n.strip(", ")] + if len(split_authors) >= len(new_standard_authors): + new_standard_authors = split_authors + sys.stdout.write("WOOP! adding missed authors: %s\n" % new_standard_authors) + else: + print "NO DELIMITER :(" ## WHY FOR THE FIRST AUTHOR LINE??? FIX THIS FIRST + except IndexError: + pass +# else: +# skipped_auths = [y.group(0) for y in \ +# re.finditer("[^\d,;&_]\s*([,;&]|[Aa][Nn][Dd]|$)\s*\d{0,3}", esc_line)] + + ## Standard author strings + sys.stdout.write("appending std authors: %s\n" % new_standard_authors) + standard_authors.append(new_standard_authors) + + + ## Match on rearranged lines +# standard_author_pattern_match_alt = re_auth_with_number.search(line_alt) +# extra_author_pattern_match_alt = re_extra_auth.search(line_alt) + +# if standard_author_pattern_match or extra_author_pattern_match \ +# or standard_author_pattern_match_alt or extra_author_pattern_match_alt: + +# if not first_standard_author_position: +# first_standard_author_position = position +# last_standard_author_position = position + +# if standard_author_pattern_match or standard_author_pattern_match_alt: +# if standard_author_pattern_match and standard_author_pattern_match_alt: +# if len(standard_author_pattern_match) > len(standard_author_pattern_match_alt): +# std_match = standard_author_pattern_match +# else: +# std_match = standard_author_pattern_match_alt +# else: +# std_match = standard_author_pattern_match or standard_author_pattern_match_alt + ## Append the matched author string (standard pattern) - standard_authors.append(standard_author_pattern_match.group('author_names')) - if extra_author_pattern_match: +# print "from line: %s" % line +# print "appending std authors: %s" % std_match.group('author_names') +# standard_authors.append(std_match.group('author_names')) + +# if extra_author_pattern_match or extra_author_pattern_match_alt: +# if extra_author_pattern_match and extra_author_pattern_match_alt: +# if len(extra_author_pattern_match) > len(extra_author_pattern_match_alt): +# ext_match = extra_author_pattern_match +# else: +# ext_match = extra_author_pattern_match_alt +# else: +# ext_match = extra_author_pattern_match or extra_author_pattern_match_alt ## Append the matched author string (extra pattern) - standard_authors.append(extra_author_pattern_match.group('extra_auth')) +# standard_authors.append(ext_match.group('extra_auth')) - if cli_opts['verbosity'] > 7: - print "--- author pattern match on line: %s, position: %d" % (line.strip(), position) +# if cli_opts['verbosity'] > 7: +# print "--- author pattern match on line: %s, position: %d" % (line.strip(), position) ## By this point, we've managed to try and get tagged authors, ## as well as anything in the top section that looks like an author ## according to the main author regex. +# print "STANDARD AUTHORS: %d" % len(standard_authors) +# print standard_authors + ## Attempt to obtain authors using affiliation positions. ## A tagged author position is considered the best. ## Otherwise start from the top of the section. @@ -5611,14 +5835,16 @@ def check_for_end_of_author_section_match_keywords(docbody): ## Attempt to pair together standard authors with identified affiliations. ## If the number of affiliation is equal to the number of author lines if len(affiliations) == len(standard_authors): - ## Increase stength for this (when len(aff)=len(auth))? + ## Increase strength for this (when len(aff)=len(auth))? for x in range(len(standard_authors)): - rebuilt_standard_authors = rebuild_author_lines([standard_authors[x]], \ - re_single_author_pattern_with_numeration) + +# rebuilt_standard_authors = rebuild_author_lines([standard_authors[x]], \ +# re_single_author_pattern_with_numeration) + ## Associate authors with affiliations - affiliation_associated_standard_authors.append({'authors' : rebuilt_standard_authors, + affiliation_associated_standard_authors.append({'authors' : standard_authors[x], 'affiliation' : affiliations[x]['line']}) - just_standard_authors.extend(rebuilt_standard_authors) + just_standard_authors.extend(standard_authors[x]) ## Now assemble affiliated authors, with their affiliations for aff in affiliations: ## Append any affiliation supported authors. @@ -5638,13 +5864,13 @@ def check_for_end_of_author_section_match_keywords(docbody): ## In the event that standard authors were not paired with affiliations ## then just make a list of dictionaries of authors without affiliations if standard_authors and not affiliation_associated_standard_authors: - rebuilt_standard_authors = \ - [rebuild_author_lines([std_auth_line], re_single_author_pattern_with_numeration) \ - for std_auth_line in standard_authors] - for r in rebuilt_standard_authors: - affiliation_associated_standard_authors.append({'authors' : r, +# rebuilt_standard_authors = \ +# [rebuild_author_lines([std_auth_line], re_single_author_pattern_with_numeration) \ +# for std_auth_line in standard_authors] + for s in standard_authors: + affiliation_associated_standard_authors.append({'authors' : s, 'affiliation' : None,}) - just_standard_authors.extend(r) + just_standard_authors.extend(s) if cli_opts['verbosity'] > 7: sys.stdout.write("--- Author extraction results:\n") @@ -5739,6 +5965,11 @@ def choose_author_method(tagged_info, std_info, aff_info, \ std_authors = map(lambda y: y.strip(" ,"), std_authors) aff_authors = map(lambda z: z.strip(" ,"), aff_authors) + sys.stdout.write("tagged authors: %d\n\n" % len(tagged_authors)) + sys.stdout.write("std authors: %d\n\n" % len(std_authors)) + sys.stdout.write("aff authors: %d\n\n" % len(aff_authors)) + + ## False if there is a 'weak' affiliation-supported author match ## AND none of them are found in the list of standard authors weak_affiliated_authors = False @@ -6714,10 +6945,38 @@ def get_reference_lines(docbody, ref_line_marker_ptn) return ref_lines +def get_reference_section_beginning(fulltext): + + sect_start = {'start_line' : None, + 'end_line' : None, + 'title_string' : None, + 'marker_pattern' : None, + 'marker' : None, + } + + ## Find start of refs section: + sect_start = find_reference_section(fulltext) + if sect_start is not None: + how_found_start = 1 + else: + ## No references found - try with no title option + sect_start = find_reference_section_no_title_via_brackets(fulltext) + if sect_start is not None: how_found_start = 2 + ## Try weaker set of patterns if needed + if sect_start is None: + ## No references found - try with no title option (with weaker patterns..) + sect_start = find_reference_section_no_title_via_dots(fulltext) + if sect_start is not None: how_found_start = 3 + if sect_start is None: + ## No references found - try with no title option (with even weaker patterns..) + sect_start = find_reference_section_no_title_via_numbers(fulltext) + if sect_start is not None: how_found_start = 4 + + return sect_start ## ----> Glue - logic for finding and extracting reference section: -def extract_references_from_fulltext(fulltext): +def extract_references_from_fulltext(fulltext, sect_start): """Locate and extract the reference section from a fulltext document. Return the extracted section as a list of strings, whereby each string in the list is considered to be a single line. @@ -6732,38 +6991,19 @@ def extract_references_from_fulltext(fulltext): status = 0 lines = [] - sect_start = {'start_line' : None, - 'end_line' : None, - 'title_string' : None, - 'marker_pattern' : None, - 'marker' : None, - } sect_end = None #How ref section found flag how_found_start = 0 - ## Find start of refs section: - sect_start = find_reference_section(fulltext) - if sect_start is not None: how_found_start = 1 + #sect_start = get_reference_section_beginning(fulltext) + if sect_start is None: - ## Only if an ending keyword was found, look for affilations - if section == 'authors': - affiliations = find_author_affiliations(fulltext,use_to_find_authors=True) - - ## Found affiliations... there could be some new authors found too! - if affiliations: - ## Append the affiliation supported authors, since the first method failed - for aff_auth_pair in affiliations: - lines.extend([auth for auth in aff_auth_pair[1]])#Authors - return (map(replace_undesirable_characters,lines),status,how_found_start) - #lines.append(aff_auth_pair[0])#Affiliation - else: - ## No References - lines = [] - status = 4 - write_message("-----extract_section_from_fulltext: " \ - "No section found\n", verbose=2) + ## No lines + status = 4 + if cli_opts['verbosity'] >= 1: + sys.stdout.write("-----extract_section_from_fulltext: " \ + "No section found!\n") else: sect_end = None if sect_start.has_key("end_line"): @@ -6821,7 +7061,7 @@ def _pdftotext_conversion_is_bad(txtlines): else: return 0 -def convert_PDF_to_plaintext(fpath): +def convert_PDF_to_plaintext(fpath, for_top_section): """Take the path to a PDF file and run pdftotext for this file, capturing the output. @param fpath: (string) path to the PDF file @@ -6836,10 +7076,18 @@ def convert_PDF_to_plaintext(fpath): ## and footers, and for some other pattern matching. p_break_in_line = re.compile(unicode(r'^\s*?(\f)(?!$)(.*?)$'), re.UNICODE) ## build pdftotext command: - cmd_pdftotext = """%(pdftotext)s -raw -q -enc UTF-8 '%(filepath)s' -""" \ - % { 'pdftotext' : CFG_PATH_PDFTOTEXT, - 'filepath' : fpath.replace("'", "\\'") - } + + if for_top_section: + text_representation = "" #FIXME -layout + else: + text_representation = "-raw" + + cmd_pdftotext = """%(pdftotext)s %(rep)s -q -enc UTF-8 '%(filepath)s' -""" \ + % { 'pdftotext' : CFG_PATH_PDFTOTEXT, + 'rep' : text_representation, + 'filepath' : fpath.replace("'", "\\'") + } + write_message("%s\n" % cmd_pdftotext, verbose=2) ## open pipe to pdftotext: pipe_pdftotext = os.popen("%s" % cmd_pdftotext, 'r') @@ -6849,7 +7097,7 @@ def convert_PDF_to_plaintext(fpath): unicodeline = docline.decode("utf-8") ## Check for a page-break in this line: m_break_in_line = p_break_in_line.match(unicodeline) - if m_break_in_line is None: + if (m_break_in_line is None) or for_top_section: ## There was no page-break in this line. Just add the line: doclines.append(unicodeline) count += 1 @@ -6866,12 +7114,13 @@ def convert_PDF_to_plaintext(fpath): "%s lines of text\n" % str(count), verbose=2) ## finally, check conversion result not bad: - if _pdftotext_conversion_is_bad(doclines): + if _pdftotext_conversion_is_bad(doclines): #FIXME must not run this with -layout status = 2 doclines = [] + return (doclines, status) -def get_plaintext_document_body(fpath): +def get_plaintext_document_body(fpath, for_top_section): """Given a file-path to a full-text, return a list of unicode strings whereby each string is a line of the fulltext. In the case of a plain-text document, this simply means reading the @@ -6900,7 +7149,7 @@ def get_plaintext_document_body(fpath): elif (res_gfile.lower().find("pdf") != -1) or \ (res_gfile.lower().find("pdfa") != -1): ## convert from PDF - (textbody, status) = convert_PDF_to_plaintext(fpath) + (textbody, status) = convert_PDF_to_plaintext(fpath, for_top_section) else: ## filepath not OK status = 1 @@ -7283,8 +7532,11 @@ def begin_extraction(daemon_cli_options=None): write_message("--- processing RecID: %s pdffile: %s; %s\n" \ % (str(curitem[0]), curitem[1], ctime()), verbose=2) + extract_top_section_metadata = cli_opts['authors'] or cli_opts['affiliations'] + ## 1. Get this document body as plaintext: - (docbody, extract_error) = get_plaintext_document_body(curitem[1]) + (docbody, extract_error) = get_plaintext_document_body(curitem[1], \ + extract_top_section_metadata) if extract_error == 1: ## Non-existent or unreadable pdf/text directory. write_message("***%s\n\n" % curitem[1], sys.stderr, verbose=0) @@ -7329,19 +7581,28 @@ def begin_extraction(daemon_cli_options=None): ## treat entire input as relevant section: extract_lines = docbody else: - ## launch search for the relevant section in the document body: - if cli_opts['authors'] == 1 or cli_opts['affiliations'] == 1: + ## Always find the position of the start of the reference section. + ## This can be worked on, or stripped, later. + refs_start = get_reference_section_beginning(docbody) + + ## launch search for the relevant section in the document body + if extract_top_section_metadata: + ## Strip references. This will prevent analysing this section for authors. + if refs_start: + docbody = docbody[:refs_start['start_line']] + (document_info, extract_error, how_found_start) = \ extract_top_document_information_from_fulltext(docbody, first_author=cli_opts['first_author']) + if cli_opts['authors']: extract_lines = document_info['authors'] elif cli_opts['affiliations']: extract_lines = document_info['affiliations'] else: (extract_lines, extract_error, how_found_start) = \ - extract_references_from_fulltext(docbody) + extract_references_from_fulltext(docbody, refs_start) - if not cli_opts['authors'] and not cli_opts['affiliations']: + if not extract_top_section_metadata: if len(extract_lines) == 0 and extract_error == 0: extract_error = 6 write_message("-----extract_references_from_fulltext " \ diff --git a/modules/bibedit/lib/refextract_config.py b/modules/bibedit/lib/refextract_config.py index ba0ff34338..2054b05334 100644 --- a/modules/bibedit/lib/refextract_config.py +++ b/modules/bibedit/lib/refextract_config.py @@ -36,8 +36,9 @@ CFG_REFEXTRACT_KB_AUTHORS = "%s/bibedit/refextract-authors.kb" % CFG_ETCDIR ## Institutions, paired with author and affiliation extraction -CFG_INSTITUTIONS = ['CERN','DESY','Rutherford','Fermilab','SLAC','TRIUMF','Brookhaven Livermore','Argonne'] +CFG_REFEXTRACT_INSTITUTIONS = ['CERN','DESY','Rutherford','Fermilab','SLAC','TRIUMF','Brookhaven Livermore','Argonne'] +CFG_REFEXTRACT_AFFILIATION_NUMERATION_ALLOWABLE_GAP = 2 ## MARC Fields and subfields used by refextract: From 49989b5342368a7d5c2bffa36f78827b140d5680 Mon Sep 17 00:00:00 2001 From: Christopher Hayward Date: Mon, 15 Aug 2011 09:56:51 +0200 Subject: [PATCH 14/15] refextract: improve affiliated author search * Include delimiters when arranging affiliated authors. * Preserve the realigned numeration when searching for authors. * Reuse the 'around-comma' numeration swapping when looking for affiliated authors. * Add another config variable capable for the replacement of affiliation terms. Rename the original other affiliation config variable to include the work 'reduction'. * Improve the numeration obtaining regular expressions; Only match numeration on lines which hold other content too. * Collect numerated affiliation data together when searching. * Show the list of affiliated authors per affiliation when searching for affiliated authors. Control with verbosity cli option. * Change the flag associated with the extraction of affiliations from -f to -l, avoiding the issue of the forthcoming fulltext api change to Refextract (-f, --fulltext for providing fulltext input) * Fix the mechanism of adding to the list of affiliated author info, by only appending a new affiliated author item if authors actually exist for that item. This prevents an invalid selection of a set of affiliated authors (over a set of standard authors), in the event that no actual authors exist, just affiliation/strength data. * Add cli verbosity-controlled messages, depicting the current status of the author extraction process. * Repair the cli arguments used inside get_cli_opts. * Change the returning document information from extract_top_document_information_from_fulltext. Now returns a list of dictionaries containing author data with possible affiliations, and a list of affiliation data. * This excludes a list of 'marked-up' author data, which is now assembled outside of this function call. * Relocate the act of locating of a document's reference section into the functions concerned with either extracting references or authors/affiliations. * Rename variables relating to lines holding either reference or top-section data, away from reference specific names. --- modules/bibedit/lib/refextract.py | 693 ++++++++++++----------- modules/bibedit/lib/refextract_config.py | 12 +- 2 files changed, 361 insertions(+), 344 deletions(-) diff --git a/modules/bibedit/lib/refextract.py b/modules/bibedit/lib/refextract.py index 58f8a1df5d..8e44462ea1 100644 --- a/modules/bibedit/lib/refextract.py +++ b/modules/bibedit/lib/refextract.py @@ -34,7 +34,8 @@ CFG_REFEXTRACT_KB_JOURNAL_TITLES, \ CFG_REFEXTRACT_KB_REPORT_NUMBERS, \ CFG_REFEXTRACT_KB_AUTHORS, \ - CFG_REFEXTRACT_INSTITUTIONS, \ + CFG_REFEXTRACT_INSTITUTION_REPLACEMENTS, \ + CFG_REFEXTRACT_INSTITUTION_REDUCTIONS, \ CFG_REFEXTRACT_AFFILIATION_NUMERATION_ALLOWABLE_GAP, \ CFG_REFEXTRACT_CTRL_FIELD_RECID, \ CFG_REFEXTRACT_TAG_ID_REFERENCE, \ @@ -72,7 +73,8 @@ CFG_REFEXTRACT_KB_JOURNAL_TITLES = "%s/etc/refextract-journal-titles.kb" % '..' CFG_REFEXTRACT_KB_REPORT_NUMBERS = "%s/etc/refextract-report-numbers.kb" % '..' CFG_REFEXTRACT_KB_AUTHORS = "%s/etc/refextract-authors.kb" % '..' - CFG_REFEXTRACT_INSTITUTIONS = [] + CFG_REFEXTRACT_INSTITUTION_REDUCTIONS = [] + CFG_REFEXTRACT_INSTITUTION_REPLACEMENTS = {} CFG_REFEXTRACT_AFFILIATION_NUMERATION_ALLOWABLE_GAP = 2 CFG_REFEXTRACT_CTRL_FIELD_RECID = "001" ## control-field recid CFG_REFEXTRACT_TAG_ID_REFERENCE = "999" ## ref field tag @@ -181,8 +183,9 @@ def encode_for_xml(s): Output raw references, as extracted from the document. No MARC XML mark-up - just each extracted line, prefixed by the recid of the document that it came from. - -a, --authors extract authors, not references. most other options - work as expected + -a, --authors extract the authors of the document. Attempt to + return associations between authors and affiliations + whenever possible. --first_author use the following regexp as the first author, helps for author extraction, ignored otherwise -l, --affiliations @@ -304,7 +307,7 @@ def get_subfield_content(line,subfield_code): content = content.split('')[0] return content -def compress_subfields(out,subfield_code): +def compress_subfields(out, subfield_code): """ For each datafield, compress multiple subfields of type 'subfield_code' into a single one e.g. for MISC text, change xml format from: @@ -404,13 +407,12 @@ def restrict_m_subfields(reference_lines): new_reference_lines.append(reference_lines[i]) return m_restricted,new_reference_lines -def filter_processed_references(out): - +def filter_processed_lines(out): """ apply filters to reference lines found - to remove junk""" - reference_lines = out.split('\n') + processed_lines = out.split('\n') ## Remove too long and too short m tags - (m_restricted,ref_lines) = restrict_m_subfields(reference_lines) + (m_restricted, ref_lines) = restrict_m_subfields(processed_lines) if m_restricted: a_tag=re.compile('\(.*?)\<\/subfield\>') @@ -440,9 +442,9 @@ def filter_processed_references(out): if rec: new_out += rec + '\n' len_filtered += 1 - if len(reference_lines) != len_filtered: - write_message("-----Filter results: unfilter references line length is %d and filtered length is %d\n" \ - % (len(reference_lines),len_filtered), verbose=2) + if len(processed_lines) != len_filtered: + write_message("-----Filter results: unfiltered section line length is %d and filtered length is %d\n" \ + % (len(processed_lines),len_filtered), verbose=2) return new_out def get_url_repair_patterns(): @@ -1251,6 +1253,9 @@ def get_author_affiliation_numeration_str(punct=None): @return: (string), which can be compiled into a regex; identifies numeration next to an author name. """ + + ##FIXME cater for start or end numeration (ie two puncs) + ## Number to look for, either general or specific re_number = '(?:\d\d?)' re_chained_numbers = "(?:(?:[,;]\s*%s\.?\s*))*" % re_number @@ -1266,7 +1271,7 @@ def get_author_affiliation_numeration_str(punct=None): (%(num)s\s* ## Core numeration item, either specific or generic %(num_chain)s ## Extra numeration, either generic or empty ) - (?:(%(punct)s)|[^\d]) ## Right numeration punctuation + (?:(%(punct)s)) ## Right numeration punctuation )""" % {'num' : re_number, 'num_chain' : re_chained_numbers, 'punct' : re_punct} @@ -3005,7 +3010,7 @@ def create_marc_xml_reference_line(line_marker, count_url, \ count_doi, \ count_auth_group) = \ - convert_processed_reference_line_to_marc_xml(line_marker, \ + convert_processed_line_to_marc_xml(line_marker, \ tagged_line.replace('\n',''), \ identified_dois, \ identified_urls) @@ -3530,7 +3535,7 @@ def build_formatted_xml_citation(citation_elements,line_marker): -def convert_processed_reference_line_to_marc_xml(line_marker, +def convert_processed_line_to_marc_xml(line_marker, line, identified_dois, identified_urls): @@ -4235,12 +4240,6 @@ def create_marc_xml_reference_section(ref_sect, ## Find DOI sections in citation (working_line1, identified_dois) = identify_and_tag_DOI(working_line1) - - - ## Find DOI sections in citation - (working_line1, identified_dois) = identify_and_tag_DOI(working_line1) - - ## Identify and replace URLs in the line: (working_line1, identified_urls) = identify_and_tag_URLs(working_line1) @@ -4868,16 +4867,19 @@ def get_post_author_section_keyword_patterns(): re_splitting_comma = re.compile(",[^\d]", re.UNICODE) -def arrange_possible_authors(line): - ## Replace and's with commas - comma_split_line = re.sub(r"(^\s*|\s)([Aa][Nn][Dd]|&)\s", ", ", line) +def arrange_possible_authors(line, delimiter=None): + if not delimiter: + delimiter = "," + ## Replace and's with delimiter (comma as standard) + delimited_line = re.sub(r"(^\s*|\s)([Aa][Nn][Dd]|&)\s", delimiter, line) ## Split by commas - possible_authors = re_splitting_comma.split(comma_split_line.strip()) +# possible_authors = re_splitting_comma.split(comma_split_line.strip()) + possible_authors = delimited_line.split(delimiter) ## Remove empty stuff possible_authors = filter(lambda x: x.strip(), possible_authors) return possible_authors -def gather_numerated_authors_affiliations(lines, aff_positions, number_to_find): +def gather_affiliated_authors_by_numeration(lines, aff_positions, number_to_find): """Use the found affiliation to try and help with author extraction""" def has_number(possible_auth, number_to_find): """Does this possible author have the numeration I want?""" @@ -4912,11 +4914,14 @@ def remove_excess_numeration(author_match): lines_to_check = lines[:] while lines_to_check: line = lines_to_check.pop().strip() - position = len(lines_to_check) - if aff_positions and (position in aff_positions): + popped_position = len(lines_to_check) + if aff_positions and (popped_position in aff_positions): continue - ## Split according to commas/'and's - possible_authors = arrange_possible_authors(line) + ## Shift numeration around delimiters if needed + ##FIXME shouldnt have to do this again here.. was done earlier for finding std authors + (shifted_line, num_delimiter) = realign_shifted_line_numeration_around_commas(line) + ## Split according to delimiter (default is comma) + possible_authors = arrange_possible_authors(shifted_line, num_delimiter) ## Make a list of ok authors found in the split line, for this affiliation numerated_authors.extend(filter(lambda a: has_number(a, number_to_find), possible_authors)) ## So, on this line, a numerated author was found. So, @@ -4929,7 +4934,7 @@ def remove_excess_numeration(author_match): map(remove_excess_numeration, all_split_authors)) def initiate_affiliated_author_search(affiliations, top_lines, aff_positions): - """ Using obtained affiliation details, try to find authors, using primarily the + """Using obtained affiliation details, try to find authors, using primarily the numeration-associated method (pairing numerated authors with numerated affiliations, and as a fall-back, the 'lines-above' affiliation. @param affiliations: (dictionary) Already collected affiliations, with their possible @@ -4947,21 +4952,22 @@ def initiate_affiliated_author_search(affiliations, top_lines, aff_positions): loose_authors = [] for cur_aff in affiliations: - position_above = cur_aff['position']-1 + if cli_opts['verbosity'] >= 2: + sys.stdout.write("---Finding affiliated authors for %s...\n" % cur_aff['line'].encode("UTF-8").strip()) ## Using numerated affiliations if cur_aff['aff_nums']: numerated_authors = [] for num in cur_aff['aff_nums']: if not num in tried_numeration: ## For this single, new, affiliation numeration val - ## use it to find authors, given: + ## use it to find authors, using: ## 1. Lines above the affiliation ## 2. The already identified affiliation positions ## 3. The affiliation number, for authors, to look for (numerated_authors_single_num, all_split_authors) = \ - gather_numerated_authors_affiliations(top_lines, \ - aff_positions, \ - number_to_find=num) + gather_affiliated_authors_by_numeration(top_lines, \ + aff_positions, \ + number_to_find=num) numerated_authors.extend(numerated_authors_single_num) ## Save all split authors, if at least one numerated author was found! ## Otherwise, this is just an empty addition @@ -4971,12 +4977,14 @@ def initiate_affiliated_author_search(affiliations, top_lines, aff_positions): ## Substantially reliable cur_aff['author_data'] = {'authors' : numerated_authors, 'strength' : 1} - + if cli_opts['verbosity'] >= 7: + sys.stdout.write("----Found %d strong affiliated authors.\n" % len(numerated_authors)) else: ## Using (line-above) NON-numerated affiliations to look for authors ## This method is far less accurate than using numeration, but nonetheless ## correct in a wide variety of situations. ## Get the next non-empty line above the affiliation + position_above = cur_aff['position']-1 while (position_above >= assumed_top_section_start) and \ (position_above >= 0) and \ (not top_lines[position_above].strip()): @@ -4998,6 +5006,8 @@ def initiate_affiliated_author_search(affiliations, top_lines, aff_positions): except IndexError: break position_above -= 1 + + collected_line_above_authors = [] ## For each 'possible author' line above the affiliation ## Build a list of weakly-matched authors for line_above in lines_above: @@ -5011,10 +5021,14 @@ def initiate_affiliated_author_search(affiliations, top_lines, aff_positions): else: ## This match isnt so reliable strength_for_this_line_above = 0 + collected_line_above_authors = filter(lambda a: re_ambig_auth.search(a), split_line_above) + + ## Far less reliable than the numerated version + cur_aff['author_data'] = {'authors' : collected_line_above_authors, + 'strength' : strength_for_this_line_above,} - ## Far less reliable than the numerated version - cur_aff['author_data'] = {'authors' : filter(lambda a: re_ambig_auth.search(a), split_line_above), - 'strength' : strength_for_this_line_above,} + if cli_opts['verbosity'] >= 7: + sys.stdout.write("----Found %d weak affiliated authors.\n" % len(collected_line_above_authors)) # ## Check all numerated authors which were found # all_numerated_authors = [] @@ -5032,8 +5046,9 @@ def build_start_end_numeration_str(predefined_punct=None): (e.g. brackets) @return: (regex) The regex which will match both starting and ending numeration on a line, with any additional punctuation included.""" - numeration_str = "^"+get_author_affiliation_numeration_str(predefined_punct) \ - +"|"+get_author_affiliation_numeration_str(predefined_punct) \ + ## It is important that the regex matches the number AND something else relevant on the line + numeration_str = "^"+get_author_affiliation_numeration_str(predefined_punct)+r"[^\d](?:(?:.*?)[^\d\s\.,\:;\-\[\]\(\)\*\\](?:.*?))" \ + +"|"+r"(?:(?:.*?)[^\s\.,\:;\-\[\]\(\)\*\\](?:.*?))[^\d]"+get_author_affiliation_numeration_str(predefined_punct) \ +"$" return numeration_str @@ -5068,7 +5083,7 @@ def obtain_author_affiliation_numeration_list(line, punct=None): ## num_match is used to obtain punctuation around the numeration return (i_stripped_nums, num_match) -def standardise_affiliation_names(line): +def replace_affiliation_names(line): """ Standardise some affiliations. Convert some domain specific HEP names to a standard form. This will very likely be moved out into a kb soon. @@ -5078,14 +5093,13 @@ def standardise_affiliation_names(line): """ ## Removes numeration, 'the'/'and', and replace titles line = line.strip() - line = re.sub(r"^Livermore", "LLNL, Livermore", line) - line = re.sub(r".*?Stanford Linear Accelerator Center.*?", "SLAC", line) - line = re.sub(r"^Fermi National Accelerator Laboratory", "Fermilab", line) + for term, repl in CFG_REFEXTRACT_INSTITUTION_REPLACEMENTS.items(): + line = re.sub(term, repl, line) line = re.sub(r"\s[tT][hH][eE]\s", " ", line) line = re.sub(r"\s[aA][nN][dD]\s", " ", line) return line -def standardise_affiliation_formats(line): +def reduce_affiliation_names(line): """ Standardise some affiliations. This will remove numeration, and will convert university names into a standard format. @param line: (string) Line from the document holding a @@ -5100,13 +5114,13 @@ def standardise_affiliation_formats(line): ## Get the University name line = (univ_name.group(2) or univ_name.group(3)) + " U." ## Check and set an institution - for inst in CFG_REFEXTRACT_INSTITUTIONS: + for inst in CFG_REFEXTRACT_INSTITUTION_REDUCTIONS: if line.find(inst) != -1: line = inst break return line -def extract_numerated_affiliations(num_position, num_section, num_find, num_punct, missing): +def extract_numerated_affiliations(num_data, num_find, missing): """ Collect numerated affiliations, using a section of the document, and the number which to search for. The punctuation surrounding any numeration (the first number found) (if any) is used to improve the strictness of the search. @@ -5119,12 +5133,13 @@ def extract_numerated_affiliations(num_position, num_section, num_find, num_punc content and numeration data of an affiliation. """ affs = [] - if num_section: + + if num_data['position'] < len(num_data['top']): ## First line, holding first affiliation with the number 1 - line = num_section[0].strip() + line = num_data['top'][num_data['position']].strip() ## A number has been found before this iteration ## Use previous number, and previous punctuation! - (aff_nums, specific_num_match) = obtain_author_affiliation_numeration_list(line, num_punct) + (aff_nums, specific_num_match) = obtain_author_affiliation_numeration_list(line, num_data['punc']) if num_find in aff_nums: ## Attempt to get numeration for this affiliation try: @@ -5134,21 +5149,22 @@ def extract_numerated_affiliations(num_position, num_section, num_find, num_punc sys.stderr.write("Error: Unable to obtain integer affiliation numeration.") sys.exit(1) ## Save the punctuation surrounding the numeration - affs.append({'position' : num_position, - 'line' : standardise_affiliation_formats(line), + affs.append({'position' : num_data['position'], + 'line' : reduce_affiliation_names(line), 'aff_nums' : aff_nums, 'author_data' : None}) +# print "--Found aff: %s" % line elif num_find in missing: ## Get the next non missing number and use that while num_find in missing: num_find += 1 + ## Increment position and remove top line + num_data['position'] += 1 ## Do until end of docbody section (num_section) - affs.extend(extract_numerated_affiliations(num_position+1, \ - num_section[1:], \ + affs.extend(extract_numerated_affiliations(num_data, \ num_find, \ - num_punct, \ missing)) return affs @@ -5184,6 +5200,8 @@ def realign_numeration(toplines): except ValueError: continue + numeration_swaps = 0 + ## Now, using the positions of the '1's, go forward and locate ## subsequent numeration, and replicate on the following line if need be missing_nums = [] @@ -5191,7 +5209,6 @@ def realign_numeration(toplines): alignment_error = 0 num = 1 for position, line in enumerate(toplines[start:]): -# print "start pos: %d, now on num: %d" % (start, num) num_match = re_start_numeration.search(line) if num_match: @@ -5201,13 +5218,10 @@ def realign_numeration(toplines): except ValueError: continue -# print "got num %d, position %d, on line %s" % (i_num, start+position, line.strip()) - ## Hit a number which is not expected, and is not just 2 ahead if (i_num != num) and ((i_num < num) or ((i_num - num) > \ CFG_REFEXTRACT_AFFILIATION_NUMERATION_ALLOWABLE_GAP)): - #alignment_error = 1 - ## Skipping can occur, but only twice in a row before exiting + ## Skipping can occur, but only whilst the number is within the allowable gap continue else: ## When there exists an acceptable missed number, for whatever reason @@ -5241,6 +5255,7 @@ def realign_numeration(toplines): # print "new line: %s" % toplines_alternate[lookahead] ## Increment the next number to look for num += 1 + numeration_swaps += 1 break except IndexError: alignment_error = 4 @@ -5249,16 +5264,16 @@ def realign_numeration(toplines): ## To dangerous to continue. alignment_error = 5 -# elif line.strip(): -# print "Stopping, num: %d, line: %s" % (num, line.strip()) -# break - if alignment_error: - print "alignment error: %d (1 or 2 is bad)" % alignment_error + if cli_opts['verbosity'] >= 1: + sys.stdout.write("---Warning: Realign numeration problem #%d.\n" % alignment_error) ## Scrap the alternate version toplines_alternate = toplines break + if cli_opts['verbosity'] >= 8: + sys.stdout.write("---realign numeration made %d changes.\n" % numeration_swaps) + return (toplines_alternate, missing_nums) def find_affiliations(lines, start, end=None, use_to_find_authors=False): @@ -5280,10 +5295,9 @@ def get_smaller(x, y): return y affiliations = [] - starting_num_position = None - numerated_aff_num_punct_ptn = None - top = None - + numerated_aff_data = {'position' : None, + 'punc' : None, + 'top' : None,} if not start: start = 0 @@ -5293,46 +5307,42 @@ def get_smaller(x, y): else: top_lines_orig = lines[start:] -# print "top_lines" -# print top_lines_orig - ## Get an alternative version of the top section, of the same length ## but with some alone numeration replicated on the next line! (top_lines_alt, missing_nums) = realign_numeration(top_lines_orig) -# print "top_lines_alt:" -# print top_lines_alt - for position in range(len(top_lines_orig)): ## Standardise some affiliations - line = standardise_affiliation_names(top_lines_orig[position].strip()) - line_alt = standardise_affiliation_names(top_lines_alt[position].strip()) + line = replace_affiliation_names(top_lines_orig[position].strip()) + line_alt = replace_affiliation_names(top_lines_alt[position].strip()) ## If a previous numeration value was found in the previous iteration ## check for the increment of this value on this line if re_aff_num.search(line) or re_aff_name.search(line): - print "AFF match on line: %s" % line - - ## Check numeration in replica docbody + ## Check numeration in line from original & realigned docbodies (aff_nums, num_match) = obtain_author_affiliation_numeration_list(line) - ## Check numeration in the numeration-realigned docbody (aff_nums_alt, num_match_alt) = obtain_author_affiliation_numeration_list(line_alt) + ## Set the information to the correct top_section, depending on ## if the numeration was found split across lines or not. if aff_nums or not aff_nums_alt: - print "using orig lines" - top = top_lines_orig - elif aff_nums_alt: - print "using alt lines" - top = top_lines_alt + top_version_for_line = top_lines_orig + else: + top_version_for_line = top_lines_alt aff_nums = aff_nums_alt num_match = num_match_alt + if cli_opts['verbosity'] >= 4: + sys.stdout.write("---Affiliation match on line: %s\n" % \ + top_version_for_line[position].encode("UTF-8").strip()) + ## Aff number '1' numeration found, save position and punctuation - if aff_nums and num_match and 1 in aff_nums: - starting_num_position = position - numerated_aff_num_punct_ptn = num_match.group(1) + if aff_nums and num_match and (1 in aff_nums): + ## Set the top version to use, depending on how this initial aff was found + numerated_aff_data = {'position' : position, + 'top' : top_version_for_line, + 'punc' : num_match.group(1),} ## So, an AFFILIATION KEYWORD was found on this line, but this is not a '1'! ## Move up lines to get the starting affiliation position, using NUMERATION elif aff_nums and num_match: @@ -5341,58 +5351,71 @@ def get_smaller(x, y): reversed_position = position - 1 ## Attempt to go back and find the start of this numeration section ## Get numeration for this line - while (reversed_position >= 0) and (starting_num_position is None): + while (reversed_position >= 0) and (numerated_aff_data['position'] is None): ## Check numeration in the numeration-realigned docbody (rev_aff_nums, rev_num_match) = \ - obtain_author_affiliation_numeration_list(top[reversed_position]) + obtain_author_affiliation_numeration_list(top_version_for_line[reversed_position]) ## Check for numeration n, n = 1 if find_num == 1 and (find_num in rev_aff_nums): - starting_num_position = reversed_position - numerated_aff_num_punct_ptn = rev_num_match.group(1) + ## Set the top version to use, depending on how this initial aff was found + numerated_aff_data = {'position' : reversed_position, + 'top' : top_version_for_line, + 'punc' : rev_num_match.group(1),} ## Check for numeration n, 1 < n < last found elif find_num in rev_aff_nums: find_num = find_num - 1 ## Move position up one line reversed_position = reversed_position - 1 - ## Starting numeration was found..! - if not starting_num_position: + if not numerated_aff_data['position']: ## Could not find start. Abort everything. break else: - ## The normal way of appending lines with affiliation names + ## No numeration -- append affiliation normally affiliations.append({'position' : position, - 'line' : standardise_affiliation_formats(line), + 'line' : reduce_affiliation_names(line), 'aff_nums' : None, 'author_data' : None,}) - ## Stop searching if a keyworded and numerated affiliation has been found - if starting_num_position is not None: + ## Stop searching if a starting numerated affiliation has been found + if numerated_aff_data['position'] is not None: break - ## In the situation where numeration has been found for an affiliation - ## Collect up all of the following numerated affiliations, - ## or go backwards and obtain them - if starting_num_position: - print "Going to extract numerated affiliations..." - affiliations = extract_numerated_affiliations(starting_num_position, \ - top[starting_num_position:], \ + starting_aff_position = None + ## Collect all numerated affiliations, using the starting affiliation + ## Missing numeration was detected during the realignment process + if numerated_aff_data['position'] is not None: + ## Need to save this, since finding numerated affs will change it + starting_aff_position = numerated_aff_data['position'] + + affiliations = extract_numerated_affiliations(numerated_aff_data, \ 1, \ - numerated_aff_num_punct_ptn, \ missing_nums) + if cli_opts['verbosity'] >= 4: + sys.stdout.write("---The collection of numerated affiliations returned %d affiliations.\n" % len(affiliations)) loose_authors = [] - - ## Try to obtain more authors, if needed + ## Get affiliated authors, if specified if use_to_find_authors: + top_to_use_to_find_authors = numerated_aff_data['top'] or top_lines_orig + + ## If numerated affiliations are being used, only look at the lines above + ## the first numerated affiliation + if numerated_aff_data['position']: + top_to_use_to_find_authors = top_to_use_to_find_authors[:starting_aff_position] aff_positions = [aff['position'] for aff in affiliations] - ## Then, if the above didn't work, do the 'line above' method + ## Look for authors associated with obtained affiliations (affiliations, loose_authors) = initiate_affiliated_author_search(affiliations, \ - top, \ - aff_positions) + top_to_use_to_find_authors, \ + aff_positions) + if cli_opts['verbosity'] >= 4: + sys.stdout.write("---The collection of affiliated authors returned authors for %d affiliations.\n" % \ + len([x['author_data']['authors'] for x in affiliations if (x['author_data'] and x['author_data']['authors'])])) + ## Remove undesirable characters for tmp_aff in affiliations: - tmp_aff['line'] = replace_undesirable_characters(standardise_affiliation_formats(tmp_aff['line']).strip(".,:;- []()*\\")) + tmp_aff['line'] = replace_undesirable_characters( \ + reduce_affiliation_names(tmp_aff['line']).strip(".,:;- []()*\\")) return (affiliations, loose_authors) @@ -5500,14 +5523,13 @@ def leading_comma(line): return (position, line_parts) re_misaligned_numeration_around_comma = re.compile("(.*?)(?P[,;])\s*(\d{1,3})") - def realign_shifted_line_numeration_around_commas(line): ## First see how many swap substitutions will take place, before-hand. swaps = [x for x in re_misaligned_numeration_around_comma.finditer(line)] delimiter = None if len(swaps) >= 1: ## Get the first matches' delimiter, which can be reused to split a line later. - delimeter = swaps[0].group("delim") + delimiter = swaps[0].group("delim") ## Do the swapping. line = re_misaligned_numeration_around_comma.sub(r"\g<1>\g<3>,", line).strip(",") return (line, delimiter) @@ -5569,16 +5591,18 @@ def check_for_end_of_author_section_match_keywords(docbody): ## Must exceed the first 3 lines keyword_hit = perform_regex_match_upon_line_with_pattern_list(line, ending_keyword_ptns) if keyword_hit and not found_ending_keyword and (position > 3): - if cli_opts['verbosity'] > 7: - print "--- ending keyword on line: %s, position: %d" % (line.strip(), position) + if cli_opts['verbosity'] >= 7: + sys.stdout.write("---Ending keyword match: %s, position: %d\n" % \ + (keyword_hit.group(0).strip(), position)) found_ending_keyword = position ## Look for author tags author_tag_hit = re_author_tag.search(line) if author_tag_hit and not found_author_tag: - if cli_opts['verbosity'] > 7: - print "--- author tag on line: %s, position: %d" % (line.strip(), position) - found_author_tag= position + if cli_opts['verbosity'] >= 7: + sys.stdout.write("---Author tag match: %s, position: %d\n" % \ + (author_tag_hit.group(0).strip(), position)) + found_author_tag = position ## Only in the top X lines if (found_ending_keyword and found_author_tag) \ @@ -5593,24 +5617,36 @@ def check_for_end_of_author_section_match_keywords(docbody): # 'Authors:', 'Chris Hayward,', '', 'Tim Smith,', '', 'Joe Harris', 'University of Bath', '', 'Abstract'] affiliations = [] - ## end-of-top-section-keyword position, if any - (pre_ending_keyword, pre_author_tag) = check_for_end_of_author_section_match_keywords(docbody) + + ## Always find the position of the start of the reference section. + ## Strip from document body. + refs_start = get_reference_section_beginning(docbody) + + ## Strip references. This will prevent analysing this section for authors. + if refs_start: + docbody = docbody[:refs_start['start_line']] + ## Default return values status = how_found_start = 0 - print "Top Section Ending:" + ## end-of-top-section-keyword position, if any + (pre_ending_keyword, pre_author_tag) = check_for_end_of_author_section_match_keywords(docbody) if pre_ending_keyword: - print "---keyword!" + if cli_opts['verbosity'] >= 4: + sys.stdout.write("---Using top section keyword as a delimiter.\n") top_section = docbody[:pre_ending_keyword] elif len(docbody) < assumed_top_section_length: ## Half total length + if cli_opts['verbosity'] >= 4: + sys.stdout.write("---Using entire document body as top section.\n") top_section = docbody - print "--whole!" else: - ## First 100 lines? + ## First N lines + if cli_opts['verbosity'] >= 4: + sys.stdout.write("---Using first %d lines as top section.\n" % \ + (assumed_top_section_length)) top_section = docbody[:assumed_top_section_length] - print "--first 100" tagged_author_information = [] just_tagged_authors = [] @@ -5665,36 +5701,21 @@ def check_for_end_of_author_section_match_keywords(docbody): first_standard_author_position = None standard_authors = [] standard_author_matches = [] + + ## Either original, or shifted, using numeration + top_section_version = None + for position in range(len(top_section)): ## An author tag was found, delay the search until the tag position is reached if first_author_tag_position and (position < first_author_tag_position): continue - ## 'Initial Surname' / 'Surname Initials' authors or - ## Knowledge-base specified authors -# if re_auth_with_number.search(line) or re_extra_auth.search(line): - ## Keep a list of standard authors, and their positions -# (last_standard_author_position, standard_authors) = collect_standard_authors(top_section, \ -# position) - ## Use the first matched author from WHERE TO START -# first_standard_author_position = position - -# (first_standard_author_position, std_authors, \ -# last_standard_author_position) = collect_standard_authors(top_section) - -# line = top_section[position] -# line_alt = realign_shifted_line_numeration_around_commas(line) - -# print line_alt -# print position - - ## Match on non-rearranged lines -# standard_author_pattern_match = re_auth_with_number.search(line) -# extra_author_pattern_match = re_extra_auth.search(line) - line = top_section[position] - (shifted_line, numeration_delimiter) = realign_shifted_line_numeration_around_commas(line) + (shifted_line, numeration_delimiter) = \ + realign_shifted_line_numeration_around_commas(line) + ## Look for authors (with or without numeration) on the original line + ## and on the augmented line (with numeration shifted) author_matches = \ [x for x in re_single_author_pattern_with_numeration.finditer(line)] author_matches_alt = \ @@ -5705,10 +5726,6 @@ def check_for_end_of_author_section_match_keywords(docbody): first_standard_author_position = position last_standard_author_position = position - print "from line: %s" % line - - use_line = "orig" - if author_matches and author_matches_alt: ## Save the list of matching authors in a list if len(author_matches) <= len(author_matches_alt): @@ -5717,21 +5734,11 @@ def check_for_end_of_author_section_match_keywords(docbody): use_line = "alt" if use_line == "alt": - which_line = re.sub("[Aa][Nn][Dd]|&", ",", shifted_line.strip()) + which_line = re.sub("\s([Aa][Nn][Dd]|&)\s", ",", shifted_line.strip()) standard_author_matches = author_matches_alt - print "USING ALT" else: - which_line = re.sub("[Aa][Nn][Dd]|&", ",", line.strip()) + which_line = re.sub("\s([Aa][Nn][Dd]|&)\s", ",", line.strip()) standard_author_matches = author_matches - print "USING NORM" - - ## Remove matches from string -# for s in standard_author_matches: -# m_start = s.start() -# m_end = s.end() -# esc_line = esc_line[:m_start]+((m_end - m_start) * "_")+esc_line[m_end:] - -# print "esc_line: %s" % esc_line new_standard_authors = [x.group(0) for x in standard_author_matches] @@ -5740,7 +5747,6 @@ def check_for_end_of_author_section_match_keywords(docbody): if len(standard_author_matches) >= 2: first_matched_auth = standard_author_matches[0].group(0) second_matched_auth = standard_author_matches[1].group(0) - try: delimiter = None ## Take either the delimiter from two author matches @@ -5752,75 +5758,18 @@ def check_for_end_of_author_section_match_keywords(docbody): delimiter = numeration_delimiter if delimiter: split_authors = [n for n in which_line.split(delimiter) if n.strip(", ")] + ## Take the authors obtained from splitting the line if len(split_authors) >= len(new_standard_authors): new_standard_authors = split_authors - sys.stdout.write("WOOP! adding missed authors: %s\n" % new_standard_authors) - else: - print "NO DELIMITER :(" ## WHY FOR THE FIRST AUTHOR LINE??? FIX THIS FIRST except IndexError: pass -# else: -# skipped_auths = [y.group(0) for y in \ -# re.finditer("[^\d,;&_]\s*([,;&]|[Aa][Nn][Dd]|$)\s*\d{0,3}", esc_line)] ## Standard author strings - sys.stdout.write("appending std authors: %s\n" % new_standard_authors) standard_authors.append(new_standard_authors) - - ## Match on rearranged lines -# standard_author_pattern_match_alt = re_auth_with_number.search(line_alt) -# extra_author_pattern_match_alt = re_extra_auth.search(line_alt) - -# if standard_author_pattern_match or extra_author_pattern_match \ -# or standard_author_pattern_match_alt or extra_author_pattern_match_alt: - -# if not first_standard_author_position: -# first_standard_author_position = position -# last_standard_author_position = position - -# if standard_author_pattern_match or standard_author_pattern_match_alt: -# if standard_author_pattern_match and standard_author_pattern_match_alt: -# if len(standard_author_pattern_match) > len(standard_author_pattern_match_alt): -# std_match = standard_author_pattern_match -# else: -# std_match = standard_author_pattern_match_alt -# else: -# std_match = standard_author_pattern_match or standard_author_pattern_match_alt - - ## Append the matched author string (standard pattern) -# print "from line: %s" % line -# print "appending std authors: %s" % std_match.group('author_names') -# standard_authors.append(std_match.group('author_names')) - -# if extra_author_pattern_match or extra_author_pattern_match_alt: -# if extra_author_pattern_match and extra_author_pattern_match_alt: -# if len(extra_author_pattern_match) > len(extra_author_pattern_match_alt): -# ext_match = extra_author_pattern_match -# else: -# ext_match = extra_author_pattern_match_alt -# else: -# ext_match = extra_author_pattern_match or extra_author_pattern_match_alt - ## Append the matched author string (extra pattern) -# standard_authors.append(ext_match.group('extra_auth')) - -# if cli_opts['verbosity'] > 7: -# print "--- author pattern match on line: %s, position: %d" % (line.strip(), position) - - ## By this point, we've managed to try and get tagged authors, - ## as well as anything in the top section that looks like an author - ## according to the main author regex. - -# print "STANDARD AUTHORS: %d" % len(standard_authors) -# print standard_authors - - ## Attempt to obtain authors using affiliation positions. - ## A tagged author position is considered the best. - ## Otherwise start from the top of the section. - ## Always attempt to find authors too. - (affiliations, loose_authors) = find_affiliations(top_section, \ - start=first_author_tag_position, \ - use_to_find_authors=True) + ## By this point, we've managed to try and get tagged authors, as well + ## as anything in the top section that looks like an author using the standard + ## author pattern. ## METHOD 3 -------------- Look for authors using affiliations ## and handle the assembly of standard authors too @@ -5831,72 +5780,85 @@ def check_for_end_of_author_section_match_keywords(docbody): just_standard_authors = [] just_affiliated_authors = [] + ## Now, attempt to obtain authors using affiliation positions. + ## A tagged author position is considered the best starting point. + ## Otherwise start from the top of the section. + ## Always attempt to find authors too. + (affiliations, loose_authors) = find_affiliations(top_section, \ + start=first_author_tag_position, \ + use_to_find_authors=True) + if affiliations is not None: ## Attempt to pair together standard authors with identified affiliations. ## If the number of affiliation is equal to the number of author lines if len(affiliations) == len(standard_authors): ## Increase strength for this (when len(aff)=len(auth))? for x in range(len(standard_authors)): - -# rebuilt_standard_authors = rebuild_author_lines([standard_authors[x]], \ -# re_single_author_pattern_with_numeration) - ## Associate authors with affiliations affiliation_associated_standard_authors.append({'authors' : standard_authors[x], 'affiliation' : affiliations[x]['line']}) just_standard_authors.extend(standard_authors[x]) ## Now assemble affiliated authors, with their affiliations for aff in affiliations: - ## Append any affiliation supported authors. - ## [!Do not include repeated authors, may be repeated from double numeration] + ## Append any affiliation supported authors, but only if authors exist. if aff['author_data']: author_list_for_affiliation = aff['author_data']['authors'] affiliated_author_strength = aff['author_data']['strength'] - else: - author_list_for_affiliation = [] - affiliated_author_strength = None - affiliation_associated_affiliated_authors.append( \ - {'authors' : [auth for auth in author_list_for_affiliation if auth not in just_affiliated_authors], - 'affiliation' : aff['line'], - 'strength' : affiliated_author_strength,}) - just_affiliated_authors.extend([auth for auth in author_list_for_affiliation]) + affiliation_associated_affiliated_authors.append( \ + {'authors' : [auth for auth in author_list_for_affiliation if auth not in just_affiliated_authors], + 'affiliation' : aff['line'], + 'strength' : affiliated_author_strength,}) + just_affiliated_authors.extend([auth for auth in author_list_for_affiliation]) ## In the event that standard authors were not paired with affiliations ## then just make a list of dictionaries of authors without affiliations if standard_authors and not affiliation_associated_standard_authors: -# rebuilt_standard_authors = \ -# [rebuild_author_lines([std_auth_line], re_single_author_pattern_with_numeration) \ -# for std_auth_line in standard_authors] for s in standard_authors: affiliation_associated_standard_authors.append({'authors' : s, 'affiliation' : None,}) just_standard_authors.extend(s) - if cli_opts['verbosity'] > 7: - sys.stdout.write("--- Author extraction results:\n") - sys.stdout.write("---- tagged: %s\n" % tagged_author_information) - sys.stdout.write("---- standard: %s\n" % affiliation_associated_standard_authors) - sys.stdout.write("---- affilated: %s\n" % affiliation_associated_affiliated_authors) - - ## Given three lists of authors, which have been 'extracted' using three different methods - ## decide which list to return as a set of reliable authors (if any) - final_authors = choose_author_method(tagged_author_information, \ - affiliation_associated_standard_authors, \ - affiliation_associated_affiliated_authors, \ - just_tagged_authors, \ - just_standard_authors, \ - just_affiliated_authors) - - if cli_opts['verbosity'] > 7: - sys.stdout.write("--- Selected authors info: %s\n" % final_authors) - - marked_up_authors = mark_up_authors_with_affiliations(final_authors) + ## Print the extracted author counts + if cli_opts['verbosity'] >= 4: + sys.stdout.write("---Author counts for each extraction type:\n") + sys.stdout.write("----1. Tagged, count %d.\n" % \ + (len([l for l in just_tagged_authors if l.strip()]))) + sys.stdout.write("----2. Standard, count %d.\n" % \ + (len([k for k in just_standard_authors if k.strip()]))) + sys.stdout.write("----3. Affiliated, count %d.\n" % \ + (len([j for j in just_affiliated_authors if j.strip()]))) + + ## Print the physical author matches + if cli_opts['verbosity'] == 9: + sys.stdout.write("---Author extraction type contents:\n") + sys.stdout.write("----1. Tagged authors:\n%s\n" % \ + tagged_author_information) + sys.stdout.write("----2. Standard authors:\n%s\n" % \ + affiliation_associated_standard_authors) + sys.stdout.write("----3. Affiliated authors:\n%s\n" % \ + [x['authors'] for x in affiliation_associated_affiliated_authors]) + + ## Given three lists of authors, which have been extracted using + ## three different methods decide which list to return as the most + ## reliable representation of this paper's authors (if any) + (final_authors, chosen_type) = choose_author_method(tagged_author_information, \ + affiliation_associated_standard_authors, \ + affiliation_associated_affiliated_authors, \ + just_tagged_authors, \ + just_standard_authors, \ + just_affiliated_authors) + + ## Display the results of choosing the set of authors + if cli_opts['verbosity'] >= 8: + sys.stdout.write("---Chosen author-type: %d\n" % chosen_type) + sys.stdout.write("\n********************\nExtracted top data:\n" % final_authors) + sys.stdout.write(" Authors:\n\t%s\n" % final_authors) + sys.stdout.write(" Affiliations:\n\t%s\n********************\n\n" % affiliations) document_information = {'authors' : final_authors, - 'affiliations' : affiliations, - 'marked_up_authors' : marked_up_authors,} + 'affiliations' : affiliations} - return (document_information, status, how_found_start) + return (document_information, status, chosen_type) def mark_up_authors_with_affiliations(final_authors): """ Prepare authors and any possible associated affiliations @@ -5910,7 +5872,7 @@ def mark_up_authors_with_affiliations(final_authors): ## affiliated supported authors were found! tagged_authors = [] - def process_auth(a): + def process_authors(a): ## Also remove numeration a = re.sub('\d+', '', a) a = replace_undesirable_characters(a).strip(".,:;- []()*\\") @@ -5921,17 +5883,17 @@ def process_aff(a): return a for aff_auth_dict in final_authors: - for auth in aff_auth_dict['authors']: + for authors in aff_auth_dict['authors']: ## Otherwise the closing element tag dissappears (!?) - if auth: + if authors: if not aff_auth_dict['affiliation']: - aff_for_auth = '' + aff_for_authors = '' else: - aff_for_auth = aff_auth_dict['affiliation'] + aff_for_authors = aff_auth_dict['affiliation'] tagged_authors.append("%s%s%s%s" % ("", \ - process_auth(auth), \ + process_authors(authors), \ CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_STND, \ - process_aff(aff_for_auth))) + process_aff(aff_for_authors))) return tagged_authors def choose_author_method(tagged_info, std_info, aff_info, \ @@ -5955,21 +5917,19 @@ def choose_author_method(tagged_info, std_info, aff_info, \ ## Immediately discard non-sets of authors (hold duplicate entries) if len(tagged_authors) != len(set(tagged_authors)): + sys.stdout.write("Warning: tagged authors list has duplicates.\n") tagged_info = [] if len(std_authors) != len(set(std_authors)): + sys.stdout.write("Warning: standard authors list has duplicates.\n") std_info = [] if len(aff_authors) != len(set(aff_authors)): + sys.stdout.write("Warning: affiliated authors list has duplicates.\n") aff_info = [] tagged_authors = map(lambda x: x.strip(" ,"), tagged_authors) std_authors = map(lambda y: y.strip(" ,"), std_authors) aff_authors = map(lambda z: z.strip(" ,"), aff_authors) - sys.stdout.write("tagged authors: %d\n\n" % len(tagged_authors)) - sys.stdout.write("std authors: %d\n\n" % len(std_authors)) - sys.stdout.write("aff authors: %d\n\n" % len(aff_authors)) - - ## False if there is a 'weak' affiliation-supported author match ## AND none of them are found in the list of standard authors weak_affiliated_authors = False @@ -6018,11 +5978,31 @@ def choose_author_method(tagged_info, std_info, aff_info, \ aff_authors_is_a_subset_of_std_authors or \ ((len(aff_authors) * 2) < len(std_authors)) + if not (tagged_authors or std_authors or aff_authors): + if cli_opts['verbosity'] >= 4: + sys.stdout.write("---Unable to find any authors.\n") + return ([], 0) + elif tagged_authors: + if cli_opts['verbosity'] >= 4: + sys.stdout.write("---Choosing tagged authors.\n") + return (tagged_info, 1) ## Make the choice, with the appropriate precedence - if standard_over_affiliated: - return tagged_info or std_info or aff_info + elif standard_over_affiliated: + if cli_opts['verbosity'] >= 4: + sys.stdout.write("---Choosing standard over affiliated authors.\n") + if std_info: + return (std_info, 2) + else: + return (aff_info, 3) +# return ((std_info or aff_info), 2) else: - return tagged_info or aff_info or std_info + if cli_opts['verbosity'] >= 4: + sys.stdout.write("---Choosing affiliated over standard authors.\n") + if aff_info: + return (aff_info, 3) + else: + return (std_info, 2) +# return ((aff_info or std_info), 3) def find_reference_section(docbody): """Search in document body for its reference section. More precisely, find @@ -6976,7 +6956,7 @@ def get_reference_section_beginning(fulltext): ## ----> Glue - logic for finding and extracting reference section: -def extract_references_from_fulltext(fulltext, sect_start): +def extract_references_from_fulltext(fulltext): """Locate and extract the reference section from a fulltext document. Return the extracted section as a list of strings, whereby each string in the list is considered to be a single line. @@ -6996,7 +6976,7 @@ def extract_references_from_fulltext(fulltext, sect_start): #How ref section found flag how_found_start = 0 - #sect_start = get_reference_section_beginning(fulltext) + sect_start = get_reference_section_beginning(fulltext) if sect_start is None: ## No lines @@ -7304,14 +7284,14 @@ def get_cli_options(): cli_opts['kb-report-number'] = o[1] elif o[0] in ("-a", "--authors"): cli_opts['authors'] = 1 - elif o[0] in ("-f", "--affiliations"): + elif o[0] in ("-l", "--affiliations"): cli_opts['affiliations'] = 1 elif o[0] in ("--first_author"): cli_opts['first_author'] = 1 if len(myargs) == 0: ## no arguments: error message usage(wmsg="Error: no full-text.") - + # What journal title format are we using? if cli_opts['verbosity'] > 0 and cli_opts['inspire']: sys.stdout.write("--- Using inspire journal title form\n") @@ -7535,8 +7515,9 @@ def begin_extraction(daemon_cli_options=None): extract_top_section_metadata = cli_opts['authors'] or cli_opts['affiliations'] ## 1. Get this document body as plaintext: - (docbody, extract_error) = get_plaintext_document_body(curitem[1], \ - extract_top_section_metadata) + (docbody, extract_error) = \ + get_plaintext_document_body(curitem[1], \ + extract_top_section_metadata) if extract_error == 1: ## Non-existent or unreadable pdf/text directory. write_message("***%s\n\n" % curitem[1], sys.stderr, verbose=0) @@ -7555,9 +7536,9 @@ def begin_extraction(daemon_cli_options=None): try: ofilehdl = open(cli_opts['xmlfile'], 'w') ofilehdl.write("%s\n" \ - % CFG_REFEXTRACT_XML_VERSION.encode("utf-8")) + % CFG_REFEXTRACT_XML_VERSION.encode("utf-8")) ofilehdl.write("%s\n" \ - % CFG_REFEXTRACT_XML_COLLECTION_OPEN.encode("utf-8")) + % CFG_REFEXTRACT_XML_COLLECTION_OPEN.encode("utf-8")) ofilehdl.flush() except Exception, err: write_message("***%s\n%s\n" % (cli_opts['xmlfile'], err), \ @@ -7568,9 +7549,9 @@ def begin_extraction(daemon_cli_options=None): ## else, write the xml lines to the stdout else: sys.stdout.write("%s\n" \ - % CFG_REFEXTRACT_XML_VERSION.encode("utf-8")) + % CFG_REFEXTRACT_XML_VERSION.encode("utf-8")) sys.stdout.write("%s\n" \ - % CFG_REFEXTRACT_XML_COLLECTION_OPEN.encode("utf-8")) + % CFG_REFEXTRACT_XML_COLLECTION_OPEN.encode("utf-8")) done_coltags = 1 if len(docbody) > 0: @@ -7581,64 +7562,89 @@ def begin_extraction(daemon_cli_options=None): ## treat entire input as relevant section: extract_lines = docbody else: - ## Always find the position of the start of the reference section. - ## This can be worked on, or stripped, later. - refs_start = get_reference_section_beginning(docbody) - ## launch search for the relevant section in the document body if extract_top_section_metadata: - ## Strip references. This will prevent analysing this section for authors. - if refs_start: - docbody = docbody[:refs_start['start_line']] - - (document_info, extract_error, how_found_start) = \ + (document_info, extract_error, author_type) = \ extract_top_document_information_from_fulltext(docbody, first_author=cli_opts['first_author']) - - if cli_opts['authors']: - extract_lines = document_info['authors'] - elif cli_opts['affiliations']: - extract_lines = document_info['affiliations'] else: (extract_lines, extract_error, how_found_start) = \ - extract_references_from_fulltext(docbody, refs_start) + extract_references_from_fulltext(docbody) + ## I want references! if not extract_top_section_metadata: if len(extract_lines) == 0 and extract_error == 0: extract_error = 6 write_message("-----extract_references_from_fulltext " \ "gave len(reflines): %s overall error: " \ "%s\n" \ - % (str(len(reflines)), str(extract_error)), verbose=2) - - ## 3. Standardise the reference lines: - #reflines = test_get_reference_lines() - (processed_references, count_misc, \ - count_title, count_reportnum, \ - count_url, count_doi, count_auth_group, \ - record_titles_count) = \ - create_marc_xml_reference_section(extract_lines, - preprint_repnum_search_kb=\ - preprint_reportnum_sre, - preprint_repnum_standardised_categs=\ - standardised_preprint_reportnum_categs, - periodical_title_search_kb=\ - title_search_kb, - standardised_periodical_titles=\ - title_search_standardised_titles, - periodical_title_search_keys=\ - title_search_keys) - - ## Add the count of 'bad titles' found in this line to the total - ## for the reference section: - all_found_titles_count = \ - sum_2_dictionaries(all_found_titles_count, \ - record_titles_count) - + % (str(len(extract_lines)), str(extract_error))) + if cli_opts['verbosity'] >= 4: + sys.stdout.write('-----reference lines extracted:\n%s\n\n' % extract_lines) + + ## 3. Standardise the reference lines: + (processed_lines, count_misc, \ + count_title, count_reportnum, \ + count_url, count_doi, count_auth_group, \ + record_titles_count) = \ + create_marc_xml_reference_section(extract_lines, + preprint_repnum_search_kb=\ + preprint_reportnum_sre, + preprint_repnum_standardised_categs=\ + standardised_preprint_reportnum_categs, + periodical_title_search_kb=\ + title_search_kb, + standardised_periodical_titles=\ + title_search_standardised_titles, + periodical_title_search_keys=\ + title_search_keys) + + ## Add the count of 'bad titles' found in this line to the total + ## for the reference section: + all_found_titles_count = \ + sum_2_dictionaries(all_found_titles_count, \ + record_titles_count) + ## I want authors/affiliations! + else: + ## Handle the xml processing separately, in the case that authors/ + ## affiliations are being extracted + if cli_opts['authors']: + extract_lines = document_info['authors'] + ## Assoiciate authors with their affiliations if possible + out_lines = mark_up_authors_with_affiliations(extract_lines) + else: + extract_lines = document_info['affiliations'] + ## Just the list of affiliations + out_lines = set([aff['line'] for aff in extract_lines]) + if not document_info and extract_error == 0: + extract_error = 6 + elif extract_error == 2: + extract_lines = [] + + if cli_opts['verbosity'] >= 1: + sys.stdout.write("-----author/affiliation extraction " \ + "gave len(extract_lines): %s overall error: " \ + "%s\n" \ + % (str(len(extract_lines)), str(extract_error))) + + processed_lines = [] + for l in out_lines: + (xml_line, \ + count_misc, \ + count_title, \ + count_reportnum, \ + count_url, \ + count_doi, \ + count_auth_group) = \ + convert_processed_line_to_marc_xml("", \ + l.replace('\n',''), \ + None, \ + None) + processed_lines.append(xml_line) else: ## document body is empty, therefore the reference section is empty: extract_lines = [] - processed_references = [] + processed_lines = [] ## 4. Display the extracted references, status codes, etc: if cli_opts['output_raw']: @@ -7672,12 +7678,13 @@ def begin_extraction(daemon_cli_options=None): count_misc, \ count_auth_group, \ recid, \ - processed_references) + processed_lines) ## Filter the processed reference lines to remove junk - out = filter_processed_references(out) ## Be sure to call this BEFORE compress_subfields - ## since filter_processed_references expects the - ## original xml format. + out = filter_processed_lines(out) ## Be sure to call this BEFORE compress_subfields + ## since filter_processed_lines expects the + ## original xml format. + ## Compress mulitple 'm' subfields in a datafield out = compress_subfields(out, CFG_REFEXTRACT_SUBFIELD_MISC) ## Compress multiple 'h' subfields in a datafield @@ -7698,13 +7705,13 @@ def begin_extraction(daemon_cli_options=None): if done_coltags: if (cli_opts['xmlfile']): ofilehdl.write("%s\n" \ - % CFG_REFEXTRACT_XML_COLLECTION_CLOSE.encode("utf-8")) + % CFG_REFEXTRACT_XML_COLLECTION_CLOSE.encode("utf-8")) ofilehdl.close() ## limit m tag data to something less than infinity limit_m_tags(cli_opts['xmlfile'], 2024) else: sys.stdout.write("%s\n" \ - % CFG_REFEXTRACT_XML_COLLECTION_CLOSE.encode("utf-8")) + % CFG_REFEXTRACT_XML_COLLECTION_CLOSE.encode("utf-8")) ## If the option to write the statistics about all periodical titles matched ## during the extraction-job was selected, do so using the specified file. @@ -7819,3 +7826,5 @@ def test_get_reference_lines(): ] return reflines +if __name__ == '__main__': + main() diff --git a/modules/bibedit/lib/refextract_config.py b/modules/bibedit/lib/refextract_config.py index 2054b05334..76277d7fe7 100644 --- a/modules/bibedit/lib/refextract_config.py +++ b/modules/bibedit/lib/refextract_config.py @@ -35,9 +35,17 @@ # authors which should be recognised as such CFG_REFEXTRACT_KB_AUTHORS = "%s/bibedit/refextract-authors.kb" % CFG_ETCDIR -## Institutions, paired with author and affiliation extraction -CFG_REFEXTRACT_INSTITUTIONS = ['CERN','DESY','Rutherford','Fermilab','SLAC','TRIUMF','Brookhaven Livermore','Argonne'] +## Lines holding key matches will be replaced with the value at extraction time +CFG_REFEXTRACT_INSTITUTION_REPLACEMENTS = {r'^Livermore' : 'LLNL, Livermore', \ + r'.*?Stanford Linear Accelerator Center.*?' : 'SLAC', \ + r'^Fermi National Accelerator Laboratory' : 'Fermilab'} +## Lines holding these institutions will be reduced solely to the institution at extraction time +CFG_REFEXTRACT_INSTITUTION_REDUCTIONS = ['CERN', 'DESY', 'Rutherford', 'Fermilab', 'SLAC', \ + 'TRIUMF', 'Brookhaven Livermore', 'Argonne'] + +## The allowable distance between consecutively numerated affiliations +## A small distance value could limit the number of numerated affiliations obtained (default: 2) CFG_REFEXTRACT_AFFILIATION_NUMERATION_ALLOWABLE_GAP = 2 ## MARC Fields and subfields used by refextract: From 95f6df158b3bc2ec8ada7ccb036138a97f7975fb Mon Sep 17 00:00:00 2001 From: Christopher Hayward Date: Tue, 30 Aug 2011 19:04:43 +0200 Subject: [PATCH 15/15] refextract: use appropriate marc tags * Places author and affiliation information into $100/$700, into subfields $a and $u respectively. * Cleans up comments and includes author-extraction related functions for the processing of xml output. * Updates refextract_tests with the correct call to the 'display xml lines' function (for references as opposed to authors/affiliations) --- modules/bibedit/lib/refextract.py | 484 ++++++++++++++++------- modules/bibedit/lib/refextract_config.py | 9 +- modules/bibedit/lib/refextract_tests.py | 20 +- 3 files changed, 365 insertions(+), 148 deletions(-) diff --git a/modules/bibedit/lib/refextract.py b/modules/bibedit/lib/refextract.py index 8e44462ea1..572fbb2c55 100644 --- a/modules/bibedit/lib/refextract.py +++ b/modules/bibedit/lib/refextract.py @@ -63,11 +63,16 @@ CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_STND, \ CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_ETAL, \ CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_INCL, \ + CFG_REFEXTRACT_MARKER_CLOSING_AFFILIATION, \ CFG_REFEXTRACT_XML_VERSION, \ CFG_REFEXTRACT_XML_COLLECTION_OPEN, \ CFG_REFEXTRACT_XML_COLLECTION_CLOSE, \ CFG_REFEXTRACT_XML_RECORD_OPEN, \ - CFG_REFEXTRACT_XML_RECORD_CLOSE + CFG_REFEXTRACT_XML_RECORD_CLOSE, \ + CFG_REFEXTRACT_AE_SUBFIELD_AUTHOR, \ + CFG_REFEXTRACT_AE_SUBFIELD_AFFILIATION, \ + CFG_REFEXTRACT_AE_TAG_ID_HEAD_AUTHOR, \ + CFG_REFEXTRACT_AE_TAG_ID_TAIL_AUTHOR except ImportError: CFG_REFEXTRACT_VERSION = "Invenio/%s refextract/%s" % ('standalone', 'standalone') CFG_REFEXTRACT_KB_JOURNAL_TITLES = "%s/etc/refextract-journal-titles.kb" % '..' @@ -102,6 +107,7 @@ CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_STND= r"" CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_ETAL= r"" CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_INCL= r"" + CFG_REFEXTRACT_MARKER_CLOSING_AFFILIATION= r"" CFG_REFEXTRACT_XML_VERSION = u"""""" CFG_REFEXTRACT_XML_COLLECTION_OPEN = u"""""" CFG_REFEXTRACT_XML_COLLECTION_CLOSE = u"""\n""" @@ -980,7 +986,16 @@ def get_bad_char_replacements(): (\s\/)? ## optional / \> ## closing of tag (>) """, \ - re.UNICODE|re.VERBOSE) + re.UNICODE|re.VERBOSE) + +re_tagged_author_aff_line = re.compile(r""" + \ ## closing of tag (>) + """, \ + re.UNICODE|re.VERBOSE) ## is there pre-recognised numeration-tagging within a @@ -3010,10 +3025,10 @@ def create_marc_xml_reference_line(line_marker, count_url, \ count_doi, \ count_auth_group) = \ - convert_processed_line_to_marc_xml(line_marker, \ - tagged_line.replace('\n',''), \ - identified_dois, \ - identified_urls) + convert_processed_reference_line_to_marc_xml(line_marker, \ + tagged_line.replace('\n',''), \ + identified_dois, \ + identified_urls) return (xml_line, count_misc, count_title, \ count_reportnum, count_url, count_doi, count_auth_group) @@ -3112,7 +3127,7 @@ def check_author_for_ibid(line_elements,author): ## If an author does not need to be replicated for an ibid, append nothing to the xml line return "", author -def append_datafield_element(line_marker,citation_structure,line_elements,author,xml_line): +def append_datafield_element(line_marker, citation_structure, line_elements, author, xml_line): """ Finish the current datafield element and start a new one, with a new marker subfield. @param line_marker: (string) The line marker which will be the sole @@ -3145,6 +3160,30 @@ def append_datafield_element(line_marker,citation_structure,line_elements,author return xml_line, author +def start_auth_aff_datafield_element(first_author): + """ Construct the first line of the XML datafield element, + with the relevant datafield tag (depending on if it's + the first author-aff pair or not). + @param first_author: (boolean) Use the HEAD author tag + or the TAIL author tag. + @return: (string) The starting datafield line with the + appropriate tag. + """ + ## First author/affiliation? (use $100) + if first_author: + auth_tag = CFG_REFEXTRACT_AE_TAG_ID_HEAD_AUTHOR + + ## use $700 + else: + auth_tag = CFG_REFEXTRACT_AE_TAG_ID_TAIL_AUTHOR + + new_datafield = """ """ \ + % { 'df-tag-auth' : auth_tag, + } + + return new_datafield + + def start_datafield_element(line_marker): """ Start a brand new datafield element with a marker subfield. @param line_marker: (string) The line marker which will be the sole @@ -3153,13 +3192,18 @@ def start_datafield_element(line_marker): @return: (string) The string holding the relevant datafield and subfield tags. """ - new_datafield = """ + + marker_subfield = """ %(marker-val)s""" \ + % { 'sf-code-ref-marker' : CFG_REFEXTRACT_SUBFIELD_MARKER, + 'marker-val' : encode_for_xml(line_marker) + } + + new_datafield = """ %(marker-subfield)s""" \ % { 'df-tag-ref' : CFG_REFEXTRACT_TAG_ID_REFERENCE, 'df-ind1-ref' : CFG_REFEXTRACT_IND1_REFERENCE, 'df-ind2-ref' : CFG_REFEXTRACT_IND2_REFERENCE, - 'sf-code-ref-marker' : CFG_REFEXTRACT_SUBFIELD_MARKER, - 'marker-val' : encode_for_xml(line_marker) + 'marker-subfield' : marker_subfield } return new_datafield @@ -3239,8 +3283,53 @@ def dump_or_split_author(misc_txt,line_elements): return "" +def build_formatted_xml_author_affiliation_line(author_elements, first_author): + """ Given a single line, of either: + 1. auth + 2. auth, aff + 3. aff + Mark up into an xml form. No splitting heuristics are required, since all + authors and associated affiliations will form single lines. + @param author_elements: (list) The type of the item (affiliation or author) + and the items content (the author or affiliation string) + @param first_author: (boolean) Whether or not this is the first author-aff + pair to mark up, for this document. This will influence the datafield tag used + (100 or 700) + @return: (string) The XML version of the passed in author-aff elements. + """ + ## Begin the datafield element (no line marker) + xml_line = start_auth_aff_datafield_element(first_author) + + line_elements = [] + citation_structure = [] + elements_processed = 0 + + for element in author_elements: + + if element['type'] == "AUTH": + ## Add the author subfield with the author text + xml_line += """ + %(content)s""" \ + % { 'content' : encode_for_xml(element['content']).strip('()'), + 'sf-code-ref-auth' : CFG_REFEXTRACT_AE_SUBFIELD_AUTHOR, + } + elif element['type'] == "AFF": + ## Add the affiliation subfield with the affiliation text + xml_line += """ + %(content)s""" \ + % { 'content' : encode_for_xml(element['content']).strip('()'), + 'sf-code-ref-auth' : CFG_REFEXTRACT_AE_SUBFIELD_AFFILIATION, + } + line_elements.append(element) + + ## Close the ending datafield element + xml_line += """ + \n""" + + return xml_line + -def build_formatted_xml_citation(citation_elements,line_marker): +def build_formatted_xml_citation(citation_elements, line_marker): """ Create the MARC-XML string of the found reference information which was taken from a tagged reference line. @param citation_elements: (list) an ordered list of dictionary elements, @@ -3339,7 +3428,6 @@ def build_formatted_xml_citation(citation_elements,line_marker): ##TITLE if element['type'] == "TITLE": - ## If a report number has been marked up, and there's misc text before this title and the last tag if is_in_line_elements("REPORTNUMBER",line_elements) and \ (len(re.sub(re_arxiv_notation,"",(element['misc_txt'].lower().strip(".,:;- []")))) > 0): @@ -3481,7 +3569,7 @@ def build_formatted_xml_citation(citation_elements,line_marker): elif element['type'] == "AUTH": if element['auth_type'] != 'incl': - auth_choice = dump_or_split_author(element['misc_txt'],line_elements) + auth_choice = dump_or_split_author(element['misc_txt'], line_elements) if auth_choice == "dump": ## This author is no good, place it into misc text xml_line += """ @@ -3533,9 +3621,102 @@ def build_formatted_xml_citation(citation_elements,line_marker): return xml_line +def convert_processed_auth_aff_line_to_marc_xml(line, first_author): + """ Given a line holding either tagged authors, affiliations or both, convert it to its + MARC-XML representation. + + @param line: (string) The tagged author-affiliation line. The line may hold a + single author, an author and an affiliation, or an affiliation. + @return xml_line: (string) the MARC-XML representation of the tagged author/aff line + @return count_*: (integer) the number of * (pieces of info) found in the author/aff line. + """ + + count_auth = count_aff = 0 + xml_line = "" + processed_line = line + cur_misc_txt = u"" + + tag_match = re_tagged_author_aff_line.search(processed_line) + + # contains a list of dictionary entries of previously cited items + author_elements = [] + # the last tag element found when working from left-to-right across the line + identified_author_element = None + + while tag_match is not None: + + ## While there are tags inside this reference line... + tag_match_start = tag_match.start() + tag_match_end = tag_match.end() + tag_type = tag_match.group(1) + cur_misc_txt += processed_line[0:tag_match_start] + + if tag_type.find("AUTH") != -1: + ## This tag is an identified Author: + ## extract the author from the line: + idx_closing_tag_nearest = processed_line.find(\ + CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_STND, tag_match_end) + + if idx_closing_tag_nearest == -1: + ## no closing tag found - strip the opening tag + ## and move past it + processed_line = processed_line[tag_match_end:] + identified_citation_element = None + else: + auth_txt = processed_line[tag_match_end:idx_closing_tag_nearest] + ## Now move past the ending tag in the line: + processed_line = processed_line[idx_closing_tag_nearest + \ + len(""):] + #SAVE the current misc text + identified_author_element = { 'type' : "AUTH", + 'content' : "%s" % auth_txt, + } + ## Increment the stats counters: + count_auth += 1 + cur_misc_txt = u"" + + elif tag_type.find("AFF") != -1: + ## This tag is an identified affiliation: + ## extract the affiliation from the line: + idx_closing_tag_nearest = processed_line.find(\ + CFG_REFEXTRACT_MARKER_CLOSING_AFFILIATION, tag_match_end) + + if idx_closing_tag_nearest == -1: + ## no closing tag found - strip the opening tag + ## and move past it + processed_line = processed_line[tag_match_end:] + identified_citation_element = None + else: + aff_txt = processed_line[tag_match_end:idx_closing_tag_nearest] + ## Now move past the ending tag in the line: + processed_line = processed_line[idx_closing_tag_nearest + \ + len(CFG_REFEXTRACT_MARKER_CLOSING_AFFILIATION):] + #SAVE the current misc text + identified_author_element = { 'type' : "AFF", + 'content' : "%s" % aff_txt, + } + ## Increment the stats counters: + count_aff += 1 + cur_misc_txt = u"" + + if identified_author_element != None: + ## Append the found tagged data and current misc text + author_elements.append(identified_author_element) + identified_author_element = None + ## Look for the next tag in the processed line: + tag_match = re_tagged_author_aff_line.search(processed_line) -def convert_processed_line_to_marc_xml(line_marker, + ## Now, run the method which will take as input: + ## 1. A list of dictionaries, where each dictionary is a author or an + ## affiliation. + xml_line = build_formatted_xml_author_affiliation_line(author_elements, first_author) + + ## return the reference-line as MARC XML: + return (xml_line, count_auth, count_aff) + + +def convert_processed_reference_line_to_marc_xml(line_marker, line, identified_dois, identified_urls): @@ -4864,49 +5045,49 @@ def get_post_author_section_keyword_patterns(): re_aff_num = re.compile(r"(^[\d]+[A-Z])") re_aff_name = re.compile(r"(univ|institut|laborator)", re.I) re_aff_univ = re.compile(r"univ[a-z]+\s+(of)?\s+([a-z\s\-]+)|([a-z\s\-]+)\s+(?!univ[a-z]+\sof)univ[a-z]+", re.I) - re_splitting_comma = re.compile(",[^\d]", re.UNICODE) def arrange_possible_authors(line, delimiter=None): + """Break a line according to a delimiter. Replace 'and' phrases + with the delimiter before splitting. + @param line: (string) The line containing possible authors. + @param delimiter: (char) A delimiter found when rearranging + numeration around characters. This rearranging took place + prior to this, and was used to partially repair pdftotext issues. + @return: (list) Broken up line. + """ if not delimiter: delimiter = "," ## Replace and's with delimiter (comma as standard) delimited_line = re.sub(r"(^\s*|\s)([Aa][Nn][Dd]|&)\s", delimiter, line) - ## Split by commas -# possible_authors = re_splitting_comma.split(comma_split_line.strip()) + ## Split by delimiter possible_authors = delimited_line.split(delimiter) ## Remove empty stuff possible_authors = filter(lambda x: x.strip(), possible_authors) return possible_authors def gather_affiliated_authors_by_numeration(lines, aff_positions, number_to_find): - """Use the found affiliation to try and help with author extraction""" + """ Use the found affiliation to try and help with author extraction. + Using affiliation positions, and the number to find, look for authors above + the affiliations, by comparing the numeration found adjacent to authors. + An extraction procedure tends to spend the majority of its time inside this + function, if the number of numerated, affiliated authors is high. + @param lines: (list) The search space. + @param aff_positions: (list) Positions of already found affiliations. + @param number_to_find: (int) The number to find against authors. + @return: (tuple) of two lists, one holding numerated author matches, + and the other holding authors which resided on a line holding a numerated + author, and were split using some common, found delimiter. + """ def has_number(possible_auth, number_to_find): """Does this possible author have the numeration I want?""" (auth_nums, auth_num_match) = obtain_author_affiliation_numeration_list(possible_auth) return number_to_find in auth_nums def remove_excess_numeration(author_match): + """See function signature.""" return re.sub("^\d+|\d+$", "", author_match) -# def make_numerated_author_pattern(list_of_numerated_authors): -# patterns = [] -# for num_auth in list_of_numerated_authors: -# num_auth = re.sub("[a-z]", "[a-z]", num_auth) -# num_auth = re.sub("(?<=%s)+?%s" % (re.escape("[a-z][a-z]"), re.escape("[a-z]")), "+", num_auth) -# -# num_auth = re.sub("[A-Z]", "[A-Z]", num_auth) -# num_auth = re.sub("(?<=%s)+?%s" % (re.escape("[A-Z][A-Z]"), re.escape("[A-Z]")), "+", num_auth) -# -# num_auth = re.sub("[0-9]", "[0-9]", num_auth) -# num_auth = re.sub("(?<=%s)+?%s" % (re.escape("[0-9][0-9]"), re.escape("[0-9]")), "+", num_auth) -# -# num_auth = re.sub("[\-]", "", num_auth) -# num_auth = re.sub("\s", "\\s", num_auth) -# num_auth = re.escape(num_auth) -# patterns.append(num_auth) -# return patterns - ## Holds numerated authors. numerated_authors = [] all_split_authors = [] @@ -4928,7 +5109,6 @@ def remove_excess_numeration(author_match): ## make sure to save the rest of the split authors in this line. if numerated_authors: all_split_authors.extend(possible_authors) -# numerated_author_patterns.extend(make_numerated_author_pattern(numerated_authors)) return (map(remove_excess_numeration, numerated_authors), \ map(remove_excess_numeration, all_split_authors)) @@ -5030,13 +5210,6 @@ def initiate_affiliated_author_search(affiliations, top_lines, aff_positions): if cli_opts['verbosity'] >= 7: sys.stdout.write("----Found %d weak affiliated authors.\n" % len(collected_line_above_authors)) -# ## Check all numerated authors which were found -# all_numerated_authors = [] -# all_numerated_authors.extend([a for a in cur_aff['author_data'] if a not in all_numerated_authors]) -# if all_numerated_authors: -# ## Extend the standard set of authors, in the event numerated authors are found -# topline_standard_authors = collect_standard_authors(top_lines, 0) - return (affiliations, loose_authors) def build_start_end_numeration_str(predefined_punct=None): @@ -5143,7 +5316,6 @@ def extract_numerated_affiliations(num_data, num_find, missing): if num_find in aff_nums: ## Attempt to get numeration for this affiliation try: -# print "num with aff: %d" % num_find num_find = num_find + 1 except ValueError: sys.stderr.write("Error: Unable to obtain integer affiliation numeration.") @@ -5153,7 +5325,6 @@ def extract_numerated_affiliations(num_data, num_find, missing): 'line' : reduce_affiliation_names(line), 'aff_nums' : aff_nums, 'author_data' : None}) -# print "--Found aff: %s" % line elif num_find in missing: ## Get the next non missing number and use that @@ -5252,7 +5423,7 @@ def realign_numeration(toplines): if line_ahead: toplines_alternate[lookahead] = \ num_match.group(0).strip() + line_ahead -# print "new line: %s" % toplines_alternate[lookahead] + ## Increment the next number to look for num += 1 numeration_swaps += 1 @@ -5419,35 +5590,6 @@ def get_smaller(x, y): return (affiliations, loose_authors) -def collect_standard_authors(top_lines, position=0, first=None): - """Obtain standard authors [recursive] - @param top_lines: (list) top lines of document - @param position: (int) position in top lines - @return: list holding the list of collected authors, - and the position of the last author line - """ - author_matches = [] - if position < len(top_lines): - line = top_lines[position] - ## Get all standard author matches for this line - author_matches = re_single_author_pattern_with_numeration.search(line) - author_matches_alt = \ - re_single_author_pattern_with_numeration.finditer(realign_shifted_line_numeration_around_commas(line)) - if author_matches or author_matches_alt: - if first is None: - first = position - - ## Recurse on the next position - (more_author_matches, first, position) = collect_standard_authors(top_lines, position+1, first) - - if len(author_matches) > len(author_matches_alt): - ## Save the matching strings in a list - author_matches.extend(more_author_matches) - else: - author_matches_alt.extent(more_author_matches) - ## Authors for this line - return (author_matches, first, position-1) - def collect_tagged_authors(top_section, position, first_line=None, \ orig_blank_lines=None, cur_blank_lines=None): """Recursively try to obtain authors after an 'author tag' has been @@ -5860,6 +6002,44 @@ def check_for_end_of_author_section_match_keywords(docbody): return (document_information, status, chosen_type) +def mark_up_affiliation(affiliation): + """ Tags a string, with the affiliation tags. + """ + def process_aff(a): + """Remove unacceptable end characters.""" + a = replace_undesirable_characters(a).strip(".,:;- []()*\\") + return a + + processed_aff = process_aff(affiliation) + + tagged_aff = "" + if processed_aff: + tagged_aff = "%s%s%s" % ("", \ + processed_aff, \ + CFG_REFEXTRACT_MARKER_CLOSING_AFFILIATION) + return tagged_aff + +def mark_up_affiliations(affiliations): + """ Tag a set of lines as affiliations. Note the first + affiliation too. + @param affiliations: (list) Strings which should be marked up + as affiliations. + @return: (list) of tuples. Holding a boolean, as to whether or not + this affiliation or author is the first one in the list. + """ + + tagged_affiliations = [] + + is_first_aff = True + for a in affiliations: + marked_up_aff = mark_up_affiliation(a) + if marked_up_aff: + tagged_affiliations.append((is_first_aff, marked_up_aff)) + if is_first_aff: + is_first_aff = False + + return tagged_affiliations + def mark_up_authors_with_affiliations(final_authors): """ Prepare authors and any possible associated affiliations into marked-up (tagged) lines according to identified authors. @@ -5878,22 +6058,25 @@ def process_authors(a): a = replace_undesirable_characters(a).strip(".,:;- []()*\\") return a - def process_aff(a): - a = replace_undesirable_characters(a).strip(".,:;- []()*\\") - return a + is_first_author = True for aff_auth_dict in final_authors: for authors in aff_auth_dict['authors']: ## Otherwise the closing element tag dissappears (!?) if authors: if not aff_auth_dict['affiliation']: - aff_for_authors = '' + aff_for_authors = "" else: - aff_for_authors = aff_auth_dict['affiliation'] - tagged_authors.append("%s%s%s%s" % ("", \ - process_authors(authors), \ - CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_STND, \ - process_aff(aff_for_authors))) + ## Use the affiliation tags to tag this affiliation + aff_for_authors = mark_up_affiliation(aff_auth_dict['affiliation']) + ## Tag authors, and any of their associated affiliations + tagged_authors.append((is_first_author, "%s%s%s%s" % ("", \ + process_authors(authors), \ + CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_STND, \ + aff_for_authors))) + if is_first_author: + is_first_author = False + return tagged_authors def choose_author_method(tagged_info, std_info, aff_info, \ @@ -5911,8 +6094,9 @@ def choose_author_method(tagged_info, std_info, aff_info, \ @param tagged_authors: (list) List of purely tagged authors. @param std_authors: (list) List of purely standard-matched authors. @param aff_authors: (list) List of purely affiliated authors. - @return: (dict) Affiliation and author information which is deemed to be - the most accurate for the document. + @return: (tuple) Affiliation and author information which is deemed to be + the most accurate for the document, and the type used -- + (standard [2] or affiliated [3]). """ ## Immediately discard non-sets of authors (hold duplicate entries) @@ -5994,7 +6178,6 @@ def choose_author_method(tagged_info, std_info, aff_info, \ return (std_info, 2) else: return (aff_info, 3) -# return ((std_info or aff_info), 2) else: if cli_opts['verbosity'] >= 4: sys.stdout.write("---Choosing affiliated over standard authors.\n") @@ -6002,7 +6185,6 @@ def choose_author_method(tagged_info, std_info, aff_info, \ return (aff_info, 3) else: return (std_info, 2) -# return ((aff_info or std_info), 3) def find_reference_section(docbody): """Search in document body for its reference section. More precisely, find @@ -7288,9 +7470,6 @@ def get_cli_options(): cli_opts['affiliations'] = 1 elif o[0] in ("--first_author"): cli_opts['first_author'] = 1 - if len(myargs) == 0: - ## no arguments: error message - usage(wmsg="Error: no full-text.") # What journal title format are we using? if cli_opts['verbosity'] > 0 and cli_opts['inspire']: @@ -7311,8 +7490,38 @@ def get_cli_options(): return (cli_opts, myargs) -def display_xml_record(status_code, count_reportnum, count_title, count_url, - count_doi, count_misc, count_auth_group, recid, xml_lines): +def display_auth_aff_xml_record(recid, xml_lines): + """ Wraps XML lines holding extracted authors and affiliations + with the necessary record and controlfield elements. + @param recid: (int) record id of the document being extracted. + @param xml_lines: (list) of xml holding annotated authors + and affiliation information. + @return: (xml_lines) xml lines with the surrounding elements. + """ + ## Start with the opening record tag: + out = u"%(record-open)s\n" \ + % { 'record-open' : CFG_REFEXTRACT_XML_RECORD_OPEN, } + + ## Display the record-id controlfield: + out += \ + u""" %(recid)s\n""" \ + % { 'cf-tag-recid' : CFG_REFEXTRACT_CTRL_FIELD_RECID, + 'recid' : encode_for_xml(recid), + } + + ## Loop through all xml lines and add them to the output string: + for line in xml_lines: + out += line + + ## Now add the closing tag to the record: + out += u"%(record-close)s\n" \ + % { 'record-close' : CFG_REFEXTRACT_XML_RECORD_CLOSE, } + + return out + + +def display_references_xml_record(status_code, count_reportnum, count_title, count_url, + count_doi, count_misc, count_auth_group, recid, xml_lines): """Given a series of MARC XML-ized reference lines and a record-id, write a MARC XML record to the stdout stream. Include in the record some stats for the extraction job. @@ -7456,9 +7665,11 @@ def begin_extraction(daemon_cli_options=None): ## no files provided for reference extraction - error message usage(wmsg="Error: No valid input file specified (-f id:file [-f id:file ...])") - ## Don't parse the knowledge bases if authors/affiliations are being extracted - if not cli_opts['authors'] and not cli_opts['affiliations']: + ## What top section data do I want? + extract_top_section_metadata = cli_opts['authors'] or cli_opts['affiliations'] + ## Don't parse the knowledge bases if authors/affiliations are being extracted + if not extract_top_section_metadata: ## Read the journal titles knowledge base, creating the search ## patterns and replace terms. Check for user-specified journal kb. if cli_opts['kb-journal'] != 0: @@ -7512,8 +7723,6 @@ def begin_extraction(daemon_cli_options=None): write_message("--- processing RecID: %s pdffile: %s; %s\n" \ % (str(curitem[0]), curitem[1], ctime()), verbose=2) - extract_top_section_metadata = cli_opts['authors'] or cli_opts['affiliations'] - ## 1. Get this document body as plaintext: (docbody, extract_error) = \ get_plaintext_document_body(curitem[1], \ @@ -7574,7 +7783,7 @@ def begin_extraction(daemon_cli_options=None): if not extract_top_section_metadata: if len(extract_lines) == 0 and extract_error == 0: extract_error = 6 - write_message("-----extract_references_from_fulltext " \ + write_message("-----extract_references_from_fulltext " \ "gave len(reflines): %s overall error: " \ "%s\n" \ % (str(len(extract_lines)), str(extract_error))) @@ -7609,12 +7818,12 @@ def begin_extraction(daemon_cli_options=None): ## affiliations are being extracted if cli_opts['authors']: extract_lines = document_info['authors'] - ## Assoiciate authors with their affiliations if possible + ## Associate authors with their affiliations if possible out_lines = mark_up_authors_with_affiliations(extract_lines) else: extract_lines = document_info['affiliations'] ## Just the list of affiliations - out_lines = set([aff['line'] for aff in extract_lines]) + out_lines = mark_up_affiliations(set([aff['line'] for aff in extract_lines])) if not document_info and extract_error == 0: extract_error = 6 @@ -7628,19 +7837,14 @@ def begin_extraction(daemon_cli_options=None): % (str(len(extract_lines)), str(extract_error))) processed_lines = [] - for l in out_lines: + for first_auth_aff, l in out_lines: (xml_line, \ - count_misc, \ - count_title, \ - count_reportnum, \ - count_url, \ - count_doi, \ - count_auth_group) = \ - convert_processed_line_to_marc_xml("", \ - l.replace('\n',''), \ - None, \ - None) + count_auth, \ + count_aff) = \ + convert_processed_auth_aff_line_to_marc_xml(l.replace('\n',''), \ + first_auth_aff) processed_lines.append(xml_line) + else: ## document body is empty, therefore the reference section is empty: extract_lines = [] @@ -7664,31 +7868,37 @@ def begin_extraction(daemon_cli_options=None): ## urls found greatly increases the level of rubbish accepted.. if count_reportnum + count_title == 0 and how_found_start > 2: count_misc = count_url = count_doi = count_auth_group = 0 - processed_references = [] - write_message("-----Found ONLY miscellaneous/Urls so removed it how_found_start= %d\n" \ - % (how_found_start), verbose=2) - elif count_reportnum + count_title > 0 and how_found_start > 2: - write_message("-----Found journals/reports with how_found_start= %d\n" % (how_found_start), verbose=2) - ## Display the processed reference lines: - out = display_xml_record(extract_error, \ - count_reportnum, \ - count_title, \ - count_url, \ - count_doi, \ - count_misc, \ - count_auth_group, \ - recid, \ - processed_lines) - - ## Filter the processed reference lines to remove junk - out = filter_processed_lines(out) ## Be sure to call this BEFORE compress_subfields - ## since filter_processed_lines expects the - ## original xml format. - - ## Compress mulitple 'm' subfields in a datafield - out = compress_subfields(out, CFG_REFEXTRACT_SUBFIELD_MISC) - ## Compress multiple 'h' subfields in a datafield - out = compress_subfields(out, CFG_REFEXTRACT_SUBFIELD_AUTH) + processed_lines = [] + if cli_opts['verbosity'] >= 1: + sys.stdout.write("-----Found ONLY miscellaneous/Urls so removed it how_found_start= %d\n" % (how_found_start)) + elif count_reportnum + count_title > 0 and how_found_start > 2: + if cli_opts['verbosity'] >= 1: + sys.stdout.write("-----Found journals/reports with how_found_start= %d\n" % (how_found_start)) + + if extract_top_section_metadata: + out = display_auth_aff_xml_record(recid, \ + processed_lines) + else: + ## Display the processed reference lines: + out = display_references_xml_record(extract_error, \ + count_reportnum, \ + count_title, \ + count_url, \ + count_doi, \ + count_misc, \ + count_auth_group, \ + recid, \ + processed_lines) + + ## Compress mulitple 'm' subfields in a datafield + out = compress_subfields(out, CFG_REFEXTRACT_SUBFIELD_MISC) + ## Compress multiple 'h' subfields in a datafield + out = compress_subfields(out, CFG_REFEXTRACT_SUBFIELD_AUTH) + + ## Filter the processed reference lines to remove junk + out = filter_processed_lines(out) ## Be sure to call this BEFORE compress_subfields + ## since filter_processed_lines expects the + ## original xml format. lines = out.split('\n') write_message("-----display_xml_record gave: %s significant " \ diff --git a/modules/bibedit/lib/refextract_config.py b/modules/bibedit/lib/refextract_config.py index 76277d7fe7..8142db363f 100644 --- a/modules/bibedit/lib/refextract_config.py +++ b/modules/bibedit/lib/refextract_config.py @@ -64,12 +64,17 @@ CFG_REFEXTRACT_SUBFIELD_URL_DESCR = "z" ## ref url-text subfield CFG_REFEXTRACT_SUBFIELD_AUTH = "h" ## ref author subfield -## refextract statisticts fields: +## refextract statistics fields: CFG_REFEXTRACT_TAG_ID_EXTRACTION_STATS = "999" ## ref-stats tag CFG_REFEXTRACT_IND1_EXTRACTION_STATS = "C" ## ref-stats ind1 CFG_REFEXTRACT_IND2_EXTRACTION_STATS = "6" ## ref-stats ind2 CFG_REFEXTRACT_SUBFIELD_EXTRACTION_STATS = "a" ## ref-stats subfield +## refextract author-extraction fields: +CFG_REFEXTRACT_AE_TAG_ID_HEAD_AUTHOR = "100" ## first author-aff details +CFG_REFEXTRACT_AE_TAG_ID_TAIL_AUTHOR = "700" ## remaining author-affs +CFG_REFEXTRACT_AE_SUBFIELD_AUTHOR = "a" ## authors subfield +CFG_REFEXTRACT_AE_SUBFIELD_AFFILIATION = "u" ## affiliations subfield ## Internal tags are used by refextract to mark-up recognised citation ## information. These are the "closing tags: @@ -86,6 +91,8 @@ CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_ETAL= r"" CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_INCL= r"" +CFG_REFEXTRACT_MARKER_CLOSING_AFFILIATION= r"" + ## XML Record and collection opening/closing tags: CFG_REFEXTRACT_XML_VERSION = u"""""" CFG_REFEXTRACT_XML_COLLECTION_OPEN = u"""""" diff --git a/modules/bibedit/lib/refextract_tests.py b/modules/bibedit/lib/refextract_tests.py index 71631a4931..675b753efd 100644 --- a/modules/bibedit/lib/refextract_tests.py +++ b/modules/bibedit/lib/refextract_tests.py @@ -29,7 +29,7 @@ create_marc_xml_reference_section, \ build_titles_knowledge_base, \ build_reportnum_knowledge_base, \ - display_xml_record, \ + display_references_xml_record, \ compress_subfields, \ restrict_m_subfields, \ cli_opts @@ -72,15 +72,15 @@ def extract_references(self, reference_lines): title_search_keys) # Generate the xml string to be outputted - tmp_out = display_xml_record(0, \ - count_reportnum, \ - count_title, \ - count_url, \ - count_doi, \ - count_misc, \ - count_auth_group, \ - self.rec_id, \ - processed_references) + tmp_out = display_references_xml_record(0, \ + count_reportnum, \ + count_title, \ + count_url, \ + count_doi, \ + count_misc, \ + count_auth_group, \ + self.rec_id, \ + processed_references) # Remove redundant misc subfields (m_restricted, ref_lines) = restrict_m_subfields(tmp_out.split('\n'))