fill up the layout token lists of the header's part when processing a…

…nd add getter for title, authors and abstract layout token lists
kermitt2 · Apr 21, 2021 · 0306aa0 · 0306aa0
1 parent 0b42a3c
commit 0306aa0
Show file tree

Hide file tree

Showing 2 changed files with 126 additions and 107 deletions.
diff --git a/grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java b/grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java
@@ -51,7 +51,7 @@ public class BiblioItem {
     private List<BoundingBox> coordinates = null;
 
     // map of labels (e.g. <title> or <abstract>) to LayoutToken
-    private Map<String, List<LayoutToken>> labeledTokens;
+    private Map<String, List<LayoutToken>> labeledTokens = new HashMap<>();
 
     private List<LayoutToken> titleLayoutTokens = new ArrayList<>();
     private List<LayoutToken> authorsLayoutTokens = new ArrayList<>();
@@ -4556,7 +4556,7 @@ public boolean rejectAsReference() {
 		// normally properties authors and authorList are null in the current Grobid version
 		if (!titleSet && !authorSet && (url == null) && (doi == null))
 			return true;
-		else 
+		else
 			return false;
 	}
 
@@ -4606,7 +4606,7 @@ public void setLayoutTokensForLabel(List<LayoutToken> tokens, TaggingLabel heade
         labeledTokens.put(headerLabel.getLabel(), tokens);
     }
 
-    public void generalResultMapping(Document doc, String labeledResult, List<LayoutToken> tokenizations) {
+    public void generalResultMapping(String labeledResult, List<LayoutToken> tokenizations) {
         if (labeledTokens == null)
             labeledTokens = new TreeMap<>();
 
@@ -4624,10 +4624,8 @@ public void generalResultMapping(Document doc, String labeledResult, List<Layout
             List<LayoutToken> clusterTokens = cluster.concatTokens();
             List<LayoutToken> theList = labeledTokens.get(clusterLabel.getLabel());
 
-            if (theList == null)
-                theList = new ArrayList<>();
-            for (LayoutToken token : clusterTokens)
-                theList.add(token);
+            theList = theList == null ? new ArrayList<>() : theList;
+            theList.addAll(clusterTokens);
             labeledTokens.put(clusterLabel.getLabel(), theList);
         }
     }
@@ -4636,6 +4634,18 @@ public void addTitleTokens(List<LayoutToken> layoutTokens) {
         this.titleLayoutTokens.addAll(layoutTokens);
     }
 
+    public List<LayoutToken> getTitleLayoutTokens() {
+        return titleLayoutTokens;
+    }
+
+    public List<LayoutToken> getAuthorsLayoutTokens() {
+        return authorsLayoutTokens;
+    }
+
+    public List<LayoutToken> getAbstractLayoutTokens() {
+        return abstractLayoutTokens;
+    }
+
     public void addAuthorsTokens(List<LayoutToken> layoutTokens) {
         this.authorsLayoutTokens.addAll(layoutTokens);
     }

diff --git a/grobid-core/src/main/java/org/grobid/core/engines/HeaderParser.java b/grobid-core/src/main/java/org/grobid/core/engines/HeaderParser.java
@@ -1,6 +1,7 @@
 package org.grobid.core.engines;
 
 import com.google.common.base.Splitter;
+import org.apache.commons.lang3.StringUtils;
 import org.apache.commons.lang3.tuple.ImmutablePair;
 import org.apache.commons.lang3.tuple.Pair;
 import org.grobid.core.GrobidModels;
@@ -56,6 +57,8 @@
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
+import static org.apache.commons.collections4.CollectionUtils.isNotEmpty;
+
 /**
  * @author Patrice Lopez
  */
@@ -115,7 +118,7 @@ public Pair<String, Document> processing(File input, BiblioItem resHeader, Grobi
      */
     public String processingHeaderSection(GrobidAnalysisConfig config, Document doc, BiblioItem resHeader, boolean serialize) {
         try {
-            SortedSet<DocumentPiece> documentHeaderParts = documentHeaderParts = doc.getDocumentPart(SegmentationLabels.HEADER);
+            SortedSet<DocumentPiece> documentHeaderParts = doc.getDocumentPart(SegmentationLabels.HEADER);
             List<LayoutToken> tokenizations = doc.getTokenizations();
 
             if (documentHeaderParts != null) {
@@ -137,7 +140,7 @@ public String processingHeaderSection(GrobidAnalysisConfig config, Document doc,
                 String header = featuredHeader.getLeft();
                 List<LayoutToken> headerTokenization = featuredHeader.getRight();
                 String res = null;
-                if ((header != null) && (header.trim().length() > 0)) {
+                if (StringUtils.isNotBlank(header)) {
                     res = label(header);
                     resHeader = resultExtraction(res, headerTokenization, resHeader, doc);
                 }
@@ -179,120 +182,119 @@ public String processingHeaderSection(GrobidAnalysisConfig config, Document doc,
                     resHeader.setLanguage(lang);
                 }
 
-                if (resHeader != null) {
-                    if (resHeader.getAbstract() != null) {
-                        resHeader.setAbstract(TextUtilities.dehyphenizeHard(resHeader.getAbstract()));
-                        //resHeader.setAbstract(TextUtilities.dehyphenize(resHeader.getAbstract()));
-                    }
-                    BiblioItem.cleanTitles(resHeader);
-                    if (resHeader.getTitle() != null) {
-                        // String temp =
-                        // utilities.dehyphenizeHard(resHeader.getTitle());
-                        String temp = TextUtilities.dehyphenize(resHeader.getTitle());
+
+                if (resHeader.getAbstract() != null) {
+                    resHeader.setAbstract(TextUtilities.dehyphenizeHard(resHeader.getAbstract()));
+                    //resHeader.setAbstract(TextUtilities.dehyphenize(resHeader.getAbstract()));
+                }
+                BiblioItem.cleanTitles(resHeader);
+                if (resHeader.getTitle() != null) {
+                    // String temp =
+                    // utilities.dehyphenizeHard(resHeader.getTitle());
+                    String temp = TextUtilities.dehyphenize(resHeader.getTitle());
+                    temp = temp.trim();
+                    if (temp.length() > 1) {
+                        if (temp.startsWith("1"))
+                            temp = temp.substring(1, temp.length());
                         temp = temp.trim();
-                        if (temp.length() > 1) {
-                            if (temp.startsWith("1"))
-                                temp = temp.substring(1, temp.length());
-                            temp = temp.trim();
-                        }
-                        resHeader.setTitle(temp);
-                    }
-                    if (resHeader.getBookTitle() != null) {
-                        resHeader.setBookTitle(TextUtilities.dehyphenize(resHeader.getBookTitle()));
                     }
+                    resHeader.setTitle(temp);
+                }
+                if (resHeader.getBookTitle() != null) {
+                    resHeader.setBookTitle(TextUtilities.dehyphenize(resHeader.getBookTitle()));
+                }
 
-                    resHeader.setOriginalAuthors(resHeader.getAuthors());
-                    resHeader.getAuthorsTokens();
-
-                    boolean fragmentedAuthors = false;
-                    boolean hasMarker = false;
-                    List<Integer> authorsBlocks = new ArrayList<Integer>();
-                    List<List<LayoutToken>> authorSegments = new ArrayList<>();
-                    if (resHeader.getAuthorsTokens() != null) {
-                        // split the list of layout tokens when token "\t" is met
-                        List<LayoutToken> currentSegment = new ArrayList<>();
-                        for(LayoutToken theToken : resHeader.getAuthorsTokens()) {
-                            if (theToken.getText() != null && theToken.getText().equals("\t")) {
-                                if (currentSegment.size() > 0)
-                                    authorSegments.add(currentSegment);
-                                currentSegment = new ArrayList<>();
-                            } else
-                                currentSegment.add(theToken);
-                        }
-                        // last segment
-                        if (currentSegment.size() > 0)
-                            authorSegments.add(currentSegment);
+                resHeader.setOriginalAuthors(resHeader.getAuthors());
+                resHeader.getAuthorsTokens();
+
+                boolean fragmentedAuthors = false;
+                boolean hasMarker = false;
+                List<Integer> authorsBlocks = new ArrayList<Integer>();
+                List<List<LayoutToken>> authorSegments = new ArrayList<>();
+                if (resHeader.getAuthorsTokens() != null) {
+                    // split the list of layout tokens when token "\t" is met
+                    List<LayoutToken> currentSegment = new ArrayList<>();
+                    for(LayoutToken theToken : resHeader.getAuthorsTokens()) {
+                        if (theToken.getText() != null && theToken.getText().equals("\t")) {
+                            if (currentSegment.size() > 0)
+                                authorSegments.add(currentSegment);
+                            currentSegment = new ArrayList<>();
+                        } else
+                            currentSegment.add(theToken);
+                    }
+                    // last segment
+                    if (currentSegment.size() > 0)
+                        authorSegments.add(currentSegment);
 
-                        if (authorSegments.size() > 1) {
-                            fragmentedAuthors = true;
-                        }
-                        for (int k = 0; k < authorSegments.size(); k++) {
-                            if (authorSegments.get(k).size() == 0)
-                                continue;
-                            List<Person> localAuthors = parsers.getAuthorParser()
-                                .processingHeaderWithLayoutTokens(authorSegments.get(k), doc.getPDFAnnotations());
-                            if (localAuthors != null) {
-                                for (Person pers : localAuthors) {
-                                    resHeader.addFullAuthor(pers);
-                                    if (pers.getMarkers() != null) {
-                                        hasMarker = true;
-                                    }
-                                    authorsBlocks.add(k);
+                    if (authorSegments.size() > 1) {
+                        fragmentedAuthors = true;
+                    }
+                    for (int k = 0; k < authorSegments.size(); k++) {
+                        if (authorSegments.get(k).size() == 0)
+                            continue;
+                        List<Person> localAuthors = parsers.getAuthorParser()
+                            .processingHeaderWithLayoutTokens(authorSegments.get(k), doc.getPDFAnnotations());
+                        if (localAuthors != null) {
+                            for (Person pers : localAuthors) {
+                                resHeader.addFullAuthor(pers);
+                                if (pers.getMarkers() != null) {
+                                    hasMarker = true;
                                 }
+                                authorsBlocks.add(k);
                             }
                         }
                     }
+                }
 
 
-                    // remove invalid authors (no last name, noise, etc.)
-                    resHeader.setFullAuthors(Person.sanityCheck(resHeader.getFullAuthors()));
-
-                    resHeader.setFullAffiliations(
-                            parsers.getAffiliationAddressParser().processReflow(res, tokenizations));
-                    resHeader.attachEmails();
-                    boolean attached = false;
-                    if (fragmentedAuthors && !hasMarker) {
-                        if (resHeader.getFullAffiliations() != null) {
-                            if (authorSegments != null) {
-                                if (resHeader.getFullAffiliations().size() == authorSegments.size()) {
-                                    int k = 0;
-                                    List<Person> persons = resHeader.getFullAuthors();
-                                    for (Person pers : persons) {
-                                        if (k < authorsBlocks.size()) {
-                                            int indd = authorsBlocks.get(k);
-                                            if (indd < resHeader.getFullAffiliations().size()) {
-                                                pers.addAffiliation(resHeader.getFullAffiliations().get(indd));
-                                            }
-                                        }
-                                        k++;
+                // remove invalid authors (no last name, noise, etc.)
+                resHeader.setFullAuthors(Person.sanityCheck(resHeader.getFullAuthors()));
+
+                resHeader.setFullAffiliations(
+                        parsers.getAffiliationAddressParser().processReflow(res, tokenizations));
+                resHeader.attachEmails();
+                boolean attached = false;
+                if (fragmentedAuthors && !hasMarker) {
+                    if (resHeader.getFullAffiliations() != null) {
+                        if (resHeader.getFullAffiliations().size() == authorSegments.size()) {
+                            int k = 0;
+                            List<Person> persons = resHeader.getFullAuthors();
+                            for (Person pers : persons) {
+                                if (k < authorsBlocks.size()) {
+                                    int indd = authorsBlocks.get(k);
+                                    if (indd < resHeader.getFullAffiliations().size()) {
+                                        pers.addAffiliation(resHeader.getFullAffiliations().get(indd));
                                     }
-                                    attached = true;
-                                    resHeader.setFullAffiliations(null);
-                                    resHeader.setAffiliation(null);
                                 }
+                                k++;
                             }
+                            attached = true;
+                            resHeader.setFullAffiliations(null);
+                            resHeader.setAffiliation(null);
                         }
                     }
-                    if (!attached) {
-                        resHeader.attachAffiliations();
-                    }
 
-                    // remove duplicated authors
-                    resHeader.setFullAuthors(Person.deduplicate(resHeader.getFullAuthors()));
+                }
+                if (!attached) {
+                    resHeader.attachAffiliations();
+                }
 
-                    if (resHeader.getEditors() != null) {
-                        // TBD: consider segments also for editors, like for authors above
-                        resHeader.setFullEditors(parsers.getAuthorParser().processingHeader(resHeader.getEditors()));
-                    }
+                // remove duplicated authors
+                resHeader.setFullAuthors(Person.deduplicate(resHeader.getFullAuthors()));
 
-                    // below using the reference strings to improve the metadata extraction, it will have to
-                    // be reviewed for something safer as just a straightforward correction
-                    /*if (resHeader.getReference() != null) {
-                        BiblioItem refer = parsers.getCitationParser().processingString(resHeader.getReference(), 0);
-                        BiblioItem.correct(resHeader, refer);
-                    }*/
+                if (resHeader.getEditors() != null) {
+                    // TBD: consider segments also for editors, like for authors above
+                    resHeader.setFullEditors(parsers.getAuthorParser().processingHeader(resHeader.getEditors()));
                 }
 
+                // below using the reference strings to improve the metadata extraction, it will have to
+                // be reviewed for something safer as just a straightforward correction
+                /*if (resHeader.getReference() != null) {
+                    BiblioItem refer = parsers.getCitationParser().processingString(resHeader.getReference(), 0);
+                    BiblioItem.correct(resHeader, refer);
+                }*/
+
+
                 // keyword post-processing
                 if (resHeader.getKeyword() != null) {
                     String keywords = TextUtilities.dehyphenize(resHeader.getKeyword());
@@ -305,10 +307,8 @@ public String processingHeaderSection(GrobidAnalysisConfig config, Document doc,
 
                 // DOI pass
                 List<String> dois = doc.getDOIMatches();
-                if (dois != null) {
-                    if ((dois.size() == 1) && (resHeader != null)) {
-                        resHeader.setDOI(dois.get(0));
-                    }
+                if (isNotEmpty(dois) && dois.size() == 1) {
+                    resHeader.setDOI(dois.get(0));
                 }
 
                 resHeader = consolidateHeader(resHeader, config.getConsolidateHeader());
@@ -822,6 +822,15 @@ public BiblioItem resultExtraction(String result, List<LayoutToken> tokenization
 
             String clusterContent = LayoutTokensUtil.normalizeDehyphenizeText(cluster.concatTokens());
             String clusterNonDehypenizedContent = LayoutTokensUtil.toText(cluster.concatTokens());
+
+            List<LayoutToken> clusterTokens = cluster.concatTokens();
+            List<LayoutToken> theList = biblio.getLabeledTokens().get(clusterLabel.getLabel());
+
+            theList = theList == null ? new ArrayList<>() : theList;
+            theList.addAll(clusterTokens);
+
+            biblio.getLabeledTokens().put(clusterLabel.getLabel(), theList);
+
             if (clusterLabel.equals(TaggingLabels.HEADER_TITLE)) {
                 /*if (biblio.getTitle() != null && isDifferentContent(biblio.getTitle(), clusterContent))
                     biblio.setTitle(biblio.getTitle() + clusterContent);