Skip to content

Commit

Permalink
fill up the layout token lists of the header's part when processing a…
Browse files Browse the repository at this point in the history
…nd add getter for title, authors and abstract layout token lists
  • Loading branch information
lfoppiano committed Apr 21, 2021
1 parent 0b42a3c commit 0306aa0
Show file tree
Hide file tree
Showing 2 changed files with 126 additions and 107 deletions.
24 changes: 17 additions & 7 deletions grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ public class BiblioItem {
private List<BoundingBox> coordinates = null;

// map of labels (e.g. <title> or <abstract>) to LayoutToken
private Map<String, List<LayoutToken>> labeledTokens;
private Map<String, List<LayoutToken>> labeledTokens = new HashMap<>();

private List<LayoutToken> titleLayoutTokens = new ArrayList<>();
private List<LayoutToken> authorsLayoutTokens = new ArrayList<>();
Expand Down Expand Up @@ -4556,7 +4556,7 @@ public boolean rejectAsReference() {
// normally properties authors and authorList are null in the current Grobid version
if (!titleSet && !authorSet && (url == null) && (doi == null))
return true;
else
else
return false;
}

Expand Down Expand Up @@ -4606,7 +4606,7 @@ public void setLayoutTokensForLabel(List<LayoutToken> tokens, TaggingLabel heade
labeledTokens.put(headerLabel.getLabel(), tokens);
}

public void generalResultMapping(Document doc, String labeledResult, List<LayoutToken> tokenizations) {
public void generalResultMapping(String labeledResult, List<LayoutToken> tokenizations) {
if (labeledTokens == null)
labeledTokens = new TreeMap<>();

Expand All @@ -4624,10 +4624,8 @@ public void generalResultMapping(Document doc, String labeledResult, List<Layout
List<LayoutToken> clusterTokens = cluster.concatTokens();
List<LayoutToken> theList = labeledTokens.get(clusterLabel.getLabel());

if (theList == null)
theList = new ArrayList<>();
for (LayoutToken token : clusterTokens)
theList.add(token);
theList = theList == null ? new ArrayList<>() : theList;
theList.addAll(clusterTokens);
labeledTokens.put(clusterLabel.getLabel(), theList);
}
}
Expand All @@ -4636,6 +4634,18 @@ public void addTitleTokens(List<LayoutToken> layoutTokens) {
this.titleLayoutTokens.addAll(layoutTokens);
}

public List<LayoutToken> getTitleLayoutTokens() {
return titleLayoutTokens;
}

public List<LayoutToken> getAuthorsLayoutTokens() {
return authorsLayoutTokens;
}

public List<LayoutToken> getAbstractLayoutTokens() {
return abstractLayoutTokens;
}

public void addAuthorsTokens(List<LayoutToken> layoutTokens) {
this.authorsLayoutTokens.addAll(layoutTokens);
}
Expand Down
209 changes: 109 additions & 100 deletions grobid-core/src/main/java/org/grobid/core/engines/HeaderParser.java
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package org.grobid.core.engines;

import com.google.common.base.Splitter;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.grobid.core.GrobidModels;
Expand Down Expand Up @@ -56,6 +57,8 @@
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import static org.apache.commons.collections4.CollectionUtils.isNotEmpty;

/**
* @author Patrice Lopez
*/
Expand Down Expand Up @@ -115,7 +118,7 @@ public Pair<String, Document> processing(File input, BiblioItem resHeader, Grobi
*/
public String processingHeaderSection(GrobidAnalysisConfig config, Document doc, BiblioItem resHeader, boolean serialize) {
try {
SortedSet<DocumentPiece> documentHeaderParts = documentHeaderParts = doc.getDocumentPart(SegmentationLabels.HEADER);
SortedSet<DocumentPiece> documentHeaderParts = doc.getDocumentPart(SegmentationLabels.HEADER);
List<LayoutToken> tokenizations = doc.getTokenizations();

if (documentHeaderParts != null) {
Expand All @@ -137,7 +140,7 @@ public String processingHeaderSection(GrobidAnalysisConfig config, Document doc,
String header = featuredHeader.getLeft();
List<LayoutToken> headerTokenization = featuredHeader.getRight();
String res = null;
if ((header != null) && (header.trim().length() > 0)) {
if (StringUtils.isNotBlank(header)) {
res = label(header);
resHeader = resultExtraction(res, headerTokenization, resHeader, doc);
}
Expand Down Expand Up @@ -179,120 +182,119 @@ public String processingHeaderSection(GrobidAnalysisConfig config, Document doc,
resHeader.setLanguage(lang);
}

if (resHeader != null) {
if (resHeader.getAbstract() != null) {
resHeader.setAbstract(TextUtilities.dehyphenizeHard(resHeader.getAbstract()));
//resHeader.setAbstract(TextUtilities.dehyphenize(resHeader.getAbstract()));
}
BiblioItem.cleanTitles(resHeader);
if (resHeader.getTitle() != null) {
// String temp =
// utilities.dehyphenizeHard(resHeader.getTitle());
String temp = TextUtilities.dehyphenize(resHeader.getTitle());

if (resHeader.getAbstract() != null) {
resHeader.setAbstract(TextUtilities.dehyphenizeHard(resHeader.getAbstract()));
//resHeader.setAbstract(TextUtilities.dehyphenize(resHeader.getAbstract()));
}
BiblioItem.cleanTitles(resHeader);
if (resHeader.getTitle() != null) {
// String temp =
// utilities.dehyphenizeHard(resHeader.getTitle());
String temp = TextUtilities.dehyphenize(resHeader.getTitle());
temp = temp.trim();
if (temp.length() > 1) {
if (temp.startsWith("1"))
temp = temp.substring(1, temp.length());
temp = temp.trim();
if (temp.length() > 1) {
if (temp.startsWith("1"))
temp = temp.substring(1, temp.length());
temp = temp.trim();
}
resHeader.setTitle(temp);
}
if (resHeader.getBookTitle() != null) {
resHeader.setBookTitle(TextUtilities.dehyphenize(resHeader.getBookTitle()));
}
resHeader.setTitle(temp);
}
if (resHeader.getBookTitle() != null) {
resHeader.setBookTitle(TextUtilities.dehyphenize(resHeader.getBookTitle()));
}

resHeader.setOriginalAuthors(resHeader.getAuthors());
resHeader.getAuthorsTokens();

boolean fragmentedAuthors = false;
boolean hasMarker = false;
List<Integer> authorsBlocks = new ArrayList<Integer>();
List<List<LayoutToken>> authorSegments = new ArrayList<>();
if (resHeader.getAuthorsTokens() != null) {
// split the list of layout tokens when token "\t" is met
List<LayoutToken> currentSegment = new ArrayList<>();
for(LayoutToken theToken : resHeader.getAuthorsTokens()) {
if (theToken.getText() != null && theToken.getText().equals("\t")) {
if (currentSegment.size() > 0)
authorSegments.add(currentSegment);
currentSegment = new ArrayList<>();
} else
currentSegment.add(theToken);
}
// last segment
if (currentSegment.size() > 0)
authorSegments.add(currentSegment);
resHeader.setOriginalAuthors(resHeader.getAuthors());
resHeader.getAuthorsTokens();

boolean fragmentedAuthors = false;
boolean hasMarker = false;
List<Integer> authorsBlocks = new ArrayList<Integer>();
List<List<LayoutToken>> authorSegments = new ArrayList<>();
if (resHeader.getAuthorsTokens() != null) {
// split the list of layout tokens when token "\t" is met
List<LayoutToken> currentSegment = new ArrayList<>();
for(LayoutToken theToken : resHeader.getAuthorsTokens()) {
if (theToken.getText() != null && theToken.getText().equals("\t")) {
if (currentSegment.size() > 0)
authorSegments.add(currentSegment);
currentSegment = new ArrayList<>();
} else
currentSegment.add(theToken);
}
// last segment
if (currentSegment.size() > 0)
authorSegments.add(currentSegment);

if (authorSegments.size() > 1) {
fragmentedAuthors = true;
}
for (int k = 0; k < authorSegments.size(); k++) {
if (authorSegments.get(k).size() == 0)
continue;
List<Person> localAuthors = parsers.getAuthorParser()
.processingHeaderWithLayoutTokens(authorSegments.get(k), doc.getPDFAnnotations());
if (localAuthors != null) {
for (Person pers : localAuthors) {
resHeader.addFullAuthor(pers);
if (pers.getMarkers() != null) {
hasMarker = true;
}
authorsBlocks.add(k);
if (authorSegments.size() > 1) {
fragmentedAuthors = true;
}
for (int k = 0; k < authorSegments.size(); k++) {
if (authorSegments.get(k).size() == 0)
continue;
List<Person> localAuthors = parsers.getAuthorParser()
.processingHeaderWithLayoutTokens(authorSegments.get(k), doc.getPDFAnnotations());
if (localAuthors != null) {
for (Person pers : localAuthors) {
resHeader.addFullAuthor(pers);
if (pers.getMarkers() != null) {
hasMarker = true;
}
authorsBlocks.add(k);
}
}
}
}


// remove invalid authors (no last name, noise, etc.)
resHeader.setFullAuthors(Person.sanityCheck(resHeader.getFullAuthors()));

resHeader.setFullAffiliations(
parsers.getAffiliationAddressParser().processReflow(res, tokenizations));
resHeader.attachEmails();
boolean attached = false;
if (fragmentedAuthors && !hasMarker) {
if (resHeader.getFullAffiliations() != null) {
if (authorSegments != null) {
if (resHeader.getFullAffiliations().size() == authorSegments.size()) {
int k = 0;
List<Person> persons = resHeader.getFullAuthors();
for (Person pers : persons) {
if (k < authorsBlocks.size()) {
int indd = authorsBlocks.get(k);
if (indd < resHeader.getFullAffiliations().size()) {
pers.addAffiliation(resHeader.getFullAffiliations().get(indd));
}
}
k++;
// remove invalid authors (no last name, noise, etc.)
resHeader.setFullAuthors(Person.sanityCheck(resHeader.getFullAuthors()));

resHeader.setFullAffiliations(
parsers.getAffiliationAddressParser().processReflow(res, tokenizations));
resHeader.attachEmails();
boolean attached = false;
if (fragmentedAuthors && !hasMarker) {
if (resHeader.getFullAffiliations() != null) {
if (resHeader.getFullAffiliations().size() == authorSegments.size()) {
int k = 0;
List<Person> persons = resHeader.getFullAuthors();
for (Person pers : persons) {
if (k < authorsBlocks.size()) {
int indd = authorsBlocks.get(k);
if (indd < resHeader.getFullAffiliations().size()) {
pers.addAffiliation(resHeader.getFullAffiliations().get(indd));
}
attached = true;
resHeader.setFullAffiliations(null);
resHeader.setAffiliation(null);
}
k++;
}
attached = true;
resHeader.setFullAffiliations(null);
resHeader.setAffiliation(null);
}
}
if (!attached) {
resHeader.attachAffiliations();
}

// remove duplicated authors
resHeader.setFullAuthors(Person.deduplicate(resHeader.getFullAuthors()));
}
if (!attached) {
resHeader.attachAffiliations();
}

if (resHeader.getEditors() != null) {
// TBD: consider segments also for editors, like for authors above
resHeader.setFullEditors(parsers.getAuthorParser().processingHeader(resHeader.getEditors()));
}
// remove duplicated authors
resHeader.setFullAuthors(Person.deduplicate(resHeader.getFullAuthors()));

// below using the reference strings to improve the metadata extraction, it will have to
// be reviewed for something safer as just a straightforward correction
/*if (resHeader.getReference() != null) {
BiblioItem refer = parsers.getCitationParser().processingString(resHeader.getReference(), 0);
BiblioItem.correct(resHeader, refer);
}*/
if (resHeader.getEditors() != null) {
// TBD: consider segments also for editors, like for authors above
resHeader.setFullEditors(parsers.getAuthorParser().processingHeader(resHeader.getEditors()));
}

// below using the reference strings to improve the metadata extraction, it will have to
// be reviewed for something safer as just a straightforward correction
/*if (resHeader.getReference() != null) {
BiblioItem refer = parsers.getCitationParser().processingString(resHeader.getReference(), 0);
BiblioItem.correct(resHeader, refer);
}*/


// keyword post-processing
if (resHeader.getKeyword() != null) {
String keywords = TextUtilities.dehyphenize(resHeader.getKeyword());
Expand All @@ -305,10 +307,8 @@ public String processingHeaderSection(GrobidAnalysisConfig config, Document doc,

// DOI pass
List<String> dois = doc.getDOIMatches();
if (dois != null) {
if ((dois.size() == 1) && (resHeader != null)) {
resHeader.setDOI(dois.get(0));
}
if (isNotEmpty(dois) && dois.size() == 1) {
resHeader.setDOI(dois.get(0));
}

resHeader = consolidateHeader(resHeader, config.getConsolidateHeader());
Expand Down Expand Up @@ -822,6 +822,15 @@ public BiblioItem resultExtraction(String result, List<LayoutToken> tokenization

String clusterContent = LayoutTokensUtil.normalizeDehyphenizeText(cluster.concatTokens());
String clusterNonDehypenizedContent = LayoutTokensUtil.toText(cluster.concatTokens());

List<LayoutToken> clusterTokens = cluster.concatTokens();
List<LayoutToken> theList = biblio.getLabeledTokens().get(clusterLabel.getLabel());

theList = theList == null ? new ArrayList<>() : theList;
theList.addAll(clusterTokens);

biblio.getLabeledTokens().put(clusterLabel.getLabel(), theList);

if (clusterLabel.equals(TaggingLabels.HEADER_TITLE)) {
/*if (biblio.getTitle() != null && isDifferentContent(biblio.getTitle(), clusterContent))
biblio.setTitle(biblio.getTitle() + clusterContent);
Expand Down

0 comments on commit 0306aa0

Please sign in to comment.