Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Finalize getting Layout token lists for header fields #746

Merged
merged 6 commits into from
Apr 22, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 16 additions & 31 deletions grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,11 @@
import org.grobid.core.layout.LayoutToken;
import org.grobid.core.tokenization.TaggingTokenCluster;
import org.grobid.core.tokenization.TaggingTokenClusteror;
import org.grobid.core.utilities.LayoutTokensUtil;
import org.grobid.core.engines.label.TaggingLabel;
import org.grobid.core.engines.label.TaggingLabels;
import org.grobid.core.engines.tagging.GenericTaggerUtils;
import org.grobid.core.lexicon.Lexicon;
import org.grobid.core.utilities.LanguageUtilities;
import org.grobid.core.utilities.TextUtilities;
import org.grobid.core.utilities.KeyGen;
import org.grobid.core.utilities.Pair;
import org.grobid.core.GrobidModels;

import java.net.URLEncoder;
Expand Down Expand Up @@ -53,9 +49,12 @@ public class BiblioItem {
// map of labels (e.g. <title> or <abstract>) to LayoutToken
private Map<String, List<LayoutToken>> labeledTokens;

private List<LayoutToken> titleLayoutTokens = new ArrayList<>();
private List<LayoutToken> authorsLayoutTokens = new ArrayList<>();
private List<LayoutToken> abstractLayoutTokens = new ArrayList<>();
/**
* This is an internal structure not meant to be used outside. This is also modified with respect of other structures
* For collecting layout tokens of the various bibliographical component, please refers to @See(getLayoutTokens(TaggingLabels label)
*/
private List<LayoutToken> authorsTokensWorkingCopy = new ArrayList<>();


@Override
public String toString() {
Expand Down Expand Up @@ -1177,13 +1176,13 @@ public void setAuthors(String aut) {
authors = aut;
}

public BiblioItem addAuthorsToken(LayoutToken lt) {
authorsLayoutTokens.add(lt);
public BiblioItem collectAuthorsToken(LayoutToken lt) {
authorsTokensWorkingCopy.add(lt);
return this;
}

public List<LayoutToken> getAuthorsTokens() {
return authorsLayoutTokens;
public void collectAuthorsTokens(List<LayoutToken> layoutTokens) {
this.authorsTokensWorkingCopy.addAll(layoutTokens);
}

public void addAuthor(String aut) {
Expand Down Expand Up @@ -4556,7 +4555,7 @@ public boolean rejectAsReference() {
// normally properties authors and authorList are null in the current Grobid version
if (!titleSet && !authorSet && (url == null) && (doi == null))
return true;
else
else
return false;
}

Expand Down Expand Up @@ -4606,7 +4605,7 @@ public void setLayoutTokensForLabel(List<LayoutToken> tokens, TaggingLabel heade
labeledTokens.put(headerLabel.getLabel(), tokens);
}

public void generalResultMapping(Document doc, String labeledResult, List<LayoutToken> tokenizations) {
public void generalResultMapping(String labeledResult, List<LayoutToken> tokenizations) {
if (labeledTokens == null)
labeledTokens = new TreeMap<>();

Expand All @@ -4624,27 +4623,13 @@ public void generalResultMapping(Document doc, String labeledResult, List<Layout
List<LayoutToken> clusterTokens = cluster.concatTokens();
List<LayoutToken> theList = labeledTokens.get(clusterLabel.getLabel());

if (theList == null)
theList = new ArrayList<>();
for (LayoutToken token : clusterTokens)
theList.add(token);
theList = theList == null ? new ArrayList<>() : theList;
theList.addAll(clusterTokens);
labeledTokens.put(clusterLabel.getLabel(), theList);
}
}

public void addTitleTokens(List<LayoutToken> layoutTokens) {
this.titleLayoutTokens.addAll(layoutTokens);
}

public void addAuthorsTokens(List<LayoutToken> layoutTokens) {
this.authorsLayoutTokens.addAll(layoutTokens);
}

public void addAbstractTokens(List<LayoutToken> layoutTokens) {
this.abstractLayoutTokens.addAll(layoutTokens);
}

public List<LayoutToken> getAbstractTokens() {
return this.abstractLayoutTokens;
public List<LayoutToken> getAuthorsTokensWorkingCopy() {
return authorsTokensWorkingCopy;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
import org.grobid.core.exceptions.GrobidException;
import org.grobid.core.exceptions.GrobidExceptionStatus;
import org.grobid.core.features.FeatureFactory;
import org.grobid.core.features.FeaturesVectorHeader;
import org.grobid.core.layout.Block;
import org.grobid.core.layout.BoundingBox;
import org.grobid.core.layout.Cluster;
Expand Down Expand Up @@ -71,7 +70,6 @@
import java.util.List;
import java.util.Map;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.stream.Collectors;

Expand Down Expand Up @@ -829,7 +827,7 @@ public static List<LayoutToken> getTokenizationParts(SortedSet<DocumentPiece> do
if (documentParts == null)
return null;

List<LayoutToken> tokenizationParts = new ArrayList<LayoutToken>();
List<LayoutToken> tokenizationParts = new ArrayList<>();
for (DocumentPiece docPiece : documentParts) {
DocumentPointer dp1 = docPiece.getLeft();
DocumentPointer dp2 = docPiece.getRight();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -186,8 +186,7 @@ public Document processing(DocumentSource documentSource,

// structure the abstract using the fulltext model
if (isNotBlank(resHeader.getAbstract())) {
//List<LayoutToken> abstractTokens = resHeader.getLayoutTokens(TaggingLabels.HEADER_ABSTRACT);
List<LayoutToken> abstractTokens = resHeader.getAbstractTokens();
List<LayoutToken> abstractTokens = resHeader.getLayoutTokens(TaggingLabels.HEADER_ABSTRACT);
if (CollectionUtils.isNotEmpty(abstractTokens)) {
abstractTokens = BiblioItem.cleanAbstractLayoutTokens(abstractTokens);
Pair<String, List<LayoutToken>> abstractProcessed = processShort(abstractTokens, doc);
Expand Down
Loading