Skip to content

Commit

Permalink
remove authors, abstract and title layout tokens
Browse files Browse the repository at this point in the history
  • Loading branch information
lfoppiano committed Apr 22, 2021
1 parent b68b96f commit af229cb
Show file tree
Hide file tree
Showing 3 changed files with 10 additions and 50 deletions.
30 changes: 1 addition & 29 deletions grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java
Expand Up @@ -53,10 +53,6 @@ public class BiblioItem {
// map of labels (e.g. <title> or <abstract>) to LayoutToken
private Map<String, List<LayoutToken>> labeledTokens;

private List<LayoutToken> titleLayoutTokens = new ArrayList<>();
private List<LayoutToken> authorsLayoutTokens = new ArrayList<>();
private List<LayoutToken> abstractLayoutTokens = new ArrayList<>();

@Override
public String toString() {
return "BiblioItem{" +
Expand Down Expand Up @@ -1177,15 +1173,6 @@ public void setAuthors(String aut) {
authors = aut;
}

public BiblioItem addAuthorsToken(LayoutToken lt) {
authorsLayoutTokens.add(lt);
return this;
}

public List<LayoutToken> getAuthorsTokens() {
return authorsLayoutTokens;
}

public void addAuthor(String aut) {
if (authors == null)
authors = aut;
Expand Down Expand Up @@ -4606,7 +4593,7 @@ public void setLayoutTokensForLabel(List<LayoutToken> tokens, TaggingLabel heade
labeledTokens.put(headerLabel.getLabel(), tokens);
}

public void generalResultMapping(Document doc, String labeledResult, List<LayoutToken> tokenizations) {
public void generalResultMapping(String labeledResult, List<LayoutToken> tokenizations) {
if (labeledTokens == null)
labeledTokens = new TreeMap<>();

Expand All @@ -4630,19 +4617,4 @@ public void generalResultMapping(Document doc, String labeledResult, List<Layout
}
}

public void addTitleTokens(List<LayoutToken> layoutTokens) {
this.titleLayoutTokens.addAll(layoutTokens);
}

public void addAuthorsTokens(List<LayoutToken> layoutTokens) {
this.authorsLayoutTokens.addAll(layoutTokens);
}

public void addAbstractTokens(List<LayoutToken> layoutTokens) {
this.abstractLayoutTokens.addAll(layoutTokens);
}

public List<LayoutToken> getAbstractTokens() {
return this.abstractLayoutTokens;
}
}
Expand Up @@ -186,8 +186,7 @@ public Document processing(DocumentSource documentSource,

// structure the abstract using the fulltext model
if (isNotBlank(resHeader.getAbstract())) {
//List<LayoutToken> abstractTokens = resHeader.getLayoutTokens(TaggingLabels.HEADER_ABSTRACT);
List<LayoutToken> abstractTokens = resHeader.getAbstractTokens();
List<LayoutToken> abstractTokens = resHeader.getLayoutTokens(TaggingLabels.HEADER_ABSTRACT);
if (CollectionUtils.isNotEmpty(abstractTokens)) {
abstractTokens = BiblioItem.cleanAbstractLayoutTokens(abstractTokens);
Pair<String, List<LayoutToken>> abstractProcessed = processShort(abstractTokens, doc);
Expand Down
Expand Up @@ -121,7 +121,7 @@ public String processingHeaderSection(GrobidAnalysisConfig config, Document doc,
String res = null;
if (StringUtils.isNotBlank(header)) {
res = label(header);
resHeader = resultExtraction(res, headerTokenization, resHeader, doc);
resHeader = resultExtraction(res, headerTokenization, resHeader);
}

// language identification
Expand Down Expand Up @@ -184,16 +184,16 @@ public String processingHeaderSection(GrobidAnalysisConfig config, Document doc,
}

resHeader.setOriginalAuthors(resHeader.getAuthors());
resHeader.getAuthorsTokens();

boolean fragmentedAuthors = false;
boolean hasMarker = false;
List<Integer> authorsBlocks = new ArrayList<Integer>();
List<Integer> authorsBlocks = new ArrayList<>();
List<List<LayoutToken>> authorSegments = new ArrayList<>();
if (resHeader.getAuthorsTokens() != null) {
List<LayoutToken> authorLayoutTokens = resHeader.getLayoutTokens(TaggingLabels.HEADER_AUTHOR);
if (isNotEmpty(authorLayoutTokens)) {
// split the list of layout tokens when token "\t" is met
List<LayoutToken> currentSegment = new ArrayList<>();
for(LayoutToken theToken : resHeader.getAuthorsTokens()) {
for(LayoutToken theToken : authorLayoutTokens) {
if (theToken.getText() != null && theToken.getText().equals("\t")) {
if (currentSegment.size() > 0)
authorSegments.add(currentSegment);
Expand Down Expand Up @@ -785,13 +785,13 @@ else if (features.blockStatus == null)
* @param biblio biblio item
* @return a biblio item
*/
public BiblioItem resultExtraction(String result, List<LayoutToken> tokenizations, BiblioItem biblio, Document doc) {
public BiblioItem resultExtraction(String result, List<LayoutToken> tokenizations, BiblioItem biblio) {

TaggingLabel lastClusterLabel = null;
TaggingTokenClusteror clusteror = new TaggingTokenClusteror(GrobidModels.HEADER, result, tokenizations);

String tokenLabel = null;
List<TaggingTokenCluster> clusters = clusteror.cluster();

biblio.generalResultMapping(result, tokenizations);
for (TaggingTokenCluster cluster : clusters) {
if (cluster == null) {
continue;
Expand All @@ -807,23 +807,14 @@ public BiblioItem resultExtraction(String result, List<LayoutToken> tokenization
else*/
if (biblio.getTitle() == null) {
biblio.setTitle(clusterContent);
List<LayoutToken> tokens = getLayoutTokens(cluster);
biblio.addTitleTokens(tokens);
}
} else if (clusterLabel.equals(TaggingLabels.HEADER_AUTHOR)) {
//if (biblio.getAuthors() != null && isDifferentandNotIncludedContent(biblio.getAuthors(), clusterContent)) {
if (biblio.getAuthors() != null) {
biblio.setAuthors(biblio.getAuthors() + "\t" + clusterNonDehypenizedContent);
//biblio.addAuthorsToken(new LayoutToken("\n", TaggingLabels.HEADER_AUTHOR));
biblio.addAuthorsToken(new LayoutToken("\t", TaggingLabels.HEADER_AUTHOR));

List<LayoutToken> tokens = cluster.concatTokens();
biblio.addAuthorsTokens(tokens);
} else {
biblio.setAuthors(clusterNonDehypenizedContent);

List<LayoutToken> tokens = cluster.concatTokens();
biblio.addAuthorsTokens(tokens);
}
} /*else if (clusterLabel.equals(TaggingLabels.HEADER_TECH)) {
biblio.setItem(BiblioItem.TechReport);
Expand Down Expand Up @@ -915,8 +906,6 @@ else if (biblio.getPublicationDate() == null)
//biblio.setAbstract(biblio.getAbstract() + " " + clusterContent);
} else {
biblio.setAbstract(clusterContent);
List<LayoutToken> tokens = getLayoutTokens(cluster);
biblio.addAbstractTokens(tokens);
}
} else if (clusterLabel.equals(TaggingLabels.HEADER_REFERENCE)) {
//if (biblio.getReference() != null) {
Expand Down

0 comments on commit af229cb

Please sign in to comment.