Skip to content

Commit

Permalink
allow table model to recover merged tables
Browse files Browse the repository at this point in the history
  • Loading branch information
kermitt2 committed Jun 9, 2021
1 parent 0078ecd commit 955ecca
Show file tree
Hide file tree
Showing 6 changed files with 55,522 additions and 41,130 deletions.
6 changes: 6 additions & 0 deletions grobid-core/src/main/java/org/grobid/core/data/Figure.java
Original file line number Diff line number Diff line change
Expand Up @@ -529,6 +529,12 @@ public void setLayoutTokens(List<LayoutToken> layoutTokens) {
this.layoutTokens = layoutTokens;
}

public void addLayoutTokens(List<LayoutToken> layoutTokens) {
if (this.layoutTokens == null)
this.layoutTokens = new ArrayList<>();
this.layoutTokens.addAll(layoutTokens);
}

public void setBlockPtrs(SortedSet<Integer> blockPtrs) {
this.blockPtrs = blockPtrs;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,7 @@ else if (config.getConsolidateCitations() == 2)
figure.setCaptionLayoutTokens(captionProcess.getRight());
}
}

tables = processTables(resultBody, layoutTokenization.getTokenization(), doc);
// further parse the caption
for(Table table : tables) {
Expand Down Expand Up @@ -2078,29 +2078,34 @@ protected List<Table> processTables(String rese,
for (TaggingTokenCluster cluster : Iterables.filter(clusteror.cluster(),
new TaggingTokenClusteror.LabelTypePredicate(TaggingLabels.TABLE))) {
List<LayoutToken> tokenizationTable = cluster.concatTokens();
Table result = parsers.getTableParser().processing(
List<Table> localResults = parsers.getTableParser().processing(
tokenizationTable,
cluster.getFeatureBlock()
);

SortedSet<Integer> blockPtrs = new TreeSet<>();
for (LayoutToken lt : tokenizationTable) {
if (!LayoutTokensUtil.spaceyToken(lt.t()) && !LayoutTokensUtil.newLineToken(lt.t())) {
blockPtrs.add(lt.getBlockPtr());
}
}
result.setBlockPtrs(blockPtrs);
result.setLayoutTokens(tokenizationTable);
for (Table result : localResults) {
List<LayoutToken> localTokenizationTable = result.getLayoutTokens();
//result.setLayoutTokens(tokenizationTable);

// the first token could be a space from previous page
for (LayoutToken lt : tokenizationTable) {
if (!LayoutTokensUtil.spaceyToken(lt.t()) && !LayoutTokensUtil.newLineToken(lt.t())) {
result.setPage(lt.getPage());
break;
}
}
results.add(result);
result.setId("" + (results.size() - 1));
// block setting: we restrict to the tokenization of this particulart table
SortedSet<Integer> blockPtrs = new TreeSet<>();
for (LayoutToken lt : localTokenizationTable) {
if (!LayoutTokensUtil.spaceyToken(lt.t()) && !LayoutTokensUtil.newLineToken(lt.t())) {
blockPtrs.add(lt.getBlockPtr());
}
}
result.setBlockPtrs(blockPtrs);

// page setting: the first token could be a space from previous page
for (LayoutToken lt : localTokenizationTable) {
if (!LayoutTokensUtil.spaceyToken(lt.t()) && !LayoutTokensUtil.newLineToken(lt.t())) {
result.setPage(lt.getPage());
break;
}
}
results.add(result);
result.setId("" + (results.size() - 1));
}
}

doc.setTables(results);
Expand Down
40 changes: 35 additions & 5 deletions grobid-core/src/main/java/org/grobid/core/engines/TableParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,11 @@
import org.slf4j.LoggerFactory;

import org.apache.commons.lang3.tuple.Pair;
import org.apache.commons.lang3.StringUtils;

import java.util.Collections;
import java.util.List;
import java.util.ArrayList;

import static org.grobid.core.engines.label.TaggingLabels.*;

Expand All @@ -33,8 +35,11 @@ protected TableParser() {

/**
* The processing here is called from the full text parser in cascade.
* Normally we should find only one table in the sequence to be labelled.
* But for robustness and recovering error from the higher level, we allow
* sub-segmenting several tables that appears one after the other.
*/
public Table processing(List<LayoutToken> tokenizationTable, String featureVector) {
public List<Table> processing(List<LayoutToken> tokenizationTable, String featureVector) {
String res;
try {
res = label(featureVector);
Expand All @@ -49,12 +54,15 @@ public Table processing(List<LayoutToken> tokenizationTable, String featureVecto
return getExtractionResult(tokenizationTable, res);
}

private Table getExtractionResult(List<LayoutToken> tokenizations, String result) {
private List<Table> getExtractionResult(List<LayoutToken> tokenizations, String result) {
List<Table> tables = new ArrayList<>();

// first table
Table table = new Table();
table.setTextArea(Collections.singletonList(BoundingBoxCalculator.calculateOneBox(tokenizations, true)));


TaggingTokenClusteror clusteror = new TaggingTokenClusteror(GrobidModels.TABLE, result, tokenizations);
List<TaggingTokenCluster> clusters = clusteror.cluster();
TaggingLabel previousLabel = null;

for (TaggingTokenCluster cluster : clusters) {
if (cluster == null) {
Expand All @@ -70,29 +78,51 @@ private Table getExtractionResult(List<LayoutToken> tokenizations, String result
table.appendCaption(clusterContent);
table.appendCaptionLayoutTokens(tokens);
table.getFullDescriptionTokens().addAll(tokens);
table.addLayoutTokens(tokens);
} else if (clusterLabel.equals(TBL_HEAD)) {
// if we already have a header (it could be via label) and we are not continuing some header/label
// we consider the non-connected header field as the introduction of a new table
// TBD: this work fine for header located before the table content, but not sure otherwise
if (!StringUtils.isEmpty(table.getHeader()) &&
previousLabel != null &&
(previousLabel.equals(TBL_CONTENT) || previousLabel.equals(TBL_NOTE) || previousLabel.equals(TBL_DESC) )) {
// we already have a table header, this means that we have a distinct table starting now
tables.add(table);
table.setTextArea(Collections.singletonList(BoundingBoxCalculator.calculateOneBox(table.getLayoutTokens(), true)));
table = new Table();
}
table.appendHeader(clusterContent);
table.getFullDescriptionTokens().addAll(tokens);
table.addLayoutTokens(tokens);
} else if (clusterLabel.equals(TBL_LABEL)) {
//label should also go to head
table.appendHeader(" " + clusterContent + " ");
table.appendLabel(clusterContent);
table.getFullDescriptionTokens().addAll(tokens);
table.addLayoutTokens(tokens);
} else if (clusterLabel.equals(TBL_NOTE)) {
table.appendNote(clusterContent);
table.getFullDescriptionTokens().addAll(tokens);
table.addAllNoteLayoutTokens(tokens);
table.addLayoutTokens(tokens);
} else if (clusterLabel.equals(TBL_OTHER)) {
table.addLayoutTokens(tokens);
} else if (clusterLabel.equals(TBL_CONTENT)) {
table.appendContent(clusterContent);
table.getContentTokens().addAll(tokens);
table.addLayoutTokens(tokens);
} else {
LOGGER.warn("Unexpected table model label - " + clusterLabel.getLabel() + " for " + clusterContent);
}

previousLabel = clusterLabel;
}

return table;
// last table
table.setTextArea(Collections.singletonList(BoundingBoxCalculator.calculateOneBox(table.getLayoutTokens(), true)));
tables.add(table);

return tables;
}

/**
Expand Down

0 comments on commit 955ecca

Please sign in to comment.