allow table model to recover merged tables

kermitt2 · Jun 9, 2021 · 955ecca · 955ecca
1 parent 0078ecd
commit 955ecca
Show file tree

Hide file tree

Showing 6 changed files with 55,522 additions and 41,130 deletions.
diff --git a/grobid-core/src/main/java/org/grobid/core/data/Figure.java b/grobid-core/src/main/java/org/grobid/core/data/Figure.java
@@ -529,6 +529,12 @@ public void setLayoutTokens(List<LayoutToken> layoutTokens) {
         this.layoutTokens = layoutTokens;
     }
 
+    public void addLayoutTokens(List<LayoutToken> layoutTokens) {
+        if (this.layoutTokens == null) 
+            this.layoutTokens = new ArrayList<>();
+        this.layoutTokens.addAll(layoutTokens);
+    }
+
     public void setBlockPtrs(SortedSet<Integer> blockPtrs) {
         this.blockPtrs = blockPtrs;
     }

diff --git a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java
@@ -264,7 +264,7 @@ else if (config.getConsolidateCitations() == 2)
                         figure.setCaptionLayoutTokens(captionProcess.getRight());
                     }
                 }
-
+                
 				tables = processTables(resultBody, layoutTokenization.getTokenization(), doc);
                 // further parse the caption
                 for(Table table : tables) {
@@ -2078,29 +2078,34 @@ protected List<Table> processTables(String rese,
 		for (TaggingTokenCluster cluster : Iterables.filter(clusteror.cluster(),
 				new TaggingTokenClusteror.LabelTypePredicate(TaggingLabels.TABLE))) {
 			List<LayoutToken> tokenizationTable = cluster.concatTokens();
-			Table result = parsers.getTableParser().processing(
+			List<Table> localResults = parsers.getTableParser().processing(
 					tokenizationTable,
 					cluster.getFeatureBlock()
 			);
 
-			SortedSet<Integer> blockPtrs = new TreeSet<>();
-			for (LayoutToken lt : tokenizationTable) {
-				if (!LayoutTokensUtil.spaceyToken(lt.t()) && !LayoutTokensUtil.newLineToken(lt.t())) {
-					blockPtrs.add(lt.getBlockPtr());
-				}
-			}
-			result.setBlockPtrs(blockPtrs);
-			result.setLayoutTokens(tokenizationTable);
+            for (Table result : localResults) {
+                List<LayoutToken> localTokenizationTable = result.getLayoutTokens();
+                //result.setLayoutTokens(tokenizationTable);
 
-			// the first token could be a space from previous page
-			for (LayoutToken lt : tokenizationTable) {
-				if (!LayoutTokensUtil.spaceyToken(lt.t()) && !LayoutTokensUtil.newLineToken(lt.t())) {
-					result.setPage(lt.getPage());
-					break;
-				}
-			}
-			results.add(result);
-			result.setId("" + (results.size() - 1));
+                // block setting: we restrict to the tokenization of this particulart table
+                SortedSet<Integer> blockPtrs = new TreeSet<>();
+                for (LayoutToken lt : localTokenizationTable) {
+                    if (!LayoutTokensUtil.spaceyToken(lt.t()) && !LayoutTokensUtil.newLineToken(lt.t())) {
+                        blockPtrs.add(lt.getBlockPtr());
+                    }
+                }
+                result.setBlockPtrs(blockPtrs);
+
+    			// page setting: the first token could be a space from previous page
+    			for (LayoutToken lt : localTokenizationTable) {
+    				if (!LayoutTokensUtil.spaceyToken(lt.t()) && !LayoutTokensUtil.newLineToken(lt.t())) {
+    					result.setPage(lt.getPage());
+    					break;
+    				}
+    			}
+    			results.add(result);
+    			result.setId("" + (results.size() - 1));
+            }
 		}
 
 		doc.setTables(results);

diff --git a/grobid-core/src/main/java/org/grobid/core/engines/TableParser.java b/grobid-core/src/main/java/org/grobid/core/engines/TableParser.java
@@ -15,9 +15,11 @@
 import org.slf4j.LoggerFactory;
 
 import org.apache.commons.lang3.tuple.Pair;
+import org.apache.commons.lang3.StringUtils;
 
 import java.util.Collections;
 import java.util.List;
+import java.util.ArrayList;
 
 import static org.grobid.core.engines.label.TaggingLabels.*;
 
@@ -33,8 +35,11 @@ protected TableParser() {
 
     /**
      * The processing here is called from the full text parser in cascade.
+     * Normally we should find only one table in the sequence to be labelled. 
+     * But for robustness and recovering error from the higher level, we allow
+     * sub-segmenting several tables that appears one after the other.   
      */
-    public Table processing(List<LayoutToken> tokenizationTable, String featureVector) {
+    public List<Table> processing(List<LayoutToken> tokenizationTable, String featureVector) {
         String res;
         try {
             res = label(featureVector);
@@ -49,12 +54,15 @@ public Table processing(List<LayoutToken> tokenizationTable, String featureVecto
         return getExtractionResult(tokenizationTable, res);
     }
 
-    private Table getExtractionResult(List<LayoutToken> tokenizations, String result) {
+    private List<Table> getExtractionResult(List<LayoutToken> tokenizations, String result) {
+        List<Table> tables = new ArrayList<>();
+
+        // first table
         Table table = new Table();
-        table.setTextArea(Collections.singletonList(BoundingBoxCalculator.calculateOneBox(tokenizations, true)));
-
+
         TaggingTokenClusteror clusteror = new TaggingTokenClusteror(GrobidModels.TABLE, result, tokenizations);
         List<TaggingTokenCluster> clusters = clusteror.cluster();
+        TaggingLabel previousLabel = null;
 
         for (TaggingTokenCluster cluster : clusters) {
             if (cluster == null) {
@@ -70,29 +78,51 @@ private Table getExtractionResult(List<LayoutToken> tokenizations, String result
                 table.appendCaption(clusterContent);
                 table.appendCaptionLayoutTokens(tokens);
                 table.getFullDescriptionTokens().addAll(tokens);
+                table.addLayoutTokens(tokens);
             } else if (clusterLabel.equals(TBL_HEAD)) {
+                // if we already have a header (it could be via label) and we are not continuing some header/label
+                // we consider the non-connected header field as the introduction of a new table
+                // TBD: this work fine for header located before the table content, but not sure otherwise
+                if (!StringUtils.isEmpty(table.getHeader()) &&
+                    previousLabel != null && 
+                    (previousLabel.equals(TBL_CONTENT) || previousLabel.equals(TBL_NOTE) || previousLabel.equals(TBL_DESC) )) {
+                    // we already have a table header, this means that we have a distinct table starting now
+                    tables.add(table);
+                    table.setTextArea(Collections.singletonList(BoundingBoxCalculator.calculateOneBox(table.getLayoutTokens(), true)));
+                    table = new Table();
+                }
                 table.appendHeader(clusterContent);
                 table.getFullDescriptionTokens().addAll(tokens);
+                table.addLayoutTokens(tokens);
             } else if (clusterLabel.equals(TBL_LABEL)) {
                 //label should also go to head
                 table.appendHeader(" " + clusterContent + " ");
                 table.appendLabel(clusterContent);
                 table.getFullDescriptionTokens().addAll(tokens);
+                table.addLayoutTokens(tokens);
             } else if (clusterLabel.equals(TBL_NOTE)) {
                 table.appendNote(clusterContent);
                 table.getFullDescriptionTokens().addAll(tokens);
                 table.addAllNoteLayoutTokens(tokens);
+                table.addLayoutTokens(tokens);
             } else if (clusterLabel.equals(TBL_OTHER)) {
+                table.addLayoutTokens(tokens);
             } else if (clusterLabel.equals(TBL_CONTENT)) {
                 table.appendContent(clusterContent);
                 table.getContentTokens().addAll(tokens);
+                table.addLayoutTokens(tokens);
             } else {
                 LOGGER.warn("Unexpected table model label - " + clusterLabel.getLabel() + " for " + clusterContent);
             }
 
+            previousLabel = clusterLabel;
         }     
 
-        return table;
+        // last table
+        table.setTextArea(Collections.singletonList(BoundingBoxCalculator.calculateOneBox(table.getLayoutTokens(), true)));
+        tables.add(table);
+
+        return tables;
     }
 
     /**