kermitt2 · de-code · Apr 14, 2021
diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
@@ -1131,6 +1131,9 @@ public StringBuilder toTEIAnnex(StringBuilder buffer,
                                     BiblioItem biblio,
                                     List<BibDataSet> bds,
                                     List<LayoutToken> tokenizations,
+                                    List<Figure> figures,
+                                    List<Table> tables,
+                                    List<Equation> equations,
                                     Document doc,
                                     GrobidAnalysisConfig config) throws Exception {
         if ((result == null) || (tokenizations == null)) {
@@ -1139,7 +1142,7 @@ public StringBuilder toTEIAnnex(StringBuilder buffer,
 
         buffer.append("\t\t\t<div type=\"annex\">\n");
         buffer = toTEITextPiece(buffer, result, biblio, bds, true,
-                new LayoutTokenization(tokenizations), null, null, null, doc, config);
+                new LayoutTokenization(tokenizations), figures, tables, equations, doc, config);
         buffer.append("\t\t\t</div>\n");
 
         return buffer;

diff --git a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java
@@ -250,29 +250,10 @@ else if (config.getConsolidateCitations() == 2)
 
 				// we apply now the figure and table models based on the fulltext labeled output
 				figures = processFigures(resultBody, layoutTokenization.getTokenization(), doc);
-                // further parse the caption
-                for(Figure figure : figures) {
-                    if (CollectionUtils.isNotEmpty(figure.getCaptionLayoutTokens()) ) {
-                        Pair<String, List<LayoutToken>> captionProcess = processShort(figure.getCaptionLayoutTokens(), doc);
-                        figure.setLabeledCaption(captionProcess.getLeft());
-                        figure.setCaptionLayoutTokens(captionProcess.getRight());
-                    }
-                }
+				postProcessFigureCaptions(figures, doc);
 
 				tables = processTables(resultBody, layoutTokenization.getTokenization(), doc);
-                // further parse the caption
-                for(Table table : tables) {
-                    if ( CollectionUtils.isNotEmpty(table.getCaptionLayoutTokens()) ) {
-                        Pair<String, List<LayoutToken>> captionProcess = processShort(table.getCaptionLayoutTokens(), doc);
-                        table.setLabeledCaption(captionProcess.getLeft());
-                        table.setCaptionLayoutTokens(captionProcess.getRight());
-                    }
-                    if ( CollectionUtils.isNotEmpty(table.getNoteLayoutTokens())) {
-                        Pair<String, List<LayoutToken>> noteProcess = processShort(table.getNoteLayoutTokens(), doc);
-                        table.setLabeledNote(noteProcess.getLeft());
-                        table.setNoteLayoutTokens(noteProcess.getRight());
-                    }
-                }
+				postProcessTableCaptions(tables, doc);
 
 				equations = processEquations(resultBody, layoutTokenization.getTokenization(), doc);
 			} else {
@@ -283,6 +264,9 @@ else if (config.getConsolidateCitations() == 2)
 			documentBodyParts = doc.getDocumentPart(SegmentationLabels.ANNEX);
             featSeg = getBodyTextFeatured(doc, documentBodyParts);
 			String resultAnnex = null;
+            List<Figure> annexFigures = null;
+            List<Table> annexTables = null;
+            List<Equation> annexEquations = null;
 			List<LayoutToken> tokenizationsBody2 = null;
 			if (featSeg != null && isNotEmpty(trim(featSeg.getLeft()))) {
 				// if featSeg is null, it usually means that no body segment is found in the
@@ -291,6 +275,14 @@ else if (config.getConsolidateCitations() == 2)
 				tokenizationsBody2 = featSeg.getRight().getTokenization();
 				resultAnnex = label(bodytext);
 				//System.out.println(rese);
+
+				annexFigures = processFigures(resultAnnex, tokenizationsBody2, doc);
+				postProcessFigureCaptions(annexFigures, doc);
+
+				annexTables = processTables(resultAnnex, tokenizationsBody2, doc);
+				postProcessTableCaptions(annexTables, doc);
+
+				annexEquations = processEquations(resultAnnex, tokenizationsBody2, doc);
 			}
 
             // final combination
@@ -299,6 +291,7 @@ else if (config.getConsolidateCitations() == 2)
 				layoutTokenization, tokenizationsBody2, // tokenization for body and annex
 				resHeader, // header
 				figures, tables, equations,
+                annexFigures, annexTables, annexEquations,
 				config);
             return doc;
         } catch (GrobidException e) {
@@ -1930,6 +1923,19 @@ protected List<Figure> processFigures(String rese, List<LayoutToken> layoutToken
         return results;
     }
 
+    protected void postProcessFigureCaptions(
+        List<Figure> figures,
+        Document doc
+    ) {
+        // further parse the caption
+        for(Figure figure : figures) {
+            if (CollectionUtils.isNotEmpty(figure.getCaptionLayoutTokens()) ) {
+                Pair<String, List<LayoutToken>> captionProcess = processShort(figure.getCaptionLayoutTokens(), doc);
+                figure.setLabeledCaption(captionProcess.getLeft());
+                figure.setCaptionLayoutTokens(captionProcess.getRight());
+            }
+        }
+    }
 
     /**
      * Create training data for the figures as identified by the full text model.
@@ -2103,6 +2109,24 @@ protected List<Table> processTables(String rese,
 		return results;
 	}
 
+    protected void postProcessTableCaptions(
+        List<Table> tables,
+        Document doc
+    ) {
+        // further parse the caption
+        for(Table table : tables) {
+            if ( CollectionUtils.isNotEmpty(table.getCaptionLayoutTokens()) ) {
+                Pair<String, List<LayoutToken>> captionProcess = processShort(table.getCaptionLayoutTokens(), doc);
+                table.setLabeledCaption(captionProcess.getLeft());
+                table.setCaptionLayoutTokens(captionProcess.getRight());
+            }
+            if ( CollectionUtils.isNotEmpty(table.getNoteLayoutTokens())) {
+                Pair<String, List<LayoutToken>> noteProcess = processShort(table.getNoteLayoutTokens(), doc);
+                table.setLabeledNote(noteProcess.getLeft());
+                table.setNoteLayoutTokens(noteProcess.getRight());
+            }
+        }
+    }
 
  	/**
      * Create training data for the table as identified by the full text model.
@@ -2312,6 +2336,9 @@ private void toTEI(Document doc,
                        List<Figure> figures,
                        List<Table> tables,
                        List<Equation> equations,
+                       List<Figure> annexFigures,
+                       List<Table> annexTables,
+                       List<Equation> annexEquations,
                        GrobidAnalysisConfig config) {
         if (doc.getBlocks() == null) {
             return;
@@ -2348,7 +2375,10 @@ private void toTEI(Document doc,
 			}
 
 			tei = teiFormatter.toTEIAnnex(tei, reseAnnex, resHeader, resCitations,
-				tokenizationsAnnex, doc, config);
+				tokenizationsAnnex,
+				annexFigures, annexTables, annexEquations,
+				doc, config
+			);
 
 			tei = teiFormatter.toTEIReferences(tei, resCitations, config);
             doc.calculateTeiIdToBibDataSets();