From 6f45c06a1596cdf7ce5476325d0728efc086391d Mon Sep 17 00:00:00 2001 From: Daniel Ecer Date: Wed, 14 Apr 2021 14:04:31 +0100 Subject: [PATCH] added annex figures, tables, equations minor intendation issue --- .../grobid/core/document/TEIFormatter.java | 5 +- .../grobid/core/engines/FullTextParser.java | 74 +++++++++++++------ 2 files changed, 56 insertions(+), 23 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java index 890f11f985..1295a062ac 100755 --- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java +++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java @@ -1131,6 +1131,9 @@ public StringBuilder toTEIAnnex(StringBuilder buffer, BiblioItem biblio, List bds, List tokenizations, + List
figures, + List tables, + List equations, Document doc, GrobidAnalysisConfig config) throws Exception { if ((result == null) || (tokenizations == null)) { @@ -1139,7 +1142,7 @@ public StringBuilder toTEIAnnex(StringBuilder buffer, buffer.append("\t\t\t
\n"); buffer = toTEITextPiece(buffer, result, biblio, bds, true, - new LayoutTokenization(tokenizations), null, null, null, doc, config); + new LayoutTokenization(tokenizations), figures, tables, equations, doc, config); buffer.append("\t\t\t
\n"); return buffer; diff --git a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java index 7707ba7144..94680b6960 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java @@ -250,29 +250,10 @@ else if (config.getConsolidateCitations() == 2) // we apply now the figure and table models based on the fulltext labeled output figures = processFigures(resultBody, layoutTokenization.getTokenization(), doc); - // further parse the caption - for(Figure figure : figures) { - if (CollectionUtils.isNotEmpty(figure.getCaptionLayoutTokens()) ) { - Pair> captionProcess = processShort(figure.getCaptionLayoutTokens(), doc); - figure.setLabeledCaption(captionProcess.getLeft()); - figure.setCaptionLayoutTokens(captionProcess.getRight()); - } - } + postProcessFigureCaptions(figures, doc); tables = processTables(resultBody, layoutTokenization.getTokenization(), doc); - // further parse the caption - for(Table table : tables) { - if ( CollectionUtils.isNotEmpty(table.getCaptionLayoutTokens()) ) { - Pair> captionProcess = processShort(table.getCaptionLayoutTokens(), doc); - table.setLabeledCaption(captionProcess.getLeft()); - table.setCaptionLayoutTokens(captionProcess.getRight()); - } - if ( CollectionUtils.isNotEmpty(table.getNoteLayoutTokens())) { - Pair> noteProcess = processShort(table.getNoteLayoutTokens(), doc); - table.setLabeledNote(noteProcess.getLeft()); - table.setNoteLayoutTokens(noteProcess.getRight()); - } - } + postProcessTableCaptions(tables, doc); equations = processEquations(resultBody, layoutTokenization.getTokenization(), doc); } else { @@ -283,6 +264,9 @@ else if (config.getConsolidateCitations() == 2) documentBodyParts = doc.getDocumentPart(SegmentationLabels.ANNEX); featSeg = getBodyTextFeatured(doc, documentBodyParts); String resultAnnex = null; + List
annexFigures = null; + List
annexTables = null; + List annexEquations = null; List tokenizationsBody2 = null; if (featSeg != null && isNotEmpty(trim(featSeg.getLeft()))) { // if featSeg is null, it usually means that no body segment is found in the @@ -291,6 +275,14 @@ else if (config.getConsolidateCitations() == 2) tokenizationsBody2 = featSeg.getRight().getTokenization(); resultAnnex = label(bodytext); //System.out.println(rese); + + annexFigures = processFigures(resultAnnex, tokenizationsBody2, doc); + postProcessFigureCaptions(annexFigures, doc); + + annexTables = processTables(resultAnnex, tokenizationsBody2, doc); + postProcessTableCaptions(annexTables, doc); + + annexEquations = processEquations(resultAnnex, tokenizationsBody2, doc); } // final combination @@ -299,6 +291,7 @@ else if (config.getConsolidateCitations() == 2) layoutTokenization, tokenizationsBody2, // tokenization for body and annex resHeader, // header figures, tables, equations, + annexFigures, annexTables, annexEquations, config); return doc; } catch (GrobidException e) { @@ -1930,6 +1923,19 @@ protected List
processFigures(String rese, List layoutToken return results; } + protected void postProcessFigureCaptions( + List
figures, + Document doc + ) { + // further parse the caption + for(Figure figure : figures) { + if (CollectionUtils.isNotEmpty(figure.getCaptionLayoutTokens()) ) { + Pair> captionProcess = processShort(figure.getCaptionLayoutTokens(), doc); + figure.setLabeledCaption(captionProcess.getLeft()); + figure.setCaptionLayoutTokens(captionProcess.getRight()); + } + } + } /** * Create training data for the figures as identified by the full text model. @@ -2103,6 +2109,24 @@ protected List
processTables(String rese, return results; } + protected void postProcessTableCaptions( + List
tables, + Document doc + ) { + // further parse the caption + for(Table table : tables) { + if ( CollectionUtils.isNotEmpty(table.getCaptionLayoutTokens()) ) { + Pair> captionProcess = processShort(table.getCaptionLayoutTokens(), doc); + table.setLabeledCaption(captionProcess.getLeft()); + table.setCaptionLayoutTokens(captionProcess.getRight()); + } + if ( CollectionUtils.isNotEmpty(table.getNoteLayoutTokens())) { + Pair> noteProcess = processShort(table.getNoteLayoutTokens(), doc); + table.setLabeledNote(noteProcess.getLeft()); + table.setNoteLayoutTokens(noteProcess.getRight()); + } + } + } /** * Create training data for the table as identified by the full text model. @@ -2312,6 +2336,9 @@ private void toTEI(Document doc, List
figures, List
tables, List equations, + List
annexFigures, + List
annexTables, + List annexEquations, GrobidAnalysisConfig config) { if (doc.getBlocks() == null) { return; @@ -2348,7 +2375,10 @@ private void toTEI(Document doc, } tei = teiFormatter.toTEIAnnex(tei, reseAnnex, resHeader, resCitations, - tokenizationsAnnex, doc, config); + tokenizationsAnnex, + annexFigures, annexTables, annexEquations, + doc, config + ); tei = teiFormatter.toTEIReferences(tei, resCitations, config); doc.calculateTeiIdToBibDataSets();