diff --git a/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java b/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java index 2b202ea4cb..d1199fbbb8 100644 --- a/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java @@ -17,20 +17,26 @@ import org.grobid.core.engines.tagging.GenericTaggerUtils; import org.grobid.core.exceptions.GrobidException; import org.grobid.core.features.FeaturesVectorFunding; +import org.grobid.core.layout.BoundingBox; import org.grobid.core.layout.LayoutToken; import org.grobid.core.tokenization.TaggingTokenCluster; import org.grobid.core.tokenization.TaggingTokenClusteror; import org.grobid.core.utilities.UnicodeUtil; import org.grobid.core.utilities.*; +import org.jetbrains.annotations.NotNull; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.util.ArrayList; +import java.util.Arrays; +import java.util.Comparator; import java.util.List; +import java.util.stream.Collectors; import static org.grobid.core.document.xml.XmlBuilderUtils.teiElement; import static org.grobid.core.engines.label.TaggingLabels.*; +import static org.grobid.core.layout.VectorGraphicBoxCalculator.mergeBoxes; public class FundingAcknowledgementParser extends AbstractParser { @@ -153,11 +159,11 @@ protected static Element injectedAnnotationsInNode(List tokenizatio } /** - * For convenience, a processing method taking an TEI XML segment as input - only paragraphs (Element p) + * For convenience, a processing method taking an TEI XML segment as input - only paragraphs (Element p) * will be processed in this segment and paragraph element will be replaced with the processed content. * Resulting entities are relative to the whole processed XML segment. - * - * Tokenization is done with the default Grobid analyzer triggered by the identified language. + * + * Tokenization is done with the default Grobid analyzer triggered by the identified language. **/ public MutablePair,List,List>> processingXmlFragment(String tei, GrobidAnalysisConfig config) { @@ -188,8 +194,8 @@ public MutablePair,List,List> annotations = localResult.left; FundingAcknowledgmentParse localEntities = localResult.right; - List list = annotations.stream().map(Pair::getLeft).toList(); - List annotationsPositionText = TextUtilities.matchTokenAndString(tokenizationFunding, paragraphText, list); + List annotationsPositionTokens = annotations.stream().map(Pair::getLeft).toList(); + List annotationsPositionText = TextUtilities.matchTokenAndString(tokenizationFunding, paragraphText, annotationsPositionTokens); List> annotationsWithPosRefToText = new ArrayList<>(); for (int i = 0; i < annotationsPositionText.size(); i++) { annotationsWithPosRefToText.add(Pair.of(annotationsPositionText.get(i), annotations.get(i).getRight())); @@ -205,7 +211,7 @@ public MutablePair,List,List,List,ListThis method modify the sentences in input + */ + private static Nodes mergeSentencesFallingOnAnnotations(Nodes sentences, List> annotations, GrobidAnalysisConfig config) { + // We merge the sentences (including their coordinates) for which the annotations + // are falling in between two of them or they will be lost later. + + List sentencePositions = getOffsetPositionsFromNodes(sentences); + + // We obtain the corrected coordinates that don't fall over the annotations + List correctedOffsetPositions = SentenceUtilities.correctSentencePositions(sentencePositions, annotations + .stream() + .map(Pair::getLeft).toList()); + + List toRemove = new ArrayList<>(); + for (OffsetPosition correctedOffsetPosition : correctedOffsetPositions) { + List originalSentences = sentencePositions.stream() + .filter(a -> a.start >= correctedOffsetPosition.start && a.end <= correctedOffsetPosition.end) + .toList(); + + // if for each "corrected sentences offset" there are more than one original sentence that + // falls into it, it means we need to merge + if (originalSentences.size() > 1) { + List toMerge = originalSentences.stream() + .map(sentencePositions::indexOf) + .toList(); + + Element destination = (Element) sentences.get(toMerge.get(0)); + boolean needToMergeCoordinates = config.isGenerateTeiCoordinates("s"); + List boundingBoxes = new ArrayList<>(); + Attribute destCoordinates = null; + + if (needToMergeCoordinates) { + destCoordinates = destination.getAttribute("coords"); + String coordinates = destCoordinates.getValue(); + boundingBoxes = Arrays.stream(coordinates.split(";")) + .map(BoundingBox::fromString) + .collect(Collectors.toList()); + } + + for (int i = 1; i < toMerge.size(); i++) { + Integer sentenceToMergeIndex = toMerge.get(i); + Node sentenceToMerge = sentences.get(sentenceToMergeIndex); + + // Merge coordinates + if (needToMergeCoordinates) { + Attribute coords = destination.getAttribute("coords"); + String coordinates = coords.getValue(); + boundingBoxes.addAll(Arrays.stream(coordinates.split(";")) + .map(BoundingBox::fromString) + .toList()); + + List mergedBoundingBoxes = mergeBoxes(boundingBoxes); + String coordsAsString = String.join(";", mergedBoundingBoxes.stream().map(BoundingBox::toString).toList()); + Attribute newCoords = new Attribute("coords", coordsAsString); + destination.removeAttribute(coords); + destination.addAttribute(newCoords); + } + + // Merge content + boolean first = true; + Node previous = null; + for (int c = 0; c < sentenceToMerge.getChildCount(); c++) { + Node child = sentenceToMerge.getChild(c); + + if (first) { + first = false; + Node lastNodeDestination = destination.getChild(destination.getChildCount() - 1); + previous = lastNodeDestination; +// if (lastNodeDestination instanceof Text) { +// ((Text) lastNodeDestination).setValue(((Text) lastNodeDestination).getValue() + " "); +// previous = lastNodeDestination; +// } else { +// Text newSpace = new Text(" "); +// destination.appendChild(newSpace); +// previous = newSpace; +// } + } + + if (previous instanceof Text && child instanceof Text) { + ((Text) previous).setValue(previous.getValue() + child.getValue()); + } else { + ((Element) sentenceToMerge).replaceChild(child, new Text("placeholder")); + child.detach(); + destination.appendChild(child); + previous = child; + } + } + sentenceToMerge.detach(); + toRemove.add(sentenceToMergeIndex); + } + } + } + toRemove.stream() + .sorted(Comparator.reverseOrder()) + .forEach(sentences::remove); + + return sentences; + } + + private static @NotNull List getOffsetPositionsFromNodes(Nodes sentences) { + List sentencePositions = new ArrayList<>(); + int start = 0; + for (Node sentence : sentences) { + int end = start + sentence.getValue().length(); + sentencePositions.add(new OffsetPosition(start, end)); + start = end; + } + return sentencePositions; + } + private static void updateParagraphNodeWithAnnotations(Node paragraph, List> annotations) { int pos = 0; List newChildren = new ArrayList<>(); @@ -400,18 +520,18 @@ protected static Pair, List> extractSentencesAndPos * The processing here is called from the header and/or full text parser in cascade * when one of these higher-level model detect a "funding" section, or in case * no funding section is found, when a acknolwedgements section is detected. - * - * Independently from the place this parser is called, it process the input sequence - * of layout tokens in a context free manner. - * + * + * Independently from the place this parser is called, it process the input sequence + * of layout tokens in a context free manner. + * * The expected input here is a paragraph. * * // This returns a Element of the annotation and the position where should be injected, relative to the paragraph. * // TODO: make new data objects for the annotations - * - * Return an XML fragment with inline annotations of the input text, together with - * extracted normalized entities. These entities are referenced by the inline - * annotations with the usual @target attribute pointing to xml:id. + * + * Return an XML fragment with inline annotations of the input text, together with + * extracted normalized entities. These entities are referenced by the inline + * annotations with the usual @target attribute pointing to xml:id. */ protected MutablePair>, FundingAcknowledgmentParse> getExtractionResult(List tokensParagraph, String labellingResult) { List fundings = new ArrayList<>(); diff --git a/grobid-core/src/main/java/org/grobid/core/utilities/SentenceUtilities.java b/grobid-core/src/main/java/org/grobid/core/utilities/SentenceUtilities.java index c0b4498835..7446f26bc5 100644 --- a/grobid-core/src/main/java/org/grobid/core/utilities/SentenceUtilities.java +++ b/grobid-core/src/main/java/org/grobid/core/utilities/SentenceUtilities.java @@ -141,27 +141,7 @@ public List runSentenceDetection(String text, List finalSentencePositions = new ArrayList<>(); - int forbiddenIndex = 0; - for(int j=0; j < sentencePositions.size(); j++) { - OffsetPosition position = sentencePositions.get(j); - for(int i=forbiddenIndex; i < forbidden.size(); i++) { - OffsetPosition forbiddenPos = forbidden.get(i); - if (forbiddenPos.end < position.end) - continue; - if (forbiddenPos.start > position.end) - break; - while ( (forbiddenPos.start < position.end && position.end < forbiddenPos.end) ) { - if (j+1 < sentencePositions.size()) { - position.end = sentencePositions.get(j+1).end; - j++; - forbiddenIndex = i; - } else - break; - } - } - finalSentencePositions.add(position); - } + List finalSentencePositions = correctSentencePositions(sentencePositions, forbidden); // as a heuristics for all implementations, because they clearly all fail for this case, we // attached to the right sentence the numerical bibliographical references markers expressed @@ -286,6 +266,31 @@ public List runSentenceDetection(String text, List correctSentencePositions(List sentencePositions, List forbiddenPositions) { + List finalSentencePositions = new ArrayList<>(); + int forbiddenIndex = 0; + for(int j = 0; j < sentencePositions.size(); j++) { + OffsetPosition position = new OffsetPosition(sentencePositions.get(j).start, sentencePositions.get(j).end); + for(int i = forbiddenIndex; i < forbiddenPositions.size(); i++) { + OffsetPosition forbiddenPos = forbiddenPositions.get(i); + if (forbiddenPos.end < position.end) + continue; + if (forbiddenPos.start > position.end) + break; + while ( (forbiddenPos.start < position.end && position.end < forbiddenPos.end) ) { + if (j+1 < sentencePositions.size()) { + position.end = sentencePositions.get(j+1).end; + j++; + forbiddenIndex = i; + } else + break; + } + } + finalSentencePositions.add(position); + } + return finalSentencePositions; + } + /** * Return true if the token should be skipped when considering sentence content. */ diff --git a/grobid-core/src/test/kotlin/org/grobid/core/engines/FundingAcknowledgementParserIntegrationTest.kt b/grobid-core/src/test/kotlin/org/grobid/core/engines/FundingAcknowledgementParserIntegrationTest.kt index 04fefa973b..17bf78d85b 100644 --- a/grobid-core/src/test/kotlin/org/grobid/core/engines/FundingAcknowledgementParserIntegrationTest.kt +++ b/grobid-core/src/test/kotlin/org/grobid/core/engines/FundingAcknowledgementParserIntegrationTest.kt @@ -182,6 +182,26 @@ class FundingAcknowledgementParserIntegrationTest { assertThat(element.toXML(), CompareMatcher.isIdenticalTo(output)) } + @Test + fun testXmlFragmentProcessing_mergingSentences_shouldMergeCorrectly() { + val input ="\n" + + "\t\t\t
\n" + + "
Acknowledgements

Our warmest thanks to PatriceLopez, the author of Grobid [22], DeLFT [20], and other open-source projects for his continuous support and inspiration with ideas, suggestions, and fruitful discussions.We thank Pedro BaptistadeCastro for his support during this work.Special thanks to Erina Fujita for useful tips on the manuscript.

\n" + + "\t\t\t
\n\n" + + val output = "
\n" + + "
Acknowledgements

Our warmest thanks to PatriceLopez, the author of Grobid [22], DeLFT [20], and other open-source projects for his continuous support and inspiration with ideas, suggestions, and fruitful discussions.We thank Pedro BaptistadeCastro for his support during this work.Special thanks to Erina Fujita for useful tips on the manuscript.

\n" + + "\t\t\t
" + + val config = GrobidAnalysisConfig.GrobidAnalysisConfigBuilder() + .withSentenceSegmentation(true) + .build() + + val (element, mutableTriple) = target.processingXmlFragment(input, config) + + assertThat(element.toXML(), CompareMatcher.isIdenticalTo(output)) + } + companion object { @JvmStatic @BeforeClass