Skip to content

Commit

Permalink
merge sentences whose boundaries are clashing with the annotations fr…
Browse files Browse the repository at this point in the history
…om the funding-acknowledgment
  • Loading branch information
lfoppiano committed May 5, 2024
1 parent fb17eec commit 6336512
Show file tree
Hide file tree
Showing 3 changed files with 180 additions and 35 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -17,20 +17,26 @@
import org.grobid.core.engines.tagging.GenericTaggerUtils;
import org.grobid.core.exceptions.GrobidException;
import org.grobid.core.features.FeaturesVectorFunding;
import org.grobid.core.layout.BoundingBox;
import org.grobid.core.layout.LayoutToken;
import org.grobid.core.tokenization.TaggingTokenCluster;
import org.grobid.core.tokenization.TaggingTokenClusteror;
import org.grobid.core.utilities.UnicodeUtil;
import org.grobid.core.utilities.*;
import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.List;
import java.util.stream.Collectors;

import static org.grobid.core.document.xml.XmlBuilderUtils.teiElement;
import static org.grobid.core.engines.label.TaggingLabels.*;
import static org.grobid.core.layout.VectorGraphicBoxCalculator.mergeBoxes;

public class FundingAcknowledgementParser extends AbstractParser {

Expand Down Expand Up @@ -153,11 +159,11 @@ protected static Element injectedAnnotationsInNode(List<LayoutToken> tokenizatio
}

/**
* For convenience, a processing method taking an TEI XML segment as input - only paragraphs (Element p)
* For convenience, a processing method taking an TEI XML segment as input - only paragraphs (Element p)
* will be processed in this segment and paragraph element will be replaced with the processed content.
* Resulting entities are relative to the whole processed XML segment.
*
* Tokenization is done with the default Grobid analyzer triggered by the identified language.
*
* Tokenization is done with the default Grobid analyzer triggered by the identified language.
**/
public MutablePair<Element, MutableTriple<List<Funding>,List<Person>,List<Affiliation>>> processingXmlFragment(String tei,
GrobidAnalysisConfig config) {
Expand Down Expand Up @@ -188,8 +194,8 @@ public MutablePair<Element, MutableTriple<List<Funding>,List<Person>,List<Affili
List<Pair<OffsetPosition, Element>> annotations = localResult.left;
FundingAcknowledgmentParse localEntities = localResult.right;

List<OffsetPosition> list = annotations.stream().map(Pair::getLeft).toList();
List<OffsetPosition> annotationsPositionText = TextUtilities.matchTokenAndString(tokenizationFunding, paragraphText, list);
List<OffsetPosition> annotationsPositionTokens = annotations.stream().map(Pair::getLeft).toList();
List<OffsetPosition> annotationsPositionText = TextUtilities.matchTokenAndString(tokenizationFunding, paragraphText, annotationsPositionTokens);
List<Pair<OffsetPosition, Element>> annotationsWithPosRefToText = new ArrayList<>();
for (int i = 0; i < annotationsPositionText.size(); i++) {
annotationsWithPosRefToText.add(Pair.of(annotationsPositionText.get(i), annotations.get(i).getRight()));
Expand All @@ -205,7 +211,7 @@ public MutablePair<Element, MutableTriple<List<Funding>,List<Person>,List<Affili
LOGGER.warn("While the configuration claim that paragraphs must be segmented, we did not find any sentence. ");
updateParagraphNodeWithAnnotations(paragraph, annotations);
}

mergeSentencesFallingOnAnnotations(sentences, annotations, config);
updateSentencesNodesWithAnnotations(sentences, annotations);
} else {
updateParagraphNodeWithAnnotations(paragraph, annotations);
Expand Down Expand Up @@ -233,6 +239,120 @@ public MutablePair<Element, MutableTriple<List<Funding>,List<Person>,List<Affili
return globalResult;
}

/**
* This method identify the sentences that should be merged because the annotations are falling on their boundaries.
* This is necessary when the annotations are extracted from the paragraphs they need to be applied to sentences
* calculated from the plain text.
* <b>This method modify the sentences in input</b>
*/
private static Nodes mergeSentencesFallingOnAnnotations(Nodes sentences, List<Pair<OffsetPosition, Element>> annotations, GrobidAnalysisConfig config) {
// We merge the sentences (including their coordinates) for which the annotations
// are falling in between two of them or they will be lost later.

List<OffsetPosition> sentencePositions = getOffsetPositionsFromNodes(sentences);

// We obtain the corrected coordinates that don't fall over the annotations
List<OffsetPosition> correctedOffsetPositions = SentenceUtilities.correctSentencePositions(sentencePositions, annotations
.stream()
.map(Pair::getLeft).toList());

List<Integer> toRemove = new ArrayList<>();
for (OffsetPosition correctedOffsetPosition : correctedOffsetPositions) {
List<OffsetPosition> originalSentences = sentencePositions.stream()
.filter(a -> a.start >= correctedOffsetPosition.start && a.end <= correctedOffsetPosition.end)
.toList();

// if for each "corrected sentences offset" there are more than one original sentence that
// falls into it, it means we need to merge
if (originalSentences.size() > 1) {
List<Integer> toMerge = originalSentences.stream()
.map(sentencePositions::indexOf)
.toList();

Element destination = (Element) sentences.get(toMerge.get(0));
boolean needToMergeCoordinates = config.isGenerateTeiCoordinates("s");
List<BoundingBox> boundingBoxes = new ArrayList<>();
Attribute destCoordinates = null;

if (needToMergeCoordinates) {
destCoordinates = destination.getAttribute("coords");
String coordinates = destCoordinates.getValue();
boundingBoxes = Arrays.stream(coordinates.split(";"))
.map(BoundingBox::fromString)
.collect(Collectors.toList());
}

for (int i = 1; i < toMerge.size(); i++) {
Integer sentenceToMergeIndex = toMerge.get(i);
Node sentenceToMerge = sentences.get(sentenceToMergeIndex);

// Merge coordinates
if (needToMergeCoordinates) {
Attribute coords = destination.getAttribute("coords");
String coordinates = coords.getValue();
boundingBoxes.addAll(Arrays.stream(coordinates.split(";"))
.map(BoundingBox::fromString)
.toList());

List<BoundingBox> mergedBoundingBoxes = mergeBoxes(boundingBoxes);
String coordsAsString = String.join(";", mergedBoundingBoxes.stream().map(BoundingBox::toString).toList());
Attribute newCoords = new Attribute("coords", coordsAsString);
destination.removeAttribute(coords);
destination.addAttribute(newCoords);
}

// Merge content
boolean first = true;
Node previous = null;
for (int c = 0; c < sentenceToMerge.getChildCount(); c++) {
Node child = sentenceToMerge.getChild(c);

if (first) {
first = false;
Node lastNodeDestination = destination.getChild(destination.getChildCount() - 1);
previous = lastNodeDestination;
// if (lastNodeDestination instanceof Text) {
// ((Text) lastNodeDestination).setValue(((Text) lastNodeDestination).getValue() + " ");
// previous = lastNodeDestination;
// } else {
// Text newSpace = new Text(" ");
// destination.appendChild(newSpace);
// previous = newSpace;
// }
}

if (previous instanceof Text && child instanceof Text) {
((Text) previous).setValue(previous.getValue() + child.getValue());
} else {
((Element) sentenceToMerge).replaceChild(child, new Text("placeholder"));
child.detach();
destination.appendChild(child);
previous = child;
}
}
sentenceToMerge.detach();
toRemove.add(sentenceToMergeIndex);
}
}
}
toRemove.stream()
.sorted(Comparator.reverseOrder())
.forEach(sentences::remove);

return sentences;
}

private static @NotNull List<OffsetPosition> getOffsetPositionsFromNodes(Nodes sentences) {
List<OffsetPosition> sentencePositions = new ArrayList<>();
int start = 0;
for (Node sentence : sentences) {
int end = start + sentence.getValue().length();
sentencePositions.add(new OffsetPosition(start, end));
start = end;
}
return sentencePositions;
}

private static void updateParagraphNodeWithAnnotations(Node paragraph, List<Pair<OffsetPosition, Element>> annotations) {
int pos = 0;
List<Node> newChildren = new ArrayList<>();
Expand Down Expand Up @@ -400,18 +520,18 @@ protected static Pair<List<String>, List<OffsetPosition>> extractSentencesAndPos
* The processing here is called from the header and/or full text parser in cascade
* when one of these higher-level model detect a "funding" section, or in case
* no funding section is found, when a acknolwedgements section is detected.
*
* Independently from the place this parser is called, it process the input sequence
* of layout tokens in a context free manner.
*
*
* Independently from the place this parser is called, it process the input sequence
* of layout tokens in a context free manner.
*
* The expected input here is a paragraph.
*
* // This returns a Element of the annotation and the position where should be injected, relative to the paragraph.
* // TODO: make new data objects for the annotations
*
* Return an XML fragment with inline annotations of the input text, together with
* extracted normalized entities. These entities are referenced by the inline
* annotations with the usual @target attribute pointing to xml:id.
*
* Return an XML fragment with inline annotations of the input text, together with
* extracted normalized entities. These entities are referenced by the inline
* annotations with the usual @target attribute pointing to xml:id.
*/
protected MutablePair<List<Pair<OffsetPosition, Element>>, FundingAcknowledgmentParse> getExtractionResult(List<LayoutToken> tokensParagraph, String labellingResult) {
List<Funding> fundings = new ArrayList<>();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -141,27 +141,7 @@ public List<OffsetPosition> runSentenceDetection(String text, List<OffsetPositio
Collections.sort(forbidden);

// cancel sentence boundaries within the forbidden spans
List<OffsetPosition> finalSentencePositions = new ArrayList<>();
int forbiddenIndex = 0;
for(int j=0; j < sentencePositions.size(); j++) {
OffsetPosition position = sentencePositions.get(j);
for(int i=forbiddenIndex; i < forbidden.size(); i++) {
OffsetPosition forbiddenPos = forbidden.get(i);
if (forbiddenPos.end < position.end)
continue;
if (forbiddenPos.start > position.end)
break;
while ( (forbiddenPos.start < position.end && position.end < forbiddenPos.end) ) {
if (j+1 < sentencePositions.size()) {
position.end = sentencePositions.get(j+1).end;
j++;
forbiddenIndex = i;
} else
break;
}
}
finalSentencePositions.add(position);
}
List<OffsetPosition> finalSentencePositions = correctSentencePositions(sentencePositions, forbidden);

// as a heuristics for all implementations, because they clearly all fail for this case, we
// attached to the right sentence the numerical bibliographical references markers expressed
Expand Down Expand Up @@ -286,6 +266,31 @@ public List<OffsetPosition> runSentenceDetection(String text, List<OffsetPositio
}
}

public static List<OffsetPosition> correctSentencePositions(List<OffsetPosition> sentencePositions, List<OffsetPosition> forbiddenPositions) {
List<OffsetPosition> finalSentencePositions = new ArrayList<>();
int forbiddenIndex = 0;
for(int j = 0; j < sentencePositions.size(); j++) {
OffsetPosition position = new OffsetPosition(sentencePositions.get(j).start, sentencePositions.get(j).end);
for(int i = forbiddenIndex; i < forbiddenPositions.size(); i++) {
OffsetPosition forbiddenPos = forbiddenPositions.get(i);
if (forbiddenPos.end < position.end)
continue;
if (forbiddenPos.start > position.end)
break;
while ( (forbiddenPos.start < position.end && position.end < forbiddenPos.end) ) {
if (j+1 < sentencePositions.size()) {
position.end = sentencePositions.get(j+1).end;
j++;
forbiddenIndex = i;
} else
break;
}
}
finalSentencePositions.add(position);
}
return finalSentencePositions;
}

/**
* Return true if the token should be skipped when considering sentence content.
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,26 @@ class FundingAcknowledgementParserIntegrationTest {
assertThat(element.toXML(), CompareMatcher.isIdenticalTo(output))
}

@Test
fun testXmlFragmentProcessing_mergingSentences_shouldMergeCorrectly() {
val input ="\n" +
"\t\t\t<div type=\"acknowledgement\">\n" +
"<div xmlns=\"http://www.tei-c.org/ns/1.0\"><head>Acknowledgements</head><p><s>Our warmest thanks to Patrice</s><s>Lopez, the author of Grobid <ref type=\"bibr\" target=\"#b21\">[22]</ref>, DeLFT <ref type=\"bibr\" target=\"#b19\">[20]</ref>, and other open-source projects for his continuous support and inspiration with ideas, suggestions, and fruitful discussions.</s><s>We thank Pedro Baptista</s><s>de</s><s>Castro for his support during this work.</s><s>Special thanks to Erina Fujita for useful tips on the manuscript.</s></p></div>\n" +
"\t\t\t</div>\n\n"

val output = "<div type=\"acknowledgement\">\n" +
"<div><head>Acknowledgements</head><p><s>Our warmest thanks to <rs xmlns=\"http://www.tei-c.org/ns/1.0\" type=\"person\">PatriceLopez</rs>, the author of Grobid <ref type=\"bibr\" target=\"#b21\">[22]</ref>, DeLFT <ref type=\"bibr\" target=\"#b19\">[20]</ref>, and other open-source projects for his continuous support and inspiration with ideas, suggestions, and fruitful discussions.</s><s>We thank <rs xmlns=\"http://www.tei-c.org/ns/1.0\" type=\"person\">Pedro BaptistadeCastro</rs> for his support during this work.</s><s>Special thanks to <rs xmlns=\"http://www.tei-c.org/ns/1.0\" type=\"person\">Erina Fujita</rs> for useful tips on the manuscript.</s></p></div>\n" +
"\t\t\t</div>"

val config = GrobidAnalysisConfig.GrobidAnalysisConfigBuilder()
.withSentenceSegmentation(true)
.build()

val (element, mutableTriple) = target.processingXmlFragment(input, config)

assertThat(element.toXML(), CompareMatcher.isIdenticalTo(output))
}

companion object {
@JvmStatic
@BeforeClass
Expand Down

0 comments on commit 6336512

Please sign in to comment.