Skip to content

Commit

Permalink
Merge dbc7312 into 506c00a
Browse files Browse the repository at this point in the history
  • Loading branch information
lfoppiano committed Nov 5, 2020
2 parents 506c00a + dbc7312 commit 0143025
Show file tree
Hide file tree
Showing 8 changed files with 926 additions and 52 deletions.
Expand Up @@ -507,7 +507,6 @@ static public Pair<String, LayoutTokenization> getBodyTextFeatured(Document doc,
else if (nbNumbType > (bibDataSets.size() / 2))
bibRefCalloutType = "AUTHOR";
} catch(EntityMatcherException e) {
e.printStackTrace();
LOGGER.info("Could not build the bibliographical matcher", e);
}
}
Expand Down
Expand Up @@ -5,13 +5,15 @@
import org.grobid.core.layout.LayoutToken;

import java.util.*;
import java.util.function.Predicate;
import java.util.stream.Collectors;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
* Class for using sentence segmentation (singleton). The actual sentence segmentation implementation
* is specified in the Grobid configuration. See org.grobid.core.lang.impl.* for the available
* is specified in the Grobid configuration. See org.grobid.core.lang.impl.* for the available
* implementations.
*
*/
Expand Down Expand Up @@ -60,8 +62,7 @@ private SentenceUtilities() {
* Basic run for sentence identification, return the offset positions of the
* identified sentences
*
* @param text
* text to segment into sentences
* @param text text to segment into sentences
* @return list of offset positions for the identified sentence, relative to the input text
*/
public List<OffsetPosition> runSentenceDetection(String text) {
Expand All @@ -80,21 +81,19 @@ public List<OffsetPosition> runSentenceDetection(String text) {
* identified sentences without sentence boundaries within a forbidden span (typically a reference marker
* and we don't want a sentence end/start in the middle of that).
*
* @param text
* text to segment into sentences
* @param forbidden
* list of offset positions where sentence boundaries are forbidden
* @param text text to segment into sentences
* @param forbidden list of offset positions where sentence boundaries are forbidden
* @return list of offset positions for the identified sentence, relative to the input text
*/
public List<OffsetPosition> runSentenceDetection(String text, List<OffsetPosition> forbidden) {
return runSentenceDetection(text, forbidden, null);
}

/**
* Run for sentence identification with some forbidden span constraints, return the offset positions of the
* Run for sentence identification with some forbidden span constraints, return the offset positions of the
* identified sentences without sentence boundaries within a forbidden span (typically a reference marker
* and we don't want a sentence end/start in the middle of that). The original LayoutToken objects are
* provided, which allows to apply additional heuristics based on document layout and font features.
* provided, which allows to apply additional heuristics based on document layout and font features.
*
* @param text
* text to segment into sentences
Expand All @@ -114,6 +113,7 @@ public List<OffsetPosition> runSentenceDetection(String text, List<OffsetPositio
// to be sure, we sort the forbidden positions
if (forbidden == null)
return sentencePositions;

Collections.sort(forbidden);

// cancel sentence boundaries within the forbidden spans
Expand All @@ -139,21 +139,48 @@ public List<OffsetPosition> runSentenceDetection(String text, List<OffsetPositio
finalSentencePositions.add(position);
}

// as a heuristics for all implementations, because they clearly all fail for this case, we
// attached to the right sentence the numerical bibliographical references markers expressed
// adjust the forbidden spans - if they are present at the beginning of the sentence, move them to the
// end of the previous sentence

for (int index = 0; index < finalSentencePositions.size(); index++) {
OffsetPosition currentSentence = finalSentencePositions.get(index);
for (OffsetPosition forbiddenSpan : forbidden) {
if (forbiddenSpan.start == currentSentence.start && index > 0) {
// Adjust the previous sentence to include this span
OffsetPosition previousSentence = finalSentencePositions.get(index - 1);
previousSentence.end = forbiddenSpan.end;
currentSentence.start = forbiddenSpan.end;
while (text.charAt(currentSentence.start) == ' ') {
if (currentSentence.start == text.length() - 1) {
break;
} else {
currentSentence.start++;
}
}
}
}
}

finalSentencePositions = finalSentencePositions
.stream()
.filter(offsetPosition -> offsetPosition.end - offsetPosition.start > 0)
.collect(Collectors.toList());

// as a heuristics for all implementations, because they clearly all fail for this case, we
// attached to the right sentence the numerical bibliographical references markers expressed
// in superscript just *after* the final sentence comma, e.g.
// "Laboratory tests at the time of injury were not predictive of outcome. 32"
// or
// "CSF-1 has been linked to tumor growth and progression in breast cancer, 5,6 and has been
// shown to effectively reduce the number of tumor-associated macrophages in different tumor
// "CSF-1 has been linked to tumor growth and progression in breast cancer, 5,6 and has been
// shown to effectively reduce the number of tumor-associated macrophages in different tumor
// types. 4,5"
// or
// or
// "Even if the symmetry is s- like, it does not necessarily indicate that the
// superconductivity is not exotic, because the s- like symmetry or the fully gapped state
// may be realized by the pairing mediated by the interband excitations of the electrons. 23) "

if (finalSentencePositions.size() == 0) {
// this should normally not happen, but it happens (depending on sentence splitter, usually the text
// this should normally not happen, but it happens (depending on sentence splitter, usually the text
// is just a punctuation)
// in this case we consider the current text as a unique sentence as fall back
finalSentencePositions.add(new OffsetPosition(0, text.length()));
Expand All @@ -166,14 +193,14 @@ public List<OffsetPosition> runSentenceDetection(String text, List<OffsetPositio

// init sentence index
int currentSentenceIndex = 0;
String sentenceChunk = text.substring(finalSentencePositions.get(currentSentenceIndex).start,
String sentenceChunk = text.substring(finalSentencePositions.get(currentSentenceIndex).start,
finalSentencePositions.get(currentSentenceIndex).end);
boolean moved = false;

// iterate on layout tokens in sync with sentences
for(int i=0; i<textLayoutTokens.size(); i++) {
LayoutToken token = textLayoutTokens.get(i);
if (token.getText() == null || token.getText().length() == 0)
if (token.getText() == null || token.getText().length() == 0)
continue;

if (this.toSkipToken(token.getText()))
Expand All @@ -191,10 +218,10 @@ public List<OffsetPosition> runSentenceDetection(String text, List<OffsetPositio
int j = i;
for(; j<textLayoutTokens.size(); j++) {
LayoutToken nextToken = textLayoutTokens.get(j);
if (nextToken.getText() == null || nextToken.getText().length() == 0)
if (nextToken.getText() == null || nextToken.getText().length() == 0)
continue;

// we don't look beyond an end of line (to prevent from numbered list/notes)
// we don't look beyond an end of line (to prevent from numbered list/notes)
if (nextToken.getText().equals("\n"))
break;

Expand All @@ -210,7 +237,7 @@ public List<OffsetPosition> runSentenceDetection(String text, List<OffsetPositio
if (this.isValidSuperScriptNumericalReferenceMarker(nextToken)) {
pushedEnd += buffer + nextToken.getText().length();
buffer = 0;
} else
} else
break;
}

Expand Down Expand Up @@ -240,30 +267,52 @@ public List<OffsetPosition> runSentenceDetection(String text, List<OffsetPositio
currentSentenceIndex++;
if (currentSentenceIndex >= finalSentencePositions.size())
break;
sentenceChunk = text.substring(finalSentencePositions.get(currentSentenceIndex).start,
sentenceChunk = text.substring(finalSentencePositions.get(currentSentenceIndex).start,
finalSentencePositions.get(currentSentenceIndex).end);
moved = false;
}
pos = 0;
}

if (currentSentenceIndex >= finalSentencePositions.size())
break;
}


// other heuristics/post-corrections based on layout/style features of the tokens could be added
// here, for instance non-breakable italic or bold chunks, or adding sentence split based on
// here, for instance non-breakable italic or bold chunks, or adding sentence split based on
// spacing/indent

return finalSentencePositions;

} catch (Exception e) {
LOGGER.warn("Cannot detect sentences. ", e);
return null;
}

}

public String getXml(String text, List<OffsetPosition> offsetPositions) {
StringBuilder outputText = new StringBuilder();

outputText.append(text.substring(offsetPositions.get(offsetPositions.size() - 1).end));
int previousStart = -1;

for (int i = offsetPositions.size() - 1; i >= 0; i--) {
if (previousStart != -1) {
outputText.insert(0, text.substring(offsetPositions.get(i).end, previousStart));
}
outputText.insert(0, "<s>" + text.substring(offsetPositions.get(i).start, offsetPositions.get(i).end)
+ "</s>");
previousStart = offsetPositions.get(i).start;
}

outputText.insert(0, text.substring(0, offsetPositions.get(0).start));

return "<sents>" + outputText.toString() + "</sents>";
}
/**
* Return true if the token should be skipped when considering sentence content.
* Return true if the token should be skipped when considering sentence content.
*/
public static boolean toSkipToken(String tok) {
// the hyphen is considered to be skipped to cover the case of word hyphenation
Expand All @@ -282,7 +331,7 @@ private static boolean toSkipTokenNoHyphen(String tok) {


/**
* Return true if the token is a valid numerical reference markers ([0-9,())\-\]\[) in supercript.
* Return true if the token is a valid numerical reference markers ([0-9,())\-\]\[) in supercript.
*/
private static boolean isValidSuperScriptNumericalReferenceMarker(LayoutToken token) {

Expand All @@ -292,10 +341,10 @@ private static boolean isValidSuperScriptNumericalReferenceMarker(LayoutToken to
return true;
}
if (token.isSuperscript() && token.getText().matches("[0-9,\\-\\(\\)\\[\\]]+")) {
//System.out.println("isValidSuperScriptNumericalReferenceMarker: " + token.getText() + " -> true");
//System.out.println("isValidSuperScriptNumericalReferenceMarker: " + token.getText() + " -> true");
return true;
} else {
//System.out.println("isValidSuperScriptNumericalReferenceMarker: " + token.getText() + " -> false");
//System.out.println("isValidSuperScriptNumericalReferenceMarker: " + token.getText() + " -> false");
return false;
}
}
Expand Down
Expand Up @@ -200,12 +200,12 @@ public Void handleResponse(HttpResponse response) throws ClientProtocolException
}

public String toString() {
String str = " (";
StringBuilder str = new StringBuilder(" (");
if (params != null) {
for (Entry<String, String> cursor : params.entrySet())
str += ","+cursor.getKey()+"="+cursor.getValue();
str.append(",").append(cursor.getKey()).append("=").append(cursor.getValue());
}
str += ")";
return str;
str.append(")");
return str.toString();
}
}
Expand Up @@ -4,13 +4,13 @@
import org.junit.Before;
import org.junit.Test;

import java.util.List;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

import static org.hamcrest.CoreMatchers.is;
import static org.hamcrest.Matchers.hasSize;
import static org.junit.Assert.assertThat;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertThat;

public class SentenceUtilitiesTest {

Expand Down Expand Up @@ -66,4 +66,41 @@ public void testTwoSentencesTextWithUsefullForbidden() throws Exception {
List<OffsetPosition> theSentences = SentenceUtilities.getInstance().runSentenceDetection(text, forbidden);
assertThat(theSentences.size(), is(1));
}

@Test
public void testGetText() throws Exception {
String text = "Bla bla bla. Bli bli bli.";

List<OffsetPosition> offsetPositions = Arrays.asList(
new OffsetPosition(0, 12),
new OffsetPosition(14, 21)
);

String outputText = SentenceUtilities.getInstance().getXml(text, offsetPositions);

assertThat(outputText, is("<sents><s>Bla bla bla.</s> B<s>li bli </s>bli.</sents>"));
}

@Test
public void testParagraphWithMarkersOutsideTheSentence() throws Exception {
String text = "Precisely controlling surface chemistry using self-assembled monolayers (SAMs) and bilayers " +
"has been a central focus of research in both synthetic and biological interfaces. " +
"1−4 Much synthetic monolayer chemistry has its basis in the formation of SAMs of alkanethiols " +
"on gold and the coinage metals, pioneered by groups including those of Whitesides, " +
"Nuzzo, and Allara in the 1980s. 5−8 ";

List<OffsetPosition> forbidden = Arrays.asList(
new OffsetPosition(174, 177),
new OffsetPosition(383, 386)
);

List<OffsetPosition> offsetPositions = SentenceUtilities.getInstance().runSentenceDetection(text, forbidden);
assertThat(SentenceUtilities.getInstance().getXml(text, offsetPositions), is("<sents><s>Precisely controlling " +
"surface chemistry using self-assembled monolayers (SAMs) and bilayers " +
"has been a central focus of research in both synthetic and biological interfaces. " +
"1−4</s> <s>Much synthetic monolayer chemistry has its basis in the formation of SAMs of alkanethiols " +
"on gold and the coinage metals, pioneered by groups including those of Whitesides, " +
"Nuzzo, and Allara in the 1980s. 5−8</s> </sents>"));

}
}
Expand Up @@ -702,6 +702,21 @@ public Response annotatePDFPatentCitation(
return restProcessFiles.annotateCitationPatentPDF(inputStream, consol, includeRaw);
}

@Path("/segmentSentence")
@Consumes(MediaType.MULTIPART_FORM_DATA)
@Produces(MediaType.APPLICATION_XML)
@POST
public Response segmentSentenceText_post(@FormDataParam(INPUT) String text) {
return restProcessString.segmentSentences(text);
}

@Path("/segmentSentence")
@Produces(MediaType.APPLICATION_XML)
@GET
public Response segmentSentenceText_get(@QueryParam(INPUT) String text) {
return restProcessString.segmentSentences(text);
}

public void setRestProcessFiles(GrobidRestProcessFiles restProcessFiles) {
this.restProcessFiles = restProcessFiles;
}
Expand Down

0 comments on commit 0143025

Please sign in to comment.