diff --git a/grobid-core/src/main/java/org/grobid/core/utilities/SentenceUtilities.java b/grobid-core/src/main/java/org/grobid/core/utilities/SentenceUtilities.java index 98b374d2a7..38afe84976 100644 --- a/grobid-core/src/main/java/org/grobid/core/utilities/SentenceUtilities.java +++ b/grobid-core/src/main/java/org/grobid/core/utilities/SentenceUtilities.java @@ -139,6 +139,33 @@ public List runSentenceDetection(String text, List 0) { + // Adjust the previous sentence to include this span + OffsetPosition previousSentence = finalSentencePositions.get(index - 1); + previousSentence.end = forbiddenSpan.end; + currentSentence.start = forbiddenSpan.end; + while (text.charAt(currentSentence.start) == ' ') { + if (currentSentence.start == text.length() - 1) { + break; + } else { + currentSentence.start++; + } + } + } + } + } + + finalSentencePositions = finalSentencePositions + .stream() + .filter(offsetPosition -> offsetPosition.end - offsetPosition.start > 0) + .collect(Collectors.toList()); + // as a heuristics for all implementations, because they clearly all fail for this case, we // attached to the right sentence the numerical bibliographical references markers expressed // in superscript just *after* the final sentence comma, e.g. @@ -251,44 +278,18 @@ public List runSentenceDetection(String text, List 0) { - // Adjust the previous sentence to include this span - OffsetPosition previousSentence = finalSentencePositions.get(index - 1); - previousSentence.end = forbiddenSpan.end; - currentSentence.start = forbiddenSpan.end; - while (text.charAt(currentSentence.start) == ' ') { - if (currentSentence.start == text.length() - 1) { - break; - } else { - currentSentence.start++; - } - } - } - } - } - - List cleanedSentencesPositions = finalSentencePositions - .stream() - .filter(offsetPosition -> offsetPosition.end - offsetPosition.start > 0) - .collect(Collectors.toList()); - - return cleanedSentencesPositions; } catch (Exception e) { LOGGER.warn("Cannot detect sentences. ", e); return null; } + } public String getXml(String text, List offsetPositions) {