From 79ebdd7f87254da060c53b28801a931dac686018 Mon Sep 17 00:00:00 2001 From: lopez Date: Sat, 26 Mar 2022 20:17:10 +0100 Subject: [PATCH] more robustness wrt sentence segmenter output --- .../main/java/org/grobid/core/document/TEIFormatter.java | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java index 86ac78a571..1b4a6d9ac7 100755 --- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java +++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java @@ -1502,8 +1502,11 @@ public void segmentIntoSentences(Element curParagraph, List curPara if (refPos >= pos+posInSentence && refPos <= pos+sentenceLength) { Node valueNode = mapRefNodes.get(new Integer(refPos)); - if (pos+posInSentence < refPos) - sentenceElement.appendChild(text.substring(pos+posInSentence, refPos)); + if (pos+posInSentence < refPos) { + String local_text_chunk = text.substring(pos+posInSentence, refPos); + local_text_chunk = XmlBuilderUtils.stripNonValidXMLCharacters(local_text_chunk); + sentenceElement.appendChild(local_text_chunk); + } valueNode.detach(); sentenceElement.appendChild(valueNode); refIndex = j;