Skip to content

Commit

Permalink
filter out possible corrupted characters introduced by the sentence s…
Browse files Browse the repository at this point in the history
…egmenter
  • Loading branch information
kermitt2 committed Mar 23, 2022
1 parent dcc8438 commit d8e2afd
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 4 deletions.
Expand Up @@ -1515,7 +1515,9 @@ public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curPara
}

if (pos+posInSentence <= theSentences.get(i).end) {
sentenceElement.appendChild(text.substring(pos+posInSentence, theSentences.get(i).end));
String local_text_chunk = text.substring(pos+posInSentence, theSentences.get(i).end);
local_text_chunk = XmlBuilderUtils.stripNonValidXMLCharacters(local_text_chunk);
sentenceElement.appendChild(local_text_chunk);
curParagraph.appendChild(sentenceElement);
}
}
Expand All @@ -1532,8 +1534,7 @@ public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curPara
}
}

}

}

/**
* Return the graphic objects in a given interval position in the document.
Expand Down
Expand Up @@ -100,4 +100,23 @@ public static void main(String[] args) throws ParsingException, IOException {
System.out.println(toXml(e));

}

public static String stripNonValidXMLCharacters(String in) {
StringBuffer out = new StringBuffer(); // Used to hold the output.
char current; // Used to reference the current character.

if (in == null || ("".equals(in)))
return "";
for (int i = 0; i < in.length(); i++) {
current = in.charAt(i);
if ((current == 0x9) ||
(current == 0xA) ||
(current == 0xD) ||
((current >= 0x20) && (current <= 0xD7FF)) ||
((current >= 0xE000) && (current <= 0xFFFD)) ||
((current >= 0x10000) && (current <= 0x10FFFF)))
out.append(current);
}
return out.toString();
}
}
Expand Up @@ -71,8 +71,10 @@ public class TextUtilities {

// a regular expression for identifying url pattern in text
// TODO: maybe find a better regex (better == more robust, not more "standard")
static public final Pattern urlPattern = Pattern
static public final Pattern urlPattern0 = Pattern
.compile("(?i)(https?|ftp)\\s?:\\s?//\\s?[-A-Z0-9+&@#/%?=~_()|!:,.;]*[-A-Z0-9+&@#/%=~_()|]");
static public final Pattern urlPattern = Pattern
.compile("(?i)(https?|ftp)\\s?:\\s?\\/\\/\\s?[-A-Z0-9+&@#\\/%?=~_()|!:.;]*[-A-Z0-9+&@#\\/%=~_()]");

// a regular expression for identifying email pattern in text
// TODO: maybe find a better regex (better == more robust, not more "standard")
Expand Down

0 comments on commit d8e2afd

Please sign in to comment.