Skip to content

Commit

Permalink
[uk] do not generate warnings for words with dot
Browse files Browse the repository at this point in the history
  • Loading branch information
arysin committed Jan 25, 2019
1 parent 532d5b1 commit 6672ba6
Showing 1 changed file with 11 additions and 3 deletions.
Expand Up @@ -39,6 +39,7 @@ public final class PatternTestTools {

// These characters should not be present in token values as they split tokens in all languages.
private static final Pattern TOKEN_SEPARATOR_PATTERN = Pattern.compile("[ .,:;…!?(){}<>«»\"]");
private static final Pattern TOKEN_SEPARATOR_PATTERN_NO_DOT = Pattern.compile("[ ,:;…!?(){}<>«»\"]");

private static final Pattern PROBABLE_PATTERN = Pattern.compile("(\\\\[dDsSwW])|.*([^*]\\*|[.+?{}()|\\[\\]].*|\\\\d).*");

Expand Down Expand Up @@ -330,7 +331,13 @@ private static void warnIfElementNotKosher(
if (!isPos && !isRegularExpression && stringValue.length() > 1) {
// Example: <token>foo bar</token> can't be valid because
// token value contains a space which is a token separator.
if (TOKEN_SEPARATOR_PATTERN.matcher(stringValue).find()) {

// Ukrainian dictionary contains some abbreviations with dot
Pattern tokenSeparatorPattern = lang.getShortCode().equals("uk")
? TOKEN_SEPARATOR_PATTERN_NO_DOT
: TOKEN_SEPARATOR_PATTERN;

if (tokenSeparatorPattern.matcher(stringValue).find()) {
System.err.println("The " + lang + " rule: "
+ ruleId + ", token [" + tokenIndex + "], contains " + "\"" + stringValue
+ "\" that contains token separators, so can't possibly be matched.");
Expand All @@ -341,8 +348,9 @@ private static void warnIfElementNotKosher(
// since Polish uses dot '.' in POS tags. So a dot does not indicate that
// it's a probable regexp for Polish POS tags.
Pattern regexPattern = (isPos && lang.getShortCode().equals("pl"))
? PROBABLE_PATTERN_PL_POS // Polish POS tag.
: PROBABLE_PATTERN; // something else than Polish POS tag.
|| (!isPos && lang.getShortCode().equals("uk"))
? PROBABLE_PATTERN_PL_POS // Polish POS tag or Ukrainian token
: PROBABLE_PATTERN; // other usual cases

if (!isRegularExpression && stringValue.length() > 1
&& regexPattern.matcher(stringValue).find() && !NO_REGEXP.contains(stringValue)) {
Expand Down

0 comments on commit 6672ba6

Please sign in to comment.