Skip to content

Commit

Permalink
Allow per-language set of characters to ignore in tokens
Browse files Browse the repository at this point in the history
  • Loading branch information
arysin committed Jan 22, 2015
1 parent b39733f commit af08112
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 5 deletions.
Expand Up @@ -31,6 +31,7 @@
import org.xml.sax.SAXException;

import javax.xml.parsers.ParserConfigurationException;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
Expand All @@ -40,6 +41,7 @@
import java.util.*;
import java.util.concurrent.Callable;
import java.util.jar.Manifest;
import java.util.regex.Pattern;

/**
* The main class used for checking text against different rules:
Expand Down Expand Up @@ -737,14 +739,19 @@ public AnalyzedSentence getRawAnalyzedSentence(final String sentence) throws IOE
}

private Map<Integer, String> replaceSoftHyphens(List<String> tokens) {
final Map<Integer, String> softHyphenTokens = new HashMap<>();
Pattern ignoredCharacterRegex = language.getIgnoredCharactersRegex();

final Map<Integer, String> ignoredCharsTokens = new HashMap<>();
if( ignoredCharacterRegex == null )
return ignoredCharsTokens;

for (int i = 0; i < tokens.size(); i++) {
if (tokens.get(i).indexOf('\u00ad') != -1) {
softHyphenTokens.put(i, tokens.get(i));
tokens.set(i, tokens.get(i).replaceAll("\u00ad", ""));
if ( ignoredCharacterRegex.matcher(tokens.get(i)).find() ) {
ignoredCharsTokens.put(i, tokens.get(i));
tokens.set(i, ignoredCharacterRegex.matcher(tokens.get(i)).replaceAll(""));
}
}
return softHyphenTokens;
return ignoredCharsTokens;
}

/**
Expand Down
18 changes: 18 additions & 0 deletions languagetool-core/src/main/java/org/languagetool/Language.java
Expand Up @@ -43,6 +43,7 @@
import java.lang.reflect.Constructor;
import java.net.URL;
import java.util.*;
import java.util.regex.Pattern;

/**
* Base class for any supported language (English, German, etc). Language classes
Expand All @@ -62,6 +63,7 @@ public abstract class Language {
private final List<String> externalRuleFiles = new ArrayList<>();

private boolean isExternalLanguage = false;
private Pattern ignoredCharactersRegex = Pattern.compile("[\u00AD]");
private List<PatternRule> patternRules;

/**
Expand Down Expand Up @@ -716,4 +718,20 @@ private boolean hasCountry() {
return getCountries().length == 1;
}

/**
* @return Return compiled regular expression to ignore inside tokens
*/
public Pattern getIgnoredCharactersRegex() {
return ignoredCharactersRegex;
}

/**
* Sets the regular expression (usually set of chars) to ignore inside tokens
* By default only soft hyphen (\u00AD) is ignored
* @since 2.9
*/
public void setIgnoredCharactersRegex(String ignoredCharactersRegex) {
this.ignoredCharactersRegex = Pattern.compile(ignoredCharactersRegex);
}

}

0 comments on commit af08112

Please sign in to comment.