Skip to content

Commit

Permalink
small code simplification
Browse files Browse the repository at this point in the history
  • Loading branch information
danielnaber committed Sep 12, 2015
1 parent 09420b9 commit a24c7d5
Show file tree
Hide file tree
Showing 4 changed files with 9 additions and 17 deletions.
Expand Up @@ -24,7 +24,7 @@
import org.languagetool.Language;
import org.languagetool.databroker.ResourceDataBroker;
import org.languagetool.languagemodel.LanguageModel;
import org.languagetool.tokenizers.WordTokenizer;
import org.languagetool.tokenizers.Tokenizer;
import org.languagetool.tools.StringTools;

import java.io.IOException;
Expand Down Expand Up @@ -53,11 +53,10 @@ public abstract class ConfusionProbabilityRule extends Rule {
private final LanguageModel lm;
private final long totalTokenCount;
private final int grams;
private final Language language;

public abstract String getMessage(ConfusionString textString, ConfusionString suggestion);

protected abstract WordTokenizer getTokenizer();

public ConfusionProbabilityRule(ResourceBundle messages, LanguageModel languageModel, Language language) {
this(messages, languageModel, language, 3);
}
Expand All @@ -75,6 +74,7 @@ public ConfusionProbabilityRule(ResourceBundle messages, LanguageModel languageM
throw new RuntimeException(e);
}
this.lm = Objects.requireNonNull(languageModel);
this.language = Objects.requireNonNull(language);
if (grams < 1 || grams > 5) {
throw new IllegalArgumentException("grams must be between 1 and 5: " + grams);
}
Expand Down Expand Up @@ -121,14 +121,18 @@ public RuleMatch[] match(AnalyzedSentence sentence) {
return matches.toArray(new RuleMatch[matches.size()]);
}

protected Tokenizer getWordTokenizer() {
return language.getWordTokenizer();
}

// Tokenization in google ngram corpus is different from LT tokenization (e.g. {@code you ' re} -> {@code you 're}),
// so we use getTokenizer() and simple ignore the LT tokens.
private List<GoogleToken> getGoogleTokens(String sentence, boolean addStartToken) {
List<GoogleToken> result = new ArrayList<>();
if (addStartToken) {
result.add(new GoogleToken(LanguageModel.GOOGLE_SENTENCE_START, 0, 0));
}
List<String> tokens = getTokenizer().tokenize(sentence);
List<String> tokens = getWordTokenizer().tokenize(sentence);
int startPos = 0;
for (String token : tokens) {
if (!StringTools.isWhitespace(token)) {
Expand Down
Expand Up @@ -21,7 +21,6 @@
import org.junit.Test;
import org.languagetool.*;
import org.languagetool.languagemodel.LanguageModel;
import org.languagetool.tokenizers.WordTokenizer;
import org.languagetool.tools.StringTools;

import java.io.IOException;
Expand Down Expand Up @@ -181,10 +180,6 @@ private FakeRule(LanguageModel languageModel, Language language) {
super(JLanguageTool.getMessageBundle(), languageModel, language);
this.language = language;
}
@Override
protected WordTokenizer getTokenizer() {
return (WordTokenizer) language.getWordTokenizer();
}
@Override public String getDescription() { return null; }
@Override public String getMessage(ConfusionString textString, ConfusionString suggestion) { return null; }
}
Expand Down
Expand Up @@ -23,7 +23,6 @@
import org.languagetool.rules.ConfusionProbabilityRule;
import org.languagetool.rules.ConfusionString;
import org.languagetool.rules.Example;
import org.languagetool.tokenizers.WordTokenizer;

import java.util.ResourceBundle;

Expand All @@ -32,8 +31,6 @@
*/
public class GermanConfusionProbabilityRule extends ConfusionProbabilityRule {

private static final WordTokenizer WORD_TOKENIZER = new WordTokenizer();

public GermanConfusionProbabilityRule(ResourceBundle messages, LanguageModel languageModel, Language language) {
this(messages, languageModel, language, 3);
}
Expand Down Expand Up @@ -61,8 +58,4 @@ public String getMessage(ConfusionString textString, ConfusionString suggestion)
}
}

@Override
protected WordTokenizer getTokenizer() {
return WORD_TOKENIZER;
}
}
Expand Up @@ -98,7 +98,7 @@ public String getMessage(ConfusionString textString, ConfusionString suggestion)
}

@Override
protected WordTokenizer getTokenizer() {
protected WordTokenizer getWordTokenizer() {
return tokenizer;
}
}

0 comments on commit a24c7d5

Please sign in to comment.