Skip to content

Commit

Permalink
avoid code duplication
Browse files Browse the repository at this point in the history
  • Loading branch information
danielnaber committed Nov 13, 2015
1 parent 282e720 commit 5deee32
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 39 deletions.
Expand Up @@ -90,7 +90,7 @@ public String getId() {
@Override @Override
public RuleMatch[] match(AnalyzedSentence sentence) { public RuleMatch[] match(AnalyzedSentence sentence) {
String text = sentence.getText(); String text = sentence.getText();
List<GoogleToken> tokens = getGoogleTokens(text, true); List<GoogleToken> tokens = GoogleToken.getGoogleTokens(text, true, getWordTokenizer());
List<RuleMatch> matches = new ArrayList<>(); List<RuleMatch> matches = new ArrayList<>();
int pos = 0; int pos = 0;
for (GoogleToken googleToken : tokens) { for (GoogleToken googleToken : tokens) {
Expand Down Expand Up @@ -148,24 +148,6 @@ private String getMessage(ConfusionString textString, ConfusionString suggestion
} }
} }


// Tokenization in google ngram corpus is different from LT tokenization (e.g. {@code you ' re} -> {@code you 're}),
// so we use getTokenizer() and simple ignore the LT tokens.
private List<GoogleToken> getGoogleTokens(String sentence, boolean addStartToken) {
List<GoogleToken> result = new ArrayList<>();
if (addStartToken) {
result.add(new GoogleToken(LanguageModel.GOOGLE_SENTENCE_START, 0, 0));
}
List<String> tokens = getWordTokenizer().tokenize(sentence);
int startPos = 0;
for (String token : tokens) {
if (!StringTools.isWhitespace(token)) {
result.add(new GoogleToken(token, startPos, startPos+token.length()));
}
startPos += token.length();
}
return result;
}

@Override @Override
public void reset() { public void reset() {
} }
Expand Down Expand Up @@ -279,7 +261,7 @@ private List<String> getContext(GoogleToken token, List<GoogleToken> tokens, Lis
} }


private double get3gramProbabilityFor(GoogleToken token, List<GoogleToken> tokens, String term) { private double get3gramProbabilityFor(GoogleToken token, List<GoogleToken> tokens, String term) {
List<GoogleToken> newTokens = getGoogleTokens(term, false); List<GoogleToken> newTokens = GoogleToken.getGoogleTokens(term, false, getWordTokenizer());
Probability ngram3Left; Probability ngram3Left;
Probability ngram3Middle; Probability ngram3Middle;
Probability ngram3Right; Probability ngram3Right;
Expand Down
Expand Up @@ -18,8 +18,13 @@
*/ */
package org.languagetool.rules.ngrams; package org.languagetool.rules.ngrams;


import org.languagetool.languagemodel.LanguageModel;
import org.languagetool.tokenizers.Tokenizer;
import org.languagetool.tools.StringTools; import org.languagetool.tools.StringTools;


import java.util.ArrayList;
import java.util.List;

/** /**
* A token as tokenized in the Google ngram index. * A token as tokenized in the Google ngram index.
* @since 3.2 * @since 3.2
Expand All @@ -45,4 +50,22 @@ public String toString() {
return token; return token;
} }


// Tokenization in google ngram corpus is different from LT tokenization (e.g. {@code you ' re} -> {@code you 're}),
// so we use getTokenizer() and simple ignore the LT tokens.
static List<GoogleToken> getGoogleTokens(String sentence, boolean addStartToken, Tokenizer wordTokenizer) {
List<GoogleToken> result = new ArrayList<>();
if (addStartToken) {
result.add(new GoogleToken(LanguageModel.GOOGLE_SENTENCE_START, 0, 0));
}
List<String> tokens = wordTokenizer.tokenize(sentence);
int startPos = 0;
for (String token : tokens) {
if (!StringTools.isWhitespace(token)) {
result.add(new GoogleToken(token, startPos, startPos+token.length()));
}
startPos += token.length();
}
return result;
}

} }
Expand Up @@ -71,7 +71,7 @@ public String getId() {
@Override @Override
public RuleMatch[] match(AnalyzedSentence sentence) { public RuleMatch[] match(AnalyzedSentence sentence) {
String text = sentence.getText(); String text = sentence.getText();
List<GoogleToken> tokens = getGoogleTokens(text, true); List<GoogleToken> tokens = GoogleToken.getGoogleTokens(text, true, getWordTokenizer());
List<RuleMatch> matches = new ArrayList<>(); List<RuleMatch> matches = new ArrayList<>();
GoogleToken prevPrevToken = null; GoogleToken prevPrevToken = null;
GoogleToken prevToken = null; GoogleToken prevToken = null;
Expand Down Expand Up @@ -105,24 +105,6 @@ public void reset() {
protected Tokenizer getWordTokenizer() { protected Tokenizer getWordTokenizer() {
return language.getWordTokenizer(); return language.getWordTokenizer();
} }

// Tokenization in google ngram corpus is different from LT tokenization (e.g. {@code you ' re} -> {@code you 're}),
// so we use getTokenizer() and simple ignore the LT tokens.
private List<GoogleToken> getGoogleTokens(String sentence, boolean addStartToken) {
List<GoogleToken> result = new ArrayList<>();
if (addStartToken) {
result.add(new GoogleToken(LanguageModel.GOOGLE_SENTENCE_START, 0, 0));
}
List<String> tokens = getWordTokenizer().tokenize(sentence);
int startPos = 0;
for (String token : tokens) {
if (!StringTools.isWhitespace(token)) {
result.add(new GoogleToken(token, startPos, startPos+token.length()));
}
startPos += token.length();
}
return result;
}


private void debug(String message, Object... vars) { private void debug(String message, Object... vars) {
if (DEBUG) { if (DEBUG) {
Expand Down

0 comments on commit 5deee32

Please sign in to comment.