Skip to content

Commit

Permalink
avoid code duplication
Browse files Browse the repository at this point in the history
  • Loading branch information
danielnaber committed Nov 13, 2015
1 parent 282e720 commit 5deee32
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 39 deletions.
Expand Up @@ -90,7 +90,7 @@ public String getId() {
@Override
public RuleMatch[] match(AnalyzedSentence sentence) {
String text = sentence.getText();
List<GoogleToken> tokens = getGoogleTokens(text, true);
List<GoogleToken> tokens = GoogleToken.getGoogleTokens(text, true, getWordTokenizer());
List<RuleMatch> matches = new ArrayList<>();
int pos = 0;
for (GoogleToken googleToken : tokens) {
Expand Down Expand Up @@ -148,24 +148,6 @@ private String getMessage(ConfusionString textString, ConfusionString suggestion
}
}

// Tokenization in google ngram corpus is different from LT tokenization (e.g. {@code you ' re} -> {@code you 're}),
// so we use getTokenizer() and simple ignore the LT tokens.
private List<GoogleToken> getGoogleTokens(String sentence, boolean addStartToken) {
List<GoogleToken> result = new ArrayList<>();
if (addStartToken) {
result.add(new GoogleToken(LanguageModel.GOOGLE_SENTENCE_START, 0, 0));
}
List<String> tokens = getWordTokenizer().tokenize(sentence);
int startPos = 0;
for (String token : tokens) {
if (!StringTools.isWhitespace(token)) {
result.add(new GoogleToken(token, startPos, startPos+token.length()));
}
startPos += token.length();
}
return result;
}

@Override
public void reset() {
}
Expand Down Expand Up @@ -279,7 +261,7 @@ private List<String> getContext(GoogleToken token, List<GoogleToken> tokens, Lis
}

private double get3gramProbabilityFor(GoogleToken token, List<GoogleToken> tokens, String term) {
List<GoogleToken> newTokens = getGoogleTokens(term, false);
List<GoogleToken> newTokens = GoogleToken.getGoogleTokens(term, false, getWordTokenizer());
Probability ngram3Left;
Probability ngram3Middle;
Probability ngram3Right;
Expand Down
Expand Up @@ -18,8 +18,13 @@
*/
package org.languagetool.rules.ngrams;

import org.languagetool.languagemodel.LanguageModel;
import org.languagetool.tokenizers.Tokenizer;
import org.languagetool.tools.StringTools;

import java.util.ArrayList;
import java.util.List;

/**
* A token as tokenized in the Google ngram index.
* @since 3.2
Expand All @@ -45,4 +50,22 @@ public String toString() {
return token;
}

// Tokenization in google ngram corpus is different from LT tokenization (e.g. {@code you ' re} -> {@code you 're}),
// so we use getTokenizer() and simple ignore the LT tokens.
static List<GoogleToken> getGoogleTokens(String sentence, boolean addStartToken, Tokenizer wordTokenizer) {
List<GoogleToken> result = new ArrayList<>();
if (addStartToken) {
result.add(new GoogleToken(LanguageModel.GOOGLE_SENTENCE_START, 0, 0));
}
List<String> tokens = wordTokenizer.tokenize(sentence);
int startPos = 0;
for (String token : tokens) {
if (!StringTools.isWhitespace(token)) {
result.add(new GoogleToken(token, startPos, startPos+token.length()));
}
startPos += token.length();
}
return result;
}

}
Expand Up @@ -71,7 +71,7 @@ public String getId() {
@Override
public RuleMatch[] match(AnalyzedSentence sentence) {
String text = sentence.getText();
List<GoogleToken> tokens = getGoogleTokens(text, true);
List<GoogleToken> tokens = GoogleToken.getGoogleTokens(text, true, getWordTokenizer());
List<RuleMatch> matches = new ArrayList<>();
GoogleToken prevPrevToken = null;
GoogleToken prevToken = null;
Expand Down Expand Up @@ -105,24 +105,6 @@ public void reset() {
protected Tokenizer getWordTokenizer() {
return language.getWordTokenizer();
}

// Tokenization in google ngram corpus is different from LT tokenization (e.g. {@code you ' re} -> {@code you 're}),
// so we use getTokenizer() and simple ignore the LT tokens.
private List<GoogleToken> getGoogleTokens(String sentence, boolean addStartToken) {
List<GoogleToken> result = new ArrayList<>();
if (addStartToken) {
result.add(new GoogleToken(LanguageModel.GOOGLE_SENTENCE_START, 0, 0));
}
List<String> tokens = getWordTokenizer().tokenize(sentence);
int startPos = 0;
for (String token : tokens) {
if (!StringTools.isWhitespace(token)) {
result.add(new GoogleToken(token, startPos, startPos+token.length()));
}
startPos += token.length();
}
return result;
}

private void debug(String message, Object... vars) {
if (DEBUG) {
Expand Down

0 comments on commit 5deee32

Please sign in to comment.