From 5deee32e258a1d55a75be9cdf24a0d417f58bcae Mon Sep 17 00:00:00 2001 From: Daniel Naber Date: Fri, 13 Nov 2015 14:22:20 +0100 Subject: [PATCH] avoid code duplication --- .../ngrams/ConfusionProbabilityRule.java | 22 ++---------------- .../rules/ngrams/GoogleToken.java | 23 +++++++++++++++++++ .../rules/ngrams/NgramProbabilityRule.java | 20 +--------------- 3 files changed, 26 insertions(+), 39 deletions(-) diff --git a/languagetool-core/src/main/java/org/languagetool/rules/ngrams/ConfusionProbabilityRule.java b/languagetool-core/src/main/java/org/languagetool/rules/ngrams/ConfusionProbabilityRule.java index 1b620e2a2b36..c8756e6053e1 100644 --- a/languagetool-core/src/main/java/org/languagetool/rules/ngrams/ConfusionProbabilityRule.java +++ b/languagetool-core/src/main/java/org/languagetool/rules/ngrams/ConfusionProbabilityRule.java @@ -90,7 +90,7 @@ public String getId() { @Override public RuleMatch[] match(AnalyzedSentence sentence) { String text = sentence.getText(); - List tokens = getGoogleTokens(text, true); + List tokens = GoogleToken.getGoogleTokens(text, true, getWordTokenizer()); List matches = new ArrayList<>(); int pos = 0; for (GoogleToken googleToken : tokens) { @@ -148,24 +148,6 @@ private String getMessage(ConfusionString textString, ConfusionString suggestion } } - // Tokenization in google ngram corpus is different from LT tokenization (e.g. {@code you ' re} -> {@code you 're}), - // so we use getTokenizer() and simple ignore the LT tokens. - private List getGoogleTokens(String sentence, boolean addStartToken) { - List result = new ArrayList<>(); - if (addStartToken) { - result.add(new GoogleToken(LanguageModel.GOOGLE_SENTENCE_START, 0, 0)); - } - List tokens = getWordTokenizer().tokenize(sentence); - int startPos = 0; - for (String token : tokens) { - if (!StringTools.isWhitespace(token)) { - result.add(new GoogleToken(token, startPos, startPos+token.length())); - } - startPos += token.length(); - } - return result; - } - @Override public void reset() { } @@ -279,7 +261,7 @@ private List getContext(GoogleToken token, List tokens, Lis } private double get3gramProbabilityFor(GoogleToken token, List tokens, String term) { - List newTokens = getGoogleTokens(term, false); + List newTokens = GoogleToken.getGoogleTokens(term, false, getWordTokenizer()); Probability ngram3Left; Probability ngram3Middle; Probability ngram3Right; diff --git a/languagetool-core/src/main/java/org/languagetool/rules/ngrams/GoogleToken.java b/languagetool-core/src/main/java/org/languagetool/rules/ngrams/GoogleToken.java index e7ee2c9afe87..653e76c2b0e2 100644 --- a/languagetool-core/src/main/java/org/languagetool/rules/ngrams/GoogleToken.java +++ b/languagetool-core/src/main/java/org/languagetool/rules/ngrams/GoogleToken.java @@ -18,8 +18,13 @@ */ package org.languagetool.rules.ngrams; +import org.languagetool.languagemodel.LanguageModel; +import org.languagetool.tokenizers.Tokenizer; import org.languagetool.tools.StringTools; +import java.util.ArrayList; +import java.util.List; + /** * A token as tokenized in the Google ngram index. * @since 3.2 @@ -45,4 +50,22 @@ public String toString() { return token; } + // Tokenization in google ngram corpus is different from LT tokenization (e.g. {@code you ' re} -> {@code you 're}), + // so we use getTokenizer() and simple ignore the LT tokens. + static List getGoogleTokens(String sentence, boolean addStartToken, Tokenizer wordTokenizer) { + List result = new ArrayList<>(); + if (addStartToken) { + result.add(new GoogleToken(LanguageModel.GOOGLE_SENTENCE_START, 0, 0)); + } + List tokens = wordTokenizer.tokenize(sentence); + int startPos = 0; + for (String token : tokens) { + if (!StringTools.isWhitespace(token)) { + result.add(new GoogleToken(token, startPos, startPos+token.length())); + } + startPos += token.length(); + } + return result; + } + } diff --git a/languagetool-core/src/main/java/org/languagetool/rules/ngrams/NgramProbabilityRule.java b/languagetool-core/src/main/java/org/languagetool/rules/ngrams/NgramProbabilityRule.java index 32229a1320bb..bddc15f68997 100644 --- a/languagetool-core/src/main/java/org/languagetool/rules/ngrams/NgramProbabilityRule.java +++ b/languagetool-core/src/main/java/org/languagetool/rules/ngrams/NgramProbabilityRule.java @@ -71,7 +71,7 @@ public String getId() { @Override public RuleMatch[] match(AnalyzedSentence sentence) { String text = sentence.getText(); - List tokens = getGoogleTokens(text, true); + List tokens = GoogleToken.getGoogleTokens(text, true, getWordTokenizer()); List matches = new ArrayList<>(); GoogleToken prevPrevToken = null; GoogleToken prevToken = null; @@ -105,24 +105,6 @@ public void reset() { protected Tokenizer getWordTokenizer() { return language.getWordTokenizer(); } - - // Tokenization in google ngram corpus is different from LT tokenization (e.g. {@code you ' re} -> {@code you 're}), - // so we use getTokenizer() and simple ignore the LT tokens. - private List getGoogleTokens(String sentence, boolean addStartToken) { - List result = new ArrayList<>(); - if (addStartToken) { - result.add(new GoogleToken(LanguageModel.GOOGLE_SENTENCE_START, 0, 0)); - } - List tokens = getWordTokenizer().tokenize(sentence); - int startPos = 0; - for (String token : tokens) { - if (!StringTools.isWhitespace(token)) { - result.add(new GoogleToken(token, startPos, startPos+token.length())); - } - startPos += token.length(); - } - return result; - } private void debug(String message, Object... vars) { if (DEBUG) {