From 118a634a1bdc7f40617ba4136b4ee48f405c1033 Mon Sep 17 00:00:00 2001 From: Daniel Naber Date: Fri, 13 Nov 2015 14:36:58 +0100 Subject: [PATCH] cleanup: move Google-style tokenizer to its own class, will soon be used from more than one place --- .../ngrams/ConfusionProbabilityRule.java | 10 ++- .../rules/ngrams/NgramProbabilityRule.java | 5 +- .../en/EnglishConfusionProbabilityRule.java | 42 +---------- .../rules/en/GoogleStyleWordTokenizer.java | 70 +++++++++++++++++++ languagetool-standalone/CHANGES.md | 2 + 5 files changed, 83 insertions(+), 46 deletions(-) create mode 100644 languagetool-language-modules/en/src/main/java/org/languagetool/rules/en/GoogleStyleWordTokenizer.java diff --git a/languagetool-core/src/main/java/org/languagetool/rules/ngrams/ConfusionProbabilityRule.java b/languagetool-core/src/main/java/org/languagetool/rules/ngrams/ConfusionProbabilityRule.java index c8756e6053e1..fa759d07c9b5 100644 --- a/languagetool-core/src/main/java/org/languagetool/rules/ngrams/ConfusionProbabilityRule.java +++ b/languagetool-core/src/main/java/org/languagetool/rules/ngrams/ConfusionProbabilityRule.java @@ -90,7 +90,7 @@ public String getId() { @Override public RuleMatch[] match(AnalyzedSentence sentence) { String text = sentence.getText(); - List tokens = GoogleToken.getGoogleTokens(text, true, getWordTokenizer()); + List tokens = GoogleToken.getGoogleTokens(text, true, getGoogleStyleWordTokenizer()); List matches = new ArrayList<>(); int pos = 0; for (GoogleToken googleToken : tokens) { @@ -134,7 +134,11 @@ public String getDescription() { return Tools.i18n(messages, "statistics_rule_description"); } - protected Tokenizer getWordTokenizer() { + /** + * Return a tokenizer that works more like Google does for its ngram index (which + * doesn't seem to be properly documented). + */ + protected Tokenizer getGoogleStyleWordTokenizer() { return language.getWordTokenizer(); } @@ -261,7 +265,7 @@ private List getContext(GoogleToken token, List tokens, Lis } private double get3gramProbabilityFor(GoogleToken token, List tokens, String term) { - List newTokens = GoogleToken.getGoogleTokens(term, false, getWordTokenizer()); + List newTokens = GoogleToken.getGoogleTokens(term, false, getGoogleStyleWordTokenizer()); Probability ngram3Left; Probability ngram3Middle; Probability ngram3Right; diff --git a/languagetool-core/src/main/java/org/languagetool/rules/ngrams/NgramProbabilityRule.java b/languagetool-core/src/main/java/org/languagetool/rules/ngrams/NgramProbabilityRule.java index bddc15f68997..9dd5b6e785ee 100644 --- a/languagetool-core/src/main/java/org/languagetool/rules/ngrams/NgramProbabilityRule.java +++ b/languagetool-core/src/main/java/org/languagetool/rules/ngrams/NgramProbabilityRule.java @@ -26,7 +26,6 @@ import org.languagetool.rules.Rule; import org.languagetool.rules.RuleMatch; import org.languagetool.tokenizers.Tokenizer; -import org.languagetool.tools.StringTools; import java.util.*; @@ -71,7 +70,7 @@ public String getId() { @Override public RuleMatch[] match(AnalyzedSentence sentence) { String text = sentence.getText(); - List tokens = GoogleToken.getGoogleTokens(text, true, getWordTokenizer()); + List tokens = GoogleToken.getGoogleTokens(text, true, getGoogleStyleWordTokenizer()); List matches = new ArrayList<>(); GoogleToken prevPrevToken = null; GoogleToken prevToken = null; @@ -102,7 +101,7 @@ public String getDescription() { public void reset() { } - protected Tokenizer getWordTokenizer() { + protected Tokenizer getGoogleStyleWordTokenizer() { return language.getWordTokenizer(); } diff --git a/languagetool-language-modules/en/src/main/java/org/languagetool/rules/en/EnglishConfusionProbabilityRule.java b/languagetool-language-modules/en/src/main/java/org/languagetool/rules/en/EnglishConfusionProbabilityRule.java index 2b2bea102219..c2a5e8b2ad40 100644 --- a/languagetool-language-modules/en/src/main/java/org/languagetool/rules/en/EnglishConfusionProbabilityRule.java +++ b/languagetool-language-modules/en/src/main/java/org/languagetool/rules/en/EnglishConfusionProbabilityRule.java @@ -23,53 +23,15 @@ import org.languagetool.rules.ngrams.ConfusionProbabilityRule; import org.languagetool.rules.Example; import org.languagetool.tokenizers.WordTokenizer; -import org.languagetool.tokenizers.en.EnglishWordTokenizer; -import java.util.List; import java.util.ResourceBundle; -import java.util.Stack; /** * @since 2.7 */ public class EnglishConfusionProbabilityRule extends ConfusionProbabilityRule { - private final EnglishWordTokenizer tokenizer = new EnglishWordTokenizer() { - @Override - public String getTokenizingCharacters() { - return super.getTokenizingCharacters() + "-"; - } - @Override - public List tokenize(final String text) { - List tokens = super.tokenize(text); - String prev = null; - final Stack l = new Stack<>(); - for (String token : tokens) { - if ("'".equals(prev)) { - // TODO: add more cases if needed: - if (token.equals("m")) { - l.pop(); - l.push("'m"); - } else if (token.equals("re")) { - l.pop(); - l.push("'re"); - } else if (token.equals("ve")) { - l.pop(); - l.push("'ve"); - } else if (token.equals("ll")) { - l.pop(); - l.push("'ll"); - } else { - l.push(token); - } - } else { - l.push(token); - } - prev = token; - } - return l; - } - }; + private final WordTokenizer tokenizer = new GoogleStyleWordTokenizer(); public EnglishConfusionProbabilityRule(ResourceBundle messages, LanguageModel languageModel, Language language) { this(messages, languageModel, language, 3); @@ -82,7 +44,7 @@ public EnglishConfusionProbabilityRule(ResourceBundle messages, LanguageModel la } @Override - protected WordTokenizer getWordTokenizer() { + protected WordTokenizer getGoogleStyleWordTokenizer() { return tokenizer; } } diff --git a/languagetool-language-modules/en/src/main/java/org/languagetool/rules/en/GoogleStyleWordTokenizer.java b/languagetool-language-modules/en/src/main/java/org/languagetool/rules/en/GoogleStyleWordTokenizer.java new file mode 100644 index 000000000000..0aa865dd9108 --- /dev/null +++ b/languagetool-language-modules/en/src/main/java/org/languagetool/rules/en/GoogleStyleWordTokenizer.java @@ -0,0 +1,70 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2014 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package org.languagetool.rules.en; + +import org.languagetool.tokenizers.WordTokenizer; + +import java.util.List; +import java.util.Stack; + +/** + * Tokenize sentences to tokens like Google does for its ngram index. Note: there + * doesn't seem to be official documentation about the way Google tokenizes there, + * so this is just an approximation. + * @since 3.2 + */ +class GoogleStyleWordTokenizer extends WordTokenizer { + + @Override + public String getTokenizingCharacters() { + return super.getTokenizingCharacters() + "-"; + } + + @Override + public List tokenize(final String text) { + List tokens = super.tokenize(text); + String prev = null; + final Stack l = new Stack<>(); + for (String token : tokens) { + if ("'".equals(prev)) { + // TODO: add more cases if needed: + if (token.equals("m")) { + l.pop(); + l.push("'m"); + } else if (token.equals("re")) { + l.pop(); + l.push("'re"); + } else if (token.equals("ve")) { + l.pop(); + l.push("'ve"); + } else if (token.equals("ll")) { + l.pop(); + l.push("'ll"); + } else { + l.push(token); + } + } else { + l.push(token); + } + prev = token; + } + return l; + } + +} diff --git a/languagetool-standalone/CHANGES.md b/languagetool-standalone/CHANGES.md index 3ab50eda1d36..6c4ccba6d064 100644 --- a/languagetool-standalone/CHANGES.md +++ b/languagetool-standalone/CHANGES.md @@ -42,6 +42,8 @@ #### API * `ConfusionProbabilityRule` has been moved to package `org.languagetool.rules.ngrams` + * `ConfusionProbabilityRule.getWordTokenizer()` is now called + `ConfusionProbabilityRule.getGoogleStyleWordTokenizer()` * `RuleAsXmlSerializer` has been renamed to `RuleMatchAsXmlSerializer` * some formerly deprecated code has been removed * some code has been deprecated