Skip to content

Commit

Permalink
cleanup: move Google-style tokenizer to its own class, will soon be u…
Browse files Browse the repository at this point in the history
…sed from more than one place
  • Loading branch information
danielnaber committed Nov 13, 2015
1 parent 5deee32 commit 118a634
Show file tree
Hide file tree
Showing 5 changed files with 83 additions and 46 deletions.
Expand Up @@ -90,7 +90,7 @@ public String getId() {
@Override
public RuleMatch[] match(AnalyzedSentence sentence) {
String text = sentence.getText();
List<GoogleToken> tokens = GoogleToken.getGoogleTokens(text, true, getWordTokenizer());
List<GoogleToken> tokens = GoogleToken.getGoogleTokens(text, true, getGoogleStyleWordTokenizer());
List<RuleMatch> matches = new ArrayList<>();
int pos = 0;
for (GoogleToken googleToken : tokens) {
Expand Down Expand Up @@ -134,7 +134,11 @@ public String getDescription() {
return Tools.i18n(messages, "statistics_rule_description");
}

protected Tokenizer getWordTokenizer() {
/**
* Return a tokenizer that works more like Google does for its ngram index (which
* doesn't seem to be properly documented).
*/
protected Tokenizer getGoogleStyleWordTokenizer() {
return language.getWordTokenizer();
}

Expand Down Expand Up @@ -261,7 +265,7 @@ private List<String> getContext(GoogleToken token, List<GoogleToken> tokens, Lis
}

private double get3gramProbabilityFor(GoogleToken token, List<GoogleToken> tokens, String term) {
List<GoogleToken> newTokens = GoogleToken.getGoogleTokens(term, false, getWordTokenizer());
List<GoogleToken> newTokens = GoogleToken.getGoogleTokens(term, false, getGoogleStyleWordTokenizer());
Probability ngram3Left;
Probability ngram3Middle;
Probability ngram3Right;
Expand Down
Expand Up @@ -26,7 +26,6 @@
import org.languagetool.rules.Rule;
import org.languagetool.rules.RuleMatch;
import org.languagetool.tokenizers.Tokenizer;
import org.languagetool.tools.StringTools;

import java.util.*;

Expand Down Expand Up @@ -71,7 +70,7 @@ public String getId() {
@Override
public RuleMatch[] match(AnalyzedSentence sentence) {
String text = sentence.getText();
List<GoogleToken> tokens = GoogleToken.getGoogleTokens(text, true, getWordTokenizer());
List<GoogleToken> tokens = GoogleToken.getGoogleTokens(text, true, getGoogleStyleWordTokenizer());
List<RuleMatch> matches = new ArrayList<>();
GoogleToken prevPrevToken = null;
GoogleToken prevToken = null;
Expand Down Expand Up @@ -102,7 +101,7 @@ public String getDescription() {
public void reset() {
}

protected Tokenizer getWordTokenizer() {
protected Tokenizer getGoogleStyleWordTokenizer() {
return language.getWordTokenizer();
}

Expand Down
Expand Up @@ -23,53 +23,15 @@
import org.languagetool.rules.ngrams.ConfusionProbabilityRule;
import org.languagetool.rules.Example;
import org.languagetool.tokenizers.WordTokenizer;
import org.languagetool.tokenizers.en.EnglishWordTokenizer;

import java.util.List;
import java.util.ResourceBundle;
import java.util.Stack;

/**
* @since 2.7
*/
public class EnglishConfusionProbabilityRule extends ConfusionProbabilityRule {

private final EnglishWordTokenizer tokenizer = new EnglishWordTokenizer() {
@Override
public String getTokenizingCharacters() {
return super.getTokenizingCharacters() + "-";
}
@Override
public List<String> tokenize(final String text) {
List<String> tokens = super.tokenize(text);
String prev = null;
final Stack<String> l = new Stack<>();
for (String token : tokens) {
if ("'".equals(prev)) {
// TODO: add more cases if needed:
if (token.equals("m")) {
l.pop();
l.push("'m");
} else if (token.equals("re")) {
l.pop();
l.push("'re");
} else if (token.equals("ve")) {
l.pop();
l.push("'ve");
} else if (token.equals("ll")) {
l.pop();
l.push("'ll");
} else {
l.push(token);
}
} else {
l.push(token);
}
prev = token;
}
return l;
}
};
private final WordTokenizer tokenizer = new GoogleStyleWordTokenizer();

public EnglishConfusionProbabilityRule(ResourceBundle messages, LanguageModel languageModel, Language language) {
this(messages, languageModel, language, 3);
Expand All @@ -82,7 +44,7 @@ public EnglishConfusionProbabilityRule(ResourceBundle messages, LanguageModel la
}

@Override
protected WordTokenizer getWordTokenizer() {
protected WordTokenizer getGoogleStyleWordTokenizer() {
return tokenizer;
}
}
@@ -0,0 +1,70 @@
/* LanguageTool, a natural language style checker
* Copyright (C) 2014 Daniel Naber (http://www.danielnaber.de)
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
* USA
*/
package org.languagetool.rules.en;

import org.languagetool.tokenizers.WordTokenizer;

import java.util.List;
import java.util.Stack;

/**
* Tokenize sentences to tokens like Google does for its ngram index. Note: there
* doesn't seem to be official documentation about the way Google tokenizes there,
* so this is just an approximation.
* @since 3.2
*/
class GoogleStyleWordTokenizer extends WordTokenizer {

@Override
public String getTokenizingCharacters() {
return super.getTokenizingCharacters() + "-";
}

@Override
public List<String> tokenize(final String text) {
List<String> tokens = super.tokenize(text);
String prev = null;
final Stack<String> l = new Stack<>();
for (String token : tokens) {
if ("'".equals(prev)) {
// TODO: add more cases if needed:
if (token.equals("m")) {
l.pop();
l.push("'m");
} else if (token.equals("re")) {
l.pop();
l.push("'re");
} else if (token.equals("ve")) {
l.pop();
l.push("'ve");
} else if (token.equals("ll")) {
l.pop();
l.push("'ll");
} else {
l.push(token);
}
} else {
l.push(token);
}
prev = token;
}
return l;
}

}
2 changes: 2 additions & 0 deletions languagetool-standalone/CHANGES.md
Expand Up @@ -42,6 +42,8 @@

#### API
* `ConfusionProbabilityRule` has been moved to package `org.languagetool.rules.ngrams`
* `ConfusionProbabilityRule.getWordTokenizer()` is now called
`ConfusionProbabilityRule.getGoogleStyleWordTokenizer()`
* `RuleAsXmlSerializer` has been renamed to `RuleMatchAsXmlSerializer`
* some formerly deprecated code has been removed
* some code has been deprecated
Expand Down

0 comments on commit 118a634

Please sign in to comment.