Skip to content

Commit

Permalink
[de] update to latest compound splitter library
Browse files Browse the repository at this point in the history
  • Loading branch information
danielnaber committed Apr 20, 2015
1 parent dcd6754 commit e294ba1
Show file tree
Hide file tree
Showing 10 changed files with 82 additions and 47 deletions.
Expand Up @@ -57,29 +57,7 @@ public List<String> getSuggestions(String word) throws IOException {
if (needsInit) {
init();
}
final List<String> candidates = new ArrayList<>();
final List<String> parts = wordSplitter.tokenize(word);
int partCount = 0;
for (String part : parts) {
if (hunspellDict.misspelled(part)) {
List<String> suggestions = morfoSpeller.getSuggestions(part);
if (suggestions.size() == 0) {
suggestions = morfoSpeller.getSuggestions(StringTools.uppercaseFirstChar(part));
}
for (String suggestion : suggestions) {
final List<String> partsCopy = new ArrayList<>(parts);
if (partCount > 0 && !parts.get(partCount-1).endsWith("-")) {
partsCopy.set(partCount, suggestion.toLowerCase());
} else {
partsCopy.set(partCount, suggestion);
}
candidates.add(StringTools.listToString(partsCopy, ""));
}
}
// TODO: what if there's no misspelled parts like for Arbeitamt = Arbeit+Amt ??
// -> morfologik must be extended to return similar words even for known words
partCount++;
}
final List<String> candidates = getCandidates(word);
final List<String> suggestions = getCorrectWords(candidates);

final List<String> noSplitSuggestions = morfoSpeller.getSuggestions(word); // after getCorrectWords() so spelling.txt is considered
Expand All @@ -102,6 +80,43 @@ public List<String> getSuggestions(String word) throws IOException {
return sortedSuggestions.subList(0, Math.min(MAX_SUGGESTIONS, sortedSuggestions.size()));
}

protected List<String> getCandidates(String word) {
return wordSplitter.tokenize(word);
}

protected List<String> getCandidates(List<String> parts) {
int partCount = 0;
final List<String> candidates = new ArrayList<>();
for (String part : parts) {
if (hunspellDict.misspelled(part)) {
// assume noun, so use uppercase:
boolean doUpperCase = partCount > 0 && !StringTools.startsWithUppercase(part);
List<String> suggestions = morfoSpeller.getSuggestions(doUpperCase ? StringTools.uppercaseFirstChar(part) : part);
if (suggestions.size() == 0) {
suggestions = morfoSpeller.getSuggestions(doUpperCase ? StringTools.lowercaseFirstChar(part) : part);
}
for (String suggestion : suggestions) {
final List<String> partsCopy = new ArrayList<>(parts);
if (partCount > 0 && parts.get(partCount).startsWith("-") && parts.get(partCount).length() > 1) {
partsCopy.set(partCount, "-" + StringTools.uppercaseFirstChar(suggestion.substring(1)));
} else if (partCount > 0 && !parts.get(partCount-1).endsWith("-")) {
partsCopy.set(partCount, suggestion.toLowerCase());
} else {
partsCopy.set(partCount, suggestion);
}
String candidate = StringTools.listToString(partsCopy, "");
if (!isMisspelled(candidate)) {
candidates.add(candidate);
}
}
}
// TODO: what if there's no misspelled parts like for Arbeitamt = Arbeit+Amt ??
// -> morfologik must be extended to return similar words even for known words
partCount++;
}
return candidates;
}

protected List<String> sortSuggestionByQuality(String misspelling, List<String> suggestions) {
return suggestions;
}
Expand Down
Expand Up @@ -116,7 +116,7 @@ public RuleMatch[] match(AnalyzedSentence sentence) throws IOException {
return toRuleMatchArray(ruleMatches);
}

private boolean isMisspelled(String word) {
boolean isMisspelled(String word) {
boolean isAlphabetic = true;
if (word.length() == 1) { // hunspell dictionaries usually do not contain punctuation
isAlphabetic = Character.isAlphabetic(word.charAt(0));
Expand Down
Expand Up @@ -18,15 +18,14 @@
*/
package org.languagetool.dev.blogs;

import de.abelssoft.tools.FileTools;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang.StringEscapeUtils;
import org.languagetool.JLanguageTool;
import org.languagetool.Language;
import org.languagetool.Languages;
import org.languagetool.commandline.CommandLineTools;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;

/**
Expand All @@ -41,11 +40,9 @@ private void check(File dir, Language lang) throws IOException {
lt.disableRule("UNPAIRED_BRACKETS");
File[] files = dir.listFiles();
for (File file : files) {
try (FileInputStream is = new FileInputStream(file)) {
System.out.println("\n=== " + file.getName() + " ================================");
String content = cleanup(FileTools.loadFile(is, "utf-8"));
CommandLineTools.checkText(content, lt);
}
System.out.println("\n=== " + file.getName() + " ================================");
String content = cleanup(FileUtils.readFileToString(file, "utf-8"));
CommandLineTools.checkText(content, lt);
}
}

Expand Down
Expand Up @@ -23,7 +23,7 @@
import com.sun.syndication.feed.synd.SyndFeed;
import com.sun.syndication.io.SyndFeedInput;
import com.sun.syndication.io.XmlReader;
import de.abelssoft.tools.FileTools;
import org.apache.commons.io.FileUtils;
import org.languagetool.tools.StringTools;

import java.io.*;
Expand Down Expand Up @@ -113,7 +113,7 @@ public static void main(String[] args) throws IOException {
System.err.println("Usage: " + BlogFetcher.class.getSimpleName() + " <urlListFile> <outputDir>");
System.exit(1);
}
String secret = FileTools.loadFile(new FileInputStream(READABILITY_API_KEY_FILE), "utf-8").trim();
String secret = FileUtils.readFileToString(new File(READABILITY_API_KEY_FILE), "utf-8").trim();
BlogFetcher fetcher = new BlogFetcher(secret);
File outputDir = new File(args[1]);
if (!outputDir.exists() || !outputDir.isDirectory()) {
Expand Down
4 changes: 2 additions & 2 deletions languagetool-language-modules/de/pom.xml
Expand Up @@ -88,9 +88,9 @@
</dependency>

<dependency>
<groupId>de.abelssoft</groupId>
<groupId>de.danielnaber</groupId>
<artifactId>jwordsplitter</artifactId>
<version>3.4</version>
<version>4.0</version>
</dependency>

<dependency>
Expand Down
Expand Up @@ -24,8 +24,8 @@
import java.util.List;
import java.util.ResourceBundle;

import de.abelssoft.wordtools.jwordsplitter.AbstractWordSplitter;
import de.abelssoft.wordtools.jwordsplitter.impl.GermanWordSplitter;
import de.danielnaber.jwordsplitter.AbstractWordSplitter;
import de.danielnaber.jwordsplitter.GermanWordSplitter;
import org.jetbrains.annotations.NotNull;
import org.languagetool.Language;
import org.languagetool.chunking.Chunker;
Expand Down Expand Up @@ -172,7 +172,7 @@ public CompoundWordTokenizer getNonStrictCompoundSplitter() {
try {
final AbstractWordSplitter wordSplitter = new GermanWordSplitter(false);
wordSplitter.setStrictMode(false); // there's a spelling mistake in (at least) one part, so strict mode wouldn't split the word
((GermanWordSplitter)wordSplitter).setMinimumWordLength(3);
wordSplitter.setMinimumWordLength(3);
compoundTokenizer = new CompoundWordTokenizer() {
@Override
public List<String> tokenize(String word) {
Expand Down
Expand Up @@ -18,6 +18,7 @@
*/
package org.languagetool.rules.de;

import de.danielnaber.jwordsplitter.GermanWordSplitter;
import org.jetbrains.annotations.Nullable;
import org.languagetool.JLanguageTool;
import org.languagetool.Language;
Expand Down Expand Up @@ -73,19 +74,37 @@ public class GermanSpellerRule extends CompoundAwareHunspellRule {
);

private final GermanCompoundTokenizer compoundTokenizer;
private final GermanWordSplitter splitter;

public GermanSpellerRule(ResourceBundle messages, German language) {
super(messages, language, language.getNonStrictCompoundSplitter(), getSpeller(language));
addExamplePair(Example.wrong("LanguageTool kann mehr als eine <marker>nromale</marker> Rechtschreibprüfung."),
Example.fixed("LanguageTool kann mehr als eine <marker>normale</marker> Rechtschreibprüfung."));
compoundTokenizer = language.getStrictCompoundTokenizer();
try {
splitter = new GermanWordSplitter(false);
} catch (IOException e) {
throw new RuntimeException(e);
}
}

@Override
public String getId() {
return RULE_ID;
}

@Override
public List<String> getCandidates(String word) {
List<String> suggestions = new ArrayList<>();
List<List<String>> partList = splitter.getAllSplits(word);
final List<String> candidates = new ArrayList<>();
for (List<String> parts : partList) {
candidates.addAll(super.getCandidates(parts));
}
suggestions.addAll(candidates);
return suggestions;
}

@Override
protected void addIgnoreWords(String origLine, Set<String> wordsToBeIgnored) {
String line;
Expand Down
Expand Up @@ -21,10 +21,9 @@
import java.io.IOException;
import java.util.List;

import de.danielnaber.jwordsplitter.GermanWordSplitter;
import org.languagetool.tokenizers.Tokenizer;

import de.abelssoft.wordtools.jwordsplitter.impl.GermanWordSplitter;

/**
* Split German nouns using the jWordSplitter library.
*
Expand All @@ -42,7 +41,7 @@ public GermanCompoundTokenizer() throws IOException {

@Override
public List<String> tokenize(String word) {
return (List<String>) wordSplitter.splitWord(word);
return wordSplitter.splitWord(word);
}

public static void main(String[] args) throws IOException {
Expand Down
Expand Up @@ -175,8 +175,8 @@ public void testGetSuggestions() throws Exception {

assertCorrection(rule, "Hauk", "Haus", "Haut");
assertCorrection(rule, "Hauk", "Haus", "Haut");
assertCorrection(rule, "Eisnbahn", "Einbahn", "Eisbahn", "Eisenbahn");
//assertCorrection(rule, "Rechtschreipreform", "Rechtschreibreform");
assertCorrection(rule, "Eisnbahn", "Einbahn", "Eisbahn", "Eisenbahn");
assertCorrection(rule, "Rechtschreipreform", "Rechtschreibreform");
assertCorrection(rule, "Theatrekasse", "Theaterkasse");
assertCorrection(rule, "Traprennen", "Trabrennen");
assertCorrection(rule, "Autuverkehr", "Autoverkehr");
Expand All @@ -186,8 +186,7 @@ public void testGetSuggestions() throws Exception {
//TODO: requires morfologik-speller change (suggestions for known words):
//assertCorrection(rule, "Arbeitamt", "Arbeitsamt");

// TODO: "Auto, verkehr, r"
//assertEquals("[Autoverkehr]", rule.getMorfologikSuggestions("Autoverkehrr").toString());
assertCorrection(rule, "Autoverkehrr", "Autoverkehr");

assertCorrection(rule, "hasslich", "hässlich", "fasslich");
assertCorrection(rule, "Struße", "Strauße", "Straße", "Sträuße");
Expand Down Expand Up @@ -216,15 +215,20 @@ public void testGetSuggestions() throws Exception {

assertCorrection(rule, "barfuss", "barfuß");
assertCorrection(rule, "Batallion", "Bataillon");
assertCorrection(rule, "Handselvertreter", "Handelsvertreter");

// use to work with jwordsplitter 3.4: too many other suggestions with Levenshtein=2
//assertCorrection(rule, "Handselvertreter", "Handelsvertreter");
//assertCorrection(rule, "Handselvertretertreffen", "Handelsvertretertreffen");

assertCorrection(rule, "aul", "auf");
assertCorrection(rule, "Icj", "Ich"); // only "ich" (lowercase) is in the lexicon
//assertCorrection(rule, "Ihj", "Ich"); // only "ich" (lowercase) is in the lexicon - does not work because of the limit

// three part compounds:
assertCorrection(rule, "Handselvertretertreffen", "Handelsvertretertreffen");
assertCorrection(rule, "Handelsvertretertrffen", "Handelsvertretertreffen");
assertCorrection(rule, "Handelsvartretertreffen", "Handelsvertretertreffen");
assertCorrection(rule, "Handelsvertretertriffen", "Handelsvertretertreffen");

// this won't work as jwordsplitter splits into Handelsvertrter + Treffen but
// the Hunspell dict doesn't contain "Handelsvertreter", thus it's a known limitation
// because jwordsplitter doesn't use the same dictionary as Hunspell:
Expand Down
1 change: 1 addition & 0 deletions languagetool-standalone/CHANGES.md
Expand Up @@ -12,6 +12,7 @@

#### German
* fixed some false alarms
* updated to jwordsplitter 4.0 for better compound splitting

#### Polish
* added a few new rules
Expand Down

0 comments on commit e294ba1

Please sign in to comment.