[de] update to latest compound splitter library

languagetool-org · Apr 20, 2015 · e294ba1 · e294ba1
1 parent dcd6754
commit e294ba1
Show file tree

Hide file tree

Showing 10 changed files with 82 additions and 47 deletions.
diff --git a/...ore/src/main/java/org/languagetool/rules/spelling/hunspell/CompoundAwareHunspellRule.java b/...ore/src/main/java/org/languagetool/rules/spelling/hunspell/CompoundAwareHunspellRule.java
@@ -57,29 +57,7 @@ public List<String> getSuggestions(String word) throws IOException {
     if (needsInit) {
       init();
     }
-    final List<String> candidates = new ArrayList<>();
-    final List<String> parts = wordSplitter.tokenize(word);
-    int partCount = 0;
-    for (String part : parts) {
-      if (hunspellDict.misspelled(part)) {
-        List<String> suggestions = morfoSpeller.getSuggestions(part);
-        if (suggestions.size() == 0) {
-          suggestions = morfoSpeller.getSuggestions(StringTools.uppercaseFirstChar(part));
-        }
-        for (String suggestion : suggestions) {
-          final List<String> partsCopy = new ArrayList<>(parts);
-          if (partCount > 0 && !parts.get(partCount-1).endsWith("-")) {
-            partsCopy.set(partCount, suggestion.toLowerCase());
-          } else {
-            partsCopy.set(partCount, suggestion);
-          }
-          candidates.add(StringTools.listToString(partsCopy, ""));
-        }
-      }
-      // TODO: what if there's no misspelled parts like for Arbeitamt = Arbeit+Amt ??
-      // -> morfologik must be extended to return similar words even for known words
-      partCount++;
-    }
+    final List<String> candidates = getCandidates(word);
     final List<String> suggestions = getCorrectWords(candidates);
 
     final List<String> noSplitSuggestions = morfoSpeller.getSuggestions(word);  // after getCorrectWords() so spelling.txt is considered
@@ -102,6 +80,43 @@ public List<String> getSuggestions(String word) throws IOException {
     return sortedSuggestions.subList(0, Math.min(MAX_SUGGESTIONS, sortedSuggestions.size()));
   }
 
+  protected List<String> getCandidates(String word) {
+    return wordSplitter.tokenize(word);
+  }
+
+  protected List<String> getCandidates(List<String> parts) {
+    int partCount = 0;
+    final List<String> candidates = new ArrayList<>();
+    for (String part : parts) {
+      if (hunspellDict.misspelled(part)) {
+        // assume noun, so use uppercase:
+        boolean doUpperCase = partCount > 0 && !StringTools.startsWithUppercase(part);
+        List<String> suggestions = morfoSpeller.getSuggestions(doUpperCase ? StringTools.uppercaseFirstChar(part) : part);
+        if (suggestions.size() == 0) {
+          suggestions = morfoSpeller.getSuggestions(doUpperCase ? StringTools.lowercaseFirstChar(part) : part);
+        }
+        for (String suggestion : suggestions) {
+          final List<String> partsCopy = new ArrayList<>(parts);
+          if (partCount > 0 && parts.get(partCount).startsWith("-") && parts.get(partCount).length() > 1) {
+            partsCopy.set(partCount, "-" + StringTools.uppercaseFirstChar(suggestion.substring(1)));
+          } else if (partCount > 0 && !parts.get(partCount-1).endsWith("-")) {
+            partsCopy.set(partCount, suggestion.toLowerCase());
+          } else {
+            partsCopy.set(partCount, suggestion);
+          }
+          String candidate = StringTools.listToString(partsCopy, "");
+          if (!isMisspelled(candidate)) {
+            candidates.add(candidate);
+          }
+        }
+      }
+      // TODO: what if there's no misspelled parts like for Arbeitamt = Arbeit+Amt ??
+      // -> morfologik must be extended to return similar words even for known words
+      partCount++;
+    }
+    return candidates;
+  }
+
   protected List<String> sortSuggestionByQuality(String misspelling, List<String> suggestions) {
     return suggestions;
   }

diff --git a/languagetool-core/src/main/java/org/languagetool/rules/spelling/hunspell/HunspellRule.java b/languagetool-core/src/main/java/org/languagetool/rules/spelling/hunspell/HunspellRule.java
@@ -116,7 +116,7 @@ public RuleMatch[] match(AnalyzedSentence sentence) throws IOException {
     return toRuleMatchArray(ruleMatches);
   }
 
-  private boolean isMisspelled(String word) {
+  boolean isMisspelled(String word) {
     boolean isAlphabetic = true;
     if (word.length() == 1) { // hunspell dictionaries usually do not contain punctuation
       isAlphabetic = Character.isAlphabetic(word.charAt(0));

diff --git a/languagetool-dev/src/main/java/org/languagetool/dev/blogs/BlogChecker.java b/languagetool-dev/src/main/java/org/languagetool/dev/blogs/BlogChecker.java
@@ -18,15 +18,14 @@
  */
 package org.languagetool.dev.blogs;
 
-import de.abelssoft.tools.FileTools;
+import org.apache.commons.io.FileUtils;
 import org.apache.commons.lang.StringEscapeUtils;
 import org.languagetool.JLanguageTool;
 import org.languagetool.Language;
 import org.languagetool.Languages;
 import org.languagetool.commandline.CommandLineTools;
 
 import java.io.File;
-import java.io.FileInputStream;
 import java.io.IOException;
 
 /**
@@ -41,11 +40,9 @@ private void check(File dir, Language lang) throws IOException {
     lt.disableRule("UNPAIRED_BRACKETS");
     File[] files = dir.listFiles();
     for (File file : files) {
-      try (FileInputStream is = new FileInputStream(file)) {
-        System.out.println("\n=== " + file.getName() +  " ================================");
-        String content = cleanup(FileTools.loadFile(is, "utf-8"));
-        CommandLineTools.checkText(content, lt);
-      }
+      System.out.println("\n=== " + file.getName() +  " ================================");
+      String content = cleanup(FileUtils.readFileToString(file, "utf-8"));
+      CommandLineTools.checkText(content, lt);
     }
   }
 

diff --git a/languagetool-dev/src/main/java/org/languagetool/dev/blogs/BlogFetcher.java b/languagetool-dev/src/main/java/org/languagetool/dev/blogs/BlogFetcher.java
@@ -23,7 +23,7 @@
 import com.sun.syndication.feed.synd.SyndFeed;
 import com.sun.syndication.io.SyndFeedInput;
 import com.sun.syndication.io.XmlReader;
-import de.abelssoft.tools.FileTools;
+import org.apache.commons.io.FileUtils;
 import org.languagetool.tools.StringTools;
 
 import java.io.*;
@@ -113,7 +113,7 @@ public static void main(String[] args) throws IOException {
       System.err.println("Usage: " + BlogFetcher.class.getSimpleName() + " <urlListFile> <outputDir>");
       System.exit(1);
     }
-    String secret = FileTools.loadFile(new FileInputStream(READABILITY_API_KEY_FILE), "utf-8").trim();
+    String secret = FileUtils.readFileToString(new File(READABILITY_API_KEY_FILE), "utf-8").trim();
     BlogFetcher fetcher = new BlogFetcher(secret);
     File outputDir = new File(args[1]);
     if (!outputDir.exists() || !outputDir.isDirectory()) {

diff --git a/languagetool-language-modules/de/pom.xml b/languagetool-language-modules/de/pom.xml
@@ -88,9 +88,9 @@
         </dependency>
 
         <dependency>
-            <groupId>de.abelssoft</groupId>
+            <groupId>de.danielnaber</groupId>
             <artifactId>jwordsplitter</artifactId>
-            <version>3.4</version>
+            <version>4.0</version>
         </dependency>
 
         <dependency>

diff --git a/languagetool-language-modules/de/src/main/java/org/languagetool/language/German.java b/languagetool-language-modules/de/src/main/java/org/languagetool/language/German.java
@@ -24,8 +24,8 @@
 import java.util.List;
 import java.util.ResourceBundle;
 
-import de.abelssoft.wordtools.jwordsplitter.AbstractWordSplitter;
-import de.abelssoft.wordtools.jwordsplitter.impl.GermanWordSplitter;
+import de.danielnaber.jwordsplitter.AbstractWordSplitter;
+import de.danielnaber.jwordsplitter.GermanWordSplitter;
 import org.jetbrains.annotations.NotNull;
 import org.languagetool.Language;
 import org.languagetool.chunking.Chunker;
@@ -172,7 +172,7 @@ public CompoundWordTokenizer getNonStrictCompoundSplitter() {
       try {
         final AbstractWordSplitter wordSplitter = new GermanWordSplitter(false);
         wordSplitter.setStrictMode(false); // there's a spelling mistake in (at least) one part, so strict mode wouldn't split the word
-        ((GermanWordSplitter)wordSplitter).setMinimumWordLength(3);
+        wordSplitter.setMinimumWordLength(3);
         compoundTokenizer = new CompoundWordTokenizer() {
           @Override
           public List<String> tokenize(String word) {

diff --git a/...getool-language-modules/de/src/main/java/org/languagetool/rules/de/GermanSpellerRule.java b/...getool-language-modules/de/src/main/java/org/languagetool/rules/de/GermanSpellerRule.java
@@ -18,6 +18,7 @@
  */
 package org.languagetool.rules.de;
 
+import de.danielnaber.jwordsplitter.GermanWordSplitter;
 import org.jetbrains.annotations.Nullable;
 import org.languagetool.JLanguageTool;
 import org.languagetool.Language;
@@ -73,19 +74,37 @@ public class GermanSpellerRule extends CompoundAwareHunspellRule {
   );
 
   private final GermanCompoundTokenizer compoundTokenizer;
+  private final GermanWordSplitter splitter;
 
   public GermanSpellerRule(ResourceBundle messages, German language) {
     super(messages, language, language.getNonStrictCompoundSplitter(), getSpeller(language));
     addExamplePair(Example.wrong("LanguageTool kann mehr als eine <marker>nromale</marker> Rechtschreibprüfung."),
                    Example.fixed("LanguageTool kann mehr als eine <marker>normale</marker> Rechtschreibprüfung."));
     compoundTokenizer = language.getStrictCompoundTokenizer();
+    try {
+      splitter = new GermanWordSplitter(false);
+    } catch (IOException e) {
+      throw new RuntimeException(e);
+    }
   }
 
   @Override
   public String getId() {
     return RULE_ID;
   }
 
+  @Override
+  public List<String> getCandidates(String word) {
+    List<String> suggestions = new ArrayList<>();
+    List<List<String>> partList = splitter.getAllSplits(word);
+    final List<String> candidates = new ArrayList<>();
+    for (List<String> parts : partList) {
+      candidates.addAll(super.getCandidates(parts));
+    }
+    suggestions.addAll(candidates);
+    return suggestions;
+  }
+
   @Override
   protected void addIgnoreWords(String origLine, Set<String> wordsToBeIgnored) {
     String line;

diff --git a/...uage-modules/de/src/main/java/org/languagetool/tokenizers/de/GermanCompoundTokenizer.java b/...uage-modules/de/src/main/java/org/languagetool/tokenizers/de/GermanCompoundTokenizer.java
@@ -21,10 +21,9 @@
 import java.io.IOException;
 import java.util.List;
 
+import de.danielnaber.jwordsplitter.GermanWordSplitter;
 import org.languagetool.tokenizers.Tokenizer;
 
-import de.abelssoft.wordtools.jwordsplitter.impl.GermanWordSplitter;
-
 /**
  * Split German nouns using the jWordSplitter library.
  * 
@@ -42,7 +41,7 @@ public GermanCompoundTokenizer() throws IOException {
 
   @Override
   public List<String> tokenize(String word) {
-    return (List<String>) wordSplitter.splitWord(word);
+    return wordSplitter.splitWord(word);
   }
 
   public static void main(String[] args) throws IOException {

diff --git a/...ol-language-modules/de/src/test/java/org/languagetool/rules/de/GermanSpellerRuleTest.java b/...ol-language-modules/de/src/test/java/org/languagetool/rules/de/GermanSpellerRuleTest.java
@@ -175,8 +175,8 @@ public void testGetSuggestions() throws Exception {
 
     assertCorrection(rule, "Hauk", "Haus", "Haut");
     assertCorrection(rule, "Hauk", "Haus", "Haut");
-    assertCorrection(rule, "Eisnbahn", "Einbahn", "Eisbahn", "Eisenbahn");
-    //assertCorrection(rule, "Rechtschreipreform", "Rechtschreibreform");
+    assertCorrection(rule, "Eisnbahn", "Einbahn", "Eisbahn", "Eisenbahn"); 
+    assertCorrection(rule, "Rechtschreipreform", "Rechtschreibreform");
     assertCorrection(rule, "Theatrekasse", "Theaterkasse");
     assertCorrection(rule, "Traprennen", "Trabrennen");
     assertCorrection(rule, "Autuverkehr", "Autoverkehr");
@@ -186,8 +186,7 @@ public void testGetSuggestions() throws Exception {
     //TODO: requires morfologik-speller change (suggestions for known words):
     //assertCorrection(rule, "Arbeitamt", "Arbeitsamt");
 
-    // TODO: "Auto, verkehr, r"
-    //assertEquals("[Autoverkehr]", rule.getMorfologikSuggestions("Autoverkehrr").toString());
+    assertCorrection(rule, "Autoverkehrr", "Autoverkehr");
 
     assertCorrection(rule, "hasslich", "hässlich", "fasslich");
     assertCorrection(rule, "Struße", "Strauße", "Straße", "Sträuße");
@@ -216,15 +215,20 @@ public void testGetSuggestions() throws Exception {
 
     assertCorrection(rule, "barfuss", "barfuß");
     assertCorrection(rule, "Batallion", "Bataillon");
-    assertCorrection(rule, "Handselvertreter", "Handelsvertreter");
+
+    // use to work with jwordsplitter 3.4: too many other suggestions with Levenshtein=2
+    //assertCorrection(rule, "Handselvertreter", "Handelsvertreter");
+    //assertCorrection(rule, "Handselvertretertreffen", "Handelsvertretertreffen");
 
     assertCorrection(rule, "aul", "auf");
     assertCorrection(rule, "Icj", "Ich");   // only "ich" (lowercase) is in the lexicon
     //assertCorrection(rule, "Ihj", "Ich");   // only "ich" (lowercase) is in the lexicon - does not work because of the limit
 
     // three part compounds:
-    assertCorrection(rule, "Handselvertretertreffen", "Handelsvertretertreffen");
     assertCorrection(rule, "Handelsvertretertrffen", "Handelsvertretertreffen");
+    assertCorrection(rule, "Handelsvartretertreffen", "Handelsvertretertreffen");
+    assertCorrection(rule, "Handelsvertretertriffen", "Handelsvertretertreffen");
+
     // this won't work as jwordsplitter splits into Handelsvertrter + Treffen but
     // the Hunspell dict doesn't contain "Handelsvertreter", thus it's a known limitation
     // because jwordsplitter doesn't use the same dictionary as Hunspell:

diff --git a/languagetool-standalone/CHANGES.md b/languagetool-standalone/CHANGES.md
@@ -12,6 +12,7 @@
 
 #### German
   * fixed some false alarms
+  * updated to jwordsplitter 4.0 for better compound splitting
 
 #### Polish
   * added a few new rules