Skip to content

Commit

Permalink
[de] use expanded terms from spelling.txt also for suggestions
Browse files Browse the repository at this point in the history
  • Loading branch information
danielnaber committed Mar 30, 2015
1 parent a2c80bc commit 73af096
Show file tree
Hide file tree
Showing 3 changed files with 70 additions and 26 deletions.
Expand Up @@ -46,11 +46,26 @@ public class MorfologikMultiSpeller {
* @param maxEditDistance maximum edit distance for accepting suggestions
*/
public MorfologikMultiSpeller(String binaryDictPath, String plainTextPath, int maxEditDistance) throws IOException {
this(binaryDictPath,
new BufferedReader(new InputStreamReader(JLanguageTool.getDataBroker().getFromResourceDirAsStream(plainTextPath), "utf-8")),
maxEditDistance);
if (!plainTextPath.endsWith(".txt")) {
throw new RuntimeException("Unsupported dictionary, plain text file needs to have suffix .txt: " + plainTextPath);
}
}

/**
* @param binaryDictPath path in classpath to a {@code .dict} binary Morfologik file
* @param plainTextReader reader with to a plain text {@code .txt} file (like from spelling.txt)
* @param maxEditDistance maximum edit distance for accepting suggestions
* @since 3.0
*/
public MorfologikMultiSpeller(String binaryDictPath, BufferedReader plainTextReader, int maxEditDistance) throws IOException {
MorfologikSpeller speller = getBinaryDict(binaryDictPath, maxEditDistance);
spellers.add(speller);
convertsCase = speller.convertsCase();
String infoFile = binaryDictPath.replace(".dict", ".info");
MorfologikSpeller plainTextDict = getPlainTextDictOrNull(plainTextPath, infoFile, maxEditDistance);
MorfologikSpeller plainTextDict = getPlainTextDictOrNull(plainTextReader, infoFile, maxEditDistance);
if (plainTextDict != null) {
spellers.add(plainTextDict);
}
Expand All @@ -65,20 +80,13 @@ private MorfologikSpeller getBinaryDict(String binaryDictPath, int maxEditDistan
}

@Nullable
private MorfologikSpeller getPlainTextDictOrNull(String plainTextPath, String infoFile, int maxEditDistance) throws IOException {
if (plainTextPath.endsWith(".txt")) {
InputStream stream = JLanguageTool.getDataBroker().getFromResourceDirAsStream(plainTextPath);
try (BufferedReader br = new BufferedReader(new InputStreamReader(stream, "utf-8"))) {
List<byte[]> lines = getLines(br);
if (lines.size() == 0) {
return null;
}
Dictionary dictionary = getDictionary(lines, infoFile);
return new MorfologikSpeller(dictionary, maxEditDistance);
}
} else {
throw new RuntimeException("Unsupported dictionary, plain text file needs to have suffix .txt: " + plainTextPath);
private MorfologikSpeller getPlainTextDictOrNull(BufferedReader plainTextReader, String infoFile, int maxEditDistance) throws IOException {
List<byte[]> lines = getLines(plainTextReader);
if (lines.size() == 0) {
return null;
}
Dictionary dictionary = getDictionary(lines, infoFile);
return new MorfologikSpeller(dictionary, maxEditDistance);
}

private List<byte[]> getLines(BufferedReader br) throws IOException {
Expand Down
Expand Up @@ -28,7 +28,7 @@
import org.languagetool.tokenizers.de.GermanCompoundTokenizer;
import org.languagetool.tools.StringTools;

import java.io.IOException;
import java.io.*;
import java.util.*;

public class GermanSpellerRule extends CompoundAwareHunspellRule {
Expand Down Expand Up @@ -96,30 +96,36 @@ protected void addIgnoreWords(String origLine, Set<String> wordsToBeIgnored) {
} else {
line = origLine;
}
if (line.contains("/")) {
wordsToBeIgnored.addAll(expandLine(line));
}

private static List<String> expandLine(String line) {
List<String> result = new ArrayList<>();
if (!line.startsWith("#") && line.contains("/")) {
String[] parts = line.split("/");
if (parts.length != 2) {
throw new RuntimeException("Unexpected line format, expected at most one slash: " + line);
}
String word = parts[0];
String suffix = parts[1];
wordsToBeIgnored.add(word);
result.add(word);
if (suffix.equals("S")) {
wordsToBeIgnored.add(word + "s");
result.add(word + "s");
} else if (suffix.equals("N")) {
wordsToBeIgnored.add(word + "n");
result.add(word + "n");
} else if (suffix.equals("A")) { // Adjektiv
wordsToBeIgnored.add(word + "e");
wordsToBeIgnored.add(word + "er");
wordsToBeIgnored.add(word + "es");
wordsToBeIgnored.add(word + "en");
wordsToBeIgnored.add(word + "em");
result.add(word + "e");
result.add(word + "er");
result.add(word + "es");
result.add(word + "en");
result.add(word + "em");
} else {
throw new RuntimeException("Unknown suffix: " + suffix + " in line: " + line);
}
} else {
wordsToBeIgnored.add(line);
result.add(line);
}
return result;
}

@Nullable
Expand All @@ -131,7 +137,10 @@ private static MorfologikMultiSpeller getSpeller(Language language) {
final String morfoFile = "/de/hunspell/de_" + language.getCountries()[0] + ".dict";
if (JLanguageTool.getDataBroker().resourceExists(morfoFile)) {
// spell data will not exist in LibreOffice/OpenOffice context
return new MorfologikMultiSpeller(morfoFile, "/de/hunspell/spelling.txt", MAX_EDIT_DISTANCE);
final InputStream stream = JLanguageTool.getDataBroker().getFromResourceDirAsStream("/de/hunspell/spelling.txt");
try (BufferedReader br = new BufferedReader(new InputStreamReader(stream, "utf-8"))) {
return new MorfologikMultiSpeller(morfoFile, new ExpandingReader(br), MAX_EDIT_DISTANCE);
}
} else {
return null;
}
Expand Down Expand Up @@ -275,4 +284,28 @@ private Replacement(String key, String value) {
this.value = value;
}
}

static class ExpandingReader extends BufferedReader {

private final List<String> buffer = new ArrayList<>();

ExpandingReader(Reader in) {
super(in);
}

@Override
public String readLine() throws IOException {
if (buffer.size() > 0) {
return buffer.remove(0);
} else {
String line = super.readLine();
if (line == null) {
return null;
}
buffer.addAll(expandLine(line));
return buffer.remove(0);
}
}
}

}
Expand Up @@ -90,6 +90,9 @@ public void testGetSuggestionsFromSpellingTxt() throws Exception {
assertThat(ruleGermany.getSuggestions("Ligafußboll").toString(), is("[Ligafußball, Ligafußballs]")); // from spelling.txt
MyGermanSpellerRule ruleSwiss = new MyGermanSpellerRule(TestTools.getMessages("de"), GERMAN_CH);
assertThat(ruleSwiss.getSuggestions("Ligafußboll").toString(), is("[Ligafussball, Ligafussballs]"));
assertThat(ruleSwiss.getSuggestions("konfliktbereid").toString(), is("[konfliktbereit, konfliktbereite]"));
assertThat(ruleSwiss.getSuggestions("konfliktbereitel").toString(),
is("[konfliktbereite, konfliktbereitem, konfliktbereiten, konfliktbereiter, konfliktbereites, konfliktbereit]"));
}

@Test
Expand Down

0 comments on commit 73af096

Please sign in to comment.