Skip to content

Commit

Permalink
embedded server: speed up for testing short sentences for de-DE, de-A…
Browse files Browse the repository at this point in the history
…T, and de-CH by avoiding init overhead (only affects German because it's the only one with a large spelling.txt file)
  • Loading branch information
danielnaber committed Nov 16, 2015
1 parent 92a3190 commit fa00d66
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 15 deletions.
Expand Up @@ -26,9 +26,7 @@
import org.languagetool.JLanguageTool;

import java.io.*;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.*;

/**
* Morfologik speller that merges results from binary (.dict) and plain text (.txt) dictionaries.
Expand All @@ -37,6 +35,8 @@
*/
public class MorfologikMultiSpeller {

private static final Map<String,Dictionary> dicPathToDict = new HashMap<>();

private final List<MorfologikSpeller> spellers;
private final boolean convertsCase;

Expand Down Expand Up @@ -65,10 +65,9 @@ public MorfologikMultiSpeller(String binaryDictPath, BufferedReader plainTextRea
List<MorfologikSpeller> spellers = new ArrayList<>();
spellers.add(speller);
convertsCase = speller.convertsCase();
String infoFile = binaryDictPath.replace(".dict", ".info");
MorfologikSpeller plainTextDict = getPlainTextDictOrNull(plainTextReader, infoFile, maxEditDistance);
if (plainTextDict != null) {
spellers.add(plainTextDict);
MorfologikSpeller plainTextSpeller = getPlainTextDictSpellerOrNull(plainTextReader, binaryDictPath, maxEditDistance);
if (plainTextSpeller != null) {
spellers.add(plainTextSpeller);
}
this.spellers = Collections.unmodifiableList(spellers);
}
Expand All @@ -82,12 +81,12 @@ private MorfologikSpeller getBinaryDict(String binaryDictPath, int maxEditDistan
}

@Nullable
private MorfologikSpeller getPlainTextDictOrNull(BufferedReader plainTextReader, String infoFile, int maxEditDistance) throws IOException {
private MorfologikSpeller getPlainTextDictSpellerOrNull(BufferedReader plainTextReader, String dictPath, int maxEditDistance) throws IOException {
List<byte[]> lines = getLines(plainTextReader);
if (lines.size() == 0) {
return null;
}
Dictionary dictionary = getDictionary(lines, infoFile);
Dictionary dictionary = getDictionary(lines, dictPath);
return new MorfologikSpeller(dictionary, maxEditDistance);
}

Expand All @@ -102,12 +101,23 @@ private List<byte[]> getLines(BufferedReader br) throws IOException {
return lines;
}

private Dictionary getDictionary(List<byte[]> lines, String infoFile) throws IOException {
Collections.sort(lines, FSABuilder.LEXICAL_ORDERING);
FSA fsa = FSABuilder.build(lines);
ByteArrayOutputStream fsaOutStream = new CFSA2Serializer().serialize(fsa, new ByteArrayOutputStream());
ByteArrayInputStream fsaInStream = new ByteArrayInputStream(fsaOutStream.toByteArray());
return Dictionary.readAndClose(fsaInStream, JLanguageTool.getDataBroker().getFromResourceDirAsStream(infoFile));
private Dictionary getDictionary(List<byte[]> lines, String dictPath) throws IOException {
Dictionary dictFromCache = dicPathToDict.get(dictPath);
if (dictFromCache != null) {
return dictFromCache;
} else {
// Creating the dictionary at runtime can easily take 50ms for spelling.txt files
// that are ~50KB. We don't want that overhead for every check of a short sentence,
// so we cache the result:
Collections.sort(lines, FSABuilder.LEXICAL_ORDERING);
FSA fsa = FSABuilder.build(lines);
ByteArrayOutputStream fsaOutStream = new CFSA2Serializer().serialize(fsa, new ByteArrayOutputStream());
ByteArrayInputStream fsaInStream = new ByteArrayInputStream(fsaOutStream.toByteArray());
String infoFile = dictPath.replace(".dict", ".info");
Dictionary dict = Dictionary.readAndClose(fsaInStream, JLanguageTool.getDataBroker().getFromResourceDirAsStream(infoFile));
dicPathToDict.put(dictPath, dict);
return dict;
}
}

/**
Expand Down
3 changes: 3 additions & 0 deletions languagetool-standalone/CHANGES.md
Expand Up @@ -40,6 +40,9 @@
using a non-breaking space could cause a rule not to match.
* `<filter>` can now also be used in `disambiguation.xml`

#### Embedded server
* Speed up for testing short sentences for de-DE, de-AT, and de-CH

#### API
* `ConfusionProbabilityRule` has been moved to package `org.languagetool.rules.ngrams`
* `ConfusionProbabilityRule.getWordTokenizer()` is now called
Expand Down

0 comments on commit fa00d66

Please sign in to comment.