Skip to content

Commit

Permalink
embedded server: speed up for testing short sentences for de-DE, de-A…
Browse files Browse the repository at this point in the history
…T, and de-CH by avoiding init overhead (only affects German because it's the only one with a large spelling.txt file)
  • Loading branch information
danielnaber committed Nov 16, 2015
1 parent 92a3190 commit fa00d66
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 15 deletions.
Expand Up @@ -26,9 +26,7 @@
import org.languagetool.JLanguageTool; import org.languagetool.JLanguageTool;


import java.io.*; import java.io.*;
import java.util.ArrayList; import java.util.*;
import java.util.Collections;
import java.util.List;


/** /**
* Morfologik speller that merges results from binary (.dict) and plain text (.txt) dictionaries. * Morfologik speller that merges results from binary (.dict) and plain text (.txt) dictionaries.
Expand All @@ -37,6 +35,8 @@
*/ */
public class MorfologikMultiSpeller { public class MorfologikMultiSpeller {


private static final Map<String,Dictionary> dicPathToDict = new HashMap<>();

private final List<MorfologikSpeller> spellers; private final List<MorfologikSpeller> spellers;
private final boolean convertsCase; private final boolean convertsCase;


Expand Down Expand Up @@ -65,10 +65,9 @@ public MorfologikMultiSpeller(String binaryDictPath, BufferedReader plainTextRea
List<MorfologikSpeller> spellers = new ArrayList<>(); List<MorfologikSpeller> spellers = new ArrayList<>();
spellers.add(speller); spellers.add(speller);
convertsCase = speller.convertsCase(); convertsCase = speller.convertsCase();
String infoFile = binaryDictPath.replace(".dict", ".info"); MorfologikSpeller plainTextSpeller = getPlainTextDictSpellerOrNull(plainTextReader, binaryDictPath, maxEditDistance);
MorfologikSpeller plainTextDict = getPlainTextDictOrNull(plainTextReader, infoFile, maxEditDistance); if (plainTextSpeller != null) {
if (plainTextDict != null) { spellers.add(plainTextSpeller);
spellers.add(plainTextDict);
} }
this.spellers = Collections.unmodifiableList(spellers); this.spellers = Collections.unmodifiableList(spellers);
} }
Expand All @@ -82,12 +81,12 @@ private MorfologikSpeller getBinaryDict(String binaryDictPath, int maxEditDistan
} }


@Nullable @Nullable
private MorfologikSpeller getPlainTextDictOrNull(BufferedReader plainTextReader, String infoFile, int maxEditDistance) throws IOException { private MorfologikSpeller getPlainTextDictSpellerOrNull(BufferedReader plainTextReader, String dictPath, int maxEditDistance) throws IOException {
List<byte[]> lines = getLines(plainTextReader); List<byte[]> lines = getLines(plainTextReader);
if (lines.size() == 0) { if (lines.size() == 0) {
return null; return null;
} }
Dictionary dictionary = getDictionary(lines, infoFile); Dictionary dictionary = getDictionary(lines, dictPath);
return new MorfologikSpeller(dictionary, maxEditDistance); return new MorfologikSpeller(dictionary, maxEditDistance);
} }


Expand All @@ -102,12 +101,23 @@ private List<byte[]> getLines(BufferedReader br) throws IOException {
return lines; return lines;
} }


private Dictionary getDictionary(List<byte[]> lines, String infoFile) throws IOException { private Dictionary getDictionary(List<byte[]> lines, String dictPath) throws IOException {
Collections.sort(lines, FSABuilder.LEXICAL_ORDERING); Dictionary dictFromCache = dicPathToDict.get(dictPath);
FSA fsa = FSABuilder.build(lines); if (dictFromCache != null) {
ByteArrayOutputStream fsaOutStream = new CFSA2Serializer().serialize(fsa, new ByteArrayOutputStream()); return dictFromCache;
ByteArrayInputStream fsaInStream = new ByteArrayInputStream(fsaOutStream.toByteArray()); } else {
return Dictionary.readAndClose(fsaInStream, JLanguageTool.getDataBroker().getFromResourceDirAsStream(infoFile)); // Creating the dictionary at runtime can easily take 50ms for spelling.txt files
// that are ~50KB. We don't want that overhead for every check of a short sentence,
// so we cache the result:
Collections.sort(lines, FSABuilder.LEXICAL_ORDERING);
FSA fsa = FSABuilder.build(lines);
ByteArrayOutputStream fsaOutStream = new CFSA2Serializer().serialize(fsa, new ByteArrayOutputStream());
ByteArrayInputStream fsaInStream = new ByteArrayInputStream(fsaOutStream.toByteArray());
String infoFile = dictPath.replace(".dict", ".info");
Dictionary dict = Dictionary.readAndClose(fsaInStream, JLanguageTool.getDataBroker().getFromResourceDirAsStream(infoFile));
dicPathToDict.put(dictPath, dict);
return dict;
}
} }


/** /**
Expand Down
3 changes: 3 additions & 0 deletions languagetool-standalone/CHANGES.md
Expand Up @@ -40,6 +40,9 @@
using a non-breaking space could cause a rule not to match. using a non-breaking space could cause a rule not to match.
* `<filter>` can now also be used in `disambiguation.xml` * `<filter>` can now also be used in `disambiguation.xml`


#### Embedded server
* Speed up for testing short sentences for de-DE, de-AT, and de-CH

#### API #### API
* `ConfusionProbabilityRule` has been moved to package `org.languagetool.rules.ngrams` * `ConfusionProbabilityRule` has been moved to package `org.languagetool.rules.ngrams`
* `ConfusionProbabilityRule.getWordTokenizer()` is now called * `ConfusionProbabilityRule.getWordTokenizer()` is now called
Expand Down

0 comments on commit fa00d66

Please sign in to comment.