embedded server: speed up for testing short sentences for de-DE, de-A…

…T, and de-CH by avoiding init overhead (only affects German because it's the only one with a large spelling.txt file)
languagetool-org · Nov 16, 2015 · fa00d66 · fa00d66
1 parent 92a3190
commit fa00d66
Show file tree

Hide file tree

Showing 2 changed files with 28 additions and 15 deletions.
diff --git a/...core/src/main/java/org/languagetool/rules/spelling/morfologik/MorfologikMultiSpeller.java b/...core/src/main/java/org/languagetool/rules/spelling/morfologik/MorfologikMultiSpeller.java
@@ -26,9 +26,7 @@
 import org.languagetool.JLanguageTool;
 
 import java.io.*;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
+import java.util.*;
 
 /**
  * Morfologik speller that merges results from binary (.dict) and plain text (.txt) dictionaries.
@@ -37,6 +35,8 @@
  */
 public class MorfologikMultiSpeller {
 
+  private static final Map<String,Dictionary> dicPathToDict = new HashMap<>();
+
   private final List<MorfologikSpeller> spellers;
   private final boolean convertsCase;
 
@@ -65,10 +65,9 @@ public MorfologikMultiSpeller(String binaryDictPath, BufferedReader plainTextRea
     List<MorfologikSpeller> spellers = new ArrayList<>();
     spellers.add(speller);
     convertsCase = speller.convertsCase();
-    String infoFile = binaryDictPath.replace(".dict", ".info");
-    MorfologikSpeller plainTextDict = getPlainTextDictOrNull(plainTextReader, infoFile, maxEditDistance);
-    if (plainTextDict != null) {
-      spellers.add(plainTextDict);
+    MorfologikSpeller plainTextSpeller = getPlainTextDictSpellerOrNull(plainTextReader, binaryDictPath, maxEditDistance);
+    if (plainTextSpeller != null) {
+      spellers.add(plainTextSpeller);
     }
     this.spellers = Collections.unmodifiableList(spellers);
   }
@@ -82,12 +81,12 @@ private MorfologikSpeller getBinaryDict(String binaryDictPath, int maxEditDistan
   }
 
   @Nullable
-  private MorfologikSpeller getPlainTextDictOrNull(BufferedReader plainTextReader, String infoFile, int maxEditDistance) throws IOException {
+  private MorfologikSpeller getPlainTextDictSpellerOrNull(BufferedReader plainTextReader, String dictPath, int maxEditDistance) throws IOException {
     List<byte[]> lines = getLines(plainTextReader);
     if (lines.size() == 0) {
       return null;
     }
-    Dictionary dictionary = getDictionary(lines, infoFile);
+    Dictionary dictionary = getDictionary(lines, dictPath);
     return new MorfologikSpeller(dictionary, maxEditDistance);
   }
 
@@ -102,12 +101,23 @@ private List<byte[]> getLines(BufferedReader br) throws IOException {
     return lines;
   }
 
-  private Dictionary getDictionary(List<byte[]> lines, String infoFile) throws IOException {
-    Collections.sort(lines, FSABuilder.LEXICAL_ORDERING);
-    FSA fsa = FSABuilder.build(lines);
-    ByteArrayOutputStream fsaOutStream = new CFSA2Serializer().serialize(fsa, new ByteArrayOutputStream());
-    ByteArrayInputStream fsaInStream = new ByteArrayInputStream(fsaOutStream.toByteArray());
-    return Dictionary.readAndClose(fsaInStream, JLanguageTool.getDataBroker().getFromResourceDirAsStream(infoFile));
+  private Dictionary getDictionary(List<byte[]> lines, String dictPath) throws IOException {
+    Dictionary dictFromCache = dicPathToDict.get(dictPath);
+    if (dictFromCache != null) {
+      return dictFromCache;
+    } else {
+      // Creating the dictionary at runtime can easily take 50ms for spelling.txt files
+      // that are ~50KB. We don't want that overhead for every check of a short sentence,
+      // so we cache the result:
+      Collections.sort(lines, FSABuilder.LEXICAL_ORDERING);
+      FSA fsa = FSABuilder.build(lines);
+      ByteArrayOutputStream fsaOutStream = new CFSA2Serializer().serialize(fsa, new ByteArrayOutputStream());
+      ByteArrayInputStream fsaInStream = new ByteArrayInputStream(fsaOutStream.toByteArray());
+      String infoFile = dictPath.replace(".dict", ".info");
+      Dictionary dict = Dictionary.readAndClose(fsaInStream, JLanguageTool.getDataBroker().getFromResourceDirAsStream(infoFile));
+      dicPathToDict.put(dictPath, dict);
+      return dict;
+    }
   }
 
   /**

diff --git a/languagetool-standalone/CHANGES.md b/languagetool-standalone/CHANGES.md
@@ -40,6 +40,9 @@
     using a non-breaking space could cause a rule not to match.
   * `<filter>` can now also be used in `disambiguation.xml`
 
+#### Embedded server
+  * Speed up for testing short sentences for de-DE, de-AT, and de-CH
+
 #### API
   * `ConfusionProbabilityRule` has been moved to package `org.languagetool.rules.ngrams`
   * `ConfusionProbabilityRule.getWordTokenizer()` is now called