Rewrite of the confusion probability rule for more precision. Also, o…

…nly a few confusion pairs are now enabled by default but precision and recall have been checked for these.
languagetool-org · May 26, 2015 · b1b9c52 · b1b9c52
1 parent 44400c4
commit b1b9c52
Show file tree

Hide file tree

Showing 30 changed files with 1,951 additions and 3,857 deletions.
diff --git a/languagetool-core/src/main/java/org/languagetool/languagemodel/LanguageModel.java b/languagetool-core/src/main/java/org/languagetool/languagemodel/LanguageModel.java
@@ -18,28 +18,42 @@
  */
 package org.languagetool.languagemodel;
 
+import java.util.List;
+
 /**
  * A very simple language model that contains information about ngram occurrences.
  * @since 2.7
  */
 public interface LanguageModel extends AutoCloseable {
 
   /** ngram sentence start marker - note: this is not in the v1 data from Google */
-  public static final String GOOGLE_SENTENCE_START = "_START_";
-  /** ngram sentence end marker - note: this is not in the v1 data from Google */
-  public static final String GOOGLE_SENTENCE_END = "_END_";
+  static final String GOOGLE_SENTENCE_START = "_START_";
+  /** ngram sentence end marker */
+  static final String GOOGLE_SENTENCE_END = ".";
 
+  /**
+   * Get the occurrence count for {@code token}.
+   */
+  long getCount(String token1);
+
+  /**
+   * Get the occurrence count for the given token sequence.
+   */
+  long getCount(List<String> tokens);
+
   /**
    * Get the occurrence count for the phrase {@code token1 token2}.
    */
-  public long getCount(String token1, String token2);
+  long getCount(String token1, String token2);
 
   /**
    * Get the occurrence count for the phrase {@code token1 token2 token3}.
    */
-  public long getCount(String token1, String token2, String token3);
+  long getCount(String token1, String token2, String token3);
 
-  @Override
-  public void close();
+  long getTotalTokenCount();
 
+  @Override
+  void close();
+
 }
diff --git a/languagetool-core/src/main/java/org/languagetool/languagemodel/LuceneLanguageModel.java b/languagetool-core/src/main/java/org/languagetool/languagemodel/LuceneLanguageModel.java
@@ -18,6 +18,7 @@
  */
 package org.languagetool.languagemodel;
 
+import org.apache.commons.lang.StringUtils;
 import org.apache.lucene.index.*;
 import org.apache.lucene.search.*;
 import org.apache.lucene.store.FSDirectory;
@@ -46,10 +47,11 @@ public LuceneLanguageModel(File topIndexDir) throws IOException {
       throw new RuntimeException("Not found or is not a directory: " + topIndexDir);
     }
     this.topIndexDir = topIndexDir;
+    addIndex(topIndexDir, 1);
     addIndex(topIndexDir, 2);
     addIndex(topIndexDir, 3);
     if (luceneSearcherMap.size() == 0) {
-      throw new RuntimeException("No directories '2grams' and/or '3grams' found in " + topIndexDir);
+      throw new RuntimeException("No directories '1grams', '2grams', and/or '3grams' found in " + topIndexDir);
     }
   }
 
@@ -61,25 +63,54 @@ private void addIndex(File topIndexDir, int ngramSize) throws IOException {
     }
   }
 
+  @Override
+  public long getCount(List<String> tokens) {
+    Objects.requireNonNull(tokens);
+    Term term = new Term("ngram", StringUtils.join(tokens, " "));
+    return getCount(term, getLuceneSearcher(tokens.size()));
+  }
+
+  @Override
+  public long getCount(String token1) {
+    Objects.requireNonNull(token1);
+    return getCount(Arrays.asList(token1));
+  }
+
   @Override
   public long getCount(String token1, String token2) {
     Objects.requireNonNull(token1);
     Objects.requireNonNull(token2);
-    Term term = new Term("ngram", token1 + " " + token2);
-    LuceneSearcher luceneSearcher = getLuceneSearcher(2);
-    return getCount(term, luceneSearcher);
+    return getCount(Arrays.asList(token1, token2));
   }
 
   @Override
   public long getCount(String token1, String token2, String token3) {
     Objects.requireNonNull(token1);
     Objects.requireNonNull(token2);
     Objects.requireNonNull(token3);
-    Term term = new Term("ngram", token1 + " " + token2 + " " + token3);
-    LuceneSearcher luceneSearcher = getLuceneSearcher(3);
-    long count = getCount(term, luceneSearcher);
-    //System.out.println("Lookup: " + token1 + " " + token2 + " " + token3 + " => " + count);
-    return count;
+    return getCount(Arrays.asList(token1, token2, token3));
+  }
+
+  @Override
+  public long getTotalTokenCount() {
+    LuceneSearcher luceneSearcher = getLuceneSearcher(1);
+    try {
+      RegexpQuery query = new RegexpQuery(new Term("totalTokenCount", ".*"));
+      TopDocs docs = luceneSearcher.searcher.search(query, 1000);  // Integer.MAX_VALUE might cause OOE on wrong index
+      if (docs.totalHits == 0) {
+        throw new RuntimeException("Expected 'totalTokenCount' meta documents not found in 1grams index");
+      } else if (docs.totalHits > 1000) {
+        throw new RuntimeException("Did not expect more than 1000 'totalTokenCount' meta documents");
+      } else {
+        long result = 0;
+        for (ScoreDoc scoreDoc : docs.scoreDocs) {
+          result += Long.parseLong(luceneSearcher.reader.document(scoreDoc.doc).get("totalTokenCount"));
+        }
+        return result;
+      }
+    } catch (IOException e) {
+      throw new RuntimeException(e);
+    }
   }
 
   protected LuceneSearcher getLuceneSearcher(int ngramSize) {