extend ngram indexing - still work in progress

languagetool-org · Nov 24, 2015 · fdee326 · fdee326
1 parent ba0e935
commit fdee326
Show file tree

Hide file tree

Showing 3 changed files with 89 additions and 49 deletions.
diff --git a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/CommonCrawlToNgram.java b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/CommonCrawlToNgram.java
@@ -30,6 +30,7 @@
 import org.jetbrains.annotations.NotNull;
 import org.languagetool.Language;
 import org.languagetool.Languages;
+import org.languagetool.dev.eval.SimpleCorpusEvaluator;
 import org.languagetool.tokenizers.Tokenizer;
 import org.tukaani.xz.XZInputStream;
 
@@ -48,29 +49,30 @@ class CommonCrawlToNgram implements AutoCloseable {
 
   private final Language language;
   private final File input;
-  private final Directory directory;
+  private final File indexTopDir;
-  private final IndexWriter indexWriter;
+  private final File evalFile;
+  private final Map<String, Long> unigramToCount = new HashMap<>();
+  private final Map<String, Long> bigramToCount = new HashMap<>();
+  private final Map<String, Long> trigramToCount = new HashMap<>();
+  private final Map<Integer, LuceneLiveIndex> indexes = new HashMap<>();
 
-  private DirectoryReader reader;
-  private IndexSearcher searcher;
   private int limit = 1000;
 
-  CommonCrawlToNgram(Language language, File input, File outputDir) throws IOException {
+  CommonCrawlToNgram(Language language, File input, File indexTopDir, File evalFile) throws IOException {
     this.language = language;
     this.input = input;
-    Analyzer analyzer = new StandardAnalyzer();
+    this.indexTopDir = indexTopDir;
-    IndexWriterConfig config = new IndexWriterConfig(analyzer);
+    this.evalFile = evalFile;
-    directory = FSDirectory.open(outputDir.toPath());
+    indexes.put(1, new LuceneLiveIndex(new File(indexTopDir, "1grams")));
-    indexWriter = new IndexWriter(directory, config);
+    indexes.put(2, new LuceneLiveIndex(new File(indexTopDir, "2grams")));
-    reader = DirectoryReader.open(indexWriter, true);  // TODO: see if false is faster
+    indexes.put(3, new LuceneLiveIndex(new File(indexTopDir, "3grams")));
-    searcher = new IndexSearcher(reader);
   }
 
   @Override
   public void close() throws IOException {
-    indexWriter.close();
+    for (LuceneLiveIndex index : indexes.values()) {
-    reader.close();
+      index.close();
-    directory.close();
+    }
   }
 
   void setLimit(int limit) {
@@ -81,19 +83,18 @@ void indexInputFile() throws IOException {
     FileInputStream fin = new FileInputStream(input);
     BufferedInputStream in = new BufferedInputStream(fin);
     Tokenizer wordTokenizer = language.getWordTokenizer();  // TODO: use a more Google-like tokenizer
-    Map<String, Long> ngramToCount = new HashMap<>();
     try (XZInputStream xzIn = new XZInputStream(in)) {
       final byte[] buffer = new byte[8192];
       int n;
       while ((n = xzIn.read(buffer)) != -1) {
         String buf = new String(buffer, 0, n);  // TODO: not always correct, we need to wait for line end first?
         String[] lines = buf.split("\n");
-        indexLine(wordTokenizer, lines, ngramToCount);
+        indexLine(wordTokenizer, lines);
       }
     }
   }
 
-  private void indexLine(Tokenizer wordTokenizer, String[] lines, Map<String, Long> ngramToCount) throws IOException {
+  private void indexLine(Tokenizer wordTokenizer, String[] lines) throws IOException {
     for (String line : lines) {
       List<String> tokens = wordTokenizer.tokenize(line);
       //System.out.println("L: " + tokens);
@@ -104,63 +105,77 @@ private void indexLine(Tokenizer wordTokenizer, String[] lines, Map<String, Long
         if (token.trim().isEmpty()) {
           continue;
         }
+        unigramToCount.compute(token, (k, v) ->  v == null ? 1 : v + 1);
+        if (prev != null) {
+          String ngram = prev + " " + token;
+          bigramToCount.compute(ngram, (k, v) ->  v == null ? 1 : v + 1);
+        }
         if (prevPrev != null && prev != null) {
           String ngram = prevPrev + " " + prev + " " + token;
-          Long count = ngramToCount.get(ngram);
+          trigramToCount.compute(ngram, (k, v) ->  v == null ? 1 : v + 1);
-          if (count == null) {
+          if (trigramToCount.size() > limit) {
-            ngramToCount.put(ngram, 1L);
+            writeAndEvaluate();
-          } else {
-            ngramToCount.put(ngram, count + 1);
-          }
-          if (ngramToCount.size() > limit) {
-            writeToLucene(ngramToCount);
-            ngramToCount.clear();
           }
         }
         prevPrev = prev;
         prev = token;
       }
     }
-    writeToLucene(ngramToCount);
+    writeAndEvaluate();
+  }
+
+  private void writeAndEvaluate() throws IOException {
+    writeToLucene(1, unigramToCount);
+    writeToLucene(2, bigramToCount);
+    writeToLucene(3, trigramToCount);
+    if (evalFile != null) {
+      SimpleCorpusEvaluator evaluator = new SimpleCorpusEvaluator(indexTopDir);
+      evaluator.run(evalFile);
+    } else {
+      System.out.println("Skipping evaluation, no evaluation file specified");
+    }
   }
 
-  private void writeToLucene(Map<String, Long> ngramToCount) throws IOException {
+  private void writeToLucene(int ngramSize, Map<String, Long> ngramToCount) throws IOException {
     //System.out.println("WRITE: ");
+    LuceneLiveIndex index = indexes.get(ngramSize);
     for (Map.Entry<String, Long> entry : ngramToCount.entrySet()) {
       Term ngram = new Term("ngram", entry.getKey());
-      reader = DirectoryReader.open(indexWriter, true);
+      index.reader = DirectoryReader.open(index.indexWriter, true);
-      searcher = new IndexSearcher(reader);
+      index.searcher = new IndexSearcher(index.reader);
       // not sure why this doesn't work, should be faster:
       /*DirectoryReader newReader = DirectoryReader.openIfChanged(reader);
       if (newReader != null) {
         reader = newReader;
       }*/
-      TopDocs topDocs = searcher.search(new TermQuery(ngram), 2);
+      TopDocs topDocs = index.searcher.search(new TermQuery(ngram), 2);
       //System.out.println(ngram + " ==> " + topDocs.totalHits);
       if (topDocs.totalHits == 0) {
         Document doc = getDoc(entry.getKey(), entry.getValue());
-        indexWriter.addDocument(doc);
+        index.indexWriter.addDocument(doc);
       } else if (topDocs.totalHits == 1) {
         int docNumber = topDocs.scoreDocs[0].doc;
-        Document document = reader.document(docNumber);
+        Document document = index.reader.document(docNumber);
         long oldCount = Long.parseLong(document.getField("count").stringValue());
         //System.out.println(ngram + " -> " + oldCount + "+" + entry.getValue());
-        indexWriter.deleteDocuments(ngram);
+        index.indexWriter.deleteDocuments(ngram);
-        indexWriter.addDocument(getDoc(entry.getKey(), oldCount + entry.getValue()));
+        index.indexWriter.addDocument(getDoc(entry.getKey(), oldCount + entry.getValue()));
         // would probably be faster, but we currently rely on the count being a common field:
         //indexWriter.updateNumericDocValue(ngram, "count", oldCount + entry.getValue());
       } else if (topDocs.totalHits > 1) {
         throw new RuntimeException("Got more than one hit for: " + ngram);
       }
       //System.out.println("   " + entry.getKey() + " -> " + entry.getValue());
     }
-    indexWriter.commit();
+    // TODO: add/update 'totalTokenCount'
+    index.indexWriter.commit();
+    ngramToCount.clear();
   }
 
   @NotNull
   private Document getDoc(String ngram, long count) {
     Document doc = new Document();
-    doc.add(new Field("ngram", ngram, StringField.TYPE_STORED));
+    doc.add(new Field("ngram", ngram, StringField.TYPE_STORED));  // TODO: store only for debugging
     doc.add(getCountField(count));
     return doc;
   }
@@ -175,15 +190,41 @@ private LongField getCountField(long count) {
   }
 
   public static void main(String[] args) throws IOException {
-    if (args.length != 2) {
+    if (args.length != 4) {
-      System.out.println("Usage: " + CommonCrawlToNgram.class + " <langCode> <input.xz>");
+      System.out.println("Usage: " + CommonCrawlToNgram.class + " <langCode> <input.xz> <ngramIndexDir> <simpleEvalFile>");
+      System.out.println(" <simpleEvalFile> a plain text file with simple error markup");
       System.exit(1);
     }
     Language language = Languages.getLanguageForShortName(args[0]);
     File input = new File(args[1]);
     File outputDir = new File(args[2]);
-    try (CommonCrawlToNgram prg = new CommonCrawlToNgram(language, input, outputDir)) {
+    File evalFile = new File(args[3]);
+    try (CommonCrawlToNgram prg = new CommonCrawlToNgram(language, input, outputDir, evalFile)) {
       prg.indexInputFile();
     }
   }
+
+  class LuceneLiveIndex {
+
+    private final Directory directory;
+    private final IndexWriter indexWriter;
+
+    private DirectoryReader reader;
+    private IndexSearcher searcher;
+
+    LuceneLiveIndex(File dir) throws IOException {
+      Analyzer analyzer = new StandardAnalyzer();
+      IndexWriterConfig config = new IndexWriterConfig(analyzer);
+      directory = FSDirectory.open(dir.toPath());
+      indexWriter = new IndexWriter(directory, config);
+      reader = DirectoryReader.open(indexWriter, true);  // TODO: see if false is faster
+      searcher = new IndexSearcher(reader);
+    }
+
+    void close() throws IOException {
+      indexWriter.close();
+      directory.close();
+    }
+
+  }
 }
diff --git a/languagetool-dev/src/main/java/org/languagetool/dev/eval/SimpleCorpusEvaluator.java b/languagetool-dev/src/main/java/org/languagetool/dev/eval/SimpleCorpusEvaluator.java
@@ -42,7 +42,7 @@
  * Evaluates the ngram rule with a simple corpus, see {@link SimpleCorpus}.
  * @since 3.2
  */
-class SimpleCorpusEvaluator {
+public class SimpleCorpusEvaluator {
 
   // without bigrams:
   private static final double START_THRESHOLD = 0.000001;
@@ -64,8 +64,8 @@ class SimpleCorpusEvaluator {
   private int goodConfusionMatches;
   private int badConfusionMatches;
 
-  SimpleCorpusEvaluator(File indexDir) throws IOException {
+  public SimpleCorpusEvaluator(File indexTopDir) throws IOException {
-    evaluator = getEvaluator(indexDir);
+    evaluator = getEvaluator(indexTopDir);
   }
 
   @NotNull
@@ -74,23 +74,22 @@ private Evaluator getEvaluator(File indexTopDir) throws IOException {
   }
 
   @NotNull
-  private ErrorCorpus getCorpus(File dir) throws IOException {
+  private ErrorCorpus getCorpus(File file) throws IOException {
-    return new SimpleCorpus(dir);
+    return new SimpleCorpus(file);
   }
 
   void close() {
     evaluator.close();
   }
 
-  PrecisionRecall run(File dir) throws IOException {
+  public PrecisionRecall run(File file) throws IOException {
     System.out.println("Output explanation:");
     System.out.println("    [  ] = this is not an expected error");
     System.out.println("    [+ ] = this is an expected error");
     System.out.println("    [++] = this is an expected error and the first suggestion is correct");
     System.out.println("    [//]  = not counted because already matches by a different rule");
     System.out.println("");
-    ErrorCorpus corpus = getCorpus(dir);
+    checkLines(getCorpus(file));
-    checkLines(corpus);
     return printAndResetResults();
   }
 

diff --git a/languagetool-dev/src/test/java/org/languagetool/dev/bigdata/CommonCrawlToNgramTest.java b/languagetool-dev/src/test/java/org/languagetool/dev/bigdata/CommonCrawlToNgramTest.java
@@ -35,7 +35,7 @@ public void testIndexing() throws IOException {
     try {
       tempDir.mkdir();
       String filename = CommonCrawlToNgramTest.class.getResource("/org/languagetool/dev/bigdata/ngram-input.txt.xz").getFile();
-      try (CommonCrawlToNgram prg = new CommonCrawlToNgram(new German(), new File(filename), tempDir)) {
+      try (CommonCrawlToNgram prg = new CommonCrawlToNgram(new German(), new File(filename), tempDir, null)) {
         prg.setLimit(1);
         prg.indexInputFile();
       }