more aggressive skipping of rare words

languagetool-org · Dec 20, 2017 · f58204c · f58204c
1 parent 106d5b7
commit f58204c
Showing 1 changed file with 48 additions and 18 deletions.
diff --git a/...etool-dev/src/main/java/org/languagetool/dev/bigdata/AutomaticConfusionRuleEvaluator.java b/...etool-dev/src/main/java/org/languagetool/dev/bigdata/AutomaticConfusionRuleEvaluator.java
@@ -43,11 +43,13 @@
 class AutomaticConfusionRuleEvaluator {
 
   private static final String LANGUAGE = "en";
-  private static final boolean CASE_SENSITIVE = false;
+  private static final boolean CASE_SENSITIVE = true;
   private static final int MAX_EXAMPLES = 1000;
+  private static final int MIN_EXAMPLES = 50;
   private static final List<Long> EVAL_FACTORS = Arrays.asList(10L, 100L, 1_000L, 10_000L, 100_000L, 1_000_000L, 10_000_000L);
   private static final float MIN_PRECISION = 0.99f;
   private static final float MIN_RECALL = 0.1f;
+  private static final String LUCENE_CONTENT_FIELD = "field";
 
   private final IndexSearcher searcher;
   private final Map<String, List<ConfusionSet>> knownSets;
@@ -114,18 +116,22 @@ private void runOnPair(ConfusionRuleEvaluator evaluator, String line, String par
       }
     }
     System.out.println("Working on: " + line);
-    File sentencesFile = writeExampleSentencesToTempFile(new String[]{part1, part2});
-    List<String> input = Arrays.asList(sentencesFile.getAbsolutePath());
-    Map<Long, ConfusionRuleEvaluator.EvalResult> results = evaluator.run(input, part1, part2, MAX_EXAMPLES, EVAL_FACTORS);
-    Map<Long, ConfusionRuleEvaluator.EvalResult> bestResults = findBestFactor(results);
-    if (bestResults.size() > 0) {
-      for (Map.Entry<Long, ConfusionRuleEvaluator.EvalResult> entry : bestResults.entrySet()) {
-        System.out.println("=> " + entry.getValue().getSummary());
+    try {
+      File sentencesFile = writeExampleSentencesToTempFile(new String[]{part1, part2});
+      List<String> input = Arrays.asList(sentencesFile.getAbsolutePath());
+      Map<Long, ConfusionRuleEvaluator.EvalResult> results = evaluator.run(input, part1, part2, MAX_EXAMPLES, EVAL_FACTORS);
+      Map<Long, ConfusionRuleEvaluator.EvalResult> bestResults = findBestFactor(results);
+      if (bestResults.size() > 0) {
+        for (Map.Entry<Long, ConfusionRuleEvaluator.EvalResult> entry : bestResults.entrySet()) {
+          System.out.println("=> " + entry.getValue().getSummary());
+        }
+      } else {
+        System.out.println("No good result found for " + part1 + "/" + part2);
       }
-    } else {
-      System.out.println("No good result found for " + part1 + "/" + part2);
+      finishedPairs.add(part1 + "/" + part2);
+    } catch (TooFewExamples e) {
+      System.out.println("Skipping " + part1 + "/" + part2 + ", too few examples: " + e.getMessage());
     }
-    finishedPairs.add(part1 + "/" + part2);
   }
 
   private Map<Long, ConfusionRuleEvaluator.EvalResult> findBestFactor(Map<Long, ConfusionRuleEvaluator.EvalResult> results) {
@@ -142,21 +148,29 @@ private Map<Long, ConfusionRuleEvaluator.EvalResult> findBestFactor(Map<Long, Co
 
   private File writeExampleSentencesToTempFile(String[] words) throws IOException {
     File tempFile = new File("/tmp/example-sentences.txt");
+    int count = 0;
     try (FileWriter fw = new FileWriter(tempFile)) {
       for (String word : words) {
-        findExampleSentences(word, fw);
+        int tmpCount = findExampleSentences(word, fw);
+        if (tmpCount <= MIN_EXAMPLES) {
+          throw new TooFewExamples(word, tmpCount);
+        }
+        count += tmpCount;
       }
-      System.out.println("Example sentences written to " + tempFile);
+      System.out.println(count + " example sentences written to " + tempFile);
     }
     return tempFile;
   }
 
-  private void findExampleSentences(String word, FileWriter fw) throws IOException {
-    Term term = new Term(TextIndexCreator.FIELD, CASE_SENSITIVE ? word.toLowerCase() : word);
-    TopDocs topDocs = searcher.search(new TermQuery(term), CASE_SENSITIVE ? Integer.MAX_VALUE : MAX_EXAMPLES);
+  private int findExampleSentences(String word, FileWriter fw) throws IOException {
+    Term term = new Term(LUCENE_CONTENT_FIELD, CASE_SENSITIVE ? word.toLowerCase() : word);
+    long t1 = System.currentTimeMillis();
+    //TopDocs topDocs = searcher.search(new TermQuery(term), CASE_SENSITIVE ? Integer.MAX_VALUE : MAX_EXAMPLES);
+    TopDocs topDocs = searcher.search(new TermQuery(term), MAX_EXAMPLES);
+    long t2 = System.currentTimeMillis();
     int count = 0;
     for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
-      String sentence = searcher.doc(scoreDoc.doc).get(TextIndexCreator.FIELD);
+      String sentence = searcher.doc(scoreDoc.doc).get(LUCENE_CONTENT_FIELD);
       if (CASE_SENSITIVE) {
         if (sentence.contains(word)) {
           fw.write(sentence + "\n");
@@ -170,7 +184,11 @@ private void findExampleSentences(String word, FileWriter fw) throws IOException
         break;
       }
     }
-    System.out.println("Found " + count + " examples for " + word);
+    long t3 = System.currentTimeMillis();
+    long searchTime = t2 - t1;
+    long iterateTime = t3 - t2;
+    System.out.println("Found " + count + " examples for " + word + " (" + searchTime + "ms, " + iterateTime + "ms)");
+    return count;
   }
 
   public static void main(String[] args) throws IOException {
@@ -185,4 +203,16 @@ public static void main(String[] args) throws IOException {
     eval.run(lines, new File(args[2]));
   }
 
+  class TooFewExamples extends RuntimeException {
+    private String word;
+    private int exampleCount;
+    TooFewExamples(String word, int exampleCount) {
+      this.word = word;
+      this.exampleCount = exampleCount;
+    }
+    @Override
+    public String getMessage() {
+      return exampleCount + " matches for " + word;
+    }
+  }
 }