another evaluation script for ngrams

languagetool-org · Jul 24, 2015 · 64107be · 64107be
1 parent e682243
commit 64107be
Show file tree

Hide file tree

Showing 2 changed files with 134 additions and 24 deletions.
diff --git a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/AllConfusionRulesEvaluator.java b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/AllConfusionRulesEvaluator.java
@@ -0,0 +1,89 @@
+/* LanguageTool, a natural language style checker 
+ * Copyright (C) 2015 Daniel Naber (http://www.danielnaber.de)
+ * 
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
+ * USA
+ */
+package org.languagetool.dev.bigdata;
+
+import org.apache.commons.lang.StringUtils;
+import org.languagetool.JLanguageTool;
+import org.languagetool.Language;
+import org.languagetool.Languages;
+import org.languagetool.languagemodel.LanguageModel;
+import org.languagetool.languagemodel.LuceneLanguageModel;
+import org.languagetool.rules.ConfusionSet;
+import org.languagetool.rules.ConfusionSetLoader;
+import org.languagetool.rules.ConfusionString;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.*;
+
+final class AllConfusionRulesEvaluator {
+
+  private static final int MAX_SENTENCES = 1000;
+
+  public static void main(String[] args) throws IOException {
+    if (args.length < 3 || args.length > 4) {
+      System.err.println("Usage: " + ConfusionRuleEvaluator.class.getSimpleName()
+              + " <langCode> <languageModelTopDir> <wikipediaXml|tatoebaFile|dir>...");
+      System.err.println("   <languageModelTopDir> is a directory with sub-directories '1grams', '2grams', and '3grams' with Lucene indexes");
+      System.err.println("   <wikipediaXml|tatoebaFile|dir> either a Wikipedia XML dump, or a Tatoeba file or");
+      System.err.println("                      a directory with example sentences (where <word>.txt contains only the sentences for <word>).");
+      System.err.println("                      You can specify both a Wikipedia file and a Tatoeba file.");
+      System.exit(1);
+    }
+    Language lang = Languages.getLanguageForShortName(args[0]);
+    LanguageModel languageModel = new LuceneLanguageModel(new File(args[1]));
+    List<String> inputsFiles = new ArrayList<>();
+    inputsFiles.add(args[2]);
+    if (args.length >= 4) {
+      inputsFiles.add(args[3]);
+    }
+    ConfusionRuleEvaluator eval = new ConfusionRuleEvaluator(lang, languageModel, 3);
+    eval.setVerboseMode(false);
+    ConfusionSetLoader confusionSetLoader = new ConfusionSetLoader();
+    InputStream inputStream = JLanguageTool.getDataBroker().getFromResourceDirAsStream("/en/confusion_sets.txt");
+    Map<String,List<ConfusionSet>> confusionSetMap = confusionSetLoader.loadConfusionSet(inputStream);
+    Set<String> done = new HashSet<>();
+    for (List<ConfusionSet> entry : confusionSetMap.values()) {
+      for (ConfusionSet confusionSet : entry) {
+        Set<ConfusionString> set = confusionSet.getSet();
+        if (set.size() != 2) {
+          System.out.println("Skipping confusion set with size != 2: " + confusionSet);
+        } else {
+          Iterator<ConfusionString> iterator = set.iterator();
+          ConfusionString set1 = iterator.next();
+          ConfusionString set2 = iterator.next();
+          String word1 = set1.getString();
+          String word2 = set2.getString();
+          String key = word1 + " " + word2;
+          if (!done.contains(key)) {
+            String summary = eval.run(inputsFiles, word1, word2, confusionSet.getFactor(), MAX_SENTENCES);
+            String summary1 = set1.getDescription() != null ? word1 + "|" + set1.getDescription() : word1;
+            String summary2 = set2.getDescription() != null ? word2 + "|" + set2.getDescription() : word2;
+            String start = summary1 + "; " + summary2 + "; " + confusionSet.getFactor();
+            String spaces = StringUtils.repeat(" ", 82-start.length());
+            System.out.println(start + spaces + "# " + summary);
+          }
+          done.add(key);
+        }
+      }
+    }
+  }
+
+}
diff --git a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/ConfusionRuleEvaluator.java b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/ConfusionRuleEvaluator.java
@@ -64,29 +64,34 @@ class ConfusionRuleEvaluator {
   private int trueNegatives = 0;
   private int falsePositives = 0;
   private int falseNegatives = 0;
+  private boolean verbose = true;
 
-  private ConfusionRuleEvaluator(Language language, LanguageModel languageModel, int grams) {
+  ConfusionRuleEvaluator(Language language, LanguageModel languageModel, int grams) {
     this.language = language;
     this.rule = new EnglishConfusionProbabilityRule(JLanguageTool.getMessageBundle(), languageModel, language, grams);
-    rule.setConfusionSet(new ConfusionSet(FACTOR, TOKEN_HOMOPHONE, TOKEN));
     this.grams = grams;
   }
+
+  void setVerboseMode(boolean verbose) {
+    this.verbose = verbose;
+  }
 
-  private void run(List<String> inputsOrDir, String token, String homophoneToken, int maxSentences) throws IOException {
+  String run(List<String> inputsOrDir, String token, String homophoneToken, long factor, int maxSentences) throws IOException {
+    rule.setConfusionSet(new ConfusionSet(factor*10, homophoneToken, token));
     List<Sentence> allTokenSentences = getRelevantSentences(inputsOrDir, token, maxSentences);
     // Load the sentences with a homophone and later replace it so we get error sentences:
     List<Sentence> allHomophoneSentences = getRelevantSentences(inputsOrDir, homophoneToken, maxSentences);
     evaluate(allTokenSentences, true, token, homophoneToken);
     evaluate(allTokenSentences, false, homophoneToken, token);
     evaluate(allHomophoneSentences, false, token, homophoneToken);
     evaluate(allHomophoneSentences, true, homophoneToken, token);
-    printEvalResult(allTokenSentences, allHomophoneSentences, inputsOrDir);
+    return printEvalResult(allTokenSentences, allHomophoneSentences, inputsOrDir);
   }
 
   @SuppressWarnings("ConstantConditions")
   private void evaluate(List<Sentence> sentences, boolean isCorrect, String token, String homophoneToken) throws IOException {
-    System.out.println("======================");
+    println("======================");
-    System.out.printf("Starting evaluation on " + sentences.size() + " sentences with %s/%s:\n", token, homophoneToken);
+    printf("Starting evaluation on " + sentences.size() + " sentences with %s/%s:\n", token, homophoneToken);
     JLanguageTool lt = new JLanguageTool(new English());
     for (Sentence sentence : sentences) {
       String textToken = isCorrect ? token : homophoneToken;
@@ -101,7 +106,7 @@ private void evaluate(List<Sentence> sentences, boolean isCorrect, String token,
         trueNegatives++;
       } else if (!consideredCorrect && isCorrect) {
         falsePositives++;
-        System.out.println("false positive: " + displayStr);
+        println("false positive: " + displayStr);
       } else if (consideredCorrect && !isCorrect) {
         falseNegatives++;
       } else {
@@ -111,21 +116,25 @@ private void evaluate(List<Sentence> sentences, boolean isCorrect, String token,
     }
   }
 
-  private void printEvalResult(List<Sentence> allTokenSentences, List<Sentence> allHomophoneSentences, List<String> inputsOrDir) {
+  private String printEvalResult(List<Sentence> allTokenSentences, List<Sentence> allHomophoneSentences, List<String> inputsOrDir) {
-    int sentences = allTokenSentences.size() + allHomophoneSentences.size();
-    System.out.println("======================");
-    System.out.println("Evaluation results for " + TOKEN + "/" + TOKEN_HOMOPHONE
-            + " with " + sentences + " sentences as of " + new Date() + ":");
     float precision = (float) truePositives / (truePositives + falsePositives);
     float recall = (float) truePositives / (truePositives + falseNegatives);
-    double fMeasure = FMeasure.getWeightedFMeasure(precision, recall);
+    String summary = String.format(ENGLISH, "precision=%.3f, recall=%.3f (%s) using %dgrams",
-    System.out.printf(ENGLISH, "  Precision: %.3f (%d false positives)\n", precision, falsePositives);
-    System.out.printf(ENGLISH, "  Recall:    %.3f (%d false negatives)\n", recall, falseNegatives);
-    System.out.printf(ENGLISH, "  F-measure: %.3f (beta=0.5)\n", fMeasure);
-    System.out.printf(ENGLISH, "  Matches:   %d (true positives)\n", truePositives);
-    System.out.printf(ENGLISH, "  Inputs:    %s\n", inputsOrDir);
-    System.out.printf(ENGLISH, "  Summary:   precision=%.3f, recall=%.3f (%s) using %dgrams\n",
             precision, recall, new SimpleDateFormat("yyyy-MM-dd").format(new Date()), grams);
+    if (verbose) {
+      int sentences = allTokenSentences.size() + allHomophoneSentences.size();
+      System.out.println("======================");
+      System.out.println("Evaluation results for " + TOKEN + "/" + TOKEN_HOMOPHONE
+              + " with " + sentences + " sentences as of " + new Date() + ":");
+      System.out.printf(ENGLISH, "  Precision: %.3f (%d false positives)\n", precision, falsePositives);
+      System.out.printf(ENGLISH, "  Recall:    %.3f (%d false negatives)\n", recall, falseNegatives);
+      double fMeasure = FMeasure.getWeightedFMeasure(precision, recall);
+      System.out.printf(ENGLISH, "  F-measure: %.3f (beta=0.5)\n", fMeasure);
+      System.out.printf(ENGLISH, "  Matches:   %d (true positives)\n", truePositives);
+      System.out.printf(ENGLISH, "  Inputs:    %s\n", inputsOrDir);
+      System.out.printf("  Summary:   " + summary + "\n");
+    }
+    return summary;
   }
 
   private List<Sentence> getRelevantSentences(List<String> inputs, String token, int maxSentences) throws IOException {
@@ -154,24 +163,36 @@ private List<Sentence> getSentencesFromSource(List<String> inputs, String token,
       Sentence sentence = sentenceSource.next();
       if (sentence.getText().toLowerCase().matches(".*\\b" + token + "\\b.*")) {
         sentences.add(sentence);
-        if (sentences.size() % 100 == 0) {
+        if (sentences.size() % 250 == 0) {
-          System.out.println("Loaded sentence " + sentences.size() + " with '" + token + "' from " + inputs);
+          println("Loaded sentence " + sentences.size() + " with '" + token + "' from " + inputs);
         }
         if (sentences.size() >= maxSentences) {
           break;
         }
       }
     }
-    System.out.println("Loaded " + sentences.size() + " sentences with '" + token + "' from " + inputs);
+    println("Loaded " + sentences.size() + " sentences with '" + token + "' from " + inputs);
     return sentences;
   }
+
+  private void println(String msg) {
+    if (verbose) {
+      System.out.println(msg);
+    }
+  }
+
+  private void printf(String msg, String... args) {
+    if (verbose) {
+      System.out.printf(msg, args);
+    }
+  }
 
   public static void main(String[] args) throws IOException {
     if (args.length < 3 || args.length > 4) {
       System.err.println("Usage: " + ConfusionRuleEvaluator.class.getSimpleName()
               + " <langCode> <languageModelTopDir> <wikipediaXml|tatoebaFile|dir>...");
       System.err.println("   <languageModelTopDir> is a directory with sub-directories '1grams', '2grams' and '3grams' with Lucene indexes");
-      System.err.println("   <wikipediaXml|tatoebaFile| dir> either a Wikipedia XML dump, or a Tatoeba file or");
+      System.err.println("   <wikipediaXml|tatoebaFile|dir> either a Wikipedia XML dump, or a Tatoeba file or");
       System.err.println("                      a directory with example sentences (where <word>.txt contains only the sentences for <word>).");
       System.err.println("                      You can specify both a Wikipedia file and a Tatoeba file.");
       System.exit(1);
@@ -184,7 +205,7 @@ public static void main(String[] args) throws IOException {
       inputsFiles.add(args[3]);
     }
     ConfusionRuleEvaluator generator = new ConfusionRuleEvaluator(lang, languageModel, 3);
-    generator.run(inputsFiles, TOKEN, TOKEN_HOMOPHONE, MAX_SENTENCES);
+    generator.run(inputsFiles, TOKEN, TOKEN_HOMOPHONE, FACTOR, MAX_SENTENCES);
     //ConfusionRuleEvaluator generator2 = new ConfusionRuleEvaluator(lang, languageModel, 4);
     //generator2.run(inputsFiles, TOKEN, TOKEN_HOMOPHONE, MAX_SENTENCES);
   }