Skip to content

Commit

Permalink
another evaluation script for ngrams
Browse files Browse the repository at this point in the history
  • Loading branch information
danielnaber committed Jul 24, 2015
1 parent e682243 commit 64107be
Show file tree
Hide file tree
Showing 2 changed files with 134 additions and 24 deletions.
@@ -0,0 +1,89 @@
/* LanguageTool, a natural language style checker
* Copyright (C) 2015 Daniel Naber (http://www.danielnaber.de)
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
* USA
*/
package org.languagetool.dev.bigdata;

import org.apache.commons.lang.StringUtils;
import org.languagetool.JLanguageTool;
import org.languagetool.Language;
import org.languagetool.Languages;
import org.languagetool.languagemodel.LanguageModel;
import org.languagetool.languagemodel.LuceneLanguageModel;
import org.languagetool.rules.ConfusionSet;
import org.languagetool.rules.ConfusionSetLoader;
import org.languagetool.rules.ConfusionString;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.*;

final class AllConfusionRulesEvaluator {

private static final int MAX_SENTENCES = 1000;

public static void main(String[] args) throws IOException {
if (args.length < 3 || args.length > 4) {
System.err.println("Usage: " + ConfusionRuleEvaluator.class.getSimpleName()
+ " <langCode> <languageModelTopDir> <wikipediaXml|tatoebaFile|dir>...");
System.err.println(" <languageModelTopDir> is a directory with sub-directories '1grams', '2grams', and '3grams' with Lucene indexes");
System.err.println(" <wikipediaXml|tatoebaFile|dir> either a Wikipedia XML dump, or a Tatoeba file or");
System.err.println(" a directory with example sentences (where <word>.txt contains only the sentences for <word>).");
System.err.println(" You can specify both a Wikipedia file and a Tatoeba file.");
System.exit(1);
}
Language lang = Languages.getLanguageForShortName(args[0]);
LanguageModel languageModel = new LuceneLanguageModel(new File(args[1]));
List<String> inputsFiles = new ArrayList<>();
inputsFiles.add(args[2]);
if (args.length >= 4) {
inputsFiles.add(args[3]);
}
ConfusionRuleEvaluator eval = new ConfusionRuleEvaluator(lang, languageModel, 3);
eval.setVerboseMode(false);
ConfusionSetLoader confusionSetLoader = new ConfusionSetLoader();
InputStream inputStream = JLanguageTool.getDataBroker().getFromResourceDirAsStream("/en/confusion_sets.txt");
Map<String,List<ConfusionSet>> confusionSetMap = confusionSetLoader.loadConfusionSet(inputStream);
Set<String> done = new HashSet<>();
for (List<ConfusionSet> entry : confusionSetMap.values()) {
for (ConfusionSet confusionSet : entry) {
Set<ConfusionString> set = confusionSet.getSet();
if (set.size() != 2) {
System.out.println("Skipping confusion set with size != 2: " + confusionSet);
} else {
Iterator<ConfusionString> iterator = set.iterator();
ConfusionString set1 = iterator.next();
ConfusionString set2 = iterator.next();
String word1 = set1.getString();
String word2 = set2.getString();
String key = word1 + " " + word2;
if (!done.contains(key)) {
String summary = eval.run(inputsFiles, word1, word2, confusionSet.getFactor(), MAX_SENTENCES);
String summary1 = set1.getDescription() != null ? word1 + "|" + set1.getDescription() : word1;
String summary2 = set2.getDescription() != null ? word2 + "|" + set2.getDescription() : word2;
String start = summary1 + "; " + summary2 + "; " + confusionSet.getFactor();
String spaces = StringUtils.repeat(" ", 82-start.length());
System.out.println(start + spaces + "# " + summary);
}
done.add(key);
}
}
}
}

}
Expand Up @@ -64,29 +64,34 @@ class ConfusionRuleEvaluator {
private int trueNegatives = 0; private int trueNegatives = 0;
private int falsePositives = 0; private int falsePositives = 0;
private int falseNegatives = 0; private int falseNegatives = 0;
private boolean verbose = true;


private ConfusionRuleEvaluator(Language language, LanguageModel languageModel, int grams) { ConfusionRuleEvaluator(Language language, LanguageModel languageModel, int grams) {
this.language = language; this.language = language;
this.rule = new EnglishConfusionProbabilityRule(JLanguageTool.getMessageBundle(), languageModel, language, grams); this.rule = new EnglishConfusionProbabilityRule(JLanguageTool.getMessageBundle(), languageModel, language, grams);
rule.setConfusionSet(new ConfusionSet(FACTOR, TOKEN_HOMOPHONE, TOKEN));
this.grams = grams; this.grams = grams;
} }

void setVerboseMode(boolean verbose) {
this.verbose = verbose;
}


private void run(List<String> inputsOrDir, String token, String homophoneToken, int maxSentences) throws IOException { String run(List<String> inputsOrDir, String token, String homophoneToken, long factor, int maxSentences) throws IOException {
rule.setConfusionSet(new ConfusionSet(factor*10, homophoneToken, token));
List<Sentence> allTokenSentences = getRelevantSentences(inputsOrDir, token, maxSentences); List<Sentence> allTokenSentences = getRelevantSentences(inputsOrDir, token, maxSentences);
// Load the sentences with a homophone and later replace it so we get error sentences: // Load the sentences with a homophone and later replace it so we get error sentences:
List<Sentence> allHomophoneSentences = getRelevantSentences(inputsOrDir, homophoneToken, maxSentences); List<Sentence> allHomophoneSentences = getRelevantSentences(inputsOrDir, homophoneToken, maxSentences);
evaluate(allTokenSentences, true, token, homophoneToken); evaluate(allTokenSentences, true, token, homophoneToken);
evaluate(allTokenSentences, false, homophoneToken, token); evaluate(allTokenSentences, false, homophoneToken, token);
evaluate(allHomophoneSentences, false, token, homophoneToken); evaluate(allHomophoneSentences, false, token, homophoneToken);
evaluate(allHomophoneSentences, true, homophoneToken, token); evaluate(allHomophoneSentences, true, homophoneToken, token);
printEvalResult(allTokenSentences, allHomophoneSentences, inputsOrDir); return printEvalResult(allTokenSentences, allHomophoneSentences, inputsOrDir);
} }


@SuppressWarnings("ConstantConditions") @SuppressWarnings("ConstantConditions")
private void evaluate(List<Sentence> sentences, boolean isCorrect, String token, String homophoneToken) throws IOException { private void evaluate(List<Sentence> sentences, boolean isCorrect, String token, String homophoneToken) throws IOException {
System.out.println("======================"); println("======================");
System.out.printf("Starting evaluation on " + sentences.size() + " sentences with %s/%s:\n", token, homophoneToken); printf("Starting evaluation on " + sentences.size() + " sentences with %s/%s:\n", token, homophoneToken);
JLanguageTool lt = new JLanguageTool(new English()); JLanguageTool lt = new JLanguageTool(new English());
for (Sentence sentence : sentences) { for (Sentence sentence : sentences) {
String textToken = isCorrect ? token : homophoneToken; String textToken = isCorrect ? token : homophoneToken;
Expand All @@ -101,7 +106,7 @@ private void evaluate(List<Sentence> sentences, boolean isCorrect, String token,
trueNegatives++; trueNegatives++;
} else if (!consideredCorrect && isCorrect) { } else if (!consideredCorrect && isCorrect) {
falsePositives++; falsePositives++;
System.out.println("false positive: " + displayStr); println("false positive: " + displayStr);
} else if (consideredCorrect && !isCorrect) { } else if (consideredCorrect && !isCorrect) {
falseNegatives++; falseNegatives++;
} else { } else {
Expand All @@ -111,21 +116,25 @@ private void evaluate(List<Sentence> sentences, boolean isCorrect, String token,
} }
} }


private void printEvalResult(List<Sentence> allTokenSentences, List<Sentence> allHomophoneSentences, List<String> inputsOrDir) { private String printEvalResult(List<Sentence> allTokenSentences, List<Sentence> allHomophoneSentences, List<String> inputsOrDir) {
int sentences = allTokenSentences.size() + allHomophoneSentences.size();
System.out.println("======================");
System.out.println("Evaluation results for " + TOKEN + "/" + TOKEN_HOMOPHONE
+ " with " + sentences + " sentences as of " + new Date() + ":");
float precision = (float) truePositives / (truePositives + falsePositives); float precision = (float) truePositives / (truePositives + falsePositives);
float recall = (float) truePositives / (truePositives + falseNegatives); float recall = (float) truePositives / (truePositives + falseNegatives);
double fMeasure = FMeasure.getWeightedFMeasure(precision, recall); String summary = String.format(ENGLISH, "precision=%.3f, recall=%.3f (%s) using %dgrams",
System.out.printf(ENGLISH, " Precision: %.3f (%d false positives)\n", precision, falsePositives);
System.out.printf(ENGLISH, " Recall: %.3f (%d false negatives)\n", recall, falseNegatives);
System.out.printf(ENGLISH, " F-measure: %.3f (beta=0.5)\n", fMeasure);
System.out.printf(ENGLISH, " Matches: %d (true positives)\n", truePositives);
System.out.printf(ENGLISH, " Inputs: %s\n", inputsOrDir);
System.out.printf(ENGLISH, " Summary: precision=%.3f, recall=%.3f (%s) using %dgrams\n",
precision, recall, new SimpleDateFormat("yyyy-MM-dd").format(new Date()), grams); precision, recall, new SimpleDateFormat("yyyy-MM-dd").format(new Date()), grams);
if (verbose) {
int sentences = allTokenSentences.size() + allHomophoneSentences.size();
System.out.println("======================");
System.out.println("Evaluation results for " + TOKEN + "/" + TOKEN_HOMOPHONE
+ " with " + sentences + " sentences as of " + new Date() + ":");
System.out.printf(ENGLISH, " Precision: %.3f (%d false positives)\n", precision, falsePositives);
System.out.printf(ENGLISH, " Recall: %.3f (%d false negatives)\n", recall, falseNegatives);
double fMeasure = FMeasure.getWeightedFMeasure(precision, recall);
System.out.printf(ENGLISH, " F-measure: %.3f (beta=0.5)\n", fMeasure);
System.out.printf(ENGLISH, " Matches: %d (true positives)\n", truePositives);
System.out.printf(ENGLISH, " Inputs: %s\n", inputsOrDir);
System.out.printf(" Summary: " + summary + "\n");
}
return summary;
} }


private List<Sentence> getRelevantSentences(List<String> inputs, String token, int maxSentences) throws IOException { private List<Sentence> getRelevantSentences(List<String> inputs, String token, int maxSentences) throws IOException {
Expand Down Expand Up @@ -154,24 +163,36 @@ private List<Sentence> getSentencesFromSource(List<String> inputs, String token,
Sentence sentence = sentenceSource.next(); Sentence sentence = sentenceSource.next();
if (sentence.getText().toLowerCase().matches(".*\\b" + token + "\\b.*")) { if (sentence.getText().toLowerCase().matches(".*\\b" + token + "\\b.*")) {
sentences.add(sentence); sentences.add(sentence);
if (sentences.size() % 100 == 0) { if (sentences.size() % 250 == 0) {
System.out.println("Loaded sentence " + sentences.size() + " with '" + token + "' from " + inputs); println("Loaded sentence " + sentences.size() + " with '" + token + "' from " + inputs);
} }
if (sentences.size() >= maxSentences) { if (sentences.size() >= maxSentences) {
break; break;
} }
} }
} }
System.out.println("Loaded " + sentences.size() + " sentences with '" + token + "' from " + inputs); println("Loaded " + sentences.size() + " sentences with '" + token + "' from " + inputs);
return sentences; return sentences;
} }

private void println(String msg) {
if (verbose) {
System.out.println(msg);
}
}

private void printf(String msg, String... args) {
if (verbose) {
System.out.printf(msg, args);
}
}


public static void main(String[] args) throws IOException { public static void main(String[] args) throws IOException {
if (args.length < 3 || args.length > 4) { if (args.length < 3 || args.length > 4) {
System.err.println("Usage: " + ConfusionRuleEvaluator.class.getSimpleName() System.err.println("Usage: " + ConfusionRuleEvaluator.class.getSimpleName()
+ " <langCode> <languageModelTopDir> <wikipediaXml|tatoebaFile|dir>..."); + " <langCode> <languageModelTopDir> <wikipediaXml|tatoebaFile|dir>...");
System.err.println(" <languageModelTopDir> is a directory with sub-directories '1grams', '2grams' and '3grams' with Lucene indexes"); System.err.println(" <languageModelTopDir> is a directory with sub-directories '1grams', '2grams' and '3grams' with Lucene indexes");
System.err.println(" <wikipediaXml|tatoebaFile| dir> either a Wikipedia XML dump, or a Tatoeba file or"); System.err.println(" <wikipediaXml|tatoebaFile|dir> either a Wikipedia XML dump, or a Tatoeba file or");
System.err.println(" a directory with example sentences (where <word>.txt contains only the sentences for <word>)."); System.err.println(" a directory with example sentences (where <word>.txt contains only the sentences for <word>).");
System.err.println(" You can specify both a Wikipedia file and a Tatoeba file."); System.err.println(" You can specify both a Wikipedia file and a Tatoeba file.");
System.exit(1); System.exit(1);
Expand All @@ -184,7 +205,7 @@ public static void main(String[] args) throws IOException {
inputsFiles.add(args[3]); inputsFiles.add(args[3]);
} }
ConfusionRuleEvaluator generator = new ConfusionRuleEvaluator(lang, languageModel, 3); ConfusionRuleEvaluator generator = new ConfusionRuleEvaluator(lang, languageModel, 3);
generator.run(inputsFiles, TOKEN, TOKEN_HOMOPHONE, MAX_SENTENCES); generator.run(inputsFiles, TOKEN, TOKEN_HOMOPHONE, FACTOR, MAX_SENTENCES);
//ConfusionRuleEvaluator generator2 = new ConfusionRuleEvaluator(lang, languageModel, 4); //ConfusionRuleEvaluator generator2 = new ConfusionRuleEvaluator(lang, languageModel, 4);
//generator2.run(inputsFiles, TOKEN, TOKEN_HOMOPHONE, MAX_SENTENCES); //generator2.run(inputsFiles, TOKEN, TOKEN_HOMOPHONE, MAX_SENTENCES);
} }
Expand Down

0 comments on commit 64107be

Please sign in to comment.