From 8dea6763f45c7f50590df40edd559c057eccca41 Mon Sep 17 00:00:00 2001 From: Daniel Naber Date: Sat, 29 Dec 2018 14:26:46 +0100 Subject: [PATCH] turn hard-coded fields into command-line parameters --- .../AutomaticConfusionRuleEvaluator.java | 48 +++++++++++-------- 1 file changed, 27 insertions(+), 21 deletions(-) diff --git a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/AutomaticConfusionRuleEvaluator.java b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/AutomaticConfusionRuleEvaluator.java index 4df52032968d..c50466fa8afc 100644 --- a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/AutomaticConfusionRuleEvaluator.java +++ b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/AutomaticConfusionRuleEvaluator.java @@ -42,32 +42,32 @@ @SuppressWarnings({"resource", "CallToPrintStackTrace"}) class AutomaticConfusionRuleEvaluator { - private static final String LANGUAGE = "en"; - private static final boolean CASE_SENSITIVE = true; private static final int MAX_EXAMPLES = 1000; private static final int MIN_EXAMPLES = 50; private static final List EVAL_FACTORS = Arrays.asList(10L, 100L, 1_000L, 10_000L, 100_000L, 1_000_000L, 10_000_000L); private static final float MIN_PRECISION = 0.95f; private static final float MIN_RECALL = 0.1f; - private static final String LUCENE_CONTENT_FIELD = "field"; private final IndexSearcher searcher; private final Map> knownSets; private final Set finishedPairs = new HashSet<>(); + private final String fieldName; + private final boolean caseInsensitive; private int ignored = 0; - AutomaticConfusionRuleEvaluator(File luceneIndexDir) throws IOException { + private AutomaticConfusionRuleEvaluator(File luceneIndexDir, String fieldName, boolean caseInsensitive) throws IOException { + this.fieldName = fieldName; + this.caseInsensitive = caseInsensitive; DirectoryReader reader = DirectoryReader.open(FSDirectory.open(luceneIndexDir.toPath())); searcher = new IndexSearcher(reader); InputStream confusionSetStream = JLanguageTool.getDataBroker().getFromResourceDirAsStream("/en/confusion_sets.txt"); knownSets = new ConfusionSetLoader().loadConfusionSet(confusionSetStream); } - private void run(List lines, File indexDir) throws IOException { - Language language = Languages.getLanguageForShortCode(LANGUAGE); + private void run(List lines, File indexDir, Language lang) throws IOException { LanguageModel lm = new LuceneLanguageModel(indexDir); - ConfusionRuleEvaluator evaluator = new ConfusionRuleEvaluator(language, lm, CASE_SENSITIVE); + ConfusionRuleEvaluator evaluator = new ConfusionRuleEvaluator(lang, lm, caseInsensitive); int lineCount = 0; for (String line : lines) { lineCount++; @@ -165,25 +165,27 @@ private File writeExampleSentencesToTempFile(String[] words) throws IOException } private int findExampleSentences(String word, FileWriter fw) throws IOException { - Term term = new Term(LUCENE_CONTENT_FIELD, CASE_SENSITIVE ? word.toLowerCase() : word); + Term term = new Term(fieldName, caseInsensitive ? word.toLowerCase() : word); long t1 = System.currentTimeMillis(); - //TopDocs topDocs = searcher.search(new TermQuery(term), CASE_SENSITIVE ? Integer.MAX_VALUE : MAX_EXAMPLES); + //TopDocs topDocs = searcher.search(new TermQuery(term), caseInsensitive ? Integer.MAX_VALUE : MAX_EXAMPLES); TopDocs topDocs = searcher.search(new TermQuery(term), MAX_EXAMPLES); long t2 = System.currentTimeMillis(); int count = 0; Set foundSentences = new HashSet<>(); for (ScoreDoc scoreDoc : topDocs.scoreDocs) { - String sentence = searcher.doc(scoreDoc.doc).get(LUCENE_CONTENT_FIELD); - if (CASE_SENSITIVE) { + String sentence = searcher.doc(scoreDoc.doc).get(fieldName); + if (caseInsensitive) { + if (!foundSentences.contains(sentence)) { + fw.write(sentence + "\n"); + foundSentences.add(sentence); + count++; + } + } else { if (sentence.contains(word) && !foundSentences.contains(sentence)) { fw.write(sentence + "\n"); foundSentences.add(sentence); count++; } - } else if (!foundSentences.contains(sentence)) { - fw.write(sentence + "\n"); - foundSentences.add(sentence); - count++; } if (count > MAX_EXAMPLES) { break; @@ -192,20 +194,24 @@ private int findExampleSentences(String word, FileWriter fw) throws IOException long t3 = System.currentTimeMillis(); long searchTime = t2 - t1; long iterateTime = t3 - t2; - System.out.println("Found " + count + " examples for " + word + " (" + searchTime + "ms, " + iterateTime + "ms)"); + System.out.println("Found " + count + " examples for " + word + " (" + searchTime + "ms, " + iterateTime + "ms), case insensitive=" + caseInsensitive); return count; } public static void main(String[] args) throws IOException { - if (args.length != 3) { - System.out.println("Usage: " + AutomaticConfusionRuleEvaluator.class.getSimpleName() + " "); + if (args.length != 6) { + System.out.println("Usage: " + AutomaticConfusionRuleEvaluator.class.getSimpleName() + " "); System.out.println(" is a semicolon-separated list of words (one pair per line)"); System.out.println(" is a Lucene index created by TextIndexCreator"); + System.out.println(" is the Lucene index field name, usually 'field' or 'fieldLowercase'"); + System.out.println(" whether to run in case-insensitive mode"); System.exit(1); } - List lines = IOUtils.readLines(new FileInputStream(args[0]), "utf-8"); - AutomaticConfusionRuleEvaluator eval = new AutomaticConfusionRuleEvaluator(new File(args[1])); - eval.run(lines, new File(args[2])); + Language lang = Languages.getLanguageForShortCode(args[0]); + List lines = IOUtils.readLines(new FileInputStream(args[1]), "utf-8"); + boolean caseInsensitive = args[5].equalsIgnoreCase("true"); + AutomaticConfusionRuleEvaluator eval = new AutomaticConfusionRuleEvaluator(new File(args[2]), args[4], caseInsensitive); + eval.run(lines, new File(args[3]), lang); } class TooFewExamples extends RuntimeException {