Skip to content

Commit

Permalink
turn hard-coded fields into command-line parameters
Browse files Browse the repository at this point in the history
  • Loading branch information
danielnaber committed Dec 29, 2018
1 parent 5b006e4 commit 8dea676
Showing 1 changed file with 27 additions and 21 deletions.
Expand Up @@ -42,32 +42,32 @@
@SuppressWarnings({"resource", "CallToPrintStackTrace"})
class AutomaticConfusionRuleEvaluator {

private static final String LANGUAGE = "en";
private static final boolean CASE_SENSITIVE = true;
private static final int MAX_EXAMPLES = 1000;
private static final int MIN_EXAMPLES = 50;
private static final List<Long> EVAL_FACTORS = Arrays.asList(10L, 100L, 1_000L, 10_000L, 100_000L, 1_000_000L, 10_000_000L);
private static final float MIN_PRECISION = 0.95f;
private static final float MIN_RECALL = 0.1f;
private static final String LUCENE_CONTENT_FIELD = "field";

private final IndexSearcher searcher;
private final Map<String, List<ConfusionSet>> knownSets;
private final Set<String> finishedPairs = new HashSet<>();
private final String fieldName;
private final boolean caseInsensitive;

private int ignored = 0;

AutomaticConfusionRuleEvaluator(File luceneIndexDir) throws IOException {
private AutomaticConfusionRuleEvaluator(File luceneIndexDir, String fieldName, boolean caseInsensitive) throws IOException {
this.fieldName = fieldName;
this.caseInsensitive = caseInsensitive;
DirectoryReader reader = DirectoryReader.open(FSDirectory.open(luceneIndexDir.toPath()));
searcher = new IndexSearcher(reader);
InputStream confusionSetStream = JLanguageTool.getDataBroker().getFromResourceDirAsStream("/en/confusion_sets.txt");
knownSets = new ConfusionSetLoader().loadConfusionSet(confusionSetStream);
}

private void run(List<String> lines, File indexDir) throws IOException {
Language language = Languages.getLanguageForShortCode(LANGUAGE);
private void run(List<String> lines, File indexDir, Language lang) throws IOException {
LanguageModel lm = new LuceneLanguageModel(indexDir);
ConfusionRuleEvaluator evaluator = new ConfusionRuleEvaluator(language, lm, CASE_SENSITIVE);
ConfusionRuleEvaluator evaluator = new ConfusionRuleEvaluator(lang, lm, caseInsensitive);
int lineCount = 0;
for (String line : lines) {
lineCount++;
Expand Down Expand Up @@ -165,25 +165,27 @@ private File writeExampleSentencesToTempFile(String[] words) throws IOException
}

private int findExampleSentences(String word, FileWriter fw) throws IOException {
Term term = new Term(LUCENE_CONTENT_FIELD, CASE_SENSITIVE ? word.toLowerCase() : word);
Term term = new Term(fieldName, caseInsensitive ? word.toLowerCase() : word);
long t1 = System.currentTimeMillis();
//TopDocs topDocs = searcher.search(new TermQuery(term), CASE_SENSITIVE ? Integer.MAX_VALUE : MAX_EXAMPLES);
//TopDocs topDocs = searcher.search(new TermQuery(term), caseInsensitive ? Integer.MAX_VALUE : MAX_EXAMPLES);
TopDocs topDocs = searcher.search(new TermQuery(term), MAX_EXAMPLES);
long t2 = System.currentTimeMillis();
int count = 0;
Set<String> foundSentences = new HashSet<>();
for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
String sentence = searcher.doc(scoreDoc.doc).get(LUCENE_CONTENT_FIELD);
if (CASE_SENSITIVE) {
String sentence = searcher.doc(scoreDoc.doc).get(fieldName);
if (caseInsensitive) {
if (!foundSentences.contains(sentence)) {
fw.write(sentence + "\n");
foundSentences.add(sentence);
count++;
}
} else {
if (sentence.contains(word) && !foundSentences.contains(sentence)) {
fw.write(sentence + "\n");
foundSentences.add(sentence);
count++;
}
} else if (!foundSentences.contains(sentence)) {
fw.write(sentence + "\n");
foundSentences.add(sentence);
count++;
}
if (count > MAX_EXAMPLES) {
break;
Expand All @@ -192,20 +194,24 @@ private int findExampleSentences(String word, FileWriter fw) throws IOException
long t3 = System.currentTimeMillis();
long searchTime = t2 - t1;
long iterateTime = t3 - t2;
System.out.println("Found " + count + " examples for " + word + " (" + searchTime + "ms, " + iterateTime + "ms)");
System.out.println("Found " + count + " examples for " + word + " (" + searchTime + "ms, " + iterateTime + "ms), case insensitive=" + caseInsensitive);
return count;
}

public static void main(String[] args) throws IOException {
if (args.length != 3) {
System.out.println("Usage: " + AutomaticConfusionRuleEvaluator.class.getSimpleName() + " <confusionPairCandidates> <exampleSentenceIndexDir> <ngramDir>");
if (args.length != 6) {
System.out.println("Usage: " + AutomaticConfusionRuleEvaluator.class.getSimpleName() + " <languageCode> <confusionPairCandidates> <exampleSentenceIndexDir> <ngramDir> <fieldName> <true|false>");
System.out.println(" <confusionPairCandidates> is a semicolon-separated list of words (one pair per line)");
System.out.println(" <exampleSentenceIndexDir> is a Lucene index created by TextIndexCreator");
System.out.println(" <fieldName> is the Lucene index field name, usually 'field' or 'fieldLowercase'");
System.out.println(" <true|false> whether to run in case-insensitive mode");
System.exit(1);
}
List<String> lines = IOUtils.readLines(new FileInputStream(args[0]), "utf-8");
AutomaticConfusionRuleEvaluator eval = new AutomaticConfusionRuleEvaluator(new File(args[1]));
eval.run(lines, new File(args[2]));
Language lang = Languages.getLanguageForShortCode(args[0]);
List<String> lines = IOUtils.readLines(new FileInputStream(args[1]), "utf-8");
boolean caseInsensitive = args[5].equalsIgnoreCase("true");
AutomaticConfusionRuleEvaluator eval = new AutomaticConfusionRuleEvaluator(new File(args[2]), args[4], caseInsensitive);
eval.run(lines, new File(args[3]), lang);
}

class TooFewExamples extends RuntimeException {
Expand Down

0 comments on commit 8dea676

Please sign in to comment.