Skip to content

Commit

Permalink
more aggressive skipping of rare words
Browse files Browse the repository at this point in the history
  • Loading branch information
danielnaber committed Dec 20, 2017
1 parent 106d5b7 commit f58204c
Showing 1 changed file with 48 additions and 18 deletions.
Expand Up @@ -43,11 +43,13 @@
class AutomaticConfusionRuleEvaluator {

private static final String LANGUAGE = "en";
private static final boolean CASE_SENSITIVE = false;
private static final boolean CASE_SENSITIVE = true;
private static final int MAX_EXAMPLES = 1000;
private static final int MIN_EXAMPLES = 50;
private static final List<Long> EVAL_FACTORS = Arrays.asList(10L, 100L, 1_000L, 10_000L, 100_000L, 1_000_000L, 10_000_000L);
private static final float MIN_PRECISION = 0.99f;
private static final float MIN_RECALL = 0.1f;
private static final String LUCENE_CONTENT_FIELD = "field";

private final IndexSearcher searcher;
private final Map<String, List<ConfusionSet>> knownSets;
Expand Down Expand Up @@ -114,18 +116,22 @@ private void runOnPair(ConfusionRuleEvaluator evaluator, String line, String par
}
}
System.out.println("Working on: " + line);
File sentencesFile = writeExampleSentencesToTempFile(new String[]{part1, part2});
List<String> input = Arrays.asList(sentencesFile.getAbsolutePath());
Map<Long, ConfusionRuleEvaluator.EvalResult> results = evaluator.run(input, part1, part2, MAX_EXAMPLES, EVAL_FACTORS);
Map<Long, ConfusionRuleEvaluator.EvalResult> bestResults = findBestFactor(results);
if (bestResults.size() > 0) {
for (Map.Entry<Long, ConfusionRuleEvaluator.EvalResult> entry : bestResults.entrySet()) {
System.out.println("=> " + entry.getValue().getSummary());
try {
File sentencesFile = writeExampleSentencesToTempFile(new String[]{part1, part2});
List<String> input = Arrays.asList(sentencesFile.getAbsolutePath());
Map<Long, ConfusionRuleEvaluator.EvalResult> results = evaluator.run(input, part1, part2, MAX_EXAMPLES, EVAL_FACTORS);
Map<Long, ConfusionRuleEvaluator.EvalResult> bestResults = findBestFactor(results);
if (bestResults.size() > 0) {
for (Map.Entry<Long, ConfusionRuleEvaluator.EvalResult> entry : bestResults.entrySet()) {
System.out.println("=> " + entry.getValue().getSummary());
}
} else {
System.out.println("No good result found for " + part1 + "/" + part2);
}
} else {
System.out.println("No good result found for " + part1 + "/" + part2);
finishedPairs.add(part1 + "/" + part2);
} catch (TooFewExamples e) {
System.out.println("Skipping " + part1 + "/" + part2 + ", too few examples: " + e.getMessage());
}
finishedPairs.add(part1 + "/" + part2);
}

private Map<Long, ConfusionRuleEvaluator.EvalResult> findBestFactor(Map<Long, ConfusionRuleEvaluator.EvalResult> results) {
Expand All @@ -142,21 +148,29 @@ private Map<Long, ConfusionRuleEvaluator.EvalResult> findBestFactor(Map<Long, Co

private File writeExampleSentencesToTempFile(String[] words) throws IOException {
File tempFile = new File("/tmp/example-sentences.txt");
int count = 0;
try (FileWriter fw = new FileWriter(tempFile)) {
for (String word : words) {
findExampleSentences(word, fw);
int tmpCount = findExampleSentences(word, fw);
if (tmpCount <= MIN_EXAMPLES) {
throw new TooFewExamples(word, tmpCount);
}
count += tmpCount;
}
System.out.println("Example sentences written to " + tempFile);
System.out.println(count + " example sentences written to " + tempFile);
}
return tempFile;
}

private void findExampleSentences(String word, FileWriter fw) throws IOException {
Term term = new Term(TextIndexCreator.FIELD, CASE_SENSITIVE ? word.toLowerCase() : word);
TopDocs topDocs = searcher.search(new TermQuery(term), CASE_SENSITIVE ? Integer.MAX_VALUE : MAX_EXAMPLES);
private int findExampleSentences(String word, FileWriter fw) throws IOException {
Term term = new Term(LUCENE_CONTENT_FIELD, CASE_SENSITIVE ? word.toLowerCase() : word);
long t1 = System.currentTimeMillis();
//TopDocs topDocs = searcher.search(new TermQuery(term), CASE_SENSITIVE ? Integer.MAX_VALUE : MAX_EXAMPLES);
TopDocs topDocs = searcher.search(new TermQuery(term), MAX_EXAMPLES);
long t2 = System.currentTimeMillis();
int count = 0;
for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
String sentence = searcher.doc(scoreDoc.doc).get(TextIndexCreator.FIELD);
String sentence = searcher.doc(scoreDoc.doc).get(LUCENE_CONTENT_FIELD);
if (CASE_SENSITIVE) {
if (sentence.contains(word)) {
fw.write(sentence + "\n");
Expand All @@ -170,7 +184,11 @@ private void findExampleSentences(String word, FileWriter fw) throws IOException
break;
}
}
System.out.println("Found " + count + " examples for " + word);
long t3 = System.currentTimeMillis();
long searchTime = t2 - t1;
long iterateTime = t3 - t2;
System.out.println("Found " + count + " examples for " + word + " (" + searchTime + "ms, " + iterateTime + "ms)");
return count;
}

public static void main(String[] args) throws IOException {
Expand All @@ -185,4 +203,16 @@ public static void main(String[] args) throws IOException {
eval.run(lines, new File(args[2]));
}

class TooFewExamples extends RuntimeException {
private String word;
private int exampleCount;
TooFewExamples(String word, int exampleCount) {
this.word = word;
this.exampleCount = exampleCount;
}
@Override
public String getMessage() {
return exampleCount + " matches for " + word;
}
}
}

0 comments on commit f58204c

Please sign in to comment.