From fdee326fe0951b60006ee8c581de654554b9fbd1 Mon Sep 17 00:00:00 2001 From: Daniel Naber Date: Tue, 24 Nov 2015 17:44:53 +0100 Subject: [PATCH] extend ngram indexing - still work in progress --- .../dev/bigdata/CommonCrawlToNgram.java | 121 ++++++++++++------ .../dev/eval/SimpleCorpusEvaluator.java | 15 +-- .../dev/bigdata/CommonCrawlToNgramTest.java | 2 +- 3 files changed, 89 insertions(+), 49 deletions(-) diff --git a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/CommonCrawlToNgram.java b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/CommonCrawlToNgram.java index d81cda7dd79b..07d21030eb63 100644 --- a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/CommonCrawlToNgram.java +++ b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/CommonCrawlToNgram.java @@ -30,6 +30,7 @@ import org.jetbrains.annotations.NotNull; import org.languagetool.Language; import org.languagetool.Languages; +import org.languagetool.dev.eval.SimpleCorpusEvaluator; import org.languagetool.tokenizers.Tokenizer; import org.tukaani.xz.XZInputStream; @@ -48,29 +49,30 @@ class CommonCrawlToNgram implements AutoCloseable { private final Language language; private final File input; - private final Directory directory; - private final IndexWriter indexWriter; + private final File indexTopDir; + private final File evalFile; + private final Map unigramToCount = new HashMap<>(); + private final Map bigramToCount = new HashMap<>(); + private final Map trigramToCount = new HashMap<>(); + private final Map indexes = new HashMap<>(); - private DirectoryReader reader; - private IndexSearcher searcher; private int limit = 1000; - CommonCrawlToNgram(Language language, File input, File outputDir) throws IOException { + CommonCrawlToNgram(Language language, File input, File indexTopDir, File evalFile) throws IOException { this.language = language; this.input = input; - Analyzer analyzer = new StandardAnalyzer(); - IndexWriterConfig config = new IndexWriterConfig(analyzer); - directory = FSDirectory.open(outputDir.toPath()); - indexWriter = new IndexWriter(directory, config); - reader = DirectoryReader.open(indexWriter, true); // TODO: see if false is faster - searcher = new IndexSearcher(reader); + this.indexTopDir = indexTopDir; + this.evalFile = evalFile; + indexes.put(1, new LuceneLiveIndex(new File(indexTopDir, "1grams"))); + indexes.put(2, new LuceneLiveIndex(new File(indexTopDir, "2grams"))); + indexes.put(3, new LuceneLiveIndex(new File(indexTopDir, "3grams"))); } @Override public void close() throws IOException { - indexWriter.close(); - reader.close(); - directory.close(); + for (LuceneLiveIndex index : indexes.values()) { + index.close(); + } } void setLimit(int limit) { @@ -81,19 +83,18 @@ void indexInputFile() throws IOException { FileInputStream fin = new FileInputStream(input); BufferedInputStream in = new BufferedInputStream(fin); Tokenizer wordTokenizer = language.getWordTokenizer(); // TODO: use a more Google-like tokenizer - Map ngramToCount = new HashMap<>(); try (XZInputStream xzIn = new XZInputStream(in)) { final byte[] buffer = new byte[8192]; int n; while ((n = xzIn.read(buffer)) != -1) { String buf = new String(buffer, 0, n); // TODO: not always correct, we need to wait for line end first? String[] lines = buf.split("\n"); - indexLine(wordTokenizer, lines, ngramToCount); + indexLine(wordTokenizer, lines); } } } - private void indexLine(Tokenizer wordTokenizer, String[] lines, Map ngramToCount) throws IOException { + private void indexLine(Tokenizer wordTokenizer, String[] lines) throws IOException { for (String line : lines) { List tokens = wordTokenizer.tokenize(line); //System.out.println("L: " + tokens); @@ -104,49 +105,61 @@ private void indexLine(Tokenizer wordTokenizer, String[] lines, Map v == null ? 1 : v + 1); + if (prev != null) { + String ngram = prev + " " + token; + bigramToCount.compute(ngram, (k, v) -> v == null ? 1 : v + 1); + } if (prevPrev != null && prev != null) { String ngram = prevPrev + " " + prev + " " + token; - Long count = ngramToCount.get(ngram); - if (count == null) { - ngramToCount.put(ngram, 1L); - } else { - ngramToCount.put(ngram, count + 1); - } - if (ngramToCount.size() > limit) { - writeToLucene(ngramToCount); - ngramToCount.clear(); + trigramToCount.compute(ngram, (k, v) -> v == null ? 1 : v + 1); + if (trigramToCount.size() > limit) { + writeAndEvaluate(); } } prevPrev = prev; prev = token; } } - writeToLucene(ngramToCount); + writeAndEvaluate(); + } + + private void writeAndEvaluate() throws IOException { + writeToLucene(1, unigramToCount); + writeToLucene(2, bigramToCount); + writeToLucene(3, trigramToCount); + if (evalFile != null) { + SimpleCorpusEvaluator evaluator = new SimpleCorpusEvaluator(indexTopDir); + evaluator.run(evalFile); + } else { + System.out.println("Skipping evaluation, no evaluation file specified"); + } } - private void writeToLucene(Map ngramToCount) throws IOException { + private void writeToLucene(int ngramSize, Map ngramToCount) throws IOException { //System.out.println("WRITE: "); + LuceneLiveIndex index = indexes.get(ngramSize); for (Map.Entry entry : ngramToCount.entrySet()) { Term ngram = new Term("ngram", entry.getKey()); - reader = DirectoryReader.open(indexWriter, true); - searcher = new IndexSearcher(reader); + index.reader = DirectoryReader.open(index.indexWriter, true); + index.searcher = new IndexSearcher(index.reader); // not sure why this doesn't work, should be faster: /*DirectoryReader newReader = DirectoryReader.openIfChanged(reader); if (newReader != null) { reader = newReader; }*/ - TopDocs topDocs = searcher.search(new TermQuery(ngram), 2); + TopDocs topDocs = index.searcher.search(new TermQuery(ngram), 2); //System.out.println(ngram + " ==> " + topDocs.totalHits); if (topDocs.totalHits == 0) { Document doc = getDoc(entry.getKey(), entry.getValue()); - indexWriter.addDocument(doc); + index.indexWriter.addDocument(doc); } else if (topDocs.totalHits == 1) { int docNumber = topDocs.scoreDocs[0].doc; - Document document = reader.document(docNumber); + Document document = index.reader.document(docNumber); long oldCount = Long.parseLong(document.getField("count").stringValue()); //System.out.println(ngram + " -> " + oldCount + "+" + entry.getValue()); - indexWriter.deleteDocuments(ngram); - indexWriter.addDocument(getDoc(entry.getKey(), oldCount + entry.getValue())); + index.indexWriter.deleteDocuments(ngram); + index.indexWriter.addDocument(getDoc(entry.getKey(), oldCount + entry.getValue())); // would probably be faster, but we currently rely on the count being a common field: //indexWriter.updateNumericDocValue(ngram, "count", oldCount + entry.getValue()); } else if (topDocs.totalHits > 1) { @@ -154,13 +167,15 @@ private void writeToLucene(Map ngramToCount) throws IOException { } //System.out.println(" " + entry.getKey() + " -> " + entry.getValue()); } - indexWriter.commit(); + // TODO: add/update 'totalTokenCount' + index.indexWriter.commit(); + ngramToCount.clear(); } @NotNull private Document getDoc(String ngram, long count) { Document doc = new Document(); - doc.add(new Field("ngram", ngram, StringField.TYPE_STORED)); + doc.add(new Field("ngram", ngram, StringField.TYPE_STORED)); // TODO: store only for debugging doc.add(getCountField(count)); return doc; } @@ -175,15 +190,41 @@ private LongField getCountField(long count) { } public static void main(String[] args) throws IOException { - if (args.length != 2) { - System.out.println("Usage: " + CommonCrawlToNgram.class + " "); + if (args.length != 4) { + System.out.println("Usage: " + CommonCrawlToNgram.class + " "); + System.out.println(" a plain text file with simple error markup"); System.exit(1); } Language language = Languages.getLanguageForShortName(args[0]); File input = new File(args[1]); File outputDir = new File(args[2]); - try (CommonCrawlToNgram prg = new CommonCrawlToNgram(language, input, outputDir)) { + File evalFile = new File(args[3]); + try (CommonCrawlToNgram prg = new CommonCrawlToNgram(language, input, outputDir, evalFile)) { prg.indexInputFile(); } } + + class LuceneLiveIndex { + + private final Directory directory; + private final IndexWriter indexWriter; + + private DirectoryReader reader; + private IndexSearcher searcher; + + LuceneLiveIndex(File dir) throws IOException { + Analyzer analyzer = new StandardAnalyzer(); + IndexWriterConfig config = new IndexWriterConfig(analyzer); + directory = FSDirectory.open(dir.toPath()); + indexWriter = new IndexWriter(directory, config); + reader = DirectoryReader.open(indexWriter, true); // TODO: see if false is faster + searcher = new IndexSearcher(reader); + } + + void close() throws IOException { + indexWriter.close(); + directory.close(); + } + + } } diff --git a/languagetool-dev/src/main/java/org/languagetool/dev/eval/SimpleCorpusEvaluator.java b/languagetool-dev/src/main/java/org/languagetool/dev/eval/SimpleCorpusEvaluator.java index 1b94b2c07e04..9c99a2fb7b47 100644 --- a/languagetool-dev/src/main/java/org/languagetool/dev/eval/SimpleCorpusEvaluator.java +++ b/languagetool-dev/src/main/java/org/languagetool/dev/eval/SimpleCorpusEvaluator.java @@ -42,7 +42,7 @@ * Evaluates the ngram rule with a simple corpus, see {@link SimpleCorpus}. * @since 3.2 */ -class SimpleCorpusEvaluator { +public class SimpleCorpusEvaluator { // without bigrams: private static final double START_THRESHOLD = 0.000001; @@ -64,8 +64,8 @@ class SimpleCorpusEvaluator { private int goodConfusionMatches; private int badConfusionMatches; - SimpleCorpusEvaluator(File indexDir) throws IOException { - evaluator = getEvaluator(indexDir); + public SimpleCorpusEvaluator(File indexTopDir) throws IOException { + evaluator = getEvaluator(indexTopDir); } @NotNull @@ -74,23 +74,22 @@ private Evaluator getEvaluator(File indexTopDir) throws IOException { } @NotNull - private ErrorCorpus getCorpus(File dir) throws IOException { - return new SimpleCorpus(dir); + private ErrorCorpus getCorpus(File file) throws IOException { + return new SimpleCorpus(file); } void close() { evaluator.close(); } - PrecisionRecall run(File dir) throws IOException { + public PrecisionRecall run(File file) throws IOException { System.out.println("Output explanation:"); System.out.println(" [ ] = this is not an expected error"); System.out.println(" [+ ] = this is an expected error"); System.out.println(" [++] = this is an expected error and the first suggestion is correct"); System.out.println(" [//] = not counted because already matches by a different rule"); System.out.println(""); - ErrorCorpus corpus = getCorpus(dir); - checkLines(corpus); + checkLines(getCorpus(file)); return printAndResetResults(); } diff --git a/languagetool-dev/src/test/java/org/languagetool/dev/bigdata/CommonCrawlToNgramTest.java b/languagetool-dev/src/test/java/org/languagetool/dev/bigdata/CommonCrawlToNgramTest.java index 147321403a08..25358a050b6a 100644 --- a/languagetool-dev/src/test/java/org/languagetool/dev/bigdata/CommonCrawlToNgramTest.java +++ b/languagetool-dev/src/test/java/org/languagetool/dev/bigdata/CommonCrawlToNgramTest.java @@ -35,7 +35,7 @@ public void testIndexing() throws IOException { try { tempDir.mkdir(); String filename = CommonCrawlToNgramTest.class.getResource("/org/languagetool/dev/bigdata/ngram-input.txt.xz").getFile(); - try (CommonCrawlToNgram prg = new CommonCrawlToNgram(new German(), new File(filename), tempDir)) { + try (CommonCrawlToNgram prg = new CommonCrawlToNgram(new German(), new File(filename), tempDir, null)) { prg.setLimit(1); prg.indexInputFile(); }