Skip to content

Commit

Permalink
extend ngram indexing - still work in progress
Browse files Browse the repository at this point in the history
  • Loading branch information
danielnaber committed Nov 24, 2015
1 parent ba0e935 commit fdee326
Show file tree
Hide file tree
Showing 3 changed files with 89 additions and 49 deletions.
Expand Up @@ -30,6 +30,7 @@
import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.NotNull;
import org.languagetool.Language; import org.languagetool.Language;
import org.languagetool.Languages; import org.languagetool.Languages;
import org.languagetool.dev.eval.SimpleCorpusEvaluator;
import org.languagetool.tokenizers.Tokenizer; import org.languagetool.tokenizers.Tokenizer;
import org.tukaani.xz.XZInputStream; import org.tukaani.xz.XZInputStream;


Expand All @@ -48,29 +49,30 @@ class CommonCrawlToNgram implements AutoCloseable {


private final Language language; private final Language language;
private final File input; private final File input;
private final Directory directory; private final File indexTopDir;
private final IndexWriter indexWriter; private final File evalFile;
private final Map<String, Long> unigramToCount = new HashMap<>();
private final Map<String, Long> bigramToCount = new HashMap<>();
private final Map<String, Long> trigramToCount = new HashMap<>();
private final Map<Integer, LuceneLiveIndex> indexes = new HashMap<>();


private DirectoryReader reader;
private IndexSearcher searcher;
private int limit = 1000; private int limit = 1000;


CommonCrawlToNgram(Language language, File input, File outputDir) throws IOException { CommonCrawlToNgram(Language language, File input, File indexTopDir, File evalFile) throws IOException {
this.language = language; this.language = language;
this.input = input; this.input = input;
Analyzer analyzer = new StandardAnalyzer(); this.indexTopDir = indexTopDir;
IndexWriterConfig config = new IndexWriterConfig(analyzer); this.evalFile = evalFile;
directory = FSDirectory.open(outputDir.toPath()); indexes.put(1, new LuceneLiveIndex(new File(indexTopDir, "1grams")));
indexWriter = new IndexWriter(directory, config); indexes.put(2, new LuceneLiveIndex(new File(indexTopDir, "2grams")));
reader = DirectoryReader.open(indexWriter, true); // TODO: see if false is faster indexes.put(3, new LuceneLiveIndex(new File(indexTopDir, "3grams")));
searcher = new IndexSearcher(reader);
} }


@Override @Override
public void close() throws IOException { public void close() throws IOException {
indexWriter.close(); for (LuceneLiveIndex index : indexes.values()) {
reader.close(); index.close();
directory.close(); }
} }


void setLimit(int limit) { void setLimit(int limit) {
Expand All @@ -81,19 +83,18 @@ void indexInputFile() throws IOException {
FileInputStream fin = new FileInputStream(input); FileInputStream fin = new FileInputStream(input);
BufferedInputStream in = new BufferedInputStream(fin); BufferedInputStream in = new BufferedInputStream(fin);
Tokenizer wordTokenizer = language.getWordTokenizer(); // TODO: use a more Google-like tokenizer Tokenizer wordTokenizer = language.getWordTokenizer(); // TODO: use a more Google-like tokenizer
Map<String, Long> ngramToCount = new HashMap<>();
try (XZInputStream xzIn = new XZInputStream(in)) { try (XZInputStream xzIn = new XZInputStream(in)) {
final byte[] buffer = new byte[8192]; final byte[] buffer = new byte[8192];
int n; int n;
while ((n = xzIn.read(buffer)) != -1) { while ((n = xzIn.read(buffer)) != -1) {
String buf = new String(buffer, 0, n); // TODO: not always correct, we need to wait for line end first? String buf = new String(buffer, 0, n); // TODO: not always correct, we need to wait for line end first?
String[] lines = buf.split("\n"); String[] lines = buf.split("\n");
indexLine(wordTokenizer, lines, ngramToCount); indexLine(wordTokenizer, lines);
} }
} }
} }


private void indexLine(Tokenizer wordTokenizer, String[] lines, Map<String, Long> ngramToCount) throws IOException { private void indexLine(Tokenizer wordTokenizer, String[] lines) throws IOException {
for (String line : lines) { for (String line : lines) {
List<String> tokens = wordTokenizer.tokenize(line); List<String> tokens = wordTokenizer.tokenize(line);
//System.out.println("L: " + tokens); //System.out.println("L: " + tokens);
Expand All @@ -104,63 +105,77 @@ private void indexLine(Tokenizer wordTokenizer, String[] lines, Map<String, Long
if (token.trim().isEmpty()) { if (token.trim().isEmpty()) {
continue; continue;
} }
unigramToCount.compute(token, (k, v) -> v == null ? 1 : v + 1);
if (prev != null) {
String ngram = prev + " " + token;
bigramToCount.compute(ngram, (k, v) -> v == null ? 1 : v + 1);
}
if (prevPrev != null && prev != null) { if (prevPrev != null && prev != null) {
String ngram = prevPrev + " " + prev + " " + token; String ngram = prevPrev + " " + prev + " " + token;
Long count = ngramToCount.get(ngram); trigramToCount.compute(ngram, (k, v) -> v == null ? 1 : v + 1);
if (count == null) { if (trigramToCount.size() > limit) {
ngramToCount.put(ngram, 1L); writeAndEvaluate();
} else {
ngramToCount.put(ngram, count + 1);
}
if (ngramToCount.size() > limit) {
writeToLucene(ngramToCount);
ngramToCount.clear();
} }
} }
prevPrev = prev; prevPrev = prev;
prev = token; prev = token;
} }
} }
writeToLucene(ngramToCount); writeAndEvaluate();
}

private void writeAndEvaluate() throws IOException {
writeToLucene(1, unigramToCount);
writeToLucene(2, bigramToCount);
writeToLucene(3, trigramToCount);
if (evalFile != null) {
SimpleCorpusEvaluator evaluator = new SimpleCorpusEvaluator(indexTopDir);
evaluator.run(evalFile);
} else {
System.out.println("Skipping evaluation, no evaluation file specified");
}
} }


private void writeToLucene(Map<String, Long> ngramToCount) throws IOException { private void writeToLucene(int ngramSize, Map<String, Long> ngramToCount) throws IOException {
//System.out.println("WRITE: "); //System.out.println("WRITE: ");
LuceneLiveIndex index = indexes.get(ngramSize);
for (Map.Entry<String, Long> entry : ngramToCount.entrySet()) { for (Map.Entry<String, Long> entry : ngramToCount.entrySet()) {
Term ngram = new Term("ngram", entry.getKey()); Term ngram = new Term("ngram", entry.getKey());
reader = DirectoryReader.open(indexWriter, true); index.reader = DirectoryReader.open(index.indexWriter, true);
searcher = new IndexSearcher(reader); index.searcher = new IndexSearcher(index.reader);
// not sure why this doesn't work, should be faster: // not sure why this doesn't work, should be faster:
/*DirectoryReader newReader = DirectoryReader.openIfChanged(reader); /*DirectoryReader newReader = DirectoryReader.openIfChanged(reader);
if (newReader != null) { if (newReader != null) {
reader = newReader; reader = newReader;
}*/ }*/
TopDocs topDocs = searcher.search(new TermQuery(ngram), 2); TopDocs topDocs = index.searcher.search(new TermQuery(ngram), 2);
//System.out.println(ngram + " ==> " + topDocs.totalHits); //System.out.println(ngram + " ==> " + topDocs.totalHits);
if (topDocs.totalHits == 0) { if (topDocs.totalHits == 0) {
Document doc = getDoc(entry.getKey(), entry.getValue()); Document doc = getDoc(entry.getKey(), entry.getValue());
indexWriter.addDocument(doc); index.indexWriter.addDocument(doc);
} else if (topDocs.totalHits == 1) { } else if (topDocs.totalHits == 1) {
int docNumber = topDocs.scoreDocs[0].doc; int docNumber = topDocs.scoreDocs[0].doc;
Document document = reader.document(docNumber); Document document = index.reader.document(docNumber);
long oldCount = Long.parseLong(document.getField("count").stringValue()); long oldCount = Long.parseLong(document.getField("count").stringValue());
//System.out.println(ngram + " -> " + oldCount + "+" + entry.getValue()); //System.out.println(ngram + " -> " + oldCount + "+" + entry.getValue());
indexWriter.deleteDocuments(ngram); index.indexWriter.deleteDocuments(ngram);
indexWriter.addDocument(getDoc(entry.getKey(), oldCount + entry.getValue())); index.indexWriter.addDocument(getDoc(entry.getKey(), oldCount + entry.getValue()));
// would probably be faster, but we currently rely on the count being a common field: // would probably be faster, but we currently rely on the count being a common field:
//indexWriter.updateNumericDocValue(ngram, "count", oldCount + entry.getValue()); //indexWriter.updateNumericDocValue(ngram, "count", oldCount + entry.getValue());
} else if (topDocs.totalHits > 1) { } else if (topDocs.totalHits > 1) {
throw new RuntimeException("Got more than one hit for: " + ngram); throw new RuntimeException("Got more than one hit for: " + ngram);
} }
//System.out.println(" " + entry.getKey() + " -> " + entry.getValue()); //System.out.println(" " + entry.getKey() + " -> " + entry.getValue());
} }
indexWriter.commit(); // TODO: add/update 'totalTokenCount'
index.indexWriter.commit();
ngramToCount.clear();
} }


@NotNull @NotNull
private Document getDoc(String ngram, long count) { private Document getDoc(String ngram, long count) {
Document doc = new Document(); Document doc = new Document();
doc.add(new Field("ngram", ngram, StringField.TYPE_STORED)); doc.add(new Field("ngram", ngram, StringField.TYPE_STORED)); // TODO: store only for debugging
doc.add(getCountField(count)); doc.add(getCountField(count));
return doc; return doc;
} }
Expand All @@ -175,15 +190,41 @@ private LongField getCountField(long count) {
} }


public static void main(String[] args) throws IOException { public static void main(String[] args) throws IOException {
if (args.length != 2) { if (args.length != 4) {
System.out.println("Usage: " + CommonCrawlToNgram.class + " <langCode> <input.xz>"); System.out.println("Usage: " + CommonCrawlToNgram.class + " <langCode> <input.xz> <ngramIndexDir> <simpleEvalFile>");
System.out.println(" <simpleEvalFile> a plain text file with simple error markup");
System.exit(1); System.exit(1);
} }
Language language = Languages.getLanguageForShortName(args[0]); Language language = Languages.getLanguageForShortName(args[0]);
File input = new File(args[1]); File input = new File(args[1]);
File outputDir = new File(args[2]); File outputDir = new File(args[2]);
try (CommonCrawlToNgram prg = new CommonCrawlToNgram(language, input, outputDir)) { File evalFile = new File(args[3]);
try (CommonCrawlToNgram prg = new CommonCrawlToNgram(language, input, outputDir, evalFile)) {
prg.indexInputFile(); prg.indexInputFile();
} }
} }

class LuceneLiveIndex {

private final Directory directory;
private final IndexWriter indexWriter;

private DirectoryReader reader;
private IndexSearcher searcher;

LuceneLiveIndex(File dir) throws IOException {
Analyzer analyzer = new StandardAnalyzer();
IndexWriterConfig config = new IndexWriterConfig(analyzer);
directory = FSDirectory.open(dir.toPath());
indexWriter = new IndexWriter(directory, config);
reader = DirectoryReader.open(indexWriter, true); // TODO: see if false is faster
searcher = new IndexSearcher(reader);
}

void close() throws IOException {
indexWriter.close();
directory.close();
}

}
} }
Expand Up @@ -42,7 +42,7 @@
* Evaluates the ngram rule with a simple corpus, see {@link SimpleCorpus}. * Evaluates the ngram rule with a simple corpus, see {@link SimpleCorpus}.
* @since 3.2 * @since 3.2
*/ */
class SimpleCorpusEvaluator { public class SimpleCorpusEvaluator {


// without bigrams: // without bigrams:
private static final double START_THRESHOLD = 0.000001; private static final double START_THRESHOLD = 0.000001;
Expand All @@ -64,8 +64,8 @@ class SimpleCorpusEvaluator {
private int goodConfusionMatches; private int goodConfusionMatches;
private int badConfusionMatches; private int badConfusionMatches;


SimpleCorpusEvaluator(File indexDir) throws IOException { public SimpleCorpusEvaluator(File indexTopDir) throws IOException {
evaluator = getEvaluator(indexDir); evaluator = getEvaluator(indexTopDir);
} }


@NotNull @NotNull
Expand All @@ -74,23 +74,22 @@ private Evaluator getEvaluator(File indexTopDir) throws IOException {
} }


@NotNull @NotNull
private ErrorCorpus getCorpus(File dir) throws IOException { private ErrorCorpus getCorpus(File file) throws IOException {
return new SimpleCorpus(dir); return new SimpleCorpus(file);
} }


void close() { void close() {
evaluator.close(); evaluator.close();
} }


PrecisionRecall run(File dir) throws IOException { public PrecisionRecall run(File file) throws IOException {
System.out.println("Output explanation:"); System.out.println("Output explanation:");
System.out.println(" [ ] = this is not an expected error"); System.out.println(" [ ] = this is not an expected error");
System.out.println(" [+ ] = this is an expected error"); System.out.println(" [+ ] = this is an expected error");
System.out.println(" [++] = this is an expected error and the first suggestion is correct"); System.out.println(" [++] = this is an expected error and the first suggestion is correct");
System.out.println(" [//] = not counted because already matches by a different rule"); System.out.println(" [//] = not counted because already matches by a different rule");
System.out.println(""); System.out.println("");
ErrorCorpus corpus = getCorpus(dir); checkLines(getCorpus(file));
checkLines(corpus);
return printAndResetResults(); return printAndResetResults();
} }


Expand Down
Expand Up @@ -35,7 +35,7 @@ public void testIndexing() throws IOException {
try { try {
tempDir.mkdir(); tempDir.mkdir();
String filename = CommonCrawlToNgramTest.class.getResource("/org/languagetool/dev/bigdata/ngram-input.txt.xz").getFile(); String filename = CommonCrawlToNgramTest.class.getResource("/org/languagetool/dev/bigdata/ngram-input.txt.xz").getFile();
try (CommonCrawlToNgram prg = new CommonCrawlToNgram(new German(), new File(filename), tempDir)) { try (CommonCrawlToNgram prg = new CommonCrawlToNgram(new German(), new File(filename), tempDir, null)) {
prg.setLimit(1); prg.setLimit(1);
prg.indexInputFile(); prg.indexInputFile();
} }
Expand Down

0 comments on commit fdee326

Please sign in to comment.