Skip to content

Commit

Permalink
Rewrite of the confusion probability rule for more precision. Also, o…
Browse files Browse the repository at this point in the history
…nly a few confusion pairs are now enabled by default but precision and recall have been checked for these.
  • Loading branch information
danielnaber committed May 26, 2015
1 parent 44400c4 commit b1b9c52
Show file tree
Hide file tree
Showing 30 changed files with 1,951 additions and 3,857 deletions.
Expand Up @@ -18,28 +18,42 @@
*/
package org.languagetool.languagemodel;

import java.util.List;

/**
* A very simple language model that contains information about ngram occurrences.
* @since 2.7
*/
public interface LanguageModel extends AutoCloseable {

/** ngram sentence start marker - note: this is not in the v1 data from Google */
public static final String GOOGLE_SENTENCE_START = "_START_";
/** ngram sentence end marker - note: this is not in the v1 data from Google */
public static final String GOOGLE_SENTENCE_END = "_END_";
static final String GOOGLE_SENTENCE_START = "_START_";
/** ngram sentence end marker */
static final String GOOGLE_SENTENCE_END = ".";

/**
* Get the occurrence count for {@code token}.
*/
long getCount(String token1);

/**
* Get the occurrence count for the given token sequence.
*/
long getCount(List<String> tokens);

/**
* Get the occurrence count for the phrase {@code token1 token2}.
*/
public long getCount(String token1, String token2);
long getCount(String token1, String token2);

/**
* Get the occurrence count for the phrase {@code token1 token2 token3}.
*/
public long getCount(String token1, String token2, String token3);
long getCount(String token1, String token2, String token3);

@Override
public void close();
long getTotalTokenCount();

@Override
void close();

}
Expand Up @@ -18,6 +18,7 @@
*/
package org.languagetool.languagemodel;

import org.apache.commons.lang.StringUtils;
import org.apache.lucene.index.*;
import org.apache.lucene.search.*;
import org.apache.lucene.store.FSDirectory;
Expand Down Expand Up @@ -46,10 +47,11 @@ public LuceneLanguageModel(File topIndexDir) throws IOException {
throw new RuntimeException("Not found or is not a directory: " + topIndexDir);
}
this.topIndexDir = topIndexDir;
addIndex(topIndexDir, 1);
addIndex(topIndexDir, 2);
addIndex(topIndexDir, 3);
if (luceneSearcherMap.size() == 0) {
throw new RuntimeException("No directories '2grams' and/or '3grams' found in " + topIndexDir);
throw new RuntimeException("No directories '1grams', '2grams', and/or '3grams' found in " + topIndexDir);
}
}

Expand All @@ -61,25 +63,54 @@ private void addIndex(File topIndexDir, int ngramSize) throws IOException {
}
}

@Override
public long getCount(List<String> tokens) {
Objects.requireNonNull(tokens);
Term term = new Term("ngram", StringUtils.join(tokens, " "));
return getCount(term, getLuceneSearcher(tokens.size()));
}

@Override
public long getCount(String token1) {
Objects.requireNonNull(token1);
return getCount(Arrays.asList(token1));
}

@Override
public long getCount(String token1, String token2) {
Objects.requireNonNull(token1);
Objects.requireNonNull(token2);
Term term = new Term("ngram", token1 + " " + token2);
LuceneSearcher luceneSearcher = getLuceneSearcher(2);
return getCount(term, luceneSearcher);
return getCount(Arrays.asList(token1, token2));
}

@Override
public long getCount(String token1, String token2, String token3) {
Objects.requireNonNull(token1);
Objects.requireNonNull(token2);
Objects.requireNonNull(token3);
Term term = new Term("ngram", token1 + " " + token2 + " " + token3);
LuceneSearcher luceneSearcher = getLuceneSearcher(3);
long count = getCount(term, luceneSearcher);
//System.out.println("Lookup: " + token1 + " " + token2 + " " + token3 + " => " + count);
return count;
return getCount(Arrays.asList(token1, token2, token3));
}

@Override
public long getTotalTokenCount() {
LuceneSearcher luceneSearcher = getLuceneSearcher(1);
try {
RegexpQuery query = new RegexpQuery(new Term("totalTokenCount", ".*"));
TopDocs docs = luceneSearcher.searcher.search(query, 1000); // Integer.MAX_VALUE might cause OOE on wrong index
if (docs.totalHits == 0) {
throw new RuntimeException("Expected 'totalTokenCount' meta documents not found in 1grams index");
} else if (docs.totalHits > 1000) {
throw new RuntimeException("Did not expect more than 1000 'totalTokenCount' meta documents");
} else {
long result = 0;
for (ScoreDoc scoreDoc : docs.scoreDocs) {
result += Long.parseLong(luceneSearcher.reader.document(scoreDoc.doc).get("totalTokenCount"));
}
return result;
}
} catch (IOException e) {
throw new RuntimeException(e);
}
}

protected LuceneSearcher getLuceneSearcher(int ngramSize) {
Expand Down

0 comments on commit b1b9c52

Please sign in to comment.