Skip to content

Commit

Permalink
don't index tokens longer than a defined max length (20 for now)
Browse files Browse the repository at this point in the history
  • Loading branch information
danielnaber committed Nov 27, 2015
1 parent 84174ec commit 40a1710
Showing 1 changed file with 12 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
class CommonCrawlToNgram implements AutoCloseable {

private static final double THRESHOLD = 0.00000000001;
private static final int MAX_TOKEN_LENGTH = 20;

private final File input;
private final File indexTopDir;
Expand Down Expand Up @@ -128,14 +129,20 @@ private void indexSentence(String sentence) throws IOException {
if (token.trim().isEmpty()) {
continue;
}
unigramToCount.compute(token, (k, v) -> v == null ? 1 : v + 1);
if (token.length() <= MAX_TOKEN_LENGTH) {
unigramToCount.compute(token, (k, v) -> v == null ? 1 : v + 1);
}
if (prev != null) {
String ngram = prev + " " + token;
bigramToCount.compute(ngram, (k, v) -> v == null ? 1 : v + 1);
if (token.length() <= MAX_TOKEN_LENGTH && prev.length() <= MAX_TOKEN_LENGTH) {
String ngram = prev + " " + token;
bigramToCount.compute(ngram, (k, v) -> v == null ? 1 : v + 1);
}
}
if (prevPrev != null && prev != null) {
String ngram = prevPrev + " " + prev + " " + token;
trigramToCount.compute(ngram, (k, v) -> v == null ? 1 : v + 1);
if (token.length() <= MAX_TOKEN_LENGTH && prev.length() <= MAX_TOKEN_LENGTH && prevPrev.length() <= MAX_TOKEN_LENGTH) {
String ngram = prevPrev + " " + prev + " " + token;
trigramToCount.compute(ngram, (k, v) -> v == null ? 1 : v + 1);
}
if (trigramToCount.size() > cacheLimit) {
writeAndEvaluate();
}
Expand Down

0 comments on commit 40a1710

Please sign in to comment.