Skip to content

Commit

Permalink
added stopwords option
Browse files Browse the repository at this point in the history
  • Loading branch information
jedp committed Oct 13, 2011
1 parent 2d8279c commit 82a3be5
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 11 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,8 @@ The example texts contain 712kB.

The resulting indexes in redis consume about 33MB.

Using `config.filterStopWords === false`, this drops slightly to almost 32MB.

So the memory required for the redis indexes is about 50 times the space used
on disk to hold the original source text.

Expand Down
46 changes: 35 additions & 11 deletions collector.js
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@
*
*/

var config = require('./config');

var N = 'N'; // key -> int
var TF_PREFIX = 'tf:'; // zset -> { documentId, frequency }
var DF_PREFIX = 'df:'; // set -> { documentIds }
Expand All @@ -56,13 +58,16 @@ var _ = require('underscore');

var stemmer = require('porter-stemmer').memoizingStemmer;

//var stopWords = require('fs').readFileSync('./stop_words.txt').split('\n');

// @@@ to do - filter out stop words? or not?
function stemText(text) {
return _.map(
text.trim().split(/\s+/),
function(t) { return stemmer(t.replace(/\W+/g, '').toLowerCase()) });
var stopWords = {};
if (config.filterStopWords) {
var fs = require('fs');
var words = fs.readFileSync(__dirname+'/stop_words.txt').toString().split('\n');
for (i in words) {
var word = words[i].trim();
if (word) {
stopWords[word] = true;
}
}
}

function Collector(redisClient, redisDatabase) {
Expand Down Expand Up @@ -102,6 +107,27 @@ function Collector(redisClient, redisDatabase) {
});
};

/*
* Utility
*
* stemText(string) -> [list, of, stems]
*
* If config.filterStopWords, stop words will be filtered out
*/

self.stemText = function(text) {
var words = text.trim().split(/\s+/);
var stemmed = [];
for (i in words) {
var word = words[i];
if (! stopWords[word]) {
stemmed.push(stemmer(word.replace(/\W+/g, '').toLowerCase()));
}
}
return stemmed;
};


/*
* Private methods
*
Expand Down Expand Up @@ -266,7 +292,7 @@ function Collector(redisClient, redisDatabase) {
self._readDocument = function(id, text, callback) {
// Collect all terms and words in a document.
// Remove extraneous characters and map to lower-case.
var terms = stemText(text);
var terms = self.stemText(text);

self._updateDocumentLength(id, terms, function(err) {
self._storeDocumentTerms(id, terms, function(err) {
Expand Down Expand Up @@ -359,7 +385,7 @@ function Collector(redisClient, redisDatabase) {
console.log("search returned ids: %j", ids);
}
};
var terms = stemText(phrase);
var terms = self.stemText(phrase);
var scores = {};
var iter = 0;
var ids = [];
Expand Down Expand Up @@ -392,8 +418,6 @@ function Collector(redisClient, redisDatabase) {
return self.initialize(function() { return self });
};

var config = require('./config');

module.exports = new Collector(
require("redis").createClient(config.redisPort, config.redisHost),
config.redisDatabase
Expand Down

0 comments on commit 82a3be5

Please sign in to comment.