Permalink
Browse files

added stopwords option

  • Loading branch information...
1 parent 2d8279c commit 82a3be5399d4e3f9c49b218e74d094d0cc7b9148 @jedp committed Oct 13, 2011
Showing with 37 additions and 11 deletions.
  1. +2 −0 README.md
  2. +35 −11 collector.js
View
@@ -93,6 +93,8 @@ The example texts contain 712kB.
The resulting indexes in redis consume about 33MB.
+Using `config.filterStopWords === false`, this drops slightly to almost 32MB.
+
So the memory required for the redis indexes is about 50 times the space used
on disk to hold the original source text.
View
@@ -43,6 +43,8 @@
*
*/
+var config = require('./config');
+
var N = 'N'; // key -> int
var TF_PREFIX = 'tf:'; // zset -> { documentId, frequency }
var DF_PREFIX = 'df:'; // set -> { documentIds }
@@ -56,13 +58,16 @@ var _ = require('underscore');
var stemmer = require('porter-stemmer').memoizingStemmer;
-//var stopWords = require('fs').readFileSync('./stop_words.txt').split('\n');
-
-// @@@ to do - filter out stop words? or not?
-function stemText(text) {
- return _.map(
- text.trim().split(/\s+/),
- function(t) { return stemmer(t.replace(/\W+/g, '').toLowerCase()) });
+var stopWords = {};
+if (config.filterStopWords) {
+ var fs = require('fs');
+ var words = fs.readFileSync(__dirname+'/stop_words.txt').toString().split('\n');
+ for (i in words) {
+ var word = words[i].trim();
+ if (word) {
+ stopWords[word] = true;
+ }
+ }
}
function Collector(redisClient, redisDatabase) {
@@ -103,6 +108,27 @@ function Collector(redisClient, redisDatabase) {
};
/*
+ * Utility
+ *
+ * stemText(string) -> [list, of, stems]
+ *
+ * If config.filterStopWords, stop words will be filtered out
+ */
+
+ self.stemText = function(text) {
+ var words = text.trim().split(/\s+/);
+ var stemmed = [];
+ for (i in words) {
+ var word = words[i];
+ if (! stopWords[word]) {
+ stemmed.push(stemmer(word.replace(/\W+/g, '').toLowerCase()));
+ }
+ }
+ return stemmed;
+ };
+
+
+ /*
* Private methods
*
* _calculateWeight
@@ -266,7 +292,7 @@ function Collector(redisClient, redisDatabase) {
self._readDocument = function(id, text, callback) {
// Collect all terms and words in a document.
// Remove extraneous characters and map to lower-case.
- var terms = stemText(text);
+ var terms = self.stemText(text);
self._updateDocumentLength(id, terms, function(err) {
self._storeDocumentTerms(id, terms, function(err) {
@@ -359,7 +385,7 @@ function Collector(redisClient, redisDatabase) {
console.log("search returned ids: %j", ids);
}
};
- var terms = stemText(phrase);
+ var terms = self.stemText(phrase);
var scores = {};
var iter = 0;
var ids = [];
@@ -392,8 +418,6 @@ function Collector(redisClient, redisDatabase) {
return self.initialize(function() { return self });
};
-var config = require('./config');
-
module.exports = new Collector(
require("redis").createClient(config.redisPort, config.redisHost),
config.redisDatabase

0 comments on commit 82a3be5

Please sign in to comment.