First commit

ibrow · May 21, 2010 · 9b19846 · 9b19846
commit 9b19846
Show file tree

Hide file tree

Showing 6 changed files with 5,653 additions and 0 deletions.
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,29 @@
+----------------------------------------------------------------------
+node-rss is released under the MIT License
+
+Copyright (c) 2010 Rob Searles - http://www.robsearles.com
+
+Permission is hereby granted, free of charge, to any person
+obtaining a copy of this software and associated documentation
+files (the "Software"), to deal in the Software without
+restriction, including without limitation the rights to use,
+copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following
+conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
+----------------------------------------------------------------------
+node-xml, which node-rss makes heavy usage is also released under the
+MIT License - see http://github.com/robrighter/node-xml for more info
+----------------------------------------------------------------------
diff --git a/README b/README
@@ -0,0 +1,40 @@
+----------------------------------------------------------------------
+node-rss - an RSS parser for node.
+http://github.com/ibrow/node-rss
+Rob Searles - http://www.robsearles.com
+----------------------------------------------------------------------
+node-rss makes heavy use of the node-xml module written by 
+Rob Righter - @robrighter
+http://github.com/robrighter/node-xml
+----------------------------------------------------------------------
+node-rss is released under the MIT licence
+----------------------------------------------------------------------
+See examples.js for working examples of node-rss
+
+----------------------------------------------------------------------
+TODO
+----------------------------------------------------------------------
+Lots, mainly:
+ - error checking
+ - writing tests
+ - make parsing mode robust
+ - conform to all specifications
+
+----------------------------------------------------------------------
+HISTORY
+----------------------------------------------------------------------
+21 May 2010
+Initial release, working on v0.1.95-17-g1036aa9
+----------------------------------------------------------------------
+
+----------------------------------------------------------------------
+REFERENCE
+----------------------------------------------------------------------
+RSS 2.0 specification
+http://cyber.law.harvard.edu/rss/rss.html
+
+RSS 1.0 specification
+http://web.resource.org/rss/1.0/spec
+
+Atom 1.0 specification
+http://atompub.org/2005/07/11/draft-ietf-atompub-format-10.html
diff --git a/example.js b/example.js
@@ -0,0 +1,45 @@
+/**********************************************************************
+example.js
+Example of the node-rss feed parser
+
+**********************************************************************/
+var sys = require('sys');
+var rss = require('./node-rss');
+
+
+/**********************************************************************
+Example One:
+Getting a remote RSS feed and parsing
+rss.parseURL(feed_url, use_excerpt, callback);
+**********************************************************************/
+// URL of the feed you want to parse
+var feed_url = 'http://feeds.feedburner.com/github';
+
+var response = rss.parseURL(feed_url, function(articles) {
+    sys.puts(articles.length);
+    for(i=0; i<articles.length; i++) {
+	sys.puts("Article: "+i+", "+
+		 articles[i].title+"\n"+
+		 articles[i].link+"\n"+
+		 articles[i].description+"\n"+
+		 articles[i].content
+		);
+    }
+});
+
+/**********************************************************************
+Example Two:
+Getting a local RSS feed and parsing
+rss.parseFile(feed_file, use_excerpt, callback);
+**********************************************************************/
+var response = rss.parseFile('nodeblogs.com.feed.xml', function(articles) {
+    sys.puts(articles.length);
+    for(i=0; i<articles.length; i++) {
+	sys.puts("Article: "+i+", "+
+		 articles[i].title+"\n"+
+		 articles[i].link+"\n"+
+		 articles[i].description+"\n"+
+		 articles[i].content
+		);
+    }
+});
diff --git a/node-rss.js b/node-rss.js
@@ -0,0 +1,171 @@
+/**********************************************************************
+ node-rss - an RSS parser for node.
+ http://github.com/ibrow/node-rss
+
+ Copyright (c) 2010 Rob Searles
+ http://www.robsearles.com
+ 
+ node-rss is released under the MIT license
+  - see LICENSE for more info
+
+ *********************************************************************
+ node-rss makes heavy use of the node-xml module written by 
+ Rob Righter - @robrighter
+ http://github.com/robrighter/node-xml
+**********************************************************************/
+var sys = require('sys'), http = require('http');
+var xml = require("./node-xml");
+
+// variable for holding the callback function which is passed to the
+// exported function. This callback is passed the articles array
+var callback = function() {};
+
+// The main "meat" of this module - parses an rss feed and triggers
+// the callback when done.
+// using node-xml: http://github.com/robrighter/node-xml
+var parser = new xml.SaxParser(function(cb) {
+    var articles = Array();
+    var current_element = false;
+    var article_count = 0;
+    var in_item = false;
+    var current_chars = '';
+
+
+    cb.onStartDocument(function() { });
+
+    // when finished parsing the RSS feed, trigger the callback
+    cb.onEndDocument(function() {	    
+	callback(articles);
+    });
+
+
+    //track what element we are currently in. If it is an <item> this is
+    // an article, add container array to the list of articles
+    cb.onStartElementNS(function(elem, attrs, prefix, uri, namespaces) {
+	current_element = elem.toLowerCase();
+	if(current_element == 'item' || current_element == 'entry') {
+	    in_item = true;
+	    articles[article_count] = Array();
+	}
+    });
+    // when we are at the end of an element, save its related content
+    cb.onEndElementNS(function(elem, prefix, uri) {
+	if(in_item) {
+	    switch(current_element) 
+	    {
+	    case 'description':
+	    case 'summary':
+		articles[article_count][current_element] = current_chars.replace(/^\s\s*/, '').replace(/\s\s*$/, '');
+		break;
+	    case 'content':
+	    case 'encoded': // feedburner is <content:encoded>, node-xml reads as <encoded>
+		current_element = 'content';
+		articles[article_count][current_element] = current_chars.replace(/^\s\s*/, '').replace(/\s\s*$/, '');
+		break;
+	    case 'link':
+	    case 'title':
+		articles[article_count][current_element] = current_chars.replace(/^\s\s*/, '').replace(/\s\s*$/, '');
+		break;
+	    }
+
+	    current_element = false;
+	    current_chars = '';
+	    if(elem.toLowerCase() == 'item' || elem.toString() == 'entry') {
+		in_item = false;
+		article_count ++;   
+	    }
+	}
+    });
+
+    cb.onCharacters(addContent);
+    cb.onCdata(addContent);
+    function addContent(chars) {
+	if(in_item) {
+	    current_chars += chars;
+	}
+    };
+
+    // @TODO handle warnings and errors properly
+    cb.onWarning(function(msg) {
+	sys.puts('<WARNING>'+msg+"</WARNING>");
+    });
+    cb.onError(function(msg) {
+	sys.puts('<ERROR>'+JSON.stringify(msg)+"</ERROR>");
+    });
+});
+
+
+/**
+ * parseFile()
+ * Parses an RSS feed from a file. 
+ * @param file - path to the RSS feed file
+ * @param cb - callback function to be triggered at end of parsing
+ */
+exports.parseFile = function(file, cb) {
+    callback = cb;
+    parser.parseFile(file);
+}
+/**
+ * parseURL()
+ * Parses an RSS feed from a URL. 
+ * @param url - URL of the RSS feed file
+ * @param cb - callback function to be triggered at end of parsing
+ *
+ * @TODO - decent error checking
+ */
+exports.parseURL = function(url, cb) {
+    callback = cb;
+
+    get_rss(url);
+    function get_rss(url) {
+	var u = require('url'), http = require('http');
+	var parts = u.parse(url);
+	//sys.puts(JSON.stringify(parts));
+
+	// set the default port to 80
+	if(!parts.port) { parts.port = 80; }
+
+
+	var redirection_level = 0;
+       	var client = http.createClient(parts.port, parts.hostname);
+	var request = client.request('GET', parts.pathname, {'host': parts.hostname});
+	request.addListener('response', function (response) {
+	    //sys.puts('STATUS: ' + response.statusCode);
+	    //sys.puts('HEADERS: ' + JSON.stringify(response.headers));
+
+	    // check to see the type of status
+	    switch(response.statusCode) {
+		// check for ALL OK
+	    case 200:
+		var body = ''; 
+		response.addListener('data', function (chunk) {
+		    body += chunk;
+		});
+		response.addListener('end', function() {
+		    parser.parseString(body);
+		});
+		break;
+		// redirect status returned
+	    case 301:
+	    case 302:
+		if(redirection_level > 10) {
+		    sys.puts("too many redirects");
+		}
+		else {
+		    sys.puts("redirect to "+response.headers.location);
+		    get_rss(response.headers.location);
+		}
+		break;
+	    default:
+		/*
+		response.setEncoding('utf8');
+		response.addListener('data', function (chunk) {
+		    //sys.puts('BODY: ' + chunk);
+		});
+*/
+		break;
+	    }	  
+	});
+	request.end();	
+    }
+};