diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..f2284bd --- /dev/null +++ b/LICENSE @@ -0,0 +1,29 @@ +---------------------------------------------------------------------- +node-rss is released under the MIT License + +Copyright (c) 2010 Rob Searles - http://www.robsearles.com + +Permission is hereby granted, free of charge, to any person +obtaining a copy of this software and associated documentation +files (the "Software"), to deal in the Software without +restriction, including without limitation the rights to use, +copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the +Software is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. +---------------------------------------------------------------------- +node-xml, which node-rss makes heavy usage is also released under the +MIT License - see http://github.com/robrighter/node-xml for more info +---------------------------------------------------------------------- diff --git a/README b/README new file mode 100644 index 0000000..d74aa29 --- /dev/null +++ b/README @@ -0,0 +1,40 @@ +---------------------------------------------------------------------- +node-rss - an RSS parser for node. +http://github.com/ibrow/node-rss +Rob Searles - http://www.robsearles.com +---------------------------------------------------------------------- +node-rss makes heavy use of the node-xml module written by +Rob Righter - @robrighter +http://github.com/robrighter/node-xml +---------------------------------------------------------------------- +node-rss is released under the MIT licence +---------------------------------------------------------------------- +See examples.js for working examples of node-rss + +---------------------------------------------------------------------- +TODO +---------------------------------------------------------------------- +Lots, mainly: + - error checking + - writing tests + - make parsing mode robust + - conform to all specifications + +---------------------------------------------------------------------- +HISTORY +---------------------------------------------------------------------- +21 May 2010 +Initial release, working on v0.1.95-17-g1036aa9 +---------------------------------------------------------------------- + +---------------------------------------------------------------------- +REFERENCE +---------------------------------------------------------------------- +RSS 2.0 specification +http://cyber.law.harvard.edu/rss/rss.html + +RSS 1.0 specification +http://web.resource.org/rss/1.0/spec + +Atom 1.0 specification +http://atompub.org/2005/07/11/draft-ietf-atompub-format-10.html \ No newline at end of file diff --git a/example.js b/example.js new file mode 100644 index 0000000..aa2c910 --- /dev/null +++ b/example.js @@ -0,0 +1,45 @@ +/********************************************************************** +example.js +Example of the node-rss feed parser + +**********************************************************************/ +var sys = require('sys'); +var rss = require('./node-rss'); + + +/********************************************************************** +Example One: +Getting a remote RSS feed and parsing +rss.parseURL(feed_url, use_excerpt, callback); +**********************************************************************/ +// URL of the feed you want to parse +var feed_url = 'http://feeds.feedburner.com/github'; + +var response = rss.parseURL(feed_url, function(articles) { + sys.puts(articles.length); + for(i=0; i this is + // an article, add container array to the list of articles + cb.onStartElementNS(function(elem, attrs, prefix, uri, namespaces) { + current_element = elem.toLowerCase(); + if(current_element == 'item' || current_element == 'entry') { + in_item = true; + articles[article_count] = Array(); + } + }); + // when we are at the end of an element, save its related content + cb.onEndElementNS(function(elem, prefix, uri) { + if(in_item) { + switch(current_element) + { + case 'description': + case 'summary': + articles[article_count][current_element] = current_chars.replace(/^\s\s*/, '').replace(/\s\s*$/, ''); + break; + case 'content': + case 'encoded': // feedburner is , node-xml reads as + current_element = 'content'; + articles[article_count][current_element] = current_chars.replace(/^\s\s*/, '').replace(/\s\s*$/, ''); + break; + case 'link': + case 'title': + articles[article_count][current_element] = current_chars.replace(/^\s\s*/, '').replace(/\s\s*$/, ''); + break; + } + + current_element = false; + current_chars = ''; + if(elem.toLowerCase() == 'item' || elem.toString() == 'entry') { + in_item = false; + article_count ++; + } + } + }); + + cb.onCharacters(addContent); + cb.onCdata(addContent); + function addContent(chars) { + if(in_item) { + current_chars += chars; + } + }; + + // @TODO handle warnings and errors properly + cb.onWarning(function(msg) { + sys.puts(''+msg+""); + }); + cb.onError(function(msg) { + sys.puts(''+JSON.stringify(msg)+""); + }); +}); + + +/** + * parseFile() + * Parses an RSS feed from a file. + * @param file - path to the RSS feed file + * @param cb - callback function to be triggered at end of parsing + */ +exports.parseFile = function(file, cb) { + callback = cb; + parser.parseFile(file); +} +/** + * parseURL() + * Parses an RSS feed from a URL. + * @param url - URL of the RSS feed file + * @param cb - callback function to be triggered at end of parsing + * + * @TODO - decent error checking + */ +exports.parseURL = function(url, cb) { + callback = cb; + + get_rss(url); + function get_rss(url) { + var u = require('url'), http = require('http'); + var parts = u.parse(url); + //sys.puts(JSON.stringify(parts)); + + // set the default port to 80 + if(!parts.port) { parts.port = 80; } + + + var redirection_level = 0; + var client = http.createClient(parts.port, parts.hostname); + var request = client.request('GET', parts.pathname, {'host': parts.hostname}); + request.addListener('response', function (response) { + //sys.puts('STATUS: ' + response.statusCode); + //sys.puts('HEADERS: ' + JSON.stringify(response.headers)); + + // check to see the type of status + switch(response.statusCode) { + // check for ALL OK + case 200: + var body = ''; + response.addListener('data', function (chunk) { + body += chunk; + }); + response.addListener('end', function() { + parser.parseString(body); + }); + break; + // redirect status returned + case 301: + case 302: + if(redirection_level > 10) { + sys.puts("too many redirects"); + } + else { + sys.puts("redirect to "+response.headers.location); + get_rss(response.headers.location); + } + break; + default: + /* + response.setEncoding('utf8'); + response.addListener('data', function (chunk) { + //sys.puts('BODY: ' + chunk); + }); +*/ + break; + } + }); + request.end(); + } +}; \ No newline at end of file diff --git a/node-xml.js b/node-xml.js new file mode 100755 index 0000000..2111287 --- /dev/null +++ b/node-xml.js @@ -0,0 +1,1248 @@ +// node-xml +// An xml parser for node.js +// (C) Rob Righter (@robrighter) 2009 - 2010, Licensed under the MIT-LICENSE +// Contributions from David Joham + + +(function () { + +// CONSTANTS +var whitespace = "\n\r\t "; + + +//XMLP is a pull-based parser. The calling application passes in a XML string +//to the constructor, then repeatedly calls .next() to parse the next segment. +//.next() returns a flag indicating what type of segment was found, and stores +//data temporarily in couple member variables (name, content, array of +//attributes), which can be accessed by several .get____() methods. +// +//Basically, XMLP is the lowest common denominator parser - an very simple +//API which other wrappers can be built against. + + +var XMLP = function(strXML) { + // Normalize line breaks + strXML = SAXStrings.replace(strXML, null, null, "\r\n", "\n"); + strXML = SAXStrings.replace(strXML, null, null, "\r", "\n"); + + this.m_xml = strXML; + this.m_iP = 0; + this.m_iState = XMLP._STATE_PROLOG; + this.m_stack = new Stack(); + this._clearAttributes(); + this.m_pause = false; + this.m_preInterruptIState = XMLP._STATE_PROLOG; + this.m_namespaceList = new Array(); + this.m_chunkTransitionContinuation = null; + +} + + +// CONSTANTS (these must be below the constructor) +XMLP._NONE = 0; +XMLP._ELM_B = 1; +XMLP._ELM_E = 2; +XMLP._ELM_EMP = 3; +XMLP._ATT = 4; +XMLP._TEXT = 5; +XMLP._ENTITY = 6; +XMLP._PI = 7; +XMLP._CDATA = 8; +XMLP._COMMENT = 9; +XMLP._DTD = 10; +XMLP._ERROR = 11; +XMLP._INTERRUPT = 12; + +XMLP._CONT_XML = 0; +XMLP._CONT_ALT = 1; + +XMLP._ATT_NAME = 0; +XMLP._ATT_VAL = 1; + +XMLP._STATE_PROLOG = 1; +XMLP._STATE_DOCUMENT = 2; +XMLP._STATE_MISC = 3; + +XMLP._errs = new Array(); +XMLP._errs[XMLP.ERR_CLOSE_PI = 0 ] = "PI: missing closing sequence"; +XMLP._errs[XMLP.ERR_CLOSE_DTD = 1 ] = "DTD: missing closing sequence"; +XMLP._errs[XMLP.ERR_CLOSE_COMMENT = 2 ] = "Comment: missing closing sequence"; +XMLP._errs[XMLP.ERR_CLOSE_CDATA = 3 ] = "CDATA: missing closing sequence"; +XMLP._errs[XMLP.ERR_CLOSE_ELM = 4 ] = "Element: missing closing sequence"; +XMLP._errs[XMLP.ERR_CLOSE_ENTITY = 5 ] = "Entity: missing closing sequence"; +XMLP._errs[XMLP.ERR_PI_TARGET = 6 ] = "PI: target is required"; +XMLP._errs[XMLP.ERR_ELM_EMPTY = 7 ] = "Element: cannot be both empty and closing"; +XMLP._errs[XMLP.ERR_ELM_NAME = 8 ] = "Element: name must immediatly follow \"<\""; +XMLP._errs[XMLP.ERR_ELM_LT_NAME = 9 ] = "Element: \"<\" not allowed in element names"; +XMLP._errs[XMLP.ERR_ATT_VALUES = 10] = "Attribute: values are required and must be in quotes"; +XMLP._errs[XMLP.ERR_ATT_LT_NAME = 11] = "Element: \"<\" not allowed in attribute names"; +XMLP._errs[XMLP.ERR_ATT_LT_VALUE = 12] = "Attribute: \"<\" not allowed in attribute values"; +XMLP._errs[XMLP.ERR_ATT_DUP = 13] = "Attribute: duplicate attributes not allowed"; +XMLP._errs[XMLP.ERR_ENTITY_UNKNOWN = 14] = "Entity: unknown entity"; +XMLP._errs[XMLP.ERR_INFINITELOOP = 15] = "Infininte loop"; +XMLP._errs[XMLP.ERR_DOC_STRUCTURE = 16] = "Document: only comments, processing instructions, or whitespace allowed outside of document element"; +XMLP._errs[XMLP.ERR_ELM_NESTING = 17] = "Element: must be nested correctly"; + + + +XMLP.prototype.continueParsing = function(strXML) { + + if(this.m_chunkTransitionContinuation){ + strXML = this.m_chunkTransitionContinuation + strXML; + } + // Normalize line breaks + strXML = SAXStrings.replace(strXML, null, null, "\r\n", "\n"); + strXML = SAXStrings.replace(strXML, null, null, "\r", "\n"); + + this.m_xml = strXML; + this.m_iP = 0; + this.m_iState = XMLP._STATE_DOCUMENT; + //this.m_stack = new Stack(); + //this._clearAttributes(); + this.m_pause = false; + this.m_preInterruptIState = XMLP._STATE_PROLOG; + this.m_chunkTransitionContinuation = null; + +} + +XMLP.prototype._addAttribute = function(name, value) { + this.m_atts[this.m_atts.length] = new Array(name, value); +} + +XMLP.prototype._checkStructure = function(iEvent) { + if(XMLP._STATE_PROLOG == this.m_iState) { + if((XMLP._TEXT == iEvent) || (XMLP._ENTITY == iEvent)) { + if(SAXStrings.indexOfNonWhitespace(this.getContent(), this.getContentBegin(), this.getContentEnd()) != -1) { + return this._setErr(XMLP.ERR_DOC_STRUCTURE); + } + } + + if((XMLP._ELM_B == iEvent) || (XMLP._ELM_EMP == iEvent)) { + this.m_iState = XMLP._STATE_DOCUMENT; + // Don't return - fall through to next state + } + } + if(XMLP._STATE_DOCUMENT == this.m_iState) { + if((XMLP._ELM_B == iEvent) || (XMLP._ELM_EMP == iEvent)) { + this.m_stack.push(this.getName()); + } + + if((XMLP._ELM_E == iEvent) || (XMLP._ELM_EMP == iEvent)) { + var strTop = this.m_stack.pop(); + if((strTop == null) || (strTop != this.getName())) { + return this._setErr(XMLP.ERR_ELM_NESTING); + } + } + + if(this.m_stack.count() == 0) { + this.m_iState = XMLP._STATE_MISC; + return iEvent; + } + } + if(XMLP._STATE_MISC == this.m_iState) { + if((XMLP._ELM_B == iEvent) || (XMLP._ELM_E == iEvent) || (XMLP._ELM_EMP == iEvent) || (XMLP.EVT_DTD == iEvent)) { + return this._setErr(XMLP.ERR_DOC_STRUCTURE); + } + + if((XMLP._TEXT == iEvent) || (XMLP._ENTITY == iEvent)) { + if(SAXStrings.indexOfNonWhitespace(this.getContent(), this.getContentBegin(), this.getContentEnd()) != -1) { + return this._setErr(XMLP.ERR_DOC_STRUCTURE); + } + } + } + + return iEvent; + +} + +XMLP.prototype._clearAttributes = function() { + this.m_atts = new Array(); +} + +XMLP.prototype._findAttributeIndex = function(name) { + for(var i = 0; i < this.m_atts.length; i++) { + if(this.m_atts[i][XMLP._ATT_NAME] == name) { + return i; + } + } + return -1; + +} + +XMLP.prototype.getAttributeCount = function() { + return this.m_atts ? this.m_atts.length : 0; +} + +XMLP.prototype.getAttributeName = function(index) { + return ((index < 0) || (index >= this.m_atts.length)) ? null : this.m_atts[index][XMLP._ATT_NAME]; +} + +XMLP.prototype.getAttributeValue = function(index) { + return ((index < 0) || (index >= this.m_atts.length)) ? null : __unescapeString(this.m_atts[index][XMLP._ATT_VAL]); +} + +XMLP.prototype.getAttributeValueByName = function(name) { + return this.getAttributeValue(this._findAttributeIndex(name)); +} + +XMLP.prototype.getColumnNumber = function() { + return SAXStrings.getColumnNumber(this.m_xml, this.m_iP); +} + +XMLP.prototype.getContent = function() { + return (this.m_cSrc == XMLP._CONT_XML) ? this.m_xml : this.m_cAlt; +} + +XMLP.prototype.getContentBegin = function() { + return this.m_cB; +} + +XMLP.prototype.getContentEnd = function() { + return this.m_cE; +} + +XMLP.prototype.getLineNumber = function() { + return SAXStrings.getLineNumber(this.m_xml, this.m_iP); +} + +XMLP.prototype.getName = function() { + return this.m_name; +} + +XMLP.prototype.pause = function(){ + this.m_pause = true; +} + +XMLP.prototype.resume = function(){ + this.m_pause = false; + this.m_iState = this.m_preInterruptIState; +} + +XMLP.prototype.next = function() { + if(!this.m_pause){ + return this._checkStructure(this._parse()); + } + else{ + //save off the current event loop state and set the state to interrupt + this.m_preInterruptIState = this.m_iState; + return XMLP._INTERRUPT; + } +} + +XMLP.prototype._parse = function() { + if(this.m_iP == this.m_xml.length) { + return XMLP._NONE; + } + + if(this.m_iP == this.m_xml.indexOf("= 0; i--){ + var item = this.m_namespaceList[i]; + if(item.prefix === ''){ + return item.uri; + } + } + + //still nothing, lets just return an empty string + return ''; +} + +XMLP.prototype._removeExpiredNamesapces = function (closingtagname) { + //remove the expiring namespaces from the list (you can id them by scopetag) + var keeps = []; + this.m_namespaceList.map(function (item){ + if(item.scopetag !== closingtagname){ + keeps.push(item); + } + }); + + this.m_namespaceList = keeps; + +} + +//////////////////////////////////////////////////////////////////////// + + +XMLP.prototype._parseAttribute = function(iB, iE) { + var iNB, iNE, iEq, iVB, iVE; + var cQuote, strN, strV; + + this.m_cAlt = ""; //resets the value so we don't use an old one by accident (see testAttribute7 in the test suite) + + iNB = SAXStrings.indexOfNonWhitespace(this.m_xml, iB, iE); + if((iNB == -1) ||(iNB >= iE)) { + return iNB; + } + + iEq = this.m_xml.indexOf("=", iNB); + if((iEq == -1) || (iEq > iE)) { + return this._setErr(XMLP.ERR_ATT_VALUES); + } + + iNE = SAXStrings.lastIndexOfNonWhitespace(this.m_xml, iNB, iEq); + + iVB = SAXStrings.indexOfNonWhitespace(this.m_xml, iEq + 1, iE); + if((iVB == -1) ||(iVB > iE)) { + return this._setErr(XMLP.ERR_ATT_VALUES); + } + + cQuote = this.m_xml.charAt(iVB); + if(SAXStrings.QUOTES.indexOf(cQuote) == -1) { + return this._setErr(XMLP.ERR_ATT_VALUES); + } + + iVE = this.m_xml.indexOf(cQuote, iVB + 1); + if((iVE == -1) ||(iVE > iE)) { + return this._setErr(XMLP.ERR_ATT_VALUES); + } + + strN = this.m_xml.substring(iNB, iNE + 1); + strV = this.m_xml.substring(iVB + 1, iVE); + + if(strN.indexOf("<") != -1) { + return this._setErr(XMLP.ERR_ATT_LT_NAME); + } + + if(strV.indexOf("<") != -1) { + return this._setErr(XMLP.ERR_ATT_LT_VALUE); + } + + strV = SAXStrings.replace(strV, null, null, "\n", " "); + strV = SAXStrings.replace(strV, null, null, "\t", " "); + iRet = this._replaceEntities(strV); + if(iRet == XMLP._ERROR) { + return iRet; + } + + strV = this.m_cAlt; + + if(this._findAttributeIndex(strN) == -1) { + this._addAttribute(strN, strV); + } + else { + return this._setErr(XMLP.ERR_ATT_DUP); + } + + this.m_iP = iVE + 2; + + return XMLP._ATT; + +} + +XMLP.prototype._parseCDATA = function(iB) { + var iE = this.m_xml.indexOf("]]>", iB); + if (iE == -1) { + //This item never closes, although it could be a malformed document, we will assume that we are mid-chunck, save the string and reurn as interrupted + this.m_chunkTransitionContinuation = this.m_xml.slice(iB-9);//the '-", iB); + if (iE == -1) { + //This item never closes, although it could be a malformed document, we will assume that we are mid-chunck, save the string and reurn as interrupted + this.m_chunkTransitionContinuation = this.m_xml.slice(iB-4);//the '-4' adds the '