Port of Simon Willison's Soup Select (for BeautifulSoup) to node.js and node-htmlparser
JavaScript
Fetching latest commit…
Cannot retrieve the latest commit at this time.
Permalink
Failed to load latest commit information.
deps Update version Nov 9, 2010
lib Fixed multiple class selectors and bug with nested tag selectors, e.g… Nov 9, 2010
testdata
tests Add tests for pull #2 96baffb and 0c7e885 Nov 9, 2010
.gitmodules Reorganise into package structure Oct 1, 2010
AUTHORS
BENCHMARK.md Add benchmarking tool based on slickspeed plus log to track history Oct 6, 2010
README.md Documentation fix Nov 9, 2010
benchmark.js Add benchmarking tool based on slickspeed plus log to track history Oct 6, 2010
example.js Add example Oct 2, 2010
nodelint.js Take some nodelint punishment Oct 21, 2010
package.json Fix issue 3 - trailing comma in package .json Feb 24, 2011
test.js Add package index, package description and test bootstrap Oct 1, 2010

README.md

node-soupselect

A port of Simon Willison's soupselect for use with node.js and node-htmlparser.

$ npm install soupselect

Minimal example...

var select = require('soupselect').select;
// dom provided by htmlparser...
select(dom, "#main a.article").forEach(function(element) {//...});

Wanted a friendly way to scrape HTML using node.js. Tried using jsdom, prompted by this article but, unfortunately, jsdom takes a strict view of lax HTML making it unusable for scraping the kind of soup found in real world web pages. Luckily htmlparser is more forgiving. More details on this found here.

A complete example including fetching HTML etc...;

var select = require('soupselect').select,
    htmlparser = require("htmlparser"),
    http = require('http'),
    sys = require('sys');

// fetch some HTML...
var http = require('http');
var host = 'www.reddit.com';
var client = http.createClient(80, host);
var request = client.request('GET', '/',{'host': host});

request.on('response', function (response) {
    response.setEncoding('utf8');

    var body = "";
    response.on('data', function (chunk) {
        body = body + chunk;
    });

    response.on('end', function() {

        // now we have the whole body, parse it and select the nodes we want...
        var handler = new htmlparser.DefaultHandler(function(err, dom) {
            if (err) {
                sys.debug("Error: " + err);
            } else {

                // soupselect happening here...
                var titles = select(dom, 'a.title');

                sys.puts("Top stories from reddit");
                titles.forEach(function(title) {
                    sys.puts("- " + title.children[0].raw + " [" + title.attribs.href + "]\n");
                })
            }
        });

        var parser = new htmlparser.Parser(handler);
        parser.parseComplete(body);
    });
});
request.end();

Notes:

  • Requires node-htmlparser > 1.6.2 & node.js 2+
  • Calls to select are synchronous - not worth trying to make it asynchronous IMO given the use case