Skip to content
Browse files

Add example

  • Loading branch information...
1 parent 6dbedb4 commit 494c14c9ee24dc5a4d4b3eeb86beb84be3423b7f Harry Fuecks committed Oct 2, 2010
Showing with 95 additions and 1 deletion.
  1. +51 −1 README.md
  2. +44 −0 example.js
View
52 README.md
@@ -3,4 +3,54 @@ node-soupselect
A port of Simon Willison's [soupselect](http://code.google.com/p/soupselect/) for use with node.js and node-htmlparser.
-Wanted a friendly way to scrape HTML using node.js
+Wanted a friendly way to scrape HTML using node.js. Tried using jsdom, prompted by [this article](http://blog.nodejitsu.com/jsdom-jquery-in-5-lines-on-nodejs) but, unfortunately, [jsdom](http://github.com/tmpvar/jsdom) takes a strict view of lax HTML making it unusable for scraping the kind of soup found in real world web pages. Luckily [htmlparser](http://github.com/tautologistics/node-htmlparser/) is more forgiving.
+
+A complete example including fetching HTML etc...;
+
+ var select = require('soupselect').select,
+ htmlparser = require("htmlparser"),
+ http = require('http'),
+ sys = require('sys');
+
+ // fetch some HTML...
+ var http = require('http');
+ var host = 'www.reddit.com';
+ var client = http.createClient(80, host);
+ var request = client.request('GET', '/',{'host': host});
+
+ request.on('response', function (response) {
+ response.setEncoding('utf8');
+
+ var body = "";
+ response.on('data', function (chunk) {
+ body = body + chunk;
+ });
+
+ response.on('end', function() {
+
+ // now we have the whole body, parse it and select the nodes we want...
+ var handler = new htmlparser.DefaultHandler(function(err, dom) {
+ if (err) {
+ sys.debug("Error: " + err);
+ } else {
+
+ // soupselect happening here...
+ var titles = select(dom, 'a.title');
+
+ sys.puts("Top stories from reddit");
+ titles.forEach(function(title) {
+ sys.puts("- " + title.children[0].raw + " [" + title.attribs.href + "]\n");
+ })
+ }
+ });
+
+ var parser = new htmlparser.Parser(handler);
+ parser.parseComplete(body);
+ });
+ });
+ request.end();
+
+Notes:
+ - Requires node-htmlparser > 1.6.2 & node.js 2+
+ - Calls to select are synchronous - not worth trying to make it asynchronous IMO given the use case
+
View
44 example.js
@@ -0,0 +1,44 @@
+ var select = require('./lib/soupselect').select,
+ htmlparser = require("htmlparser"),
+ http = require('http'),
+ sys = require('sys');
+
+ // fetch some HTML...
+ var http = require('http');
+ var host = 'www.reddit.com';
+ var client = http.createClient(80, host);
+ var request = client.request('GET', '/',{'host': host});
+
+ request.on('response', function (response) {
+ response.setEncoding('utf8');
+
+ var body = "";
+ response.on('data', function (chunk) {
+ body = body + chunk;
+ });
+
+ response.on('end', function() {
+
+ // now we have the whole body, parse it and select the nodes we want...
+ var handler = new htmlparser.DefaultHandler(function(err, dom) {
+ if (err) {
+ sys.debug("Error: " + err);
+ } else {
+
+ // soupselect happening here...
+ var titles = select(dom, 'a.title');
+
+ sys.puts("Top stories from reddit");
+ titles.forEach(function(title) {
+ sys.puts("- " + title.children[0].raw + " [" + title.attribs.href + "]\n");
+ })
+ }
+ });
+
+ var parser = new htmlparser.Parser(handler);
+ parser.parseComplete(body);
+ });
+ });
+ request.end();
+
+

0 comments on commit 494c14c

Please sign in to comment.
Something went wrong with that request. Please try again.