Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

use cheerio/request instead of jsdom/YUI

  • Loading branch information...
commit b48fdcf2fc587db8b22e05169676260ef6e6d2d2 1 parent 346c925
Huge Huang authored
Showing with 49 additions and 76 deletions.
  1. +33 −66 css-crawler.js
  2. +4 −4 package.json
  3. +12 −6 run
99 css-crawler.js
View
@@ -1,68 +1,35 @@
-var util = require('util')
- , YUI = require('yui').YUI
- , jsdom = require('jsdom')
- , EventEmitter = require('events').EventEmitter;
-
-module.exports = new EventEmitter();
-
-//Turn off all the things we don't want.
-jsdom.defaultDocumentFeatures = {
- //Don't bring in outside resources
- FetchExternalResources : false,
- //Don't process them
- ProcessExternalResources : false,
- //Don't expose Mutation events (for performance)
- MutationEvents : false,
- //Do not use their implementation of QSA
- QuerySelector : false
-};
-
-var dom = jsdom.defaultLevel;
-//Hack in focus and blur methods so they don't fail when a YUI widget calls them
-dom.Element.prototype.blur = function() {};
-dom.Element.prototype.focus = function() {};
-
-//Create the document and window
-module.exports.fetch = function (config) {
- var host = config.host
- , rule = config.rule
- , callback = config.callback || 'callback'
- , format = config.format || 'outerHTML'
- , obj = {}
- , document;
-
- // added http prefix
- if (-1 === host.indexOf('http://')) {
- host = 'http://' + host;
- }
-
- if (host && rule) {
- jsdom.env({ html: host, done: function (errors, win) {
- doc = win.document;
-
- YUI({
- win: win,
- doc: win.document
- }).use('node', 'io', function (Y) {
- Y.on('io:complete', function (id, o, c) {
- var results = Y.all(rule),
- items = [], item;
-
- results.each(function (n) {
- item = n.get(format);
- items.push(item);
- });
-
- obj.rule = rule;
- obj.host = host;
- obj.callback = callback;
- obj.results = items;
- module.exports.emit('data', obj);
- });
-
- Y.io(host);
- });
- }});
-
+var request = require('request');
+var cheerio = require('cheerio');
+
+/**
+ * @method fetch
+ * @param {Object} conf
+ * @param {string} conf.url url.
+ * @param {string} conf.rule css selector.
+ * @param {Function} cb callback.
+ * @chainable
+ */
+function fetch(conf, cb) {
+ var url = conf.url;
+ var rule = conf.rule;
+ var self = this;
+
+ request(url, function(error, response, body) {
+ var results, $;
+ if (!error && response.statusCode == 200) {
+ results = [];
+ $ = cheerio.load(body);
+ $(rule).each(function(i, elem) {
+ results[i] = $(this).text();
+ });
+ cb(null, results);
+ return self;
+ } else {
+ cb(error, null);
}
+ });
}
+
+module.exports = {
+ fetch: fetch
+};
8 package.json
View
@@ -2,17 +2,17 @@
"author": "huang47 <huge.huang@gmail.com> (huang47.blogspot.com)",
"name": "css-crawler",
"description": "Crawl web via css selector",
- "version": "0.3.2",
+ "version": "0.4.0",
"repository": {
"type": "git",
"url": "git://github.com/huang47/css-crawler.git"
},
"engines": {
- "node": "*"
+ "node": ">= 0.8"
},
"dependencies": {
- "yui": "3.5.0",
- "jsdom": "0.2.7"
+ "cheerio": "*",
+ "request": "*"
},
"devDependencies": {},
"main": "css-crawler"
18 run
View
@@ -2,11 +2,17 @@
var c = require('./css-crawler.js');
-c.fetch({
- host: 'http://tw.dictionary.search.yahoo.com/search?p=hello',
- rule: '.explanation_wrapper'
-});
+//var conf = {
+// url: 'http://tw.dictionary.search.yahoo.com/search?p=hello',
+// rule: '.explanation_wrapper'
+//};
+//
+var conf = {
+ url: 'http://news.ycombinator.com',
+ rule: 'table td.title a'
+};
-c.on('data', function (obj) {
- console.log(obj, 'obj');
+c.fetch(conf, function(error, data) {
+ if (error) { console.error(error); return; }
+ console.log(data.join('\n'));
});
Please sign in to comment.
Something went wrong with that request. Please try again.