Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

first commit

  • Loading branch information...
commit aabf72a2c359cc80248b1a9d85d914c53d8cf3d3 0 parents
@hdachev authored
3  .gitignore
@@ -0,0 +1,3 @@
+node*
+npm*
+test*
80 README.md
@@ -0,0 +1,80 @@
+# jscrape = jsdom + jquery + request for the truly lazy
+
+
+### Install
+
+ npm install jscrape
+
+
+### Be lazy
+
+Use the same way you'd use request,
+passing a url or request-options as the first param,
+and a `function(err, $, response, body)` callback,
+where `$` is the jQuery object for the parsed page:
+
+```javascript
+var jscrape = require ( './index' );
+jscrape ( 'http://www.google.com', function ( error, $, response, body ) {
+ if ( !error && $ ) {
+ // print the innerHTML of the lucky button.
+ console.log( $( 'button:contains("Lucky")' ).html () )
+ }
+})
+```
+
+Or just use a `function(err, $)`-style callback:
+
+```javascript
+var jscrape = require ( "./index" );
+function getNews ( callback ) {
+ jscrape ( 'http://news.ycombinator.com/', function ( err, $ ) {
+ // jquery to the rescue
+ callback ( err, $ && $( 'span[id^=score]' ).map ( function () {
+ var link;
+ // some nested tables are better than others
+ return {
+ id : num ( $( this ).attr ( 'id' ) ),
+ url : ( link = $( this ).closest ( 'tr' ).prev ( 'tr' ).find ( 'td.title a' ) )
+ .attr ( 'href' ),
+ title : link.text (),
+ score : num ( $( this ).text () )
+ }
+ }).get () )
+ })
+ function num ( str ) {
+ return Number ( String ( str ).replace ( /[^0-9]+/g, '' ) )
+ }
+}
+getNews ( function ( err, news ) {
+ console.log ( err, news );
+})
+```
+
+When passing an object as the first param
+it's passed through directly to request.
+Instead, when passing in just a url string,
+its wrapped in a request options object
+that has some sensible defaults
+for a simple scraping setup,
+so you can be lazy.
+Enjoy!
+
+
+### Workaround for npm trouble with contextify on windows
+
+If you can't `npm install jscrape`
+because of contextify failing to build on windows,
+clone the `contextify` repo under `your_project/node_modules/contextify`
+and replace the contents of `lib/contextify.js`
+with the following:
+
+```javascript
+module.exports = function ( obj ) {
+ obj.getGlobal = function () {
+ return obj;
+ }
+};
+```
+
+That should work just fine.
7 example1.js
@@ -0,0 +1,7 @@
+var jscrape = require ( './index' );
+jscrape ( 'http://www.google.com', function ( error, $, response, body ) {
+ if ( !error && $ ) {
+ // Print the innerHTML of the I'm Feeling Lucky button.
+ console.log( $( 'button:contains("Lucky")' ).html () )
+ }
+})
23 example2.js
@@ -0,0 +1,23 @@
+var jscrape = require ( "./index" );
+function getNews ( callback ) {
+ jscrape ( 'http://news.ycombinator.com/', function ( err, $ ) {
+ // jquery to the rescue
+ callback ( err, $ && $( 'span[id^=score]' ).map ( function () {
+ var link;
+ // some nested tables are better than others
+ return {
+ id : num ( $( this ).attr ( 'id' ) ),
+ url : ( link = $( this ).closest ( 'tr' ).prev ( 'tr' ).find ( 'td.title a' ) )
+ .attr ( 'href' ),
+ title : link.text (),
+ score : num ( $( this ).text () )
+ }
+ }).get () )
+ })
+ function num ( str ) {
+ return Number ( String ( str ).replace ( /[^0-9]+/g, '' ) )
+ }
+}
+getNews ( function ( err, news ) {
+ console.log ( err, news );
+})
38 index.js
@@ -0,0 +1,38 @@
+
+
+var request = require ( 'request' ),
+ jsdom = require ( 'jsdom' ),
+ jquery = require ( 'jquery' ),
+ jscrape;
+
+module.exports = jscrape = function ( options, callback )
+{
+ if ( typeof options === 'string' )
+ options =
+ { url : options
+ , pool : jscrape.pool
+ , headers : jscrape.headers
+ };
+
+ request ( options, function ( err, response, body )
+ {
+ if ( !err && response.statusCode == 200 )
+ callback ( null, jquery.create ( jsdom.jsdom ( body ).createWindow () ), response, body );
+ else
+ callback ( err || response.statusCode, null, response, body );
+ });
+};
+
+jscrape.pool =
+ { maxSockets : 64 };
+
+jscrape.headers =
+ { "Accept" : "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
+ , "Accept-Charset" : "ISO-8859-1,utf-8;q=0.7,*;q=0.3"
+ , "Accept-Language" : "en-US,en;q=0.8"
+ , "Cache-Control" : "no-cache"
+ , "Connection" : "keep-alive"
+ , "Pragma" : "no-cache"
+ , "User-Agent" : "Mozilla/5.0 (Windows NT 6.0; WOW64) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.162 Safari/535.19"
+ };
+
11 package.json
@@ -0,0 +1,11 @@
+{ "name" : "jscrape",
+ "version" : "0.0.1",
+ "description" : "jsdom + jquery + request for the lazy",
+ "keywords" : [ "jquery", "scrape" ],
+ "main" : "./index.js",
+ "dependencies" : {
+ "jquery" : ">= 1.6.3",
+ "jsdom" : ">= 0.2.14",
+ "request" : ">= 2.9.202"
+ }
+}
Please sign in to comment.
Something went wrong with that request. Please try again.