diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..40b878d --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +node_modules/ \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..b7aaccb --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,3 @@ +0.0.1 / 2012-07-21 +------------------ +* Inital release. \ No newline at end of file diff --git a/Cakefile b/Cakefile new file mode 100644 index 0000000..34972c9 --- /dev/null +++ b/Cakefile @@ -0,0 +1,42 @@ +{spawn} = require('child_process') +testutil = require('testutil') +growl = require('growl') + +option '-g', '--grep [PATTERN]', 'only run tests matching ' + +task 'build', 'build lib/ from src/', -> + coffee = spawn 'coffee', ['-c', '-o', 'lib', 'src'] + coffee.stderr.on 'data', (data) -> process.stderr.write data.toString() + coffee.stdout.on 'data', (data) -> process.stdout.write data.toString() + coffee.on 'exit', (code) -> + if code is 0 + console.log 'Successfully built.' + else + console.log "Error building. Code: #{code}" + +task 'test', 'test project', (options) -> + process.env['NODE_ENV'] = 'test' + testutil.fetchTestFiles './test', (files) -> + files.unshift '--colors' + if options.grep? + files.unshift options.grep + files.unshift '--grep' + + mocha = spawn 'mocha', files#, customFds: [0..2] + mocha.stdout.pipe(process.stdout, end: false); + mocha.stderr.pipe(process.stderr, end: false); + + task 'watch', 'Watch src/ for changes', -> + coffee = spawn 'coffee', ['-w', '-c', '-o', 'lib', 'src'] + coffee.stderr.on 'data', (data) -> 'ERR: ' + process.stderr.write data.toString() + coffee.stdout.on 'data', (data) -> + d = data.toString() + if d.indexOf('compiled') > 0 + #invoke 'test' + do (->) + else + growl(d, title: 'Error', image: './resources/error.png') + + process.stdout.write data.toString() + + #mocha = spawn 'mocha', ['-w'] \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..f5b4f09 --- /dev/null +++ b/LICENSE @@ -0,0 +1,16 @@ + +(The MIT License) + +Copyright (c) 2012 JP Richardson + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files +(the 'Software'), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, + merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS +OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..2c05300 --- /dev/null +++ b/README.md @@ -0,0 +1,107 @@ +Node.js - linkscrape +===================== + +This module allows scrapes links from an HTML string and normalizes them. It does not actually perform the HTTP request. Use [superagent][1] or [request][2] for that. + + +Installation +------------ + + npm install linkscrape + + + +Example +------- + +HTML string: +```html + + + + Test File + + + +

+ Google +

+

+ Link in page + hi + hello + Faq + About Us +

+ + +``` + +Extract links with `linkscrape`: + +```javascript +scrape('http://someserver.com/mypage', htmlString, function($, links){ + console.log(links.length);// is 6 + + console.log(links[0].href); //is 'http://google.com' + console.log(links[0].text); //is 'Google' + console.log(links[0].html); //is 'Google' + console.log(links[0].element); //object + console.log(links[0].link); //is 'http://google.com' + + console.log(links[1].href); //is '#wat' + console.log(links[1].text); //is 'Link in page' + console.log(links[1].html); //is 'Link in page' + console.log(links[1].element); //object + console.log(links[1].link); //is null + console.log($(links[1].element).attr('class')); //is 'pretty' + + console.log(links[2].href); //is "javascript:alert('hi');" + console.log(links[2].text); //is 'hi' + console.log(links[2].html); //is 'hi' + console.log(links[2].element); //object + console.log(links[2].link); //is null + + console.log(links[3].href); //is "alert('hello')" + console.log(links[3].text); //is 'hello' + console.log(links[3].html); //is 'hello' + console.log(links[3].element); //object + console.log(links[3].link); //is null + + console.log(links[4].href); //is "/faq/questions" + console.log(links[4].text); //is 'Faq' + console.log(links[4].html); //is 'Faq' + console.log(links[4].element); //object + console.log(links[4].link); //is 'http://someserver.com/faq/questions' + + console.log(links[5].href); //is "aboutus" + console.log(links[5].text); //is 'About Us' + console.log(links[5].html); //is 'About Us' + console.log(links[5].element); //object + console.log(links[5].link); //is 'http://someserver.com/aboutus' +``` + +It's currently backed by [cheerio][3]. So you can use the `$` with the jQuery selectors. See [cheerio docs][3] for more details. + + + +Test +---- + + npm test + +or... + + mocha test + +License +------- + +Licensed under MIT. See `LICENSE` for more details. + +Copyright (c) 2012 JP Richardson + + +[1]:http://visionmedia.github.com/superagent/ +[2]:https://github.com/mikeal/request +[3]:https://github.com/MatthewMueller/cheerio diff --git a/lib/linkscrape.js b/lib/linkscrape.js new file mode 100644 index 0000000..087dd4a --- /dev/null +++ b/lib/linkscrape.js @@ -0,0 +1,62 @@ +// Generated by CoffeeScript 1.3.3 +(function() { + var cheerio, extractLinks, normalizeLink, url; + + cheerio = require('cheerio'); + + url = require('url'); + + extractLinks = function(pageUrl, pageHtml, callback) { + var $, links, parsedUrl; + $ = cheerio.load(pageHtml); + links = []; + parsedUrl = url.parse(pageUrl); + $('a').each(function(i, el) { + var absoluteUrl, link; + absoluteUrl = normalizeLink(parsedUrl, $(el).attr('href')); + link = {}; + link.text = $(el).text(); + link.html = $(el).html(); + link.href = $(el).attr('href'); + link.element = el; + link.link = absoluteUrl; + return links.push(link); + }); + return callback($, links); + }; + + normalizeLink = function(parsedUrl, scrapedHref) { + var pos, scrapedUrl, surl; + if (scrapedHref.indexOf('javascript:') === 0) { + return null; + } + if (scrapedHref.indexOf('#') === 0) { + return null; + } + scrapedUrl = url.parse(scrapedHref); + if (scrapedUrl.host != null) { + return scrapedHref; + } else { + if (scrapedHref.indexOf('/') === 0) { + return parsedUrl.protocol + '//' + parsedUrl.host + scrapedHref; + } else { + if (scrapedHref.indexOf('(') > 0 && scrapedHref.indexOf(')') > 0) { + return null; + } else { + pos = parsedUrl.href.lastIndexOf("/"); + surl = ""; + if (pos >= 0) { + surl = parsedUrl.href.substring(0, pos + 1); + return surl + scrapedHref; + } else { + return parsedUrl.href + "/" + scrapedHref; + } + } + } + } + return null; + }; + + module.exports = extractLinks; + +}).call(this); diff --git a/package.json b/package.json new file mode 100755 index 0000000..8065649 --- /dev/null +++ b/package.json @@ -0,0 +1,31 @@ +{ + "name" : "linkscrape", + "version" : "0.0.1", + "description" : "A Node.js module to scrape and normalize links from an HTML string.", + "homepage" : [ + "https://github.com/jprichardson/node-linkscrape" + ], + "repository" : { + "type" : "git", + "url" : "https://github.com/jprichardson/node-linkscrape" + }, + "keywords" : ["extract", "scrape", "html", "link", "anchor", "body", "scraper", "http"], + "author" : "JP Richardson ", + "licenses" : [ { + "type" : "MIT", + "url" : "http://github.com/jprichardson/node-linkscrape/raw/master/LICENSE" + }], + "dependencies" : { + "cheerio": "0.8.x" + }, + "devDepdencies":{ + "mocha": "1.3.x", + "coffee-script": "1.3.x", + "testutil": "0.2.x", + "autoresolve": "0.0.x" + }, + "main" : "./lib/linkscrape", + "scripts": { + "test": "mocha test" + } +} diff --git a/resources/error.png b/resources/error.png new file mode 100644 index 0000000..490822a Binary files /dev/null and b/resources/error.png differ diff --git a/src/linkscrape.coffee b/src/linkscrape.coffee new file mode 100644 index 0000000..5dfaf2b --- /dev/null +++ b/src/linkscrape.coffee @@ -0,0 +1,42 @@ +cheerio = require('cheerio') +url = require('url') + +extractLinks = (pageUrl, pageHtml, callback) -> + $ = cheerio.load(pageHtml); links = []; parsedUrl = url.parse(pageUrl) + $('a').each (i, el) -> + absoluteUrl = normalizeLink(parsedUrl, $(el).attr('href')) + link = {} + link.text = $(el).text() + link.html = $(el).html() + link.href = $(el).attr('href') + link.element = el + link.link = absoluteUrl + links.push link + callback($, links) + +normalizeLink = (parsedUrl, scrapedHref) -> + if scrapedHref.indexOf('javascript:') is 0 + return null + if scrapedHref.indexOf('#') is 0 + return null + + scrapedUrl = url.parse(scrapedHref) + if scrapedUrl.host? #is absolute + return scrapedHref + else + if scrapedHref.indexOf('/') is 0 + return parsedUrl.protocol + '//' + parsedUrl.host + scrapedHref + else + if scrapedHref.indexOf('(') > 0 and scrapedHref.indexOf(')') > 0 #crappy JavaScript detection + return null + else + pos = parsedUrl.href.lastIndexOf("/") + surl = "" + if pos >= 0 + surl = parsedUrl.href.substring(0, pos + 1) + return surl + scrapedHref + else + return parsedUrl.href + "/" + scrapedHref + return null + +module.exports = extractLinks diff --git a/test/linkscrape.test.coffee b/test/linkscrape.test.coffee new file mode 100644 index 0000000..97fdea2 --- /dev/null +++ b/test/linkscrape.test.coffee @@ -0,0 +1,51 @@ +testutil = require('testutil') +fs = require('fs') +P = require('autoresolve') +scrape = require(P('lib/linkscrape')) +cheerio = require('cheerio') + +describe '+ linkscrape()', -> + it 'should return the parsed links from a page', (done) -> + fs.readFile P('test/resources/testfile.html'), (err, data) -> + scrape 'http://someserver.com/mypage', data.toString(), ($, links) -> + T links.length is 6 + + T links[0].href is 'http://google.com' + T links[0].text is 'Google' + T links[0].html is 'Google' + T links[0].element? + T links[0].link is 'http://google.com' + + T links[1].href is '#wat' + T links[1].text is 'Link in page' + T links[1].html is 'Link in page' + T links[1].element? + T links[1].link is null + T $(links[1].element).attr('class') is 'pretty' + + T links[2].href is "javascript:alert('hi');" + T links[2].text is 'hi' + T links[2].html is 'hi' + T links[2].element? + T links[2].link is null + + T links[3].href is "alert('hello')" + T links[3].text is 'hello' + T links[3].html is 'hello' + T links[3].element? + T links[3].link is null + + T links[4].href is "/faq/questions" + T links[4].text is 'Faq' + T links[4].html is 'Faq' + T links[4].element? + T links[4].link is 'http://someserver.com/faq/questions' + + T links[5].href is "aboutus" + T links[5].text is 'About Us' + T links[5].html is 'About Us' + T links[5].element? + T links[5].link is 'http://someserver.com/aboutus' + + done() + diff --git a/test/mocha.opts b/test/mocha.opts new file mode 100644 index 0000000..c4348af --- /dev/null +++ b/test/mocha.opts @@ -0,0 +1,6 @@ +--reporter spec +--ui bdd +--growl +--compilers coffee:coffee-script +--timeout 5000 +--globals __fnoc \ No newline at end of file diff --git a/test/resources/testfile.html b/test/resources/testfile.html new file mode 100644 index 0000000..de37581 --- /dev/null +++ b/test/resources/testfile.html @@ -0,0 +1,19 @@ + + + + Test File + + + +

+ Google +

+

+ Link in page + hi + hello + Faq + About Us +

+ + \ No newline at end of file