Initial commit.

jprichardson · Jul 22, 2012 · 5c68665 · 5c68665
commit 5c68665
Show file tree

Hide file tree

Showing 12 changed files with 380 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+node_modules/
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -0,0 +1,3 @@
+0.0.1 / 2012-07-21
+------------------
+* Inital release.
diff --git a/Cakefile b/Cakefile
@@ -0,0 +1,42 @@
+{spawn} = require('child_process')
+testutil = require('testutil')
+growl = require('growl')
+
+option '-g', '--grep [PATTERN]', 'only run tests matching <pattern>'
+
+task 'build', 'build lib/ from src/', ->
+  coffee = spawn 'coffee', ['-c', '-o', 'lib', 'src']
+  coffee.stderr.on 'data', (data) -> process.stderr.write data.toString()
+  coffee.stdout.on 'data', (data) -> process.stdout.write data.toString()
+  coffee.on 'exit', (code) ->
+    if code is 0 
+      console.log 'Successfully built.'
+    else
+      console.log "Error building. Code: #{code}"
+
+task 'test', 'test project', (options) ->
+  process.env['NODE_ENV'] = 'test'
+  testutil.fetchTestFiles './test', (files) ->
+    files.unshift '--colors'
+    if options.grep?
+      files.unshift options.grep
+      files.unshift '--grep'
+
+    mocha = spawn 'mocha', files#, customFds: [0..2]
+    mocha.stdout.pipe(process.stdout, end: false);
+    mocha.stderr.pipe(process.stderr, end: false);
+
+ task 'watch', 'Watch src/ for changes', ->
+    coffee = spawn 'coffee', ['-w', '-c', '-o', 'lib', 'src']
+    coffee.stderr.on 'data', (data) -> 'ERR: ' + process.stderr.write data.toString()
+    coffee.stdout.on 'data', (data) ->
+      d = data.toString()
+      if d.indexOf('compiled') > 0
+        #invoke 'test'
+        do (->)
+      else
+        growl(d, title: 'Error', image: './resources/error.png')
+
+      process.stdout.write data.toString()
+
+    #mocha = spawn 'mocha', ['-w']
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,16 @@
+
+(The MIT License)
+
+Copyright (c) 2012 JP Richardson
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files 
+(the 'Software'), to deal in the Software without restriction, including without limitation the rights to use, copy, modify,
+ merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE 
+WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS 
+OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/README.md b/README.md
@@ -0,0 +1,107 @@
+Node.js - linkscrape
+=====================
+
+This module allows scrapes links from an HTML string and normalizes them. It does not actually perform the HTTP request. Use [superagent][1] or [request][2] for that.
+
+
+Installation
+------------
+
+    npm install linkscrape
+
+
+
+Example
+-------
+
+HTML string:
+```html
+<html>
+  <head>
+    <title>
+      Test File
+    </title>
+  </head>
+  <body>
+    <p id="wat">
+      <a href="http://google.com"><b>Google</b></a>
+    </p>
+    <p>
+      <a href="#wat" class="pretty">Link in page</a>
+      <a href="javascript:alert('hi');">hi</a>
+      <a href="alert('hello')">hello</a>
+      <a href="/faq/questions">Faq</a>
+      <a href="aboutus">About Us</a>
+    </p>
+  </body>
+</html>
+```
+
+Extract links with `linkscrape`:
+
+```javascript
+scrape('http://someserver.com/mypage', htmlString, function($, links){
+    console.log(links.length);// is 6
+
+    console.log(links[0].href); //is 'http://google.com'
+    console.log(links[0].text); //is 'Google'
+    console.log(links[0].html); //is '<b>Google</b>'
+    console.log(links[0].element); //object
+    console.log(links[0].link); //is 'http://google.com'
+
+    console.log(links[1].href); //is '#wat'
+    console.log(links[1].text); //is 'Link in page'
+    console.log(links[1].html); //is 'Link in page'
+    console.log(links[1].element); //object
+    console.log(links[1].link); //is null
+    console.log($(links[1].element).attr('class')); //is 'pretty'
+
+    console.log(links[2].href); //is "javascript:alert('hi');"
+    console.log(links[2].text); //is 'hi'
+    console.log(links[2].html); //is 'hi'
+    console.log(links[2].element); //object
+    console.log(links[2].link); //is null
+
+    console.log(links[3].href); //is "alert('hello')"
+    console.log(links[3].text); //is 'hello'
+    console.log(links[3].html); //is 'hello'
+    console.log(links[3].element); //object
+    console.log(links[3].link); //is null
+
+    console.log(links[4].href); //is "/faq/questions"
+    console.log(links[4].text); //is 'Faq'
+    console.log(links[4].html); //is 'Faq'
+    console.log(links[4].element); //object
+    console.log(links[4].link); //is 'http://someserver.com/faq/questions'
+
+    console.log(links[5].href); //is "aboutus"
+    console.log(links[5].text); //is 'About Us'
+    console.log(links[5].html); //is 'About Us'
+    console.log(links[5].element); //object
+    console.log(links[5].link); //is 'http://someserver.com/aboutus'
+```
+
+It's currently backed by [cheerio][3]. So you can use the `$` with the jQuery selectors. See [cheerio docs][3] for more details. 
+
+
+
+Test
+----
+
+    npm test
+
+or...
+
+    mocha test
+
+License
+-------
+
+Licensed under MIT. See `LICENSE` for more details.
+
+Copyright (c) 2012 JP Richardson
+
+
+[1]:http://visionmedia.github.com/superagent/
+[2]:https://github.com/mikeal/request
+[3]:https://github.com/MatthewMueller/cheerio
diff --git a/lib/linkscrape.js b/lib/linkscrape.js
diff --git a/package.json b/package.json
@@ -0,0 +1,31 @@
+{
+    "name" : "linkscrape", 
+    "version" : "0.0.1",
+    "description" : "A Node.js module to scrape and normalize links from an HTML string.",
+    "homepage" : [
+        "https://github.com/jprichardson/node-linkscrape"
+    ],
+    "repository" : {
+        "type" : "git", 
+        "url" : "https://github.com/jprichardson/node-linkscrape"
+    },
+    "keywords" : ["extract", "scrape", "html", "link", "anchor", "body", "scraper", "http"],
+    "author" : "JP Richardson <jprichardson@gmail.com>",
+    "licenses" : [ {
+        "type" : "MIT",
+        "url" : "http://github.com/jprichardson/node-linkscrape/raw/master/LICENSE"
+    }],
+    "dependencies" : {
+        "cheerio": "0.8.x"
+     },
+     "devDepdencies":{
+        "mocha": "1.3.x",
+        "coffee-script": "1.3.x",
+        "testutil": "0.2.x",
+        "autoresolve": "0.0.x"
+    },
+    "main" : "./lib/linkscrape",
+    "scripts": {
+        "test": "mocha test"
+    }
+}
diff --git a/resources/error.png b/resources/error.png
diff --git a/src/linkscrape.coffee b/src/linkscrape.coffee
@@ -0,0 +1,42 @@
+cheerio = require('cheerio')
+url = require('url')
+
+extractLinks = (pageUrl, pageHtml, callback) ->
+  $ = cheerio.load(pageHtml); links = []; parsedUrl = url.parse(pageUrl)
+  $('a').each (i, el) ->
+    absoluteUrl = normalizeLink(parsedUrl, $(el).attr('href'))
+    link = {}
+    link.text = $(el).text()
+    link.html = $(el).html()
+    link.href = $(el).attr('href')
+    link.element = el
+    link.link = absoluteUrl
+    links.push link
+  callback($, links)
+
+normalizeLink = (parsedUrl, scrapedHref) ->
+  if scrapedHref.indexOf('javascript:') is 0
+    return null
+  if scrapedHref.indexOf('#') is 0
+    return null
+
+  scrapedUrl = url.parse(scrapedHref)
+  if scrapedUrl.host? #is absolute
+    return scrapedHref
+  else
+    if scrapedHref.indexOf('/') is 0
+      return parsedUrl.protocol + '//' + parsedUrl.host + scrapedHref
+    else
+      if scrapedHref.indexOf('(') > 0 and scrapedHref.indexOf(')') > 0 #crappy JavaScript detection
+        return null
+      else
+        pos = parsedUrl.href.lastIndexOf("/")
+        surl = ""
+        if pos >= 0
+          surl = parsedUrl.href.substring(0, pos + 1)
+          return surl + scrapedHref
+        else
+          return parsedUrl.href + "/" + scrapedHref
+  return null
+
+module.exports = extractLinks
diff --git a/test/linkscrape.test.coffee b/test/linkscrape.test.coffee
@@ -0,0 +1,51 @@
+testutil = require('testutil')
+fs = require('fs')
+P = require('autoresolve')
+scrape = require(P('lib/linkscrape'))
+cheerio = require('cheerio')
+
+describe '+ linkscrape()', ->
+  it 'should return the parsed links from a page', (done) ->
+    fs.readFile P('test/resources/testfile.html'), (err, data) ->
+      scrape 'http://someserver.com/mypage', data.toString(), ($, links) ->
+        T links.length is 6
+
+        T links[0].href is 'http://google.com'
+        T links[0].text is 'Google'
+        T links[0].html is '<b>Google</b>'
+        T links[0].element?
+        T links[0].link is 'http://google.com'
+
+        T links[1].href is '#wat'
+        T links[1].text is 'Link in page'
+        T links[1].html is 'Link in page'
+        T links[1].element?
+        T links[1].link is null
+        T $(links[1].element).attr('class') is 'pretty'
+
+        T links[2].href is "javascript:alert('hi');"
+        T links[2].text is 'hi'
+        T links[2].html is 'hi'
+        T links[2].element?
+        T links[2].link is null
+
+        T links[3].href is "alert('hello')"
+        T links[3].text is 'hello'
+        T links[3].html is 'hello'
+        T links[3].element?
+        T links[3].link is null
+
+        T links[4].href is "/faq/questions"
+        T links[4].text is 'Faq'
+        T links[4].html is 'Faq'
+        T links[4].element?
+        T links[4].link is 'http://someserver.com/faq/questions'
+
+        T links[5].href is "aboutus"
+        T links[5].text is 'About Us'
+        T links[5].html is 'About Us'
+        T links[5].element?
+        T links[5].link is 'http://someserver.com/aboutus'
+
+        done()
+
diff --git a/test/mocha.opts b/test/mocha.opts
@@ -0,0 +1,6 @@
+--reporter spec
+--ui bdd
+--growl
+--compilers coffee:coffee-script
+--timeout 5000
+--globals __fnoc
diff --git a/test/resources/testfile.html b/test/resources/testfile.html
@@ -0,0 +1,19 @@
+<html>
+  <head>
+    <title>
+      Test File
+    </title>
+  </head>
+  <body>
+    <p id="wat">
+      <a href="http://google.com"><b>Google</b></a>
+    </p>
+    <p>
+      <a href="#wat" class="pretty">Link in page</a>
+      <a href="javascript:alert('hi');">hi</a>
+      <a href="alert('hello')">hello</a>
+      <a href="/faq/questions">Faq</a>
+      <a href="aboutus">About Us</a>
+    </p>
+  </body>
+</html>