A Node.js module to scrape and normalize links from an HTML string.
JavaScript HTML
Switch branches/tags
Latest commit d4a7856 Apr 1, 2015 @jprichardson 1.0.0
Failed to load latest commit information.
lib lib/linkscrape: fix var decls Apr 1, 2015
resources Initial commit. Jul 22, 2012
.gitignore Initial commit. Jul 22, 2012
.travis.yml travis: dropped 0.6 and 0.8, added 0.10 and 0.12 Apr 1, 2015
CHANGELOG.md 1.0.0 Apr 1, 2015
LICENSE Initial commit. Jul 22, 2012
README.md Added Travis-CI. Changed order of return callback args. Remove Coffee… Nov 25, 2012
package.json 1.0.0 Apr 1, 2015


Node.js - linkscrape

build status

This module allows scrapes links from an HTML string and normalizes them. It does not actually perform the HTTP request. Use superagent or request for that.


npm install linkscrape


HTML string:

      Test File
    <p id="wat">
      <a href="http://google.com"><b>Google</b></a>
      <a href="#wat" class="pretty">Link in page</a>
      <a href="javascript:alert('hi');">hi</a>
      <a href="alert('hello')">hello</a>
      <a href="/faq/questions">Faq</a>
      <a href="aboutus">About Us</a>

You must pass in the URL (of where the HTML string came from) to the scrape() method so that it can normalize the links.

var linkscrape = require('linkscrape');

linkscrape('http://someserver.com/mypage', htmlString, function(links, $){
  console.log(links.length);// is 6

  console.log(links[0].href); //is 'http://google.com'
  console.log(links[0].text); //is 'Google'
  console.log(links[0].html); //is '<b>Google</b>'
  console.log(links[0].element); //object
  console.log(links[0].link); //is 'http://google.com'

  console.log(links[1].href); //is '#wat'
  console.log(links[1].text); //is 'Link in page'
  console.log(links[1].html); //is 'Link in page'
  console.log(links[1].element); //object
  console.log(links[1].link); //is null
  console.log($(links[1].element).attr('class')); //is 'pretty'

  console.log(links[2].href); //is "javascript:alert('hi');"
  console.log(links[2].text); //is 'hi'
  console.log(links[2].html); //is 'hi'
  console.log(links[2].element); //object
  console.log(links[2].link); //is null

  console.log(links[3].href); //is "alert('hello')"
  console.log(links[3].text); //is 'hello'
  console.log(links[3].html); //is 'hello'
  console.log(links[3].element); //object
  console.log(links[3].link); //is null

  console.log(links[4].href); //is "/faq/questions"
  console.log(links[4].text); //is 'Faq'
  console.log(links[4].html); //is 'Faq'
  console.log(links[4].element); //object
  console.log(links[4].link); //is 'http://someserver.com/faq/questions'

  console.log(links[5].href); //is "aboutus"
  console.log(links[5].text); //is 'About Us'
  console.log(links[5].html); //is 'About Us'
  console.log(links[5].element); //object
  console.log(links[5].link); //is 'http://someserver.com/aboutus'

It's currently backed by cheerio. So you can use the $ with the jQuery selectors. See cheerio docs for more details.


npm test


mocha test


Licensed under MIT. See LICENSE for more details.

Copyright (c) 2012 JP Richardson