From 379d93e9a1136c456165e0696f8536b9d38a0847 Mon Sep 17 00:00:00 2001
From: chuanye <chuanye.wang@yuelian-inc.com>
Date: Tue, 27 Aug 2013 16:04:13 +0800
Subject: [PATCH] =?UTF-8?q?=E6=94=B9=E7=94=A8cheerio=EF=BC=8C=E5=BC=83?=
 =?UTF-8?q?=E7=94=A8jsdom?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitignore         |   1 +
 README.md          |  22 +-
 examples/simple.js |  10 +-
 package.json       |  59 ++---
 src/helpers.js     | 590 ---------------------------------------------
 src/readability.js | 113 ++-------
 6 files changed, 52 insertions(+), 743 deletions(-)
 delete mode 100644 src/helpers.js
diff --git a/.gitignore b/.gitignore
index 3c3629e..eb79dd5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,2 @@
 node_modules
+.idea
diff --git a/README.md b/README.md
index 465c33b..beaf446 100644
--- a/README.md
+++ b/README.md
@@ -22,8 +22,8 @@ Example
 
     var readability = require('node-readability');
 
-    readability.read('http://howtonode.org/really-simple-file-uploads', function(err, article) {
-      console.log(article.getContent());
+    readability.read('http://howtonode.org/really-simple-file-uploads', function(err, $) {
+      console.log($('body').html());
     });
 
 **NB** If the file has been marked with charset other than utf-8, it is converted automatically. Charsets such as GBK, GB2312 is also supported via [iconv](https://github.com/bnoordhuis/node-iconv).
@@ -50,24 +50,6 @@ Possible option values
  * **timeout** set a timeout in ms
  * **agent** pass-through http.request agent parameter
 
-## article
-
-### getContent()
-
-Return the article content of the web page. Return `false` if failed.
-
-### getTitle()
-
-Return the article title of the web page.
-
-### getHTML()
-
-Return the original html of the web page.
-
-### getDocument()
-
-Return the document of the web page generated by jsdom.
-
 ## TODO
 
   * Support more readability features
diff --git a/examples/simple.js b/examples/simple.js
index fa27800..1ab28b2 100644
--- a/examples/simple.js
+++ b/examples/simple.js
@@ -1,12 +1,8 @@
-var readability = require('../src/readability')
+var readability = require('../src/readability');
 
 // uncoment the following line to print the debug info to console.
 // readability.debug(true);
 
-
-readability.read('http://colorlines.com/archives/2011/08/dispatch_from_angola_faith-based_slavery_in_a_louisiana_prison.html',
-function(err, read) {
-  var dom = read.getDocument();
-  var html = '<html><head><meta charset="utf-8"><title>'+dom.title+'</title></head><body><h1>'+read.getTitle()+'</h1>'+read.getContent()+'</body></html>';
-  console.log(html);
+readability.read('http://jb.qm120.com/', function (err, $) {
+    console.log($('body').html());
 });
diff --git a/package.json b/package.json
index 65bff61..5eac058 100644
--- a/package.json
+++ b/package.json
@@ -1,35 +1,28 @@
 {
-  "name": "node-readability",
-  "version": "0.0.8",
-  "author": "Zihua Li",
-  "description": "Turning any web page into a clean view.",
-  "homepage": "https://github.com/luin/node-readability",
-  "repository": {
-    "type": "git",
-    "url": "git://github.com/luin/node-readability.git"
-  },
-  "scripts": {
-    "test": "mocha -R spec"
-  },
-  "main": "./src/readability",
-  "licenses": [
-    {
-      "type": "Apache License 2.0",
-      "url": "http://www.apache.org/licenses/LICENSE-2.0"
-    }
-  ],
-  "dependencies": {
-    "fetch": "0.3.x",
-    "jsdom": "0.6.x"
-  },
-  "engines": [
-    "node >=0.6.0"
-  ],
-  "keywords": [
-    "readability"
-  ],
-  "devDependencies": {
-    "mocha": "~1.8.2",
-    "should": "~1.2.2"
-  }
+    "name": "node-readability",
+    "version": "0.0.8.1",
+    "author": "Zihua Li",
+    "description": "Turning any web page into a clean view.",
+    "homepage": "https://github.com/luin/node-readability",
+    "repository": {
+        "type": "git",
+        "url": "git://github.com/luin/node-readability.git"
+    },
+    "main": "./src/readability",
+    "licenses": [
+        {
+            "type": "Apache License 2.0",
+            "url": "http://www.apache.org/licenses/LICENSE-2.0"
+        }
+    ],
+    "dependencies": {
+        "fetch": "*",
+        "cheerio": "*"
+    },
+    "engines": [
+        "node >=0.6.0"
+    ],
+    "keywords": [
+        "readability"
+    ]
 }
diff --git a/src/helpers.js b/src/helpers.js
deleted file mode 100644
index 792b5a3..0000000
--- a/src/helpers.js
+++ /dev/null
@@ -1,590 +0,0 @@
-// All of the regular expressions in use within readability.
-var regexps = {
-  unlikelyCandidatesRe: /combx|comment|disqus|foot|header|menu|meta|nav|rss|shoutbox|sidebar|sponsor/i,
-  okMaybeItsACandidateRe: /and|article|body|column|main/i,
-  positiveRe: /article|body|content|entry|hentry|page|pagination|post|text/i,
-  negativeRe: /combx|comment|contact|foot|footer|footnote|link|media|meta|promo|related|scroll|shoutbox|sponsor|utility|tags|widget/i,
-  divToPElementsRe: /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
-  replaceBrsRe: /(<br[^>]*>[ \n\r\t]*){2,}/gi,
-  replaceFontsRe: /<(\/?)font[^>]*>/gi,
-  trimRe: /^\s+|\s+$/g,
-  normalizeRe: /\s{2,}/g,
-  killBreaksRe: /(<br\s*\/?>(\s|&nbsp;?)*){1,}/g,
-  videoRe: /http:\/\/(www\.)?(youtube|vimeo|youku|tudou|56|yinyuetai)\.com/i
-};
-
-var dbg;
-exports.debug = function (debug) {
-  dbg = (debug) ? console.log : function () {};
-};
-
-/**
- * Prepare the HTML document for readability to scrape it.
- * This includes things like stripping javascript, CSS, and handling terrible markup.
- *
- * @return void
- **/
-var prepDocument = module.exports.prepDocument = function (document) {
-  var frames = document.getElementsByTagName('frame');
-  if (frames.length > 0) {
-    var bestFrame = null;
-    var bestFrameSize = 0;
-
-    frames.forEach(function (frame) {
-      var frameSize = frame.offsetWidth + frame.offsetHeight;
-      var canAccessFrame = false;
-      try {
-        frame.contentWindow.document.body;
-        canAccessFrame = true;
-      } catch (e) {}
-
-      if (canAccessFrame && frameSize > bestFrameSize) {
-        bestFrame = frame;
-        bestFrameSize = frameSize;
-      }
-    });
-
-    if (bestFrame) {
-      var newBody = document.createElement('body');
-      newBody.innerHTML = bestFrame.contentWindow.document.body.innerHTML;
-      newBody.style.overflow = 'scroll';
-      document.body = newBody;
-
-      var frameset = document.getElementsByTagName('frameset')[0];
-      if (frameset) {
-        frameset.parentNode.removeChild(frameset);
-      }
-    }
-  }
-
-  // remove all scripts that are not readability
-  var scripts = document.getElementsByTagName('script');
-  for (var i = 0; i < scripts.length; ++i) {
-    scripts[i].parentNode.removeChild(scripts[i]);
-  }
-  // remove all stylesheets
-  for (var k = 0; k < document.styleSheets.length; k++) {
-    document.styleSheets[k].disabled = true;
-  }
-
-  // turn all double br's into p's
-  // note, this is pretty costly as far as processing goes. Maybe optimize later.
-  document.body.innerHTML = document.body.innerHTML.replace(regexps.replaceBrsRe, '</p><p>').replace(regexps.replaceFontsRe, '<$1span>')
-}
-
-/***
- * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
- *               most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
- *
- * @return Element
- **/
-var grabArticle = module.exports.grabArticle = function (document, preserveUnlikelyCandidates) {
-  /**
-   * First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs
-   * into P tags where they have been used inappropriately (as in, where they contain no other block level elements.)
-   *
-   * Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5
-   * TODO: Shouldn't this be a reverse traversal?
-   **/
-  var nodes = document.getElementsByTagName('*');
-  for (var i = 0; i < nodes.length; ++i) {
-    var node = nodes[i];
-    // Remove unlikely candidates */
-    var continueFlag = false;
-    if (!preserveUnlikelyCandidates) {
-      var unlikelyMatchString = node.className + node.id;
-      if (unlikelyMatchString.search(regexps.unlikelyCandidatesRe) !== -1 && unlikelyMatchString.search(regexps.okMaybeItsACandidateRe) == -1 && node.tagName !== "BODY") {
-        dbg("Removing unlikely candidate - " + unlikelyMatchString);
-        node.parentNode.removeChild(node);
-        continueFlag = true;
-      }
-    }
-
-    // Turn all divs that don't have children block level elements into p's
-    if (!continueFlag && node.tagName === "DIV") {
-      if (node.innerHTML.search(regexps.divToPElementsRe) === -1) {
-        dbg("Altering div to p");
-        var newNode = document.createElement('p');
-        newNode.innerHTML = node.innerHTML;
-        node.parentNode.replaceChild(newNode, node);
-      } else {
-        // EXPERIMENTAL
-        node.childNodes._toArray().forEach(function (childNode) {
-          if (childNode.nodeType == 3 /*TEXT_NODE*/ ) {
-            // use span instead of p. Need more tests.
-            dbg("replacing text node with a span tag with the same content.");
-            var span = document.createElement('span');
-            span.innerHTML = childNode.nodeValue;
-            childNode.parentNode.replaceChild(span, childNode);
-          }
-        });
-      }
-    }
-  }
-
-  /**
-   * Loop through all paragraphs, and assign a score to them based on how content-y they look.
-   * Then add their score to their parent node.
-   *
-   * A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
-   **/
-  var allParagraphs = document.getElementsByTagName("p");
-  var candidates = [];
-
-  for (var i = 0; i < allParagraphs.length; ++i) {
-    var paragraph = allParagraphs[i];
-    var parentNode = paragraph.parentNode;
-    var grandParentNode = parentNode.parentNode;
-    var innerText = getInnerText(paragraph);
-
-    // If this paragraph is less than 25 characters, don't even count it. 
-    if (innerText.length < 25) continue;
-
-    // Initialize readability data for the parent.
-    if (typeof parentNode.readability == 'undefined') {
-      initializeNode(parentNode);
-      candidates.push(parentNode);
-    }
-
-    // Initialize readability data for the grandparent.
-    if (typeof grandParentNode.readability == 'undefined') {
-      initializeNode(grandParentNode);
-      candidates.push(grandParentNode);
-    }
-
-    var contentScore = 0;
-
-    // Add a point for the paragraph itself as a base. */
-    ++contentScore;
-
-    // Add points for any commas within this paragraph */
-    // support Chinese commas.
-    contentScore += innerText.replace('，', ',').split(',').length;
-
-    // For every 100 characters in this paragraph, add another point. Up to 3 points. */
-    contentScore += Math.min(Math.floor(innerText.length / 100), 3);
-
-    // Add the score to the parent. The grandparent gets half. */
-    parentNode.readability.contentScore += contentScore;
-    grandParentNode.readability.contentScore += contentScore / 2;
-  }
-
-
-  /**
-   * After we've calculated scores, loop through all of the possible candidate nodes we found
-   * and find the one with the highest score.
-   **/
-  var topCandidate = null;
-  candidates.forEach(function (candidate) {
-    /**
-     * Scale the final candidates score based on link density. Good content should have a
-     * relatively small link density (5% or less) and be mostly unaffected by this operation.
-     **/
-    candidate.readability.contentScore = candidate.readability.contentScore * (1 - getLinkDensity(candidate));
-
-    dbg('Candidate: ' + candidate + " (" + candidate.className + ":" + candidate.id + ") with score " + candidate.readability.contentScore);
-
-    if (!topCandidate || candidate.readability.contentScore > topCandidate.readability.contentScore) topCandidate = candidate;
-  });
-
-  /**
-   * If we still have no top candidate, just use the body as a last resort.
-   * We also have to copy the body node so it is something we can modify.
-   **/
-  if (topCandidate === null || topCandidate.tagName === "BODY") {
-    topCandidate = document.createElement("DIV");
-    topCandidate.innerHTML = document.body.innerHTML;
-    document.body.innerHTML = "";
-    document.body.appendChild(topCandidate);
-    initializeNode(topCandidate);
-  }
-
-
-  /**
-   * Now that we have the top candidate, look through its siblings for content that might also be related.
-   * Things like preambles, content split by ads that we removed, etc.
-   **/
-  var articleContent = document.createElement("DIV");
-  articleContent.id = "readability-content";
-  var siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2);
-  var siblingNodes = topCandidate.parentNode.childNodes;
-  for (var i = 0, il = siblingNodes.length; i < il; i++) {
-    var siblingNode = siblingNodes[i];
-    var append = false;
-
-    dbg("Looking at sibling node: " + siblingNode + " (" + siblingNode.className + ":" + siblingNode.id + ")" + ((typeof siblingNode.readability != 'undefined') ? (" with score " + siblingNode.readability.contentScore) : ''));
-    dbg("Sibling has score " + (siblingNode.readability ? siblingNode.readability.contentScore : 'Unknown'));
-
-    if (siblingNode === topCandidate) {
-      append = true;
-    }
-
-    if (typeof siblingNode.readability != 'undefined' && siblingNode.readability.contentScore >= siblingScoreThreshold) {
-      append = true;
-    }
-
-    if (siblingNode.nodeName == "P") {
-      var linkDensity = getLinkDensity(siblingNode);
-      var nodeContent = getInnerText(siblingNode);
-      var nodeLength = nodeContent.length;
-
-      if (nodeLength > 80 && linkDensity < 0.25) {
-        append = true;
-      } else if (nodeLength < 80 && linkDensity == 0 && nodeContent.search(/\.( |$)/) !== -1) {
-        append = true;
-      }
-    }
-
-    if (append) {
-      dbg("Appending node: " + siblingNode)
-
-      /* Append sibling and subtract from our list because it removes the node when you append to another node */
-      articleContent.appendChild(siblingNode);
-      i--;
-      il--;
-    }
-  }
-
-  /**
-   * So we have all of the content that we need. Now we clean it up for presentation.
-   **/
-  prepArticle(articleContent);
-
-  return articleContent;
-};
-
-/**
- * Remove the style attribute on every e and under.
- *
- * @param Element
- * @return void
- **/
-function cleanStyles (e) {
-  if (!e) return;
-
-
-  // Remove any root styles, if we're able.
-  if (typeof e.removeAttribute == 'function' && e.className != 'readability-styled') e.removeAttribute('style');
-
-  // Go until there are no more child nodes
-  var cur = e.firstChild;
-  while (cur) {
-    if (cur.nodeType == 1) {
-      // Remove style attribute(s) :
-      if (cur.className != "readability-styled") {
-        cur.removeAttribute("style");
-      }
-      cleanStyles(cur);
-    }
-    cur = cur.nextSibling;
-  }
-}
-
-/**
- * Remove extraneous break tags from a node.
- *
- * @param Element
- * @return void
- **/
-function killBreaks (e) {
-  e.innerHTML = e.innerHTML.replace(regexps.killBreaksRe, '<br />');
-}
-
-
-/**
- * Get the inner text of a node - cross browser compatibly.
- * This also strips out any excess whitespace to be found.
- *
- * @param Element
- * @return string
- **/
-getInnerText = exports.getInnerText = function (e, normalizeSpaces) {
-  var textContent = "";
-
-  normalizeSpaces = (typeof normalizeSpaces == 'undefined') ? true : normalizeSpaces;
-
-  textContent = e.textContent.trim();
-
-  if (normalizeSpaces) return textContent.replace(regexps.normalizeRe, " ");
-  else return textContent;
-}
-
-/**
- * Get the number of times a string s appears in the node e.
- *
- * @param Element
- * @param string - what to split on. Default is ","
- * @return number (integer)
- **/
-function getCharCount (e, s) {
-  s = s || ",";
-  return getInnerText(e).split(s).length;
-}
-
-/**
- * Get the density of links as a percentage of the content
- * This is the amount of text that is inside a link divided by the total text in the node.
- * 
- * @param Element
- * @return number (float)
- **/
-function getLinkDensity (e) {
-  var links = e.getElementsByTagName("a");
-
-  var textLength = getInnerText(e).length;
-  var linkLength = 0;
-  for (var i = 0, il = links.length; i < il; i++) {
-    var href = links[i].getAttribute('href');
-    // hack for <h2><a href="#menu"></a></h2> / <h2><a></a></h2>
-    if(!href || (href.length > 0 && href[0] === '#')) continue;
-    linkLength += getInnerText(links[i]).length;
-  }
-  return linkLength / textLength;
-}
-
-/**
- * Get an elements class/id weight. Uses regular expressions to tell if this 
- * element looks good or bad.
- *
- * @param Element
- * @return number (Integer)
- **/
-function getClassWeight (e) {
-  var weight = 0;
-
-  /* Look for a special classname */
-  if (e.className != "") {
-    if (e.className.search(regexps.negativeRe) !== -1) weight -= 25;
-
-    if (e.className.search(regexps.positiveRe) !== -1) weight += 25;
-  }
-
-  /* Look for a special ID */
-  if (typeof (e.id) == 'string' && e.id != "") {
-    if (e.id.search(regexps.negativeRe) !== -1) weight -= 25;
-
-    if (e.id.search(regexps.positiveRe) !== -1) weight += 25;
-  }
-
-  return weight;
-}
-
-/**
- * Clean a node of all elements of type "tag".
- * (Unless it's a youtube/vimeo video. People love movies.)
- *
- * @param Element
- * @param string tag to clean
- * @return void
- **/
-function clean (e, tag) {
-  var targetList = e.getElementsByTagName(tag);
-  var isEmbed = (tag == 'object' || tag == 'embed');
-
-  for (var y = targetList.length - 1; y >= 0; y--) {
-    /* Allow youtube and vimeo videos through as people usually want to see those. */
-    if (isEmbed && targetList[y].innerHTML.search(regexps.videoRe) !== -1) {
-      continue;
-    }
-
-    targetList[y].parentNode.removeChild(targetList[y]);
-  }
-}
-
-/**
- * Clean an element of all tags of type "tag" if they look fishy.
- * "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
- *
- * @return void
- **/
-function cleanConditionally(e, tag) {
-  var tagsList = e.getElementsByTagName(tag);
-  var curTagsLength = tagsList.length;
-
-  /**
-   * Gather counts for other typical elements embedded within.
-   * Traverse backwards so we can remove nodes at the same time without effecting the traversal.
-   *
-   * TODO: Consider taking into account original contentScore here.
-   **/
-  for (var i = curTagsLength - 1; i >= 0; i--) {
-    var weight = getClassWeight(tagsList[i]);
-
-    dbg("Cleaning Conditionally " + tagsList[i] + " (" + tagsList[i].className + ":" + tagsList[i].id + ")" + ((typeof tagsList[i].readability != 'undefined') ? (" with score " + tagsList[i].readability.contentScore) : ''));
-
-    if (weight < 0) {
-      tagsList[i].parentNode.removeChild(tagsList[i]);
-    } else if (getCharCount(tagsList[i], ',') < 10) {
-      /**
-       * If there are not very many commas, and the number of
-       * non-paragraph elements is more than paragraphs or other ominous signs, remove the element.
-       **/
-
-      var p = tagsList[i].getElementsByTagName("p").length;
-      var img = tagsList[i].getElementsByTagName("img").length;
-      var li = tagsList[i].getElementsByTagName("li").length - 100;
-      var input = tagsList[i].getElementsByTagName("input").length;
-
-      var embedCount = 0;
-      var embeds = tagsList[i].getElementsByTagName("embed");
-      for (var ei = 0, il = embeds.length; ei < il; ei++) {
-        if (embeds[ei].src && embeds[ei].src.search(regexps.videoRe) == -1) {
-          embedCount++;
-        }
-      }
-
-      var linkDensity = getLinkDensity(tagsList[i]);
-      var contentLength = getInnerText(tagsList[i]).length;
-      var toRemove = false;
-
-      if (img > p && img > 1) {
-        toRemove = true;
-      } else if (li > p && tag != "ul" && tag != "ol") {
-        toRemove = true;
-      } else if (input > Math.floor(p / 3)) {
-        toRemove = true;
-      } else if (contentLength < 25 && (img == 0 || img > 2)) {
-        toRemove = true;
-      } else if (weight < 25 && linkDensity > .2) {
-        toRemove = true;
-      } else if (weight >= 25 && linkDensity > .5) {
-        toRemove = true;
-      } else if ((embedCount == 1 && contentLength < 75) || embedCount > 1) {
-        toRemove = true;
-      }
-
-      if (toRemove) {
-        tagsList[i].parentNode.removeChild(tagsList[i]);
-      }
-    }
-  }
-}
-
-/**
- * Clean out spurious headers from an Element. Checks things like classnames and link density.
- *
- * @param Element
- * @return void
- **/
-function cleanHeaders (e) {
-  for (var headerIndex = 1; headerIndex < 7; headerIndex++) {
-    var headers = e.getElementsByTagName('h' + headerIndex);
-    for (var i = headers.length - 1; i >= 0; --i) {
-      if (getClassWeight(headers[i]) < 0 || getLinkDensity(headers[i]) > 0.33) {
-        headers[i].parentNode.removeChild(headers[i]);
-      }
-    }
-  }
-}
-
-/**
- * Remove the header that doesn't have next sibling.
- *
- * @param Element
- * @return void
- **/
-
-function cleanSingleHeader (e) {
-  for (var headerIndex = 1; headerIndex < 7; headerIndex++) {
-    var headers = e.getElementsByTagName('h' + headerIndex);
-    for (var i = headers.length - 1; i >= 0; --i) {
-      if (headers[i].nextSibling === null) {
-        headers[i].parentNode.removeChild(headers[i]);
-      }
-    }
-  }
-
-}
-
-function prepArticle (articleContent) {
-  cleanStyles(articleContent);
-  killBreaks(articleContent);
-
-  /* Clean out junk from the article content */
-  clean(articleContent, "form");
-  clean(articleContent, "object");
-  clean(articleContent, "h1");
-  /**
-   * If there is only one h2, they are probably using it
-   * as a header and not a subheader, so remove it since we already have a header.
-   ***/
-  if (articleContent.getElementsByTagName('h2').length == 1) clean(articleContent, "h2");
-
-  clean(articleContent, "iframe");
-
-  cleanHeaders(articleContent);
-
-  /* Do these last as the previous stuff may have removed junk that will affect these */
-  cleanConditionally(articleContent, "table");
-  cleanConditionally(articleContent, "ul");
-  cleanConditionally(articleContent, "div");
-
-  /* Remove extra paragraphs */
-  var articleParagraphs = articleContent.getElementsByTagName('p');
-  for (var i = articleParagraphs.length - 1; i >= 0; i--) {
-    var imgCount = articleParagraphs[i].getElementsByTagName('img').length;
-    var embedCount = articleParagraphs[i].getElementsByTagName('embed').length;
-    var objectCount = articleParagraphs[i].getElementsByTagName('object').length;
-
-    if (imgCount == 0 && embedCount == 0 && objectCount == 0 && getInnerText(articleParagraphs[i], false) == '') {
-      articleParagraphs[i].parentNode.removeChild(articleParagraphs[i]);
-    }
-  }
-
-  cleanSingleHeader(articleContent);
-
-  try {
-    articleContent.innerHTML = articleContent.innerHTML.replace(/<br[^>]*>\s*<p/gi, '<p');
-  } catch (e) {
-    dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block-elements bug. Ignoring.");
-  }
-
-}
-
-/**
- * Initialize a node with the readability object. Also checks the
- * className/id for special names to add to its score.
- *
- * @param Element
- * @return void
- **/
-function initializeNode (node) {
-  node.readability = {
-    "contentScore": 0
-  };
-
-  switch (node.tagName) {
-  case 'DIV':
-    node.readability.contentScore += 5;
-    break;
-
-  case 'PRE':
-  case 'TD':
-  case 'BLOCKQUOTE':
-    node.readability.contentScore += 3;
-    break;
-
-  case 'ADDRESS':
-  case 'OL':
-  case 'UL':
-  case 'DL':
-  case 'DD':
-  case 'DT':
-  case 'LI':
-  case 'FORM':
-    node.readability.contentScore -= 3;
-    break;
-
-  case 'H1':
-  case 'H2':
-  case 'H3':
-  case 'H4':
-  case 'H5':
-  case 'H6':
-  case 'TH':
-    node.readability.contentScore -= 5;
-    break;
-  }
-
-  node.readability.contentScore += getClassWeight(node);
-}
diff --git a/src/readability.js b/src/readability.js
index b048759..c45ebde 100644
--- a/src/readability.js
+++ b/src/readability.js
@@ -1,105 +1,32 @@
-var jsdom = require('jsdom');
+var cheerio = require('cheerio');
 var fetchUrl = require('fetch').fetchUrl;
-var helpers = require('./helpers');
 
-exports.debug = function (debug) {
-  helpers.debug(debug);
-};
-
-exports.debug(false);
-
-function Readability(document) {
-  this._document = document;
-  this.iframeLoads = 0;
-  // Cache the body HTML in case we need to re-use it later
-  this.bodyCache = null;
-  this._articleContent = '';
-
-  this.cache = {};
-
-  helpers.prepDocument(this._document);
-  this.cache = {
-    'body': this._document.body.innerHTML
-  };
-}
-
-Readability.prototype.getContent = function () {
-  if (typeof this.cache['article-content'] !== 'undefined') {
-    return this.cache['article-content'];
-  }
-
-  var articleContent = helpers.grabArticle(this._document);
-  if (helpers.getInnerText(articleContent, false) === '') {
-    this._document.body.innerHTML = this.cache.body;
-    articleContent = helpers.grabArticle(this._document, true);
-    if (helpers.getInnerText(articleContent, false) === '') {
-      return this.cache['article-content'] = false;
+function read(html, options, callback) {
+    if (typeof options === 'function') {
+        callback = options;
+        options = {};
     }
-  }
-
-  return this.cache['article-content'] = articleContent.innerHTML;
-};
 
-Readability.prototype.getTitle = function () {
-  if (typeof this.cache['article-title'] !== 'undefined') {
-    return this.cache['article-title'];
-  }
-
-  var title = this._document.title;
-  var betterTitle;
-  var commonSeparatingCharacters = [' | ', ' _ ', ' - ', '«', '»', '—'];
-
-  var self = this;
-  commonSeparatingCharacters.forEach(function (char) {
-    var tmpArray = title.split(char);
-    if (tmpArray.length > 1) {
-      if (betterTitle) return self.cache['article-title'] = title;
-      betterTitle = tmpArray[0].trim();
+    if (html.indexOf('<') === -1) {
+        fetchUrl(html, options, jsdomParse);
+    } else {
+        jsdomParse(null, null, html);
     }
-  });
-
-  if (betterTitle && betterTitle.length > 10) {
-    return this.cache['article-title'] = betterTitle;
-  }
-
-  return this.cache['article-title'] = title;
-};
 
-Readability.prototype.getDocument = function () {
-  return this._document;
-};
+    function jsdomParse(error, meta, body) {
+        if (error) {
+            return callback(error);
+        }
 
-Readability.prototype.getHTML = function () {
-  return this._document.getElementsByTagName('html')[0].innerHTML;
-};
+        if (typeof body !== 'string') body = body.toString();
 
-function read(html, options, callback) {
-  if (typeof options === 'function') {
-    callback = options;
-    options = {};
-  }
-
-  if (html.indexOf('<') === -1) {
-    fetchUrl(html, options, jsdomParse);
-  } else {
-    jsdomParse(null, null, html);
-  }
-
-  function jsdomParse(error, meta, body) {
-    if (error) {
-      return callback(error);
+        var $ = cheerio.load(body);
+        if (!$) {
+            callback(new Error('parse html error'), null);
+        } else {
+            callback(null, $);
+        }
     }
-
-    if (typeof body !== 'string') body = body.toString();
-    jsdom.env({
-      html: body,
-      done: function (errors, window) {
-        if (errors) return callback(errors);
-        if (!window.document.body) return callback(new Error('No body tag was found.'));
-        callback(null, new Readability(window.document, options));
-      }
-    });
-  }
 }
 
 module.exports.read = read;