From 379d93e9a1136c456165e0696f8536b9d38a0847 Mon Sep 17 00:00:00 2001
From: chuanye '+read.getTitle()+'
'+read.getContent()+'';
- console.log(html);
+readability.read('http://jb.qm120.com/', function (err, $) {
+ console.log($('body').html());
});
diff --git a/package.json b/package.json
index 65bff61..5eac058 100644
--- a/package.json
+++ b/package.json
@@ -1,35 +1,28 @@
{
- "name": "node-readability",
- "version": "0.0.8",
- "author": "Zihua Li",
- "description": "Turning any web page into a clean view.",
- "homepage": "https://github.com/luin/node-readability",
- "repository": {
- "type": "git",
- "url": "git://github.com/luin/node-readability.git"
- },
- "scripts": {
- "test": "mocha -R spec"
- },
- "main": "./src/readability",
- "licenses": [
- {
- "type": "Apache License 2.0",
- "url": "http://www.apache.org/licenses/LICENSE-2.0"
- }
- ],
- "dependencies": {
- "fetch": "0.3.x",
- "jsdom": "0.6.x"
- },
- "engines": [
- "node >=0.6.0"
- ],
- "keywords": [
- "readability"
- ],
- "devDependencies": {
- "mocha": "~1.8.2",
- "should": "~1.2.2"
- }
+ "name": "node-readability",
+ "version": "0.0.8.1",
+ "author": "Zihua Li",
+ "description": "Turning any web page into a clean view.",
+ "homepage": "https://github.com/luin/node-readability",
+ "repository": {
+ "type": "git",
+ "url": "git://github.com/luin/node-readability.git"
+ },
+ "main": "./src/readability",
+ "licenses": [
+ {
+ "type": "Apache License 2.0",
+ "url": "http://www.apache.org/licenses/LICENSE-2.0"
+ }
+ ],
+ "dependencies": {
+ "fetch": "*",
+ "cheerio": "*"
+ },
+ "engines": [
+ "node >=0.6.0"
+ ],
+ "keywords": [
+ "readability"
+ ]
}
diff --git a/src/helpers.js b/src/helpers.js
deleted file mode 100644
index 792b5a3..0000000
--- a/src/helpers.js
+++ /dev/null
@@ -1,590 +0,0 @@
-// All of the regular expressions in use within readability.
-var regexps = {
- unlikelyCandidatesRe: /combx|comment|disqus|foot|header|menu|meta|nav|rss|shoutbox|sidebar|sponsor/i,
- okMaybeItsACandidateRe: /and|article|body|column|main/i,
- positiveRe: /article|body|content|entry|hentry|page|pagination|post|text/i,
- negativeRe: /combx|comment|contact|foot|footer|footnote|link|media|meta|promo|related|scroll|shoutbox|sponsor|utility|tags|widget/i,
- divToPElementsRe: /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
- replaceBrsRe: /(
]*>[ \n\r\t]*){2,}/gi,
- replaceFontsRe: /<(\/?)font[^>]*>/gi,
- trimRe: /^\s+|\s+$/g,
- normalizeRe: /\s{2,}/g,
- killBreaksRe: /(
(\s| ?)*){1,}/g,
- videoRe: /http:\/\/(www\.)?(youtube|vimeo|youku|tudou|56|yinyuetai)\.com/i
-};
-
-var dbg;
-exports.debug = function (debug) {
- dbg = (debug) ? console.log : function () {};
-};
-
-/**
- * Prepare the HTML document for readability to scrape it.
- * This includes things like stripping javascript, CSS, and handling terrible markup.
- *
- * @return void
- **/
-var prepDocument = module.exports.prepDocument = function (document) {
- var frames = document.getElementsByTagName('frame');
- if (frames.length > 0) {
- var bestFrame = null;
- var bestFrameSize = 0;
-
- frames.forEach(function (frame) {
- var frameSize = frame.offsetWidth + frame.offsetHeight;
- var canAccessFrame = false;
- try {
- frame.contentWindow.document.body;
- canAccessFrame = true;
- } catch (e) {}
-
- if (canAccessFrame && frameSize > bestFrameSize) {
- bestFrame = frame;
- bestFrameSize = frameSize;
- }
- });
-
- if (bestFrame) {
- var newBody = document.createElement('body');
- newBody.innerHTML = bestFrame.contentWindow.document.body.innerHTML;
- newBody.style.overflow = 'scroll';
- document.body = newBody;
-
- var frameset = document.getElementsByTagName('frameset')[0];
- if (frameset) {
- frameset.parentNode.removeChild(frameset);
- }
- }
- }
-
- // remove all scripts that are not readability
- var scripts = document.getElementsByTagName('script');
- for (var i = 0; i < scripts.length; ++i) {
- scripts[i].parentNode.removeChild(scripts[i]);
- }
- // remove all stylesheets
- for (var k = 0; k < document.styleSheets.length; k++) {
- document.styleSheets[k].disabled = true;
- }
-
- // turn all double br's into p's
- // note, this is pretty costly as far as processing goes. Maybe optimize later.
- document.body.innerHTML = document.body.innerHTML.replace(regexps.replaceBrsRe, '
').replace(regexps.replaceFontsRe, '<$1span>')
-}
-
-/***
- * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
- * most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
- *
- * @return Element
- **/
-var grabArticle = module.exports.grabArticle = function (document, preserveUnlikelyCandidates) {
- /**
- * First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs
- * into P tags where they have been used inappropriately (as in, where they contain no other block level elements.)
- *
- * Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5
- * TODO: Shouldn't this be a reverse traversal?
- **/
- var nodes = document.getElementsByTagName('*');
- for (var i = 0; i < nodes.length; ++i) {
- var node = nodes[i];
- // Remove unlikely candidates */
- var continueFlag = false;
- if (!preserveUnlikelyCandidates) {
- var unlikelyMatchString = node.className + node.id;
- if (unlikelyMatchString.search(regexps.unlikelyCandidatesRe) !== -1 && unlikelyMatchString.search(regexps.okMaybeItsACandidateRe) == -1 && node.tagName !== "BODY") {
- dbg("Removing unlikely candidate - " + unlikelyMatchString);
- node.parentNode.removeChild(node);
- continueFlag = true;
- }
- }
-
- // Turn all divs that don't have children block level elements into p's
- if (!continueFlag && node.tagName === "DIV") {
- if (node.innerHTML.search(regexps.divToPElementsRe) === -1) {
- dbg("Altering div to p");
- var newNode = document.createElement('p');
- newNode.innerHTML = node.innerHTML;
- node.parentNode.replaceChild(newNode, node);
- } else {
- // EXPERIMENTAL
- node.childNodes._toArray().forEach(function (childNode) {
- if (childNode.nodeType == 3 /*TEXT_NODE*/ ) {
- // use span instead of p. Need more tests.
- dbg("replacing text node with a span tag with the same content.");
- var span = document.createElement('span');
- span.innerHTML = childNode.nodeValue;
- childNode.parentNode.replaceChild(span, childNode);
- }
- });
- }
- }
- }
-
- /**
- * Loop through all paragraphs, and assign a score to them based on how content-y they look.
- * Then add their score to their parent node.
- *
- * A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
- **/
- var allParagraphs = document.getElementsByTagName("p");
- var candidates = [];
-
- for (var i = 0; i < allParagraphs.length; ++i) {
- var paragraph = allParagraphs[i];
- var parentNode = paragraph.parentNode;
- var grandParentNode = parentNode.parentNode;
- var innerText = getInnerText(paragraph);
-
- // If this paragraph is less than 25 characters, don't even count it.
- if (innerText.length < 25) continue;
-
- // Initialize readability data for the parent.
- if (typeof parentNode.readability == 'undefined') {
- initializeNode(parentNode);
- candidates.push(parentNode);
- }
-
- // Initialize readability data for the grandparent.
- if (typeof grandParentNode.readability == 'undefined') {
- initializeNode(grandParentNode);
- candidates.push(grandParentNode);
- }
-
- var contentScore = 0;
-
- // Add a point for the paragraph itself as a base. */
- ++contentScore;
-
- // Add points for any commas within this paragraph */
- // support Chinese commas.
- contentScore += innerText.replace(',', ',').split(',').length;
-
- // For every 100 characters in this paragraph, add another point. Up to 3 points. */
- contentScore += Math.min(Math.floor(innerText.length / 100), 3);
-
- // Add the score to the parent. The grandparent gets half. */
- parentNode.readability.contentScore += contentScore;
- grandParentNode.readability.contentScore += contentScore / 2;
- }
-
-
- /**
- * After we've calculated scores, loop through all of the possible candidate nodes we found
- * and find the one with the highest score.
- **/
- var topCandidate = null;
- candidates.forEach(function (candidate) {
- /**
- * Scale the final candidates score based on link density. Good content should have a
- * relatively small link density (5% or less) and be mostly unaffected by this operation.
- **/
- candidate.readability.contentScore = candidate.readability.contentScore * (1 - getLinkDensity(candidate));
-
- dbg('Candidate: ' + candidate + " (" + candidate.className + ":" + candidate.id + ") with score " + candidate.readability.contentScore);
-
- if (!topCandidate || candidate.readability.contentScore > topCandidate.readability.contentScore) topCandidate = candidate;
- });
-
- /**
- * If we still have no top candidate, just use the body as a last resort.
- * We also have to copy the body node so it is something we can modify.
- **/
- if (topCandidate === null || topCandidate.tagName === "BODY") {
- topCandidate = document.createElement("DIV");
- topCandidate.innerHTML = document.body.innerHTML;
- document.body.innerHTML = "";
- document.body.appendChild(topCandidate);
- initializeNode(topCandidate);
- }
-
-
- /**
- * Now that we have the top candidate, look through its siblings for content that might also be related.
- * Things like preambles, content split by ads that we removed, etc.
- **/
- var articleContent = document.createElement("DIV");
- articleContent.id = "readability-content";
- var siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2);
- var siblingNodes = topCandidate.parentNode.childNodes;
- for (var i = 0, il = siblingNodes.length; i < il; i++) {
- var siblingNode = siblingNodes[i];
- var append = false;
-
- dbg("Looking at sibling node: " + siblingNode + " (" + siblingNode.className + ":" + siblingNode.id + ")" + ((typeof siblingNode.readability != 'undefined') ? (" with score " + siblingNode.readability.contentScore) : ''));
- dbg("Sibling has score " + (siblingNode.readability ? siblingNode.readability.contentScore : 'Unknown'));
-
- if (siblingNode === topCandidate) {
- append = true;
- }
-
- if (typeof siblingNode.readability != 'undefined' && siblingNode.readability.contentScore >= siblingScoreThreshold) {
- append = true;
- }
-
- if (siblingNode.nodeName == "P") {
- var linkDensity = getLinkDensity(siblingNode);
- var nodeContent = getInnerText(siblingNode);
- var nodeLength = nodeContent.length;
-
- if (nodeLength > 80 && linkDensity < 0.25) {
- append = true;
- } else if (nodeLength < 80 && linkDensity == 0 && nodeContent.search(/\.( |$)/) !== -1) {
- append = true;
- }
- }
-
- if (append) {
- dbg("Appending node: " + siblingNode)
-
- /* Append sibling and subtract from our list because it removes the node when you append to another node */
- articleContent.appendChild(siblingNode);
- i--;
- il--;
- }
- }
-
- /**
- * So we have all of the content that we need. Now we clean it up for presentation.
- **/
- prepArticle(articleContent);
-
- return articleContent;
-};
-
-/**
- * Remove the style attribute on every e and under.
- *
- * @param Element
- * @return void
- **/
-function cleanStyles (e) {
- if (!e) return;
-
-
- // Remove any root styles, if we're able.
- if (typeof e.removeAttribute == 'function' && e.className != 'readability-styled') e.removeAttribute('style');
-
- // Go until there are no more child nodes
- var cur = e.firstChild;
- while (cur) {
- if (cur.nodeType == 1) {
- // Remove style attribute(s) :
- if (cur.className != "readability-styled") {
- cur.removeAttribute("style");
- }
- cleanStyles(cur);
- }
- cur = cur.nextSibling;
- }
-}
-
-/**
- * Remove extraneous break tags from a node.
- *
- * @param Element
- * @return void
- **/
-function killBreaks (e) {
- e.innerHTML = e.innerHTML.replace(regexps.killBreaksRe, '
');
-}
-
-
-/**
- * Get the inner text of a node - cross browser compatibly.
- * This also strips out any excess whitespace to be found.
- *
- * @param Element
- * @return string
- **/
-getInnerText = exports.getInnerText = function (e, normalizeSpaces) {
- var textContent = "";
-
- normalizeSpaces = (typeof normalizeSpaces == 'undefined') ? true : normalizeSpaces;
-
- textContent = e.textContent.trim();
-
- if (normalizeSpaces) return textContent.replace(regexps.normalizeRe, " ");
- else return textContent;
-}
-
-/**
- * Get the number of times a string s appears in the node e.
- *
- * @param Element
- * @param string - what to split on. Default is ","
- * @return number (integer)
- **/
-function getCharCount (e, s) {
- s = s || ",";
- return getInnerText(e).split(s).length;
-}
-
-/**
- * Get the density of links as a percentage of the content
- * This is the amount of text that is inside a link divided by the total text in the node.
- *
- * @param Element
- * @return number (float)
- **/
-function getLinkDensity (e) {
- var links = e.getElementsByTagName("a");
-
- var textLength = getInnerText(e).length;
- var linkLength = 0;
- for (var i = 0, il = links.length; i < il; i++) {
- var href = links[i].getAttribute('href');
- // hack for
1) { - if (betterTitle) return self.cache['article-title'] = title; - betterTitle = tmpArray[0].trim(); + if (html.indexOf('<') === -1) { + fetchUrl(html, options, jsdomParse); + } else { + jsdomParse(null, null, html); } - }); - - if (betterTitle && betterTitle.length > 10) { - return this.cache['article-title'] = betterTitle; - } - - return this.cache['article-title'] = title; -}; -Readability.prototype.getDocument = function () { - return this._document; -}; + function jsdomParse(error, meta, body) { + if (error) { + return callback(error); + } -Readability.prototype.getHTML = function () { - return this._document.getElementsByTagName('html')[0].innerHTML; -}; + if (typeof body !== 'string') body = body.toString(); -function read(html, options, callback) { - if (typeof options === 'function') { - callback = options; - options = {}; - } - - if (html.indexOf('<') === -1) { - fetchUrl(html, options, jsdomParse); - } else { - jsdomParse(null, null, html); - } - - function jsdomParse(error, meta, body) { - if (error) { - return callback(error); + var $ = cheerio.load(body); + if (!$) { + callback(new Error('parse html error'), null); + } else { + callback(null, $); + } } - - if (typeof body !== 'string') body = body.toString(); - jsdom.env({ - html: body, - done: function (errors, window) { - if (errors) return callback(errors); - if (!window.document.body) return callback(new Error('No body tag was found.')); - callback(null, new Readability(window.document, options)); - } - }); - } } module.exports.read = read;