From 379d93e9a1136c456165e0696f8536b9d38a0847 Mon Sep 17 00:00:00 2001 From: chuanye Date: Tue, 27 Aug 2013 16:04:13 +0800 Subject: [PATCH] =?UTF-8?q?=E6=94=B9=E7=94=A8cheerio=EF=BC=8C=E5=BC=83?= =?UTF-8?q?=E7=94=A8jsdom?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 1 + README.md | 22 +- examples/simple.js | 10 +- package.json | 59 ++--- src/helpers.js | 590 --------------------------------------------- src/readability.js | 113 ++------- 6 files changed, 52 insertions(+), 743 deletions(-) delete mode 100644 src/helpers.js diff --git a/.gitignore b/.gitignore index 3c3629e..eb79dd5 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ node_modules +.idea diff --git a/README.md b/README.md index 465c33b..beaf446 100644 --- a/README.md +++ b/README.md @@ -22,8 +22,8 @@ Example var readability = require('node-readability'); - readability.read('http://howtonode.org/really-simple-file-uploads', function(err, article) { - console.log(article.getContent()); + readability.read('http://howtonode.org/really-simple-file-uploads', function(err, $) { + console.log($('body').html()); }); **NB** If the file has been marked with charset other than utf-8, it is converted automatically. Charsets such as GBK, GB2312 is also supported via [iconv](https://github.com/bnoordhuis/node-iconv). @@ -50,24 +50,6 @@ Possible option values * **timeout** set a timeout in ms * **agent** pass-through http.request agent parameter -## article - -### getContent() - -Return the article content of the web page. Return `false` if failed. - -### getTitle() - -Return the article title of the web page. - -### getHTML() - -Return the original html of the web page. - -### getDocument() - -Return the document of the web page generated by jsdom. - ## TODO * Support more readability features diff --git a/examples/simple.js b/examples/simple.js index fa27800..1ab28b2 100644 --- a/examples/simple.js +++ b/examples/simple.js @@ -1,12 +1,8 @@ -var readability = require('../src/readability') +var readability = require('../src/readability'); // uncoment the following line to print the debug info to console. // readability.debug(true); - -readability.read('http://colorlines.com/archives/2011/08/dispatch_from_angola_faith-based_slavery_in_a_louisiana_prison.html', -function(err, read) { - var dom = read.getDocument(); - var html = ''+dom.title+'

'+read.getTitle()+'

'+read.getContent()+''; - console.log(html); +readability.read('http://jb.qm120.com/', function (err, $) { + console.log($('body').html()); }); diff --git a/package.json b/package.json index 65bff61..5eac058 100644 --- a/package.json +++ b/package.json @@ -1,35 +1,28 @@ { - "name": "node-readability", - "version": "0.0.8", - "author": "Zihua Li", - "description": "Turning any web page into a clean view.", - "homepage": "https://github.com/luin/node-readability", - "repository": { - "type": "git", - "url": "git://github.com/luin/node-readability.git" - }, - "scripts": { - "test": "mocha -R spec" - }, - "main": "./src/readability", - "licenses": [ - { - "type": "Apache License 2.0", - "url": "http://www.apache.org/licenses/LICENSE-2.0" - } - ], - "dependencies": { - "fetch": "0.3.x", - "jsdom": "0.6.x" - }, - "engines": [ - "node >=0.6.0" - ], - "keywords": [ - "readability" - ], - "devDependencies": { - "mocha": "~1.8.2", - "should": "~1.2.2" - } + "name": "node-readability", + "version": "0.0.8.1", + "author": "Zihua Li", + "description": "Turning any web page into a clean view.", + "homepage": "https://github.com/luin/node-readability", + "repository": { + "type": "git", + "url": "git://github.com/luin/node-readability.git" + }, + "main": "./src/readability", + "licenses": [ + { + "type": "Apache License 2.0", + "url": "http://www.apache.org/licenses/LICENSE-2.0" + } + ], + "dependencies": { + "fetch": "*", + "cheerio": "*" + }, + "engines": [ + "node >=0.6.0" + ], + "keywords": [ + "readability" + ] } diff --git a/src/helpers.js b/src/helpers.js deleted file mode 100644 index 792b5a3..0000000 --- a/src/helpers.js +++ /dev/null @@ -1,590 +0,0 @@ -// All of the regular expressions in use within readability. -var regexps = { - unlikelyCandidatesRe: /combx|comment|disqus|foot|header|menu|meta|nav|rss|shoutbox|sidebar|sponsor/i, - okMaybeItsACandidateRe: /and|article|body|column|main/i, - positiveRe: /article|body|content|entry|hentry|page|pagination|post|text/i, - negativeRe: /combx|comment|contact|foot|footer|footnote|link|media|meta|promo|related|scroll|shoutbox|sponsor|utility|tags|widget/i, - divToPElementsRe: /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i, - replaceBrsRe: /(]*>[ \n\r\t]*){2,}/gi, - replaceFontsRe: /<(\/?)font[^>]*>/gi, - trimRe: /^\s+|\s+$/g, - normalizeRe: /\s{2,}/g, - killBreaksRe: /((\s| ?)*){1,}/g, - videoRe: /http:\/\/(www\.)?(youtube|vimeo|youku|tudou|56|yinyuetai)\.com/i -}; - -var dbg; -exports.debug = function (debug) { - dbg = (debug) ? console.log : function () {}; -}; - -/** - * Prepare the HTML document for readability to scrape it. - * This includes things like stripping javascript, CSS, and handling terrible markup. - * - * @return void - **/ -var prepDocument = module.exports.prepDocument = function (document) { - var frames = document.getElementsByTagName('frame'); - if (frames.length > 0) { - var bestFrame = null; - var bestFrameSize = 0; - - frames.forEach(function (frame) { - var frameSize = frame.offsetWidth + frame.offsetHeight; - var canAccessFrame = false; - try { - frame.contentWindow.document.body; - canAccessFrame = true; - } catch (e) {} - - if (canAccessFrame && frameSize > bestFrameSize) { - bestFrame = frame; - bestFrameSize = frameSize; - } - }); - - if (bestFrame) { - var newBody = document.createElement('body'); - newBody.innerHTML = bestFrame.contentWindow.document.body.innerHTML; - newBody.style.overflow = 'scroll'; - document.body = newBody; - - var frameset = document.getElementsByTagName('frameset')[0]; - if (frameset) { - frameset.parentNode.removeChild(frameset); - } - } - } - - // remove all scripts that are not readability - var scripts = document.getElementsByTagName('script'); - for (var i = 0; i < scripts.length; ++i) { - scripts[i].parentNode.removeChild(scripts[i]); - } - // remove all stylesheets - for (var k = 0; k < document.styleSheets.length; k++) { - document.styleSheets[k].disabled = true; - } - - // turn all double br's into p's - // note, this is pretty costly as far as processing goes. Maybe optimize later. - document.body.innerHTML = document.body.innerHTML.replace(regexps.replaceBrsRe, '

').replace(regexps.replaceFontsRe, '<$1span>') -} - -/*** - * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is - * most likely to be the stuff a user wants to read. Then return it wrapped up in a div. - * - * @return Element - **/ -var grabArticle = module.exports.grabArticle = function (document, preserveUnlikelyCandidates) { - /** - * First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs - * into P tags where they have been used inappropriately (as in, where they contain no other block level elements.) - * - * Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5 - * TODO: Shouldn't this be a reverse traversal? - **/ - var nodes = document.getElementsByTagName('*'); - for (var i = 0; i < nodes.length; ++i) { - var node = nodes[i]; - // Remove unlikely candidates */ - var continueFlag = false; - if (!preserveUnlikelyCandidates) { - var unlikelyMatchString = node.className + node.id; - if (unlikelyMatchString.search(regexps.unlikelyCandidatesRe) !== -1 && unlikelyMatchString.search(regexps.okMaybeItsACandidateRe) == -1 && node.tagName !== "BODY") { - dbg("Removing unlikely candidate - " + unlikelyMatchString); - node.parentNode.removeChild(node); - continueFlag = true; - } - } - - // Turn all divs that don't have children block level elements into p's - if (!continueFlag && node.tagName === "DIV") { - if (node.innerHTML.search(regexps.divToPElementsRe) === -1) { - dbg("Altering div to p"); - var newNode = document.createElement('p'); - newNode.innerHTML = node.innerHTML; - node.parentNode.replaceChild(newNode, node); - } else { - // EXPERIMENTAL - node.childNodes._toArray().forEach(function (childNode) { - if (childNode.nodeType == 3 /*TEXT_NODE*/ ) { - // use span instead of p. Need more tests. - dbg("replacing text node with a span tag with the same content."); - var span = document.createElement('span'); - span.innerHTML = childNode.nodeValue; - childNode.parentNode.replaceChild(span, childNode); - } - }); - } - } - } - - /** - * Loop through all paragraphs, and assign a score to them based on how content-y they look. - * Then add their score to their parent node. - * - * A score is determined by things like number of commas, class names, etc. Maybe eventually link density. - **/ - var allParagraphs = document.getElementsByTagName("p"); - var candidates = []; - - for (var i = 0; i < allParagraphs.length; ++i) { - var paragraph = allParagraphs[i]; - var parentNode = paragraph.parentNode; - var grandParentNode = parentNode.parentNode; - var innerText = getInnerText(paragraph); - - // If this paragraph is less than 25 characters, don't even count it. - if (innerText.length < 25) continue; - - // Initialize readability data for the parent. - if (typeof parentNode.readability == 'undefined') { - initializeNode(parentNode); - candidates.push(parentNode); - } - - // Initialize readability data for the grandparent. - if (typeof grandParentNode.readability == 'undefined') { - initializeNode(grandParentNode); - candidates.push(grandParentNode); - } - - var contentScore = 0; - - // Add a point for the paragraph itself as a base. */ - ++contentScore; - - // Add points for any commas within this paragraph */ - // support Chinese commas. - contentScore += innerText.replace(',', ',').split(',').length; - - // For every 100 characters in this paragraph, add another point. Up to 3 points. */ - contentScore += Math.min(Math.floor(innerText.length / 100), 3); - - // Add the score to the parent. The grandparent gets half. */ - parentNode.readability.contentScore += contentScore; - grandParentNode.readability.contentScore += contentScore / 2; - } - - - /** - * After we've calculated scores, loop through all of the possible candidate nodes we found - * and find the one with the highest score. - **/ - var topCandidate = null; - candidates.forEach(function (candidate) { - /** - * Scale the final candidates score based on link density. Good content should have a - * relatively small link density (5% or less) and be mostly unaffected by this operation. - **/ - candidate.readability.contentScore = candidate.readability.contentScore * (1 - getLinkDensity(candidate)); - - dbg('Candidate: ' + candidate + " (" + candidate.className + ":" + candidate.id + ") with score " + candidate.readability.contentScore); - - if (!topCandidate || candidate.readability.contentScore > topCandidate.readability.contentScore) topCandidate = candidate; - }); - - /** - * If we still have no top candidate, just use the body as a last resort. - * We also have to copy the body node so it is something we can modify. - **/ - if (topCandidate === null || topCandidate.tagName === "BODY") { - topCandidate = document.createElement("DIV"); - topCandidate.innerHTML = document.body.innerHTML; - document.body.innerHTML = ""; - document.body.appendChild(topCandidate); - initializeNode(topCandidate); - } - - - /** - * Now that we have the top candidate, look through its siblings for content that might also be related. - * Things like preambles, content split by ads that we removed, etc. - **/ - var articleContent = document.createElement("DIV"); - articleContent.id = "readability-content"; - var siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2); - var siblingNodes = topCandidate.parentNode.childNodes; - for (var i = 0, il = siblingNodes.length; i < il; i++) { - var siblingNode = siblingNodes[i]; - var append = false; - - dbg("Looking at sibling node: " + siblingNode + " (" + siblingNode.className + ":" + siblingNode.id + ")" + ((typeof siblingNode.readability != 'undefined') ? (" with score " + siblingNode.readability.contentScore) : '')); - dbg("Sibling has score " + (siblingNode.readability ? siblingNode.readability.contentScore : 'Unknown')); - - if (siblingNode === topCandidate) { - append = true; - } - - if (typeof siblingNode.readability != 'undefined' && siblingNode.readability.contentScore >= siblingScoreThreshold) { - append = true; - } - - if (siblingNode.nodeName == "P") { - var linkDensity = getLinkDensity(siblingNode); - var nodeContent = getInnerText(siblingNode); - var nodeLength = nodeContent.length; - - if (nodeLength > 80 && linkDensity < 0.25) { - append = true; - } else if (nodeLength < 80 && linkDensity == 0 && nodeContent.search(/\.( |$)/) !== -1) { - append = true; - } - } - - if (append) { - dbg("Appending node: " + siblingNode) - - /* Append sibling and subtract from our list because it removes the node when you append to another node */ - articleContent.appendChild(siblingNode); - i--; - il--; - } - } - - /** - * So we have all of the content that we need. Now we clean it up for presentation. - **/ - prepArticle(articleContent); - - return articleContent; -}; - -/** - * Remove the style attribute on every e and under. - * - * @param Element - * @return void - **/ -function cleanStyles (e) { - if (!e) return; - - - // Remove any root styles, if we're able. - if (typeof e.removeAttribute == 'function' && e.className != 'readability-styled') e.removeAttribute('style'); - - // Go until there are no more child nodes - var cur = e.firstChild; - while (cur) { - if (cur.nodeType == 1) { - // Remove style attribute(s) : - if (cur.className != "readability-styled") { - cur.removeAttribute("style"); - } - cleanStyles(cur); - } - cur = cur.nextSibling; - } -} - -/** - * Remove extraneous break tags from a node. - * - * @param Element - * @return void - **/ -function killBreaks (e) { - e.innerHTML = e.innerHTML.replace(regexps.killBreaksRe, '
'); -} - - -/** - * Get the inner text of a node - cross browser compatibly. - * This also strips out any excess whitespace to be found. - * - * @param Element - * @return string - **/ -getInnerText = exports.getInnerText = function (e, normalizeSpaces) { - var textContent = ""; - - normalizeSpaces = (typeof normalizeSpaces == 'undefined') ? true : normalizeSpaces; - - textContent = e.textContent.trim(); - - if (normalizeSpaces) return textContent.replace(regexps.normalizeRe, " "); - else return textContent; -} - -/** - * Get the number of times a string s appears in the node e. - * - * @param Element - * @param string - what to split on. Default is "," - * @return number (integer) - **/ -function getCharCount (e, s) { - s = s || ","; - return getInnerText(e).split(s).length; -} - -/** - * Get the density of links as a percentage of the content - * This is the amount of text that is inside a link divided by the total text in the node. - * - * @param Element - * @return number (float) - **/ -function getLinkDensity (e) { - var links = e.getElementsByTagName("a"); - - var textLength = getInnerText(e).length; - var linkLength = 0; - for (var i = 0, il = links.length; i < il; i++) { - var href = links[i].getAttribute('href'); - // hack for

/

- if(!href || (href.length > 0 && href[0] === '#')) continue; - linkLength += getInnerText(links[i]).length; - } - return linkLength / textLength; -} - -/** - * Get an elements class/id weight. Uses regular expressions to tell if this - * element looks good or bad. - * - * @param Element - * @return number (Integer) - **/ -function getClassWeight (e) { - var weight = 0; - - /* Look for a special classname */ - if (e.className != "") { - if (e.className.search(regexps.negativeRe) !== -1) weight -= 25; - - if (e.className.search(regexps.positiveRe) !== -1) weight += 25; - } - - /* Look for a special ID */ - if (typeof (e.id) == 'string' && e.id != "") { - if (e.id.search(regexps.negativeRe) !== -1) weight -= 25; - - if (e.id.search(regexps.positiveRe) !== -1) weight += 25; - } - - return weight; -} - -/** - * Clean a node of all elements of type "tag". - * (Unless it's a youtube/vimeo video. People love movies.) - * - * @param Element - * @param string tag to clean - * @return void - **/ -function clean (e, tag) { - var targetList = e.getElementsByTagName(tag); - var isEmbed = (tag == 'object' || tag == 'embed'); - - for (var y = targetList.length - 1; y >= 0; y--) { - /* Allow youtube and vimeo videos through as people usually want to see those. */ - if (isEmbed && targetList[y].innerHTML.search(regexps.videoRe) !== -1) { - continue; - } - - targetList[y].parentNode.removeChild(targetList[y]); - } -} - -/** - * Clean an element of all tags of type "tag" if they look fishy. - * "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc. - * - * @return void - **/ -function cleanConditionally(e, tag) { - var tagsList = e.getElementsByTagName(tag); - var curTagsLength = tagsList.length; - - /** - * Gather counts for other typical elements embedded within. - * Traverse backwards so we can remove nodes at the same time without effecting the traversal. - * - * TODO: Consider taking into account original contentScore here. - **/ - for (var i = curTagsLength - 1; i >= 0; i--) { - var weight = getClassWeight(tagsList[i]); - - dbg("Cleaning Conditionally " + tagsList[i] + " (" + tagsList[i].className + ":" + tagsList[i].id + ")" + ((typeof tagsList[i].readability != 'undefined') ? (" with score " + tagsList[i].readability.contentScore) : '')); - - if (weight < 0) { - tagsList[i].parentNode.removeChild(tagsList[i]); - } else if (getCharCount(tagsList[i], ',') < 10) { - /** - * If there are not very many commas, and the number of - * non-paragraph elements is more than paragraphs or other ominous signs, remove the element. - **/ - - var p = tagsList[i].getElementsByTagName("p").length; - var img = tagsList[i].getElementsByTagName("img").length; - var li = tagsList[i].getElementsByTagName("li").length - 100; - var input = tagsList[i].getElementsByTagName("input").length; - - var embedCount = 0; - var embeds = tagsList[i].getElementsByTagName("embed"); - for (var ei = 0, il = embeds.length; ei < il; ei++) { - if (embeds[ei].src && embeds[ei].src.search(regexps.videoRe) == -1) { - embedCount++; - } - } - - var linkDensity = getLinkDensity(tagsList[i]); - var contentLength = getInnerText(tagsList[i]).length; - var toRemove = false; - - if (img > p && img > 1) { - toRemove = true; - } else if (li > p && tag != "ul" && tag != "ol") { - toRemove = true; - } else if (input > Math.floor(p / 3)) { - toRemove = true; - } else if (contentLength < 25 && (img == 0 || img > 2)) { - toRemove = true; - } else if (weight < 25 && linkDensity > .2) { - toRemove = true; - } else if (weight >= 25 && linkDensity > .5) { - toRemove = true; - } else if ((embedCount == 1 && contentLength < 75) || embedCount > 1) { - toRemove = true; - } - - if (toRemove) { - tagsList[i].parentNode.removeChild(tagsList[i]); - } - } - } -} - -/** - * Clean out spurious headers from an Element. Checks things like classnames and link density. - * - * @param Element - * @return void - **/ -function cleanHeaders (e) { - for (var headerIndex = 1; headerIndex < 7; headerIndex++) { - var headers = e.getElementsByTagName('h' + headerIndex); - for (var i = headers.length - 1; i >= 0; --i) { - if (getClassWeight(headers[i]) < 0 || getLinkDensity(headers[i]) > 0.33) { - headers[i].parentNode.removeChild(headers[i]); - } - } - } -} - -/** - * Remove the header that doesn't have next sibling. - * - * @param Element - * @return void - **/ - -function cleanSingleHeader (e) { - for (var headerIndex = 1; headerIndex < 7; headerIndex++) { - var headers = e.getElementsByTagName('h' + headerIndex); - for (var i = headers.length - 1; i >= 0; --i) { - if (headers[i].nextSibling === null) { - headers[i].parentNode.removeChild(headers[i]); - } - } - } - -} - -function prepArticle (articleContent) { - cleanStyles(articleContent); - killBreaks(articleContent); - - /* Clean out junk from the article content */ - clean(articleContent, "form"); - clean(articleContent, "object"); - clean(articleContent, "h1"); - /** - * If there is only one h2, they are probably using it - * as a header and not a subheader, so remove it since we already have a header. - ***/ - if (articleContent.getElementsByTagName('h2').length == 1) clean(articleContent, "h2"); - - clean(articleContent, "iframe"); - - cleanHeaders(articleContent); - - /* Do these last as the previous stuff may have removed junk that will affect these */ - cleanConditionally(articleContent, "table"); - cleanConditionally(articleContent, "ul"); - cleanConditionally(articleContent, "div"); - - /* Remove extra paragraphs */ - var articleParagraphs = articleContent.getElementsByTagName('p'); - for (var i = articleParagraphs.length - 1; i >= 0; i--) { - var imgCount = articleParagraphs[i].getElementsByTagName('img').length; - var embedCount = articleParagraphs[i].getElementsByTagName('embed').length; - var objectCount = articleParagraphs[i].getElementsByTagName('object').length; - - if (imgCount == 0 && embedCount == 0 && objectCount == 0 && getInnerText(articleParagraphs[i], false) == '') { - articleParagraphs[i].parentNode.removeChild(articleParagraphs[i]); - } - } - - cleanSingleHeader(articleContent); - - try { - articleContent.innerHTML = articleContent.innerHTML.replace(/]*>\s*

1) { - if (betterTitle) return self.cache['article-title'] = title; - betterTitle = tmpArray[0].trim(); + if (html.indexOf('<') === -1) { + fetchUrl(html, options, jsdomParse); + } else { + jsdomParse(null, null, html); } - }); - - if (betterTitle && betterTitle.length > 10) { - return this.cache['article-title'] = betterTitle; - } - - return this.cache['article-title'] = title; -}; -Readability.prototype.getDocument = function () { - return this._document; -}; + function jsdomParse(error, meta, body) { + if (error) { + return callback(error); + } -Readability.prototype.getHTML = function () { - return this._document.getElementsByTagName('html')[0].innerHTML; -}; + if (typeof body !== 'string') body = body.toString(); -function read(html, options, callback) { - if (typeof options === 'function') { - callback = options; - options = {}; - } - - if (html.indexOf('<') === -1) { - fetchUrl(html, options, jsdomParse); - } else { - jsdomParse(null, null, html); - } - - function jsdomParse(error, meta, body) { - if (error) { - return callback(error); + var $ = cheerio.load(body); + if (!$) { + callback(new Error('parse html error'), null); + } else { + callback(null, $); + } } - - if (typeof body !== 'string') body = body.toString(); - jsdom.env({ - html: body, - done: function (errors, window) { - if (errors) return callback(errors); - if (!window.document.body) return callback(new Error('No body tag was found.')); - callback(null, new Readability(window.document, options)); - } - }); - } } module.exports.read = read;