diff --git a/plugins/env.readability.js b/plugins/env.readability.js index fffc3a65..bc3e79d4 100644 --- a/plugins/env.readability.js +++ b/plugins/env.readability.js @@ -9,30 +9,37 @@ load('plugins/vendor/readability.js'); Envjs.scriptTypes[""] = false; Envjs.scriptTypes["javascript"] = false; Envjs.scriptTypes["text/javascript"] = false; -document.async = false; +document.async = true; var readStyle='style-newspaper', readSize='size-medium', - readMargin='margin-medium', + readMargin='margin-wide', start = new Date().getTime(), docs = { - 'http://www.loc.gov/pictures/collection/bbc/background.html':'background.html', + 'http://www.cnn.com/2010/WORLD/europe/08/24/vbs.uk.afghanistan/index.html?hpt=C1':'cnn.html', + 'http://www.loc.gov/pictures/collection/bbc/background.html':'background.html'/*, 'http://www.loc.gov/pictures/collection/bbc/bibliographies.html':'bibliographies.html', 'http://www.loc.gov/pictures/collection/bbc/cataloging.html': 'cataloging.html', 'http://www.loc.gov/pictures/collection/bbc/digitizing.html': 'digitizing.html', 'http://www.loc.gov/pictures/collection/bbc/sets.html': 'sets.html', - 'http://www.loc.gov/pictures/collection/bbc/tinker_evers_chance.html': 'tinkers_evers_chance.html' + 'http://www.loc.gov/pictures/collection/bbc/tinker_evers_chance.html': 'tinkers_evers_chance.html'*/ /*'http://timesofindia.indiatimes.com/india/Railways-rot-as-Mamata-plays-politics-in-Bengal/articleshow/6193608.cms': '6193608.html'*/ /*'http://www.articlesbase.com/communication-articles/difference-between-analog-and-digital-69824.html': 'article1.html'*/ }; - + +jQuery(document).ready(function(){ + console.log('document ready') + makeReadable(name); +}); + jQuery.each(docs, function(doc, name){ console.log('loading document %s', doc); start = new Date().getTime(); document.location = doc; - makeReadable(name); }); + + function makeReadable(name){ console.log('document ready : %s (%s)', window.location, new Date().getTime()-start); try{ diff --git a/plugins/vendor/readability.js b/plugins/vendor/readability.js index defa809d..94ef34a0 100644 --- a/plugins/vendor/readability.js +++ b/plugins/vendor/readability.js @@ -13,10 +13,11 @@ var dbg = (typeof console !== 'undefined') ? function(s) { * Readability is licensed under the Apache License, Version 2.0. **/ var readability = { - version: '1.6.2', + version: '1.7.1', emailSrc: 'http://lab.arc90.com/experiments/readability/email.php', iframeLoads: 0, convertLinksToFootnotes: false, + reversePageScroll: false, /* If they hold shift and hit space, scroll up */ frameHack: false, /** * The frame hack is to workaround a firefox bug where if you * pull content out of a frame and stick it into the parent element, the scrollbar won't appear. @@ -24,30 +25,37 @@ var readability = { **/ biggestFrame: false, bodyCache: null, /* Cache the body HTML in case we need to re-use it later */ - flags: 0x1 | 0x2 | 0x4, /* Start with both flags set. */ - + flags: 0x1 | 0x2 | 0x4, /* Start with all flags set. */ + /* constants */ - FLAG_STRIP_UNLIKELYS: 0x1, - FLAG_WEIGHT_CLASSES: 0x2, - FLAG_CLEAN_CONDITIONALLY: 0x4, + FLAG_STRIP_UNLIKELYS: 0x1, + FLAG_WEIGHT_CLASSES: 0x2, + FLAG_CLEAN_CONDITIONALLY: 0x4, + + maxPages: 30, /* The maximum number of pages to loop through before we call it quits and just show a link. */ + parsedPages: {}, /* The list of pages we've parsed in this call of readability, for autopaging. As a key store for easier searching. */ + pageETags: {}, /* A list of the ETag headers of pages we've parsed, in case they happen to match, we'll know it's a duplicate. */ /** * All of the regular expressions in use within readability. * Defined up here so we don't instantiate them repeatedly in loops. **/ regexps: { - unlikelyCandidatesRe: /combx|comment|disqus|foot|header|menu|rss|shoutbox|sidebar|sponsor|ad-break|agegate/i, - okMaybeItsACandidateRe: /and|article|body|column|main/i, - positiveRe: /article|body|content|entry|hentry|page|pagination|post|text|blog|story/i, - negativeRe: /combx|comment|contact|foot|footer|footnote|masthead|media|meta|promo|related|scroll|shoutbox|sponsor|tags|widget/i, - divToPElementsRe: /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i, - replaceBrsRe: /(]*>[ \n\r\t]*){2,}/gi, - replaceFontsRe: /<(\/?)font[^>]*>/gi, - trimRe: /^\s+|\s+$/g, - normalizeRe: /\s{2,}/g, - killBreaksRe: /((\s| ?)*){1,}/g, - videoRe: /http:\/\/(www\.)?(youtube|vimeo)\.com/i, - skipFootnoteLinkRe: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i + unlikelyCandidates: /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter/i, + okMaybeItsACandidate: /and|article|body|column|main|shadow/i, + positive: /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i, + negative: /combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i, + extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single/i, + divToPElements: /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i, + replaceBrs: /(]*>[ \n\r\t]*){2,}/gi, + replaceFonts: /<(\/?)font[^>]*>/gi, + trim: /^\s+|\s+$/g, + normalize: /\s{2,}/g, + killBreaks: /((\s| ?)*){1,}/g, + videos: /http:\/\/(www\.)?(youtube|vimeo)\.com/i, + skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i, + nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i, // Match: next, continue, >, >>, » but not >|, »| as those usually mean last. + prevLink: /(prev|earl|old|new|<|«)/i }, /** @@ -63,28 +71,22 @@ var readability = { * @return void **/ init: function() { - /* Before we do anything, remove all scripts that are not readability. */ - window.onload = window.onunload = function() {}; - var scripts = document.getElementsByTagName('script'); - for(var i = scripts.length-1; i >= 0; i--) - { - if(typeof(scripts[i].src) == "undefined" || (scripts[i].src.indexOf('readability') == -1 && scripts[i].src.indexOf('typekit') == -1)) - { - scripts[i].nodeValue=""; - scripts[i].removeAttribute('src'); - scripts[i].parentNode.removeChild(scripts[i]); - } - } - - dbg('caching document body'); + window.onload = window.onunload = function() {}; + + readability.removeScripts(document); + if(document.body && !readability.bodyCache) { - readability.bodyCache = document.body.cloneNode(true); } - dbg('finished caching document body'); + readability.bodyCache = document.body.innerHTML; + + } + /* Make sure this document is added to the list of parsed pages first, so we don't double up on the first page */ + readability.parsedPages[window.location.href.replace(/\/$/, '')] = true; + + /* Pull out any possible next page link first */ + var nextPageLink = readability.findNextPageLink(document.body); - dbg('preping document'); readability.prepDocument(); - dbg('finished preping document'); /* Build readability's DOM tree */ var overlay = document.createElement("DIV"); @@ -94,35 +96,16 @@ var readability = { var articleContent = readability.grabArticle(); var articleFooter = readability.getArticleFooter(); - /** - * If we attempted to strip unlikely candidates on the first run through, and we ended up with no content, - * that may mean we stripped out the actual content so we couldn't parse it. So re-run init while preserving - * unlikely candidates to have a better shot at getting our content out properly. - **/ - if(readability.getInnerText(articleContent, false).length < 250) - { - if (readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS)) { - readability.removeFlag(readability.FLAG_STRIP_UNLIKELYS); - document.body = readability.bodyCache; - return readability.init(); - } - else if (readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) { - readability.removeFlag(readability.FLAG_WEIGHT_CLASSES); - document.body = readability.bodyCache; - return readability.init(); - } - else if (readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONALLY)) { - readability.removeFlag(readability.FLAG_CLEAN_CONDITIONALLY); - document.body = readability.bodyCache; - return readability.init(); - } - else { - articleContent.innerHTML = [ - "

Sorry, readability was unable to parse this page for content. If you feel like it should have been able to, please let us know by submitting an issue.

", - (readability.frameHack ? "

It appears this page uses frames. Unfortunately, browser security properties often cause Readability to fail on pages that include frames. You may want to try running readability itself on this source page: " + readability.biggestFrame.src + "

" : ""), - "

Also, please note that Readability does not play very nicely with front pages. Readability is intended to work on articles with a sizable chunk of text that you'd like to read comfortably. If you're using Readability on a landing page (like nytimes.com for example), please click into an article first before using Readability.

" - ].join(''); - } + if(!articleContent) { + articleContent = document.createElement("DIV"); + articleContent.id = "readability-content"; + articleContent.innerHTML = [ + "

Sorry, readability was unable to parse this page for content. If you feel like it should have been able to, please let us know by submitting an issue.

", + (readability.frameHack ? "

It appears this page uses frames. Unfortunately, browser security properties often cause Readability to fail on pages that include frames. You may want to try running readability itself on this source page: " + readability.biggestFrame.src + "

" : ""), + "

Also, please note that Readability does not play very nicely with front pages. Readability is intended to work on articles with a sizable chunk of text that you'd like to read comfortably. If you're using Readability on a landing page (like nytimes.com for example), please click into an article first before using Readability.

" + ].join(''); + + nextPageLink = null; } overlay.id = "readOverlay"; @@ -138,22 +121,21 @@ var readability = { } innerDiv.className = readMargin + " " + readSize; - if(typeof(readConvertLinksToFootnotes) !== 'undefined' && readConvertLinksToFootnotes == true) { + if(typeof(readConvertLinksToFootnotes) !== 'undefined' && readConvertLinksToFootnotes === true) { readability.convertLinksToFootnotes = true; } /* Glue the structure of our document together. */ - // articleContent.appendChild( articleFooter ); - innerDiv.appendChild( articleTitle ); - innerDiv.appendChild( articleContent ); - innerDiv.appendChild( articleFooter ); - overlay.appendChild( articleTools ); - overlay.appendChild( innerDiv ); + innerDiv.appendChild( articleTitle ); + innerDiv.appendChild( articleContent ); + innerDiv.appendChild( articleFooter ); + overlay.appendChild( articleTools ); + overlay.appendChild( innerDiv ); /* Clear the old HTML, insert the new content. */ document.body.innerHTML = ""; document.body.insertBefore(overlay, document.body.firstChild); - document.body.removeAttribute('style'); + document.body.removeAttribute('style'); if(readability.frameHack) { @@ -175,13 +157,8 @@ var readability = { innerDiv.insertBefore( rootWarning, articleContent ); } -// document.body.style.display = "block"; - - if(readability.convertLinksToFootnotes && !window.location.href.match(/wikipedia\.org/g)) { - readability.addFootnotes(articleContent); - } - readability.fixImageFloats(articleContent); + readability.postProcessContent(articleContent); window.scrollTo(0, 0); @@ -189,28 +166,83 @@ var readability = { if (readStyle == "style-athelas" || readStyle == "style-apertura") { readability.useRdbTypekit(); } + + if (nextPageLink) { + /** + * Append any additional pages after a small timeout so that people + * can start reading without having to wait for this to finish processing. + **/ + window.setTimeout(function() { + readability.appendNextPage(nextPageLink); + }, 500); + } + + /** Smooth scrolling **/ + document.onkeydown = function(e) { + var code = (window.event) ? event.keyCode : e.keyCode; + if (code === 16) { + readability.reversePageScroll = true; + return; + } + + if (code === 32) { + readability.curScrollStep = 0; + var windowHeight = window.innerHeight ? window.innerHeight : (document.documentElement.clientHeight ? document.documentElement.clientHeight : document.body.clientHeight); + + if(readability.reversePageScroll) { + readability.scrollTo(readability.scrollTop(), readability.scrollTop() - (windowHeight - 50), 20, 10); + } + else { + readability.scrollTo(readability.scrollTop(), readability.scrollTop() + (windowHeight - 50), 20, 10); + } + + return false; + } + }; + + document.onkeyup = function(e) { + var code = (window.event) ? event.keyCode : e.keyCode; + if (code === 16) { + readability.reversePageScroll = false; + return; + } + } }, - /** - * Some content ends up looking ugly if the image is too large to be floated. - * If the image is wider than a threshold (currently 55%), no longer float it, - * center it instead. - * - * @param Element - * @return void - **/ - fixImageFloats: function (articleContent) { - var imageWidthThreshold = Math.min(articleContent.offsetWidth, 800) * 0.55, - images = articleContent.getElementsByTagName('img'); - - for(var i=0, il = images.length; i < il; i++) { - var image = images[i]; - - if(image.offsetWidth > imageWidthThreshold) { - image.className += " blockImage"; - } - } - }, + /** + * Run any post-process modifications to article content as necessary. + * + * @param Element + * @return void + **/ + postProcessContent: function(articleContent) { + if(readability.convertLinksToFootnotes && !window.location.href.match(/wikipedia\.org/g)) { + readability.addFootnotes(articleContent); + } + + readability.fixImageFloats(articleContent); + }, + + /** + * Some content ends up looking ugly if the image is too large to be floated. + * If the image is wider than a threshold (currently 55%), no longer float it, + * center it instead. + * + * @param Element + * @return void + **/ + fixImageFloats: function (articleContent) { + var imageWidthThreshold = Math.min(articleContent.offsetWidth, 800) * 0.55, + images = articleContent.getElementsByTagName('img'); + + for(var i=0, il = images.length; i < il; i++) { + var image = images[i]; + + if(image.offsetWidth > imageWidthThreshold) { + image.className += " blockImage"; + } + } + }, /** * Get the article tools Element that has buttons like reload, print, email. @@ -272,7 +304,7 @@ var readability = { } } - curTitle = curTitle.replace( readability.regexps.trimRe, "" ); + curTitle = curTitle.replace( readability.regexps.trim, "" ); if(curTitle.split(' ').length <= 4) { curTitle = origTitle; @@ -299,21 +331,21 @@ var readability = { // var statsQueryParams = "?readStyle=" + encodeURIComponent(readStyle) + "&readMargin=" + encodeURIComponent(readMargin) + "&readSize=" + encodeURIComponent(readSize); /* TODO: attach this to an image */ - articleFooter.id = "readFooter"; - articleFooter.innerHTML = [ - "", - ""].join(''); - + articleFooter.id = "readFooter"; + articleFooter.innerHTML = [ + "", + ""].join(''); + return articleFooter; }, @@ -340,12 +372,14 @@ var readability = { } } + document.body.id = "readabilityBody"; + var frames = document.getElementsByTagName('frame'); if(frames.length > 0) { var bestFrame = null; var bestFrameSize = 0; /* The frame to try to run readability upon. Must be on same domain. */ - var biggestFrameSize = 0; /* Used for the error message. Can be on any domain. */ + var biggestFrameSize = 0; /* Used for the error message. Can be on any domain. */ for(var frameIndex = 0; frameIndex < frames.length; frameIndex++) { var frameSize = frames[frameIndex].offsetWidth + frames[frameIndex].offsetHeight; @@ -358,10 +392,10 @@ var readability = { dbg(eFrames); } - if(frameSize > biggestFrameSize) { - biggestFrameSize = frameSize; - readability.biggestFrame = frames[frameIndex]; - } + if(frameSize > biggestFrameSize) { + biggestFrameSize = frameSize; + readability.biggestFrame = frames[frameIndex]; + } if(canAccessFrame && frameSize > bestFrameSize) { @@ -385,7 +419,7 @@ var readability = { } } - /* remove all stylesheets */ + /* Remove all stylesheets */ for (var k=0;k < document.styleSheets.length; k++) { if (document.styleSheets[k].href !== null && document.styleSheets[k].href.lastIndexOf("readability") == -1) { document.styleSheets[k].disabled = true; @@ -395,20 +429,12 @@ var readability = { /* Remove all style tags in head (not doing this on IE) - TODO: Why not? */ var styleTags = document.getElementsByTagName("style"); for (var st=0;st < styleTags.length; st++) { - if (navigator.appName != "Microsoft Internet Explorer") { - styleTags[st].textContent = ""; } + styleTags[st].textContent = ""; } /* Turn all double br's into p's */ /* Note, this is pretty costly as far as processing goes. Maybe optimize later. */ - /* Thatcher: edited for some optimization in envjs */ - /*document.body.innerHTML = readability.bodyCache ? - readability.bodyCache. - replace(readability.regexps.replaceBrsRe, '

'). - replace(readability.regexps.replaceFontsRe, '<$1span>') : - document.body.innerHTML. - replace(readability.regexps.replaceBrsRe, '

'). - replace(readability.regexps.replaceFontsRe, '<$1span>') ;*/ + document.body.innerHTML = document.body.innerHTML.replace(readability.regexps.replaceBrs, '

').replace(readability.regexps.replaceFonts, '<$1span>'); }, /** @@ -418,15 +444,29 @@ var readability = { * @return void **/ addFootnotes: function(articleContent) { - var footnotesWrapper = document.createElement('div'); - footnotesWrapper.innerHTML = "

References

"; - - var articleFootnotes = document.createElement('ol'); - footnotesWrapper.appendChild(articleFootnotes); + var footnotesWrapper = document.getElementById('readability-footnotes'), + articleFootnotes = document.getElementById('readability-footnotes-list'); + if(!footnotesWrapper) { + footnotesWrapper = document.createElement("DIV"); + footnotesWrapper.id = 'readability-footnotes'; + footnotesWrapper.innerHTML = '

References

'; + footnotesWrapper.style.display = 'none'; /* Until we know we have footnotes, don't show the references block. */ + + articleFootnotes = document.createElement('ol'); + articleFootnotes.id = 'readability-footnotes-list'; + + footnotesWrapper.appendChild(articleFootnotes); + + var readFooter = document.getElementById('readFooter'); + + if(readFooter) { + readFooter.parentNode.insertBefore(footnotesWrapper, readFooter); + } + } + var articleLinks = articleContent.getElementsByTagName('a'); - - var linkCount = 0; + var linkCount = articleFootnotes.getElementsByTagName('li').length; for (var i = 0; i < articleLinks.length; i++) { var articleLink = articleLinks[i], @@ -436,7 +476,7 @@ var readability = { linkDomain = footnoteLink.host ? footnoteLink.host : document.location.host, linkText = readability.getInnerText(articleLink); - if(articleLink.className && articleLink.className.indexOf('readability-DoNotFootnote') !== -1 || linkText.match(readability.regexps.skipFootnoteLinkRe)) { + if(articleLink.className && articleLink.className.indexOf('readability-DoNotFootnote') !== -1 || linkText.match(readability.regexps.skipFootnoteLink)) { continue; } @@ -444,9 +484,9 @@ var readability = { /** Add a superscript reference after the article link */ refLink.href = '#readabilityFootnoteLink-' + linkCount; - refLink.innerHTML = '[' + linkCount + ']' + refLink.innerHTML = '[' + linkCount + ']'; refLink.className = 'readability-DoNotFootnote'; - refLink.style.color = 'inherit'; + try { refLink.style.color = 'inherit'; } catch(e) {} /* IE7 doesn't like inherit. */ if(articleLink.parentNode.lastChild == articleLink) { articleLink.parentNode.appendChild(refLink); @@ -454,8 +494,8 @@ var readability = { articleLink.parentNode.insertBefore(refLink, articleLink.nextSibling); } - articleLink.style.color = 'inherit'; articleLink.name = 'readabilityLink-' + linkCount; + try { articleLink.style.color = 'inherit'; } catch(e) {} /* IE7 doesn't like inherit. */ footnote.innerHTML = "^ "; @@ -469,7 +509,7 @@ var readability = { } if(linkCount > 0) { - articleContent.appendChild(footnotesWrapper); + footnotesWrapper.style.display = 'block'; } }, @@ -551,7 +591,8 @@ var readability = { * as a header and not a subheader, so remove it since we already have a header. ***/ if(articleContent.getElementsByTagName('h2').length == 1) { - readability.clean(articleContent, "h2"); } + readability.clean(articleContent, "h2"); + } readability.clean(articleContent, "iframe"); readability.cleanHeaders(articleContent); @@ -563,14 +604,12 @@ var readability = { /* Remove extra paragraphs */ var articleParagraphs = articleContent.getElementsByTagName('p'); - for(var i = articleParagraphs.length-1; i >= 0; i--) - { + for(var i = articleParagraphs.length-1; i >= 0; i--) { var imgCount = articleParagraphs[i].getElementsByTagName('img').length; var embedCount = articleParagraphs[i].getElementsByTagName('embed').length; var objectCount = articleParagraphs[i].getElementsByTagName('object').length; - if(imgCount === 0 && embedCount === 0 && objectCount === 0 && readability.getInnerText(articleParagraphs[i], false) == '') - { + if(imgCount === 0 && embedCount === 0 && objectCount === 0 && readability.getInnerText(articleParagraphs[i], false) == '') { articleParagraphs[i].parentNode.removeChild(articleParagraphs[i]); } } @@ -633,10 +672,18 @@ var readability = { * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is * most likely to be the stuff a user wants to read. Then return it wrapped up in a div. * + * @param page a document to run upon. Needs to be a full document, complete with body. * @return Element **/ - grabArticle: function () { - var stripUnlikelyCandidates = readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS); + grabArticle: function (page) { + var stripUnlikelyCandidates = readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS), + isPaging = (page !== null) ? true: false; + + page = page ? page : document.body; + + var pageCacheHtml = page.innerHTML; + + var allElements = page.getElementsByTagName('*'); /** * First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs @@ -647,20 +694,21 @@ var readability = { **/ var node = null; var nodesToScore = []; - for(var nodeIndex = 0; (node = document.getElementsByTagName('*')[nodeIndex]); nodeIndex++) - { + for(var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex++) { /* Remove unlikely candidates */ if (stripUnlikelyCandidates) { var unlikelyMatchString = node.className + node.id; if ( - unlikelyMatchString.search(readability.regexps.unlikelyCandidatesRe) !== -1 && - unlikelyMatchString.search(readability.regexps.okMaybeItsACandidateRe) == -1 && - node.tagName !== "BODY" - ) + ( + unlikelyMatchString.search(readability.regexps.unlikelyCandidates) !== -1 && + unlikelyMatchString.search(readability.regexps.okMaybeItsACandidate) == -1 && + node.tagName !== "BODY" + ) + ) { dbg("Removing unlikely candidate - " + unlikelyMatchString); node.parentNode.removeChild(node); - nodeIndex--; + //nodeIndex--; continue; } } @@ -671,13 +719,12 @@ var readability = { /* Turn all divs that don't have children block level elements into p's */ if (node.tagName === "DIV") { - if (node.innerHTML.search(readability.regexps.divToPElementsRe) === -1) { - dbg("Altering div to p"); + if (node.innerHTML.search(readability.regexps.divToPElements) === -1) { var newNode = document.createElement('p'); try { newNode.innerHTML = node.innerHTML; node.parentNode.replaceChild(newNode, node); - nodeIndex--; + //nodeIndex--; nodesToScore[nodesToScore.length] = node; } @@ -691,7 +738,6 @@ var readability = { for(var i = 0, il = node.childNodes.length; i < il; i++) { var childNode = node.childNodes[i]; if(childNode.nodeType == 3) { // Node.TEXT_NODE - dbg("replacing text node with a p tag with the same content."); var p = document.createElement('p'); p.innerHTML = childNode.nodeValue; p.style.display = 'inline'; @@ -724,15 +770,13 @@ var readability = { continue; } /* Initialize readability data for the parent. */ - if(typeof parentNode.readability == 'undefined') - { + if(typeof parentNode.readability == 'undefined') { readability.initializeNode(parentNode); candidates.push(parentNode); } /* Initialize readability data for the grandparent. */ - if(grandParentNode && typeof(grandParentNode.readability) == 'undefined' && typeof(grandParentNode.tagName) != 'undefined') - { + if(grandParentNode && typeof(grandParentNode.readability) == 'undefined' && typeof(grandParentNode.tagName) != 'undefined') { readability.initializeNode(grandParentNode); candidates.push(grandParentNode); } @@ -782,9 +826,9 @@ var readability = { if (topCandidate === null || topCandidate.tagName == "BODY") { topCandidate = document.createElement("DIV"); - topCandidate.innerHTML = document.body.innerHTML; - document.body.innerHTML = ""; - document.body.appendChild(topCandidate); + topCandidate.innerHTML = page.innerHTML; + page.innerHTML = ""; + page.appendChild(topCandidate); readability.initializeNode(topCandidate); } @@ -793,16 +837,25 @@ var readability = { * Things like preambles, content split by ads that we removed, etc. **/ var articleContent = document.createElement("DIV"); + if (isPaging) { articleContent.id = "readability-content"; + } var siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2); var siblingNodes = topCandidate.parentNode.childNodes; - for(var s=0, sl=siblingNodes.length; s < sl; s++) - { + for(var s=0, sl=siblingNodes.length; s < sl; s++) { var siblingNode = siblingNodes[s]; var append = false; + /** + * Fix for odd IE7 Crash where siblingNode does not exist even though this should be a live nodeList. + * Example of error visible here: http://www.esquire.com/features/honesty0707 + **/ + if(!siblingNode) { + continue; + } + dbg("Looking at sibling node: " + siblingNode + " (" + siblingNode.className + ":" + siblingNode.id + ")" + ((typeof siblingNode.readability != 'undefined') ? (" with score " + siblingNode.readability.contentScore) : '')); dbg("Sibling has score " + (siblingNode.readability ? siblingNode.readability.contentScore : 'Unknown')); @@ -837,8 +890,7 @@ var readability = { } } - if(append) - { + if(append) { dbg("Appending node: " + siblingNode); var nodeToAppend = null; @@ -846,13 +898,12 @@ var readability = { /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */ dbg("Altering siblingNode of " + siblingNode.nodeName + ' to div.'); - nodeToAppend = document.createElement('div'); + nodeToAppend = document.createElement("DIV"); try { nodeToAppend.id = siblingNode.id; nodeToAppend.innerHTML = siblingNode.innerHTML; } - catch(e) - { + catch(er) { dbg("Could not alter siblingNode to div, probably an IE restriction, reverting back to original."); nodeToAppend = siblingNode; s--; @@ -872,15 +923,63 @@ var readability = { } } - /** * So we have all of the content that we need. Now we clean it up for presentation. **/ readability.prepArticle(articleContent); + + if (readability.curPageNum === 1) { + articleContent.innerHTML = '
' + articleContent.innerHTML + '
'; + } + + /** + * Now that we've gone through the full algorithm, check to see if we got any meaningful content. + * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher + * likelihood of finding the content, and the sieve approach gives us a higher likelihood of + * finding the -right- content. + **/ + if(readability.getInnerText(articleContent, false).length < 250) { + page.innerHTML = pageCacheHtml; + + if (readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS)) { + readability.removeFlag(readability.FLAG_STRIP_UNLIKELYS); + return readability.grabArticle(page); + } + else if (readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) { + readability.removeFlag(readability.FLAG_WEIGHT_CLASSES); + return readability.grabArticle(page); + } + else if (readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONALLY)) { + readability.removeFlag(readability.FLAG_CLEAN_CONDITIONALLY); + return readability.grabArticle(page); + } else { + return null; + } + } return articleContent; }, + /** + * Removes script tags from the document. + * + * @param Element + **/ + removeScripts: function (doc) { + var scripts = doc.getElementsByTagName('script'); + for(var i = scripts.length-1; i >= 0; i--) + { + if(typeof(scripts[i].src) == "undefined" || (scripts[i].src.indexOf('readability') == -1 && scripts[i].src.indexOf('typekit') == -1)) + { + scripts[i].nodeValue=""; + scripts[i].removeAttribute('src'); + if (scripts[i].parentNode) { + scripts[i].parentNode.removeChild(scripts[i]); + } + } + } + }, + /** * Get the inner text of a node - cross browser compatibly. * This also strips out any excess whitespace to be found. @@ -891,19 +990,19 @@ var readability = { getInnerText: function (e, normalizeSpaces) { var textContent = ""; - if(typeof(e.textContent) == "undefined" && typeof(e.innerText) == "undefined") { - return ""; - } + if(typeof(e.textContent) == "undefined" && typeof(e.innerText) == "undefined") { + return ""; + } normalizeSpaces = (typeof normalizeSpaces == 'undefined') ? true : normalizeSpaces; if (navigator.appName == "Microsoft Internet Explorer") { - textContent = e.innerText.replace( readability.regexps.trimRe, "" ); } + textContent = e.innerText.replace( readability.regexps.trim, "" ); } else { - textContent = e.textContent.replace( readability.regexps.trimRe, "" ); } + textContent = e.textContent.replace( readability.regexps.trim, "" ); } if(normalizeSpaces) { - return textContent.replace( readability.regexps.normalizeRe, " "); } + return textContent.replace( readability.regexps.normalize, " "); } else { return textContent; } }, @@ -970,6 +1069,410 @@ var readability = { return linkLength / textLength; }, + /** + * Find a cleaned up version of the current URL, to use for comparing links for possible next-pageyness. + * + * @author Dan Lacy + * @return string the base url + **/ + findBaseUrl: function () { + var noUrlParams = window.location.pathname.split("?")[0], + urlSlashes = noUrlParams.split("/").reverse(), + cleanedSegments = [], + possibleType = ""; + + for (var i = 0, slashLen = urlSlashes.length; i < slashLen; i++) { + var segment = urlSlashes[i]; + + // Split off and save anything that looks like a file type. + if (segment.indexOf(".") !== -1) { + possibleType = segment.split(".")[1]; + + /* If the type isn't alpha-only, it's probably not actually a file extension. */ + if(!possibleType.match(/[^a-zA-Z]/)) { + segment = segment.split(".")[0]; + } + } + + /** + * EW-CMS specific segment replacement. Ugly. + * Example: http://www.ew.com/ew/article/0,,20313460_20369436,00.html + **/ + if(segment.indexOf(',00') !== -1) { + segment = segment.replace(',00', ''); + } + + // If our first or second segment has anything looking like a page number, remove it. + if (segment.match(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i) && ((i === 1) || (i === 0))) { + segment = segment.replace(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i, ""); + } + + + del = false; + + /* If this is purely a number, and it's the first or second segment, it's probably a page number. Remove it. */ + if (i < 2 && segment.match(/^\d{1,2}$/)) { + del = true; + } + + /* If this is the first segment and it's just "index", remove it. */ + if(i === 0 && segment.toLowerCase() == "index") + del = true; + + /* If our first or second segment is smaller than 3 characters, and the first segment was purely alphas, remove it. */ + if(i < 2 && segment.length < 3 && !urlSlashes[0].match(/[a-z]/i)) + del = true; + + /* If it's not marked for deletion, push it to cleanedSegments. */ + if (!del) { + cleanedSegments.push(segment); + } + } + + // This is our final, cleaned, base article URL. + return window.location.protocol + "//" + window.location.host + cleanedSegments.reverse().join("/"); + }, + + /** + * Look for any paging links that may occur within the document. + * + * @param body + * @return object (array) + **/ + findNextPageLink: function (elem) { + var possiblePages = {}, + allLinks = elem.getElementsByTagName('a'), + articleBaseUrl = readability.findBaseUrl(); + + /** + * Loop through all links, looking for hints that they may be next-page links. + * Things like having "page" in their textContent, className or id, or being a child + * of a node with a page-y className or id. + * + * Also possible: levenshtein distance? longest common subsequence? + * + * After we do that, assign each page a score, and + **/ + for(i = 0, il = allLinks.length; i < il; i++) { + var link = allLinks[i], + linkHref = allLinks[i].href.replace(/#.*$/, '').replace(/\/$/, ''); + + /* If we've already seen this page, ignore it */ + if(linkHref == "" || linkHref == articleBaseUrl || linkHref == window.location.href || linkHref in readability.parsedPages) { + continue; + } + + /* If it's on a different domain, skip it. */ + if(window.location.host != linkHref.split(/\/+/g)[1]) { + continue; + } + + var linkText = readability.getInnerText(link); + + /* If the linkText looks like it's not the next page, skip it. */ + if(linkText.match(readability.regexps.extraneous) || linkText.length > 25) { + continue; + } + + /* If the leftovers of the URL after removing the base URL don't contain any digits, it's certainly not a next page link. */ + var linkHrefLeftover = linkHref.replace(articleBaseUrl, ''); + if(!linkHrefLeftover.match(/\d/)) { + continue; + } + + if(!(linkHref in possiblePages)) { + possiblePages[linkHref] = {"score": 0, "linkText": linkText, "href": linkHref}; + } else { + possiblePages[linkHref].linkText += ' | ' + linkText; + } + + linkObj = possiblePages[linkHref]; + + /** + * If the articleBaseUrl isn't part of this URL, penalize this link. It could still be the link, but the odds are lower. + * Example: http://www.actionscript.org/resources/articles/745/1/JavaScript-and-VBScript-Injection-in-ActionScript-3/Page1.html + **/ + if(linkHref.indexOf(articleBaseUrl) !== 0) { + linkObj.score -= 25; + } + + var linkData = linkText + ' ' + link.className + ' ' + link.id; + if(linkData.match(readability.regexps.nextLink)) { + linkObj.score += 50; + } + if(linkData.match(/pag(e|ing|inat)/i)) { + linkObj.score += 25; + } + if(linkData.match(/(first|last)/i)) { // -65 is enough to negate any bonuses gotten from a > or » in the text, + /* If we already matched on "next", last is probably fine. If we didn't, then it's bad. Penalize. */ + if(!linkObj.linkText.match(readability.regexps.nextLink)) + linkObj.score -= 65; + } + if(linkData.match(readability.regexps.negative) || linkData.match(readability.regexps.extraneous)) { + linkObj.score -= 50; + } + if(linkData.match(readability.regexps.prevLink)) { + linkObj.score -= 200; + } + + /* If a parentNode contains page or paging or paginat */ + var parentNode = link.parentNode, + positiveNodeMatch = false, + negativeNodeMatch = false; + while(parentNode) { + var parentNodeClassAndId = parentNode.className + ' ' + parentNode.id; + if(!positiveNodeMatch && parentNodeClassAndId && parentNodeClassAndId.match(/pag(e|ing|inat)/i)) { + positiveNodeMatch = true; + linkObj.score += 25; + } + if(!negativeNodeMatch && parentNodeClassAndId && parentNodeClassAndId.match(readability.regexps.negative)) { + /* If this is just something like "footer", give it a negative. If it's something like "body-and-footer", leave it be. */ + if(!parentNodeClassAndId.match(readability.regexps.positive)) { + linkObj.score -= 25; + negativeNodeMatch = true; + } + } + + parentNode = parentNode.parentNode; + } + + /** + * If the URL looks like it has paging in it, add to the score. + * Things like /page/2/, /pagenum/2, ?p=3, ?page=11, ?pagination=34 + **/ + if (linkHref.match(/p(a|g|ag)?(e|ing|ination)?(=|\/)[0-9]{1,2}/i) || linkHref.match(/(page|paging)/i)) { + linkObj.score += 25; + } + + /* If the URL contains negative values, give a slight decrease. */ + if (linkHref.match(readability.regexps.extraneous)) { + linkObj.score -= 15; + } + + /** + * Minor punishment to anything that doesn't match our current URL. + * NOTE: I'm finding this to cause more harm than good where something is exactly 50 points. + * Dan, can you show me a counterexample where this is necessary? + * if (linkHref.indexOf(window.location.href) !== 0) { + * linkObj.score -= 1; + * } + **/ + + /** + * If the link text can be parsed as a number, give it a minor bonus, with a slight + * bias towards lower numbered pages. This is so that pages that might not have 'next' + * in their text can still get scored, and sorted properly by score. + **/ + linkTextAsNumber = parseInt(linkText, 10); + if(linkTextAsNumber) { + // Punish 1 since we're either already there, or it's probably before what we want anyways. + if (linkTextAsNumber === 1) { + linkObj.score -= 10; + } + else { + // Todo: Describe this better + linkObj.score += Math.max(0, 10 - linkTextAsNumber); + } + } + } + + /** + * Loop thrugh all of our possible pages from above and find our top candidate for the next page URL. + * Require at least a score of 50, which is a relatively high confidence that this page is the next link. + **/ + var topPage = null; + for(var page in possiblePages) { + if(possiblePages.hasOwnProperty(page)) { + if(possiblePages[page].score >= 50 && (!topPage || topPage.score < possiblePages[page].score)) { + topPage = possiblePages[page]; + } + } + } + + if(topPage) { + var nextHref = topPage.href.replace(/\/$/,''); + + dbg('NEXT PAGE IS ' + nextHref); + readability.parsedPages[nextHref] = true; + return nextHref; + } + else { + return null; + } + }, + + /** + * Build a simple cross browser compatible XHR. + * + * TODO: This could likely be simplified beyond what we have here right now. There's still a bit of excess junk. + **/ + xhr: function () { + if (typeof XMLHttpRequest !== 'undefined' && (window.location.protocol !== 'file:' || !window.ActiveXObject)) { + return new XMLHttpRequest(); + } + else { + try { return new ActiveXObject('Msxml2.XMLHTTP.6.0'); } catch(sixerr) { } + try { return new ActiveXObject('Msxml2.XMLHTTP.3.0'); } catch(threrr) { } + try { return new ActiveXObject('Msxml2.XMLHTTP'); } catch(err) { } + } + + return false; + }, + + successfulRequest: function (request) { + return (request.status >= 200 && request.status < 300) || request.status == 304 || (request.status === 0 && request.responseText); + }, + + ajax: function (url, options) { + var request = readability.xhr(); + + function respondToReadyState(readyState) { + if (request.readyState == 4) { + if (readability.successfulRequest(request)) { + if (options.success) { options.success(request); } + } + else { + if (options.error) { options.error(request); } + } + } + } + + if (typeof options === 'undefined') { options = {}; } + + request.onreadystatechange = respondToReadyState; + + request.open('get', url, true); + request.setRequestHeader('Accept', 'text/html'); + + try { + request.send(options.postBody); + } + catch (e) { + if (options.error) { options.error(); } + } + + return request; + }, + + /** + * Make an AJAX request for each page and append it to the document. + **/ + curPageNum: 1, + + appendNextPage: function (nextPageLink) { + readability.curPageNum++; + + var articlePage = document.createElement("DIV"); + articlePage.id = 'readability-page-' + readability.curPageNum; + articlePage.className = 'page'; + articlePage.innerHTML = '

§

'; + + document.getElementById("readability-content").appendChild(articlePage); + + if(readability.curPageNum > readability.maxPages) { + var nextPageLink = "
View Next Page
"; + + articlePage.innerHTML = articlePage.innerHTML + nextPageLink; + return; + } + + /** + * Now that we've built the article page DOM element, get the page content + * asynchronously and load the cleaned content into the div we created for it. + * + * Todo: try using a self-calling function rather than with + * + * Yes, "with statement is considered harmful". But this is using with as a replacement for let which is in ecmascript 1.7, so it's okay. + * See here: http://stackoverflow.com/questions/61552/are-there-legitimate-uses-for-javascripts-with-statement#answer-185283 + **/ + with({pageUrl: nextPageLink, thisPage: articlePage}) { + readability.ajax(pageUrl, { + success: function(r) { + + /* First, check to see if we have a matching ETag in headers - if we do, this is a duplicate page. */ + var eTag = r.getResponseHeader('ETag'); + if(eTag) { + if(eTag in readability.pageETags) { + dbg("Exact duplicate page found via ETag. Aborting."); + articlePage.style.display = 'none'; + return; + } else { + readability.pageETags[eTag] = 1; + } + } + + // TODO: this ends up doubling up page numbers on NYTimes articles. Need to generically parse those away. + var page = document.createElement("DIV"); + + /** + * Do some preprocessing to our HTML to make it ready for appending. + * • Remove any script tags. Swap and reswap newlines with a unicode character because multiline regex doesn't work in javascript. + * • Turn any noscript tags into divs so that we can parse them. This allows us to find any next page links hidden via javascript. + * • Turn all double br's into p's - was handled by prepDocument in the original view. + * Maybe in the future abstract out prepDocument to work for both the original document and AJAX-added pages. + **/ + var responseHtml = r.responseText. + replace(/\n/g,'\uffff').replace(/.*?<\/script>/gi, ''). + replace(/\uffff/g,'\n'). + replace(/<(\/?)noscript/gi, '<$1div'). + replace(readability.regexps.replaceBrs, '

'). + replace(readability.regexps.replaceFonts, '<$1span>'); + + page.innerHTML = responseHtml; + + /** + * Reset all flags for the next page, as they will search through it and disable as necessary at the end of grabArticle. + **/ + readability.flags = 0x1 | 0x2 | 0x4; + + var nextPageLink = readability.findNextPageLink(page), + content = readability.grabArticle(page); + + if(!content) { + dbg("No content found in page to append. Aborting.") + return; + } + + /** + * Anti-duplicate mechanism. Essentially, get the first paragraph of our new page. + * Compare it against all of the the previous document's we've gotten. If the previous + * document contains exactly the innerHTML of this first paragraph, it's probably a duplicate. + **/ + firstP = content.getElementsByTagName("P").length ? content.getElementsByTagName("P")[0] : null; + if(firstP && firstP.innerHTML.length > 100) { + for(var i=1; i <= readability.curPageNum; i++) { + var rPage = document.getElementById('readability-page-' + i); + if(rPage && rPage.innerHTML.indexOf(firstP.innerHTML) !== -1) { + dbg('Duplicate of page ' + i + ' - skipping.'); + articlePage.style.display = 'none'; + readability.parsedPages[pageUrl] = true; + return; + } + } + } + + readability.removeScripts(content); + + thisPage.innerHTML = thisPage.innerHTML + content.innerHTML; + + /** + * After the page has rendered, post process the content. This delay is necessary because, + * in webkit at least, offsetWidth is not set in time to determine image width. We have to + * wait a little bit for reflow to finish before we can fix floating images. + **/ + window.setTimeout( + function() { readability.postProcessContent(thisPage); }, + 500 + ); + + if(nextPageLink) { + readability.appendNextPage(nextPageLink); + } + } + }); + } + }, + /** * Get an elements class/id weight. Uses regular expressions to tell if this * element looks good or bad. @@ -987,29 +1490,29 @@ var readability = { /* Look for a special classname */ if (typeof(e.className) === 'string' && e.className != '') { - if(e.className.search(readability.regexps.negativeRe) !== -1) { + if(e.className.search(readability.regexps.negative) !== -1) { weight -= 25; } - if(e.className.search(readability.regexps.positiveRe) !== -1) { + if(e.className.search(readability.regexps.positive) !== -1) { weight += 25; } } /* Look for a special ID */ if (typeof(e.id) === 'string' && e.id != '') { - if(e.id.search(readability.regexps.negativeRe) !== -1) { + if(e.id.search(readability.regexps.negative) !== -1) { weight -= 25; } - if(e.id.search(readability.regexps.positiveRe) !== -1) { + if(e.id.search(readability.regexps.positive) !== -1) { weight += 25; } } return weight; }, - nodeIsVisible: function (node) { - return (node.offsetWidth !== 0 || node.offsetHeight !== 0) && node.style.display.toLowerCase() !== 'none'; - }, + nodeIsVisible: function (node) { + return (node.offsetWidth !== 0 || node.offsetHeight !== 0) && node.style.display.toLowerCase() !== 'none'; + }, /** * Remove extraneous break tags from a node. @@ -1019,7 +1522,7 @@ var readability = { **/ killBreaks: function (e) { try { - e.innerHTML = e.innerHTML.replace(readability.regexps.killBreaksRe,'
'); + e.innerHTML = e.innerHTML.replace(readability.regexps.killBreaks,'
'); } catch (eBreaks) { dbg("KillBreaks failed - this is an IE bug. Ignoring.: " + eBreaks); @@ -1047,12 +1550,12 @@ var readability = { } /* First, check the elements attributes to see if any of them contain youtube or vimeo */ - if (attributeValues.search(readability.regexps.videoRe) !== -1) { + if (attributeValues.search(readability.regexps.videos) !== -1) { continue; } /* Then check the elements inside this element for the same. */ - if (targetList[y].innerHTML.search(readability.regexps.videoRe) !== -1) { + if (targetList[y].innerHTML.search(readability.regexps.videos) !== -1) { continue; } @@ -1070,9 +1573,9 @@ var readability = { **/ cleanConditionally: function (e, tag) { - if(!readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONALLY)) { - return; - } + if(!readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONALLY)) { + return; + } var tagsList = e.getElementsByTagName(tag); var curTagsLength = tagsList.length; @@ -1106,7 +1609,7 @@ var readability = { var embedCount = 0; var embeds = tagsList[i].getElementsByTagName("embed"); for(var ei=0,il=embeds.length; ei < il; ei++) { - if (embeds[ei].src.search(readability.regexps.videoRe) == -1) { + if (embeds[ei].src.search(readability.regexps.videos) == -1) { embedCount++; } } @@ -1154,6 +1657,72 @@ var readability = { } } }, + + /*** Smooth scrolling logic ***/ + + /** + * easeInOut animation algorithm - returns an integer that says how far to move at this point in the animation. + * Borrowed from jQuery's easing library. + * @return integer + **/ + easeInOut: function(start,end,totalSteps,actualStep) { + var delta = end - start; + + if ((actualStep/=totalSteps/2) < 1) { return delta/2*actualStep*actualStep + start; } + return -delta/2 * ((--actualStep)*(actualStep-2) - 1) + start; + }, + + /** + * Helper function to, in a cross compatible way, get or set the current scroll offset of the document. + * @return mixed integer on get, the result of window.scrollTo on set + **/ + scrollTop: function(scroll){ + var setScroll = typeof scroll != 'undefined'; + + if(setScroll) { + return window.scrollTo(0, scroll); + } + if(typeof window.pageYOffset != 'undefined') { + return window.pageYOffset; + } + else if(document.documentElement.clientHeight) { + return document.documentElement.scrollTop; + } + else { + return document.body.scrollTop; + } + }, + + /** + * scrollTo - Smooth scroll to the point of scrollEnd in the document. + * @return void + **/ + curScrollStep: 0, + scrollTo: function (scrollStart, scrollEnd, steps, interval) { + if( + (scrollStart < scrollEnd && readability.scrollTop() < scrollEnd) || + (scrollStart > scrollEnd && readability.scrollTop() > scrollEnd) + ) { + readability.curScrollStep++; + if(readability.curScrollStep > steps) { + return; + } + + var oldScrollTop = readability.scrollTop(); + + readability.scrollTop(readability.easeInOut(scrollStart, scrollEnd, steps, readability.curScrollStep)); + + // We're at the end of the window. + if(oldScrollTop == readability.scrollTop()) { + return; + } + + window.setTimeout(function() { + readability.scrollTo(scrollStart, scrollEnd, steps, interval); + }, interval); + } + }, + /** * Show the email popup. @@ -1167,7 +1736,7 @@ var readability = { return; } - var emailContainer = document.createElement('div'); + var emailContainer = document.createElement("DIV"); emailContainer.setAttribute('id', 'email-container'); emailContainer.innerHTML = ''; @@ -1220,4 +1789,4 @@ var readability = { }; -readability.init(); +//readability.init(); \ No newline at end of file diff --git a/reports/cnn.html b/reports/cnn.html new file mode 100644 index 00000000..ce49c856 --- /dev/null +++ b/reports/cnn.html @@ -0,0 +1,52 @@ + + + + + 'Afghan village' in British countryside is training ground for elite troops - CNN.com + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

'Afghan village' in British countryside is training ground for elite troops

Click to play

Training with 'brutal' British troops

STORY HIGHLIGHTS

  • VBS embeds with esteemed, aggressive British army unit
  • Troops described as "hardest men in the world," "most brutal" soldiers in army
  • Training facility in English countryside modeled after Afghan village

Editor's note: The staff at CNN.com has recently been intrigued by the journalism of VICE, an independent media company and website based in Brooklyn, New York. VBS.TV is Vice's broadband television network. The reports, which are produced solely by VICE, reflect a transparent approach to journalism, where viewers are taken along on every step of the reporting process. We believe this unique reporting approach is worthy of sharing with our CNN.com readers.

London, England (VBS.TV) -- In April this year I embedded with the elite soldiers of the 2nd Battalion British Parachute Regiment at their training facility in Thetford, United Kingdom. We went there to film the British army's much-talked-about, multimillion pound "replica Afghanistan village." The focus of the piece was to be on the younger members of the battalion. Some of them are as young as 18 and only months away from their first visit to Helmand province in September 2010. The majority of British casualties are all recorded in that area.

I'd heard about the Afghan village training facility from an ex-Para turned photographer called Stuart Griffiths. A couple of years ago, Vice Magazine published a photo essay in which he visited the homes of badly injured soldiers returned from Iraq or Afghanistan. Perhaps more shocking than the horrific injuries the soldiers had sustained was the lack of support of these men from the British government of the time -- particularly those who suffered psychological problems as a result of combat shock.

Stuart served his time in the Paras on tours of Northern Ireland in the late 80s to early 90s. But after being discharged he fell on hard times. He became homeless and spent a long time wandering the streets of London. He slept in cardboard boxes in doorways, often coming across fellow army veterans who faced a similar lack of support and sympathy from society, the government or the army. Happily, Stuart managed to get himself off the streets and, inspired by the stories of his fellow homeless veterans, became a campaigner of sorts for veterans rights, using his photography. His story and those of other ex-Paras is now available to watch in a moving British documentary called "Isolation."

A week before we set off for the embed, I asked Stuart what kind of experience we were letting ourselves in for and he said: "Not only are the Parachute Regiment Britain's most elite soldiers, they are the most brutal of British troops. That means the rest of the British army are simply in awe of them. I would say around 90 per cent of the SAS (the British equivalent of the Special Forces) comes from the Paras. And so the troops you are going to be sleeping next to every night are some of the hardest men in the world."

"Recently what's happened is that many former veterans have had a "call to arms" and returned to the forces again. Life in civvy (civilian) street has been a huge disappointment for many of these guys. Once you've fought 'in-theater' a number of times it's really hard for you to come back and fit in with the hum-drum of modern life. Especially if you're a Para. They miss the camaraderie and the discipline and the action that you get when you're in combat."

See the rest of Afghanistan in the UK at VBS.TV

"When Afghanistan first kicked off, all three battalions of the Paras were sent straight there because of their reputation for violence and bravery. I remember it was in Helmand 2006 that Afghanistan became very hairy indeed. The Parachute Regiment were there when it became very intense and at the time a Member of Parliament called John Reid was quoted as saying, 'I'd be quite happy that a single shot is not fired in Afghanistan.' In reality the Parachute Regiment were firing off more rounds in Helmand than the entire Korean war!"

Grinning nervously, I asked him if there's anything else I should know and he looked at me and said, "They all drink rivers of lager."

Before we set off to meet the Paras, Stuart introduced us to a young man named Richard Dare who'd been a private in the Royal Anglians. Richard had a significant part of his brain blown out of his head by a mortar attack and was slowly rehabilitating himself at his home in a small town near Leicestershire. Richard talked to us about his love for the army and for war. The severe injuries to his brain didn't seem to change his love for army life one iota. The more we spoke to him, the more we knew we had to include him in our film.

But first we had to go meet the Paras. With the help of my co-producer Jason Mojica, we packed two small cameras into our bags and traveled to the middle of the British countryside to try and ingratiate ourselves with the most brutal soldiers in the whole of the British army.

It all started off nice and civilized. We were met on the door by a dashing officer who arranged for a private to show us our room -- a basic little number in the middle of the barracks which, while not five star, would certainly meet our needs for the week. I remember feeling pretty pleased with ourselves that we'd been given this amazing access, thinking we'd have carte blanche to wander around and check out the comings and goings of the most elite units in the world.

"Ah," we thought, "This should be an informative and relaxing few days in the countryside. I don't know what we were worried about at all."

Soon we were put in jeep, driven an hour-and-a-half away to a gloomy brick building in the middle of nowhere and told that the next time we slept in a bed would be in five days time. As we stepped out of the van about 200 Para troops stared at us with eyes that said: "Who are these wimps?"

The dark fell fast. A cold wind started blowing and I suddenly realized we'd left all our warm clothes in the barracks miles away. Then the sergeant major introduced himself and told us to get in line with the rest of the troops.

And off we marched into the night.

\ No newline at end of file