Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 40 additions & 22 deletions src/Readability.php
Original file line number Diff line number Diff line change
Expand Up @@ -174,14 +174,15 @@ class Readability implements LoggerAwareInterface
* @param string (optional) Which parser to use for turning raw HTML into a DOMDocument
* @param bool (optional) Use tidy
*/
public function __construct($html, $url = null, $parser = 'libxml', $useTidy = true)
public function __construct($html, $url = null, $parser = 'libxml', $use_tidy = true)
{
$this->url = $url;
$this->html = $html;
$this->parser = $parser;
$this->useTidy = $useTidy && function_exists('tidy_parse_string');
$this->useTidy = $use_tidy && function_exists('tidy_parse_string');

$this->logger = new NullLogger();
$this->loadHtml();
}

public function setLogger(LoggerInterface $logger)
Expand Down Expand Up @@ -235,6 +236,8 @@ public function addPostFilter($filter, $replacer = '')
* Load HTML in a DOMDocument.
* Apply Pre filters
* Cleanup HTML using Tidy (or not).
*
* @todo This should be called in init() instead of from __construct
*/
private function loadHtml()
{
Expand Down Expand Up @@ -266,7 +269,6 @@ private function loadHtml()
* Use tidy (if it exists).
* This fixes problems with some sites which would otherwise trouble DOMDocument's HTML parsing.
* Although sometimes it makes matters worse, which is why there is an option to disable it.
*
*/
if ($this->useTidy) {
$this->logger->debug('Tidying document');
Expand Down Expand Up @@ -314,8 +316,6 @@ private function loadHtml()
*/
public function init()
{
$this->loadHtml();

if (!isset($this->dom->documentElement)) {
return false;
}
Expand Down Expand Up @@ -372,12 +372,31 @@ public function init()
return $this->success;
}

/**
* Debug.
*
* @deprecated use $this->logger->debug() instead
*/
protected function dbg($msg)
{
$this->logger->debug($msg);
}

/**
* Dump debug info.
*
* @deprecated since Monolog gather log, we don't need it
*/
protected function dump_dbg()
{
}

/**
* Run any post-process modifications to article content as necessary.
*
* @param \DOMElement $articleContent
*/
public function postProcessContent(\DOMElement $articleContent)
public function postProcessContent($articleContent)
{
if ($this->convertLinksToFootnotes && !preg_match('/\bwiki/', $this->url)) {
$this->addFootnotes($articleContent);
Expand Down Expand Up @@ -462,7 +481,7 @@ protected function prepDocument()
*
* @param \DOMElement $articleContent
*/
public function addFootnotes(\DOMElement $articleContent)
public function addFootnotes($articleContent)
{
$footnotesWrapper = $this->dom->createElement('footer');
$footnotesWrapper->setAttribute('class', 'readability-footnotes');
Expand Down Expand Up @@ -526,7 +545,7 @@ public function addFootnotes(\DOMElement $articleContent)
*
* @param \DOMElement $articleContent
*/
public function prepArticle(\DOMElement $articleContent)
public function prepArticle($articleContent)
{
$this->logger->debug($this->lightClean ? 'Light clean enabled.' : 'Standard clean enabled.');

Expand Down Expand Up @@ -623,7 +642,7 @@ public function prepArticle(\DOMElement $articleContent)
*
* @param \DOMElement $node
*/
protected function initializeNode(\DOMElement $node)
protected function initializeNode($node)
{
if (!isset($node->tagName)) {
return;
Expand Down Expand Up @@ -694,7 +713,7 @@ protected function initializeNode(\DOMElement $node)
*
* @return \DOMElement|bool
*/
protected function grabArticle(\DOMElement $page = null)
protected function grabArticle($page = null)
{
if (!$page) {
$page = $this->dom;
Expand Down Expand Up @@ -743,8 +762,7 @@ protected function grabArticle(\DOMElement $page = null)
continue;
}

// XML_TEXT_NODE
if ($childNode->nodeType == 3) {
if ($childNode->nodeType === XML_TEXT_NODE) {
$p = $this->dom->createElement('p');
$p->innerHTML = $childNode->nodeValue;
$p->setAttribute('data-readability-styled', 'true');
Expand All @@ -770,7 +788,7 @@ protected function grabArticle(\DOMElement $page = null)
continue;
}

$grandParentNode = ($parentNode->parentNode instanceof \DOMElement) ? $parentNode->parentNode : null;
$grandParentNode = $parentNode->parentNode instanceof \DOMElement ? $parentNode->parentNode : null;
$innerText = $this->getInnerText($nodesToScore[$pt]);

// If this paragraph is less than MIN_PARAGRAPH_LENGTH (default:20) characters, don't even count it.
Expand Down Expand Up @@ -1051,7 +1069,7 @@ protected function grabArticle(\DOMElement $page = null)
*
* @return string
*/
public function getInnerText(\DOMElement $e = null, $normalizeSpaces = true, $flattenLines = false)
public function getInnerText($e, $normalizeSpaces = true, $flattenLines = false)
{
if (null === $e || !isset($e->textContent) || $e->textContent === '') {
return '';
Expand All @@ -1073,7 +1091,7 @@ public function getInnerText(\DOMElement $e = null, $normalizeSpaces = true, $fl
*
* @param \DOMElement $e
*/
public function cleanStyles(\DOMElement $e)
public function cleanStyles($e)
{
if (!is_object($e)) {
return;
Expand Down Expand Up @@ -1121,7 +1139,7 @@ public function getWordCount($text)
*
* @return int
*/
public function getLinkDensity(\DOMElement $e, $excludeExternal = false)
public function getLinkDensity($e, $excludeExternal = false)
{
$links = $e->getElementsByTagName('a');
$textLength = mb_strlen($this->getInnerText($e, true, true));
Expand Down Expand Up @@ -1150,7 +1168,7 @@ public function getLinkDensity(\DOMElement $e, $excludeExternal = false)
*
* @return int
*/
protected function weightAttribute(\DOMElement $element, $attribute)
protected function weightAttribute($element, $attribute)
{
if (!$element->hasAttribute($attribute)) {
return 0;
Expand Down Expand Up @@ -1185,7 +1203,7 @@ protected function weightAttribute(\DOMElement $element, $attribute)
*
* @return int
*/
public function getWeight(\DOMElement $e)
public function getWeight($e)
{
if (!$this->flagIsActive(self::FLAG_WEIGHT_ATTRIBUTES)) {
return 0;
Expand All @@ -1205,7 +1223,7 @@ public function getWeight(\DOMElement $e)
*
* @param \DOMElement $node
*/
public function killBreaks(\DOMElement $node)
public function killBreaks($node)
{
$html = $node->innerHTML;
$html = preg_replace($this->regexps['killBreaks'], '<br />', $html);
Expand All @@ -1221,7 +1239,7 @@ public function killBreaks(\DOMElement $node)
* @param \DOMElement $e
* @param string $tag
*/
public function clean(\DOMElement $e, $tag)
public function clean($e, $tag)
{
$currentItem = null;
$targetList = $e->getElementsByTagName($tag);
Expand Down Expand Up @@ -1257,7 +1275,7 @@ public function clean(\DOMElement $e, $tag)
* @param \DOMElement $e
* @param string $tag
*/
public function cleanConditionally(\DOMElement $e, $tag)
public function cleanConditionally($e, $tag)
{
if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
return;
Expand Down Expand Up @@ -1370,7 +1388,7 @@ public function cleanConditionally(\DOMElement $e, $tag)
*
* @param \DOMElement $e
*/
public function cleanHeaders(\DOMElement $e)
public function cleanHeaders($e)
{
for ($headerIndex = 1; $headerIndex < 3; ++$headerIndex) {
$headers = $e->getElementsByTagName('h'.$headerIndex);
Expand Down
29 changes: 19 additions & 10 deletions tests/ReadabilityTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -22,40 +22,47 @@ private function getReadability($html, $url = null, $parser = 'libxml', $useTidy
return $readability;
}

/**
* @requires extension tidy
*/
public function testConstructDefault()
{
$readability = $this->getReadability('');

$this->assertNull($readability->url);
$this->assertInstanceOf('DomDocument', $readability->dom);
}

/**
* @requires extension tidy
*/
public function testConstructSimple()
{
$readability = $this->getReadability('<html/>', 'http://0.0.0.0');
$readability->init();

$this->assertEquals('http://0.0.0.0', $readability->url);
$this->assertInstanceOf('DomDocument', $readability->dom);
$this->assertEquals('<html/>', $readability->original_html);
$this->assertTrue($readability->tidied);

$this->assertTrue($this->logHandler->hasDebugThatContains('Parsing URL: http://0.0.0.0'));
$this->assertTrue($this->logHandler->hasDebugThatContains('Tidying document'));
$this->assertTrue($this->logHandler->hasDebugThatContains('Light clean enabled.'));
}

public function testConstructDefaultWithoutTidy()
{
$readability = $this->getReadability('', null, 'libxml', false);
$readability->init();

$this->assertNull($readability->url);
$this->assertEquals('', $readability->original_html);
$this->assertFalse($readability->tidied);

$this->assertTrue($this->logHandler->hasDebugThatContains('Parsing URL: '));
$this->assertFalse($this->logHandler->hasDebugThatContains('Tidying document'));
$this->assertTrue($this->logHandler->hasDebugThatContains('Light clean enabled.'));
$this->assertInstanceOf('DomDocument', $readability->dom);
}

public function testConstructSimpleWithoutTidy()
{
$readability = $this->getReadability('<html/>', 'http://0.0.0.0', 'libxml', false);
$readability->init();

$this->assertEquals('http://0.0.0.0', $readability->url);
$this->assertInstanceOf('DomDocument', $readability->dom);
$this->assertEquals('<html/>', $readability->original_html);
$this->assertFalse($readability->tidied);
}
Expand Down Expand Up @@ -447,6 +454,8 @@ public function testPostFilters()

public function testPreFilters()
{
$this->markTestSkipped('Won\'t work until loadHtml() is moved in init() instead of __construct()');

$readability = $this->getReadability('<div>'.str_repeat('<p>This <b>is</b> the awesome and WONDERFUL content :)</p>', 7).'</div>', 'http://0.0.0.0');
$readability->addPreFilter('!<b[^>]*>(.*?)</b>!is', '');

Expand Down