Skip to content

Commit

Permalink
fixup! JSHtml
Browse files Browse the repository at this point in the history
  • Loading branch information
jtojnar committed Mar 18, 2024
1 parent ccdd166 commit 732e4df
Showing 1 changed file with 27 additions and 9 deletions.
36 changes: 27 additions & 9 deletions src/Readability.php
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,7 @@ public function init(): bool

if (null === $articleContent) {
$this->success = false;
/** @var JSLikeHTMLElement */
$articleContent = $this->dom->createElement('div');
$articleContent->setAttribute('class', 'readability-content');
$articleContent->setInnerHtml('<p>Sorry, Readability was unable to parse this page for content.</p>');
Expand All @@ -302,7 +303,9 @@ public function init(): bool

// without tidy the body can (sometimes) be wiped, so re-create it
if (false === isset($this->body->childNodes)) {
$this->body = $this->dom->createElement('body');
/** @var JSLikeHTMLElement */
$body = $this->dom->createElement('body');
$this->body = $body;
}

// Clear the old HTML, insert the new content.
Expand Down Expand Up @@ -335,19 +338,23 @@ public function postProcessContent(\DOMElement $articleContent): void
*/
public function addFootnotes(\DOMElement $articleContent): void
{
/** @var JSLikeHTMLElement */
$footnotesWrapper = $this->dom->createElement('footer');
$footnotesWrapper->setAttribute('class', 'readability-footnotes');
$footnotesWrapper->setInnerHtml('<h3>References</h3>');
$articleFootnotes = $this->dom->createElement('ol');
$articleFootnotes->setAttribute('class', 'readability-footnotes-list');
$footnotesWrapper->appendChild($articleFootnotes);
/** @var \DOMNodeList<JSLikeHTMLElement> */
$articleLinks = $articleContent->getElementsByTagName('a');
$linkCount = 0;

for ($i = 0; $i < $articleLinks->length; ++$i) {
$articleLink = $articleLinks->item($i);
$footnoteLink = $articleLink->cloneNode(true);
/** @var JSLikeHTMLElement */
$refLink = $this->dom->createElement('a');
/** @var JSLikeHTMLElement */
$footnote = $this->dom->createElement('li');
$linkDomain = @parse_url($footnoteLink->getAttribute('href'), \PHP_URL_HOST);
if (!$linkDomain && isset($this->url)) {
Expand Down Expand Up @@ -609,6 +616,7 @@ public function killBreaks(JSLikeHTMLElement $node): void
*/
public function clean(JSLikeHTMLElement $e, string $tag): void
{
/** @var \DOMNodeList<JSLikeHTMLElement> */
$targetList = $e->getElementsByTagName($tag);
$isEmbed = ('audio' === $tag || 'video' === $tag || 'iframe' === $tag || 'object' === $tag || 'embed' === $tag);

Expand Down Expand Up @@ -645,6 +653,7 @@ public function cleanConditionally(JSLikeHTMLElement $e, string $tag): void
return;
}

/** @var \DOMNodeList<JSLikeHTMLElement> */
$tagsList = $e->getElementsByTagName($tag);
$curTagsLength = $tagsList->length;

Expand Down Expand Up @@ -755,6 +764,7 @@ public function cleanConditionally(JSLikeHTMLElement $e, string $tag): void
public function cleanHeaders(JSLikeHTMLElement $e): void
{
for ($headerIndex = 1; $headerIndex < 3; ++$headerIndex) {
/** @var \DOMNodeList<JSLikeHTMLElement> */
$headers = $e->getElementsByTagName('h' . $headerIndex);

for ($i = $headers->length - 1; $i >= 0; --$i) {
Expand Down Expand Up @@ -823,6 +833,7 @@ protected function getArticleTitle(): JSLikeHTMLElement
$curTitle = $origTitle;
}

/** @var JSLikeHTMLElement */
$articleTitle = $this->dom->createElement('h1');
$articleTitle->setInnerHtml($curTitle);

Expand All @@ -840,7 +851,9 @@ protected function prepDocument(): void
* so we create a new body node and append it to the document.
*/
if (null === $this->body) {
$this->body = $this->dom->createElement('body');
/** @var JSLikeHTMLElement */
$body = $this->dom->createElement('body');
$this->body = $body;
$this->dom->documentElement->appendChild($this->body);
}

Expand Down Expand Up @@ -944,6 +957,7 @@ protected function grabArticle(?JSLikeHTMLElement $page = null): ?JSLikeHTMLElem
$xpath = new \DOMXPath($page);
}

/** @var \DOMNodeList<JSLikeHTMLElement> */
$allElements = $page->getElementsByTagName('*');

for ($nodeIndex = 0; $allElements->item($nodeIndex); ++$nodeIndex) {
Expand Down Expand Up @@ -986,6 +1000,7 @@ protected function grabArticle(?JSLikeHTMLElement $page = null): ?JSLikeHTMLElem
// (as in, where they contain no other block level elements).
if ('div' === $tagName) {
if (!preg_match($this->regexps['divToPElements'], $nodeContent)) {
/** @var JSLikeHTMLElement */
$newNode = $this->dom->createElement('p');

try {
Expand Down Expand Up @@ -1156,7 +1171,7 @@ protected function grabArticle(?JSLikeHTMLElement $page = null): ?JSLikeHTMLElem
}
}

/** @var \DOMNodeList<JSLikeHTMLElement> */
/** @var non-empty-array<JSLikeHTMLElement|null> */
$topCandidates = array_filter(
$topCandidates,
fn ($v, $idx) => 0 === $idx || null !== $v,
Expand All @@ -1169,18 +1184,21 @@ protected function grabArticle(?JSLikeHTMLElement $page = null): ?JSLikeHTMLElem
* We also have to copy the body node so it is something we can modify.
*/
if (null === $topCandidate || 0 === strcasecmp($topCandidate->tagName, 'body')) {
/** @var JSLikeHTMLElement */
$topCandidate = $this->dom->createElement('div');

if ($page instanceof \DOMDocument) {
if (!isset($page->documentElement)) {
/** @var ?JSLikeHTMLElement */
$documentElement = $page->documentElement;
if (null === $documentElement) {
// we don't have a body either? what a mess! :)
$this->logger->debug('The page has no body!');
} else {
$this->logger->debug('Setting body to a raw HTML of original page!');
$topCandidate->setInnerHtml($page->documentElement->getInnerHTML());
$page->documentElement->setInnerHtml('');
$topCandidate->setInnerHtml($documentElement->getInnerHTML());
$documentElement->setInnerHtml('');
$this->reinitBody();
$page->documentElement->appendChild($topCandidate);
$documentElement->appendChild($topCandidate);
}
} else {
$topCandidate->setInnerHtml($page->getInnerHTML());
Expand All @@ -1189,7 +1207,7 @@ protected function grabArticle(?JSLikeHTMLElement $page = null): ?JSLikeHTMLElem
}

$this->initializeNode($topCandidate);
} elseif ($topCandidate) {
} elseif (null !== $topCandidate) {
$alternativeCandidateAncestors = [];
foreach ($topCandidates as $candidate) {
if ((int) $candidate->getAttribute('readability') / (int) $topCandidate->getAttribute('readability') >= 0.75) {
Expand All @@ -1200,7 +1218,7 @@ protected function grabArticle(?JSLikeHTMLElement $page = null): ?JSLikeHTMLElem
}
if (\count($alternativeCandidateAncestors) >= 3) {
$parentOfTopCandidate = $topCandidate->parentNode;
while ('body' !== $parentOfTopCandidate->nodeName) {
while ('body' !== $parentOfTopCandidate->nodeName && $parentOfTopCandidate instanceof JSLikeHTMLElement) {
$listsContainingThisAncestor = 0;
for ($ancestorIndex = 0; $ancestorIndex < \count($alternativeCandidateAncestors) && $listsContainingThisAncestor < 3; ++$ancestorIndex) {
$listsContainingThisAncestor += (int) \in_array($parentOfTopCandidate, $alternativeCandidateAncestors[$ancestorIndex], true);
Expand Down

0 comments on commit 732e4df

Please sign in to comment.