Skip to content

Commit

Permalink
Code update thanks to Rector
Browse files Browse the repository at this point in the history
Also:
- cast some variable to improve PHPStan
- catch `JsonException` when parsing JSON-LD
  • Loading branch information
j0k3r committed Jan 11, 2022
1 parent 06b388b commit cbc76b7
Show file tree
Hide file tree
Showing 9 changed files with 130 additions and 198 deletions.
76 changes: 40 additions & 36 deletions src/Extractor/ContentExtractor.php
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,10 @@ class ContentExtractor
{
/** @var Readability|null */
public $readability;
/** @var \DOMXPath|null */
private $xpath;
private ?\DOMXPath $xpath = null;
private ?string $html = null;
private array $config;
/** @var SiteConfig|null */
private $siteConfig = null;
private ?SiteConfig $siteConfig = null;
private ?string $title = null;
private ?string $language = null;
private array $authors = [];
Expand All @@ -35,10 +33,8 @@ class ContentExtractor
private ?string $date = null;
private bool $success = false;
private ?string $nextPageUrl = null;
/** @var LoggerInterface */
private $logger;
/** @var ConfigBuilder */
private $configBuilder;
private LoggerInterface $logger;
private ConfigBuilder $configBuilder;

/**
* @param array $config
Expand Down Expand Up @@ -74,8 +70,8 @@ public function __construct($config = [], LoggerInterface $logger = null, Config

$this->config = $resolver->resolve($config);

$this->logger = null === $logger ? new NullLogger() : $logger;
$this->configBuilder = null === $configBuilder ? new ConfigBuilder($this->config['config_builder'], $this->logger) : $configBuilder;
$this->logger = $logger ?? new NullLogger();
$this->configBuilder = $configBuilder ?? new ConfigBuilder($this->config['config_builder'], $this->logger);
}

public function setLogger(LoggerInterface $logger): void
Expand Down Expand Up @@ -489,9 +485,7 @@ public function process(string $html, string $url, SiteConfig $siteConfig = null
"//a[contains(concat(' ',normalize-space(@rel),' '),' author ')]",
$this->readability->dom,
'Author found (rel="author"): {author}',
function ($element, $currentEntity) {
return $currentEntity + [trim($element)];
}
fn ($element, $currentEntity) => $currentEntity + [trim($element)]
);

$this->extractEntityFromQuery(
Expand All @@ -500,9 +494,7 @@ function ($element, $currentEntity) {
'//meta[@name="author"]/@content',
$this->readability->dom,
'Author found (meta name="author"): {author}',
function ($element, $currentEntity) {
return $currentEntity + [trim($element)];
}
fn ($element, $currentEntity) => $currentEntity + [trim($element)]
);

// Find date in pubdate marked time element
Expand Down Expand Up @@ -570,22 +562,25 @@ function ($element, $currentEntity) {
}
}

/** @var \DOMDocument */
$ownerDocument = $this->body->ownerDocument;

// prevent self-closing iframes
if ('iframe' === $this->body->tagName) {
if (!$this->body->hasChildNodes()) {
$this->body->appendChild($this->body->ownerDocument->createTextNode('[embedded content]'));
$this->body->appendChild($ownerDocument->createTextNode('[embedded content]'));
}
} else {
foreach ($this->body->getElementsByTagName('iframe') as $e) {
if (!$e->hasChildNodes()) {
$e->appendChild($this->body->ownerDocument->createTextNode('[embedded content]'));
$e->appendChild($ownerDocument->createTextNode('[embedded content]'));
}
}
}

// prevent self-closing iframe when content is ONLY an iframe
if ('iframe' === $this->body->nodeName && !$this->body->hasChildNodes()) {
$this->body->appendChild($this->body->ownerDocument->createTextNode('[embedded content]'));
$this->body->appendChild($ownerDocument->createTextNode('[embedded content]'));
}

// remove image lazy loading
Expand Down Expand Up @@ -811,8 +806,10 @@ private function wrapElements($elems = false, string $tag = 'div', string $logMe
$a = iterator_to_array($elems);
foreach ($a as $item) {
if (null !== $item && null !== $item->parentNode && $item instanceof \DOMElement) {
$newNode = $item->ownerDocument->createElement($tag);
$newNode->setInnerHtml($item->ownerDocument->saveXML($item));
/** @var \DOMDocument */
$ownerDocument = $item->ownerDocument;
$newNode = $ownerDocument->createElement($tag);
$newNode->setInnerHtml($ownerDocument->saveXML($item));

$item->parentNode->replaceChild($newNode, $item);
}
Expand Down Expand Up @@ -846,9 +843,11 @@ private function extractEntityFromQuery($entity, $detectEntity, $xpathExpression

// we define the default callback here
if (!\is_callable($returnCallback)) {
$returnCallback = function ($element) {
return trim($element);
};
$returnCallback = fn ($element) => trim($element);
}

if (!$this->xpath) {
return false;
}

// check for given css class
Expand Down Expand Up @@ -933,7 +932,7 @@ private function extractDate(bool $detectDate, string $cssClass, \DOMNode $node
*/
private function extractAuthor(bool $detectAuthor, \DOMNode $node = null): bool
{
if (false === $detectAuthor) {
if (false === $detectAuthor || !$this->xpath) {
return false;
}

Expand All @@ -945,6 +944,7 @@ private function extractAuthor(bool $detectAuthor, \DOMNode $node = null): bool
$elems = $this->xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' vcard ') and (contains(concat(' ',normalize-space(@class),' '),' author ') or contains(concat(' ',normalize-space(@class),' '),' byline '))]", $node);

if ($elems && $elems->length > 0) {
/** @var \DOMNode */
$author = $elems->item(0);
$fns = $this->xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' fn ')]", $author);

Expand Down Expand Up @@ -980,7 +980,7 @@ private function extractAuthor(bool $detectAuthor, \DOMNode $node = null): bool
*/
private function extractBody(bool $detectBody, string $xpathExpression, \DOMNode $node = null, string $type): bool
{
if (false === $detectBody) {
if (false === $detectBody || !$this->xpath) {
return false;
}

Expand Down Expand Up @@ -1008,14 +1008,18 @@ private function extractBody(bool $detectBody, string $xpathExpression, \DOMNode
$this->body = $elems->item(0);

// prune (clean up elements that may not be content)
if ($this->siteConfig->prune() && null !== $this->readability) {
if ($this->siteConfig && $this->siteConfig->prune() && null !== $this->readability) {
$this->logger->info('Pruning content');
$this->readability->prepArticle($this->body);
}

return false;
}

if (!$this->readability) {
return false;
}

$this->body = $this->readability->dom->createElement('div');
$this->logger->info('{nb} body elems found', ['nb' => $elems->length]);
$len = 0;
Expand All @@ -1041,7 +1045,7 @@ private function extractBody(bool $detectBody, string $xpathExpression, \DOMNode
$this->logger->info('...element is child of another body element, skipping.');
} else {
// prune (clean up elements that may not be content)
if ($this->siteConfig->prune()) {
if ($this->siteConfig && $this->siteConfig->prune()) {
$this->logger->info('...pruning content');
$this->readability->prepArticle($elem);
}
Expand Down Expand Up @@ -1103,9 +1107,7 @@ private function extractEntityFromPattern(string $entity, string $pattern, $retu
{
// we define the default callback here
if (!\is_callable($returnCallback)) {
$returnCallback = function ($e) {
return trim($e);
};
$returnCallback = fn ($e) => trim($e);
}

$elems = $this->xpath->evaluate($pattern, $this->readability->dom);
Expand Down Expand Up @@ -1158,9 +1160,7 @@ private function extractMultipleEntityFromPattern(string $entity, string $patter
{
// we define the default callback here
if (!\is_callable($returnCallback)) {
$returnCallback = function ($e) {
return trim($e);
};
$returnCallback = fn ($e) => trim($e);
}

$elems = $this->xpath->evaluate($pattern, $this->readability->dom);
Expand Down Expand Up @@ -1221,7 +1221,7 @@ private function extractDefinedInformation(string $html)
$xpath = new \DOMXPath($doc);

$this->extractOpenGraph($xpath);
if (false === $this->siteConfig->skip_json_ld) {
if ($this->siteConfig && false === $this->siteConfig->skip_json_ld) {
$this->extractJsonLdInformation($xpath);
}
}
Expand Down Expand Up @@ -1352,7 +1352,11 @@ private function extractJsonLdInformation(\DOMXPath $xpath)
$candidateNames = [];

foreach ($scripts as $script) {
$data = json_decode(trim($script->nodeValue), true);
try {
$data = (array) json_decode(trim($script->nodeValue), true, 512, \JSON_THROW_ON_ERROR);
} catch (\JsonException $e) {
continue;
}

if (isset($data['@type']) && \in_array($data['@type'], ['Organization', 'WebSite', 'Person'], true)) {
if (isset($data['name'])) {
Expand Down
36 changes: 10 additions & 26 deletions src/Extractor/HttpClient.php
Original file line number Diff line number Diff line change
Expand Up @@ -25,22 +25,10 @@
*/
class HttpClient
{
/**
* @var array
*/
private $config;
/**
* @var HttpMethodsClient
*/
private $client;
/**
* @var LoggerInterface
*/
private $logger;
/**
* @var History
*/
private $responseHistory;
private array $config;
private HttpMethodsClient $client;
private LoggerInterface $logger;
private History $responseHistory;

/**
* @param Client $client Http client
Expand Down Expand Up @@ -245,7 +233,7 @@ public function fetch($url, $skipTypeVerification = false, $httpHeader = [])
// (regex found here: https://stackoverflow.com/a/137831/569101)
preg_match_all('/<!--\[if\s(?:[^<]+|<(?!!\[endif\]-->))*<!\[endif\]-->/mi', $body, $matchesConditional);

if (isset($matchesConditional[0]) && \count($matchesConditional[0]) > 1) {
if (isset($matchesConditional[0]) && (is_countable($matchesConditional[0]) ? \count($matchesConditional[0]) : 0) > 1) {
foreach ($matchesConditional as $conditionalComment) {
$body = str_replace($conditionalComment, '', $body);
}
Expand All @@ -265,9 +253,7 @@ public function fetch($url, $skipTypeVerification = false, $httpHeader = [])
// remove utm parameters & fragment
$uri = new Uri(str_replace('&amp;', '&', $effectiveUrl));
parse_str($uri->getQuery(), $query);
$queryParameters = array_filter($query, function ($k) {
return !(0 === stripos($k, 'utm_'));
}, \ARRAY_FILTER_USE_KEY);
$queryParameters = array_filter($query, fn ($k) => !(0 === stripos($k, 'utm_')), \ARRAY_FILTER_USE_KEY);
$effectiveUrl = (string) Uri::withQueryValues(new Uri($uri->withFragment('')->withQuery('')), $queryParameters);

$this->logger->info('Data fetched: {data}', ['data' => [
Expand All @@ -292,7 +278,7 @@ private function cleanupUrl(string $url): string
{
// rewrite part of urls to something more readable
foreach ($this->config['rewrite_url'] as $find => $action) {
if (false !== strpos($url, $find) && \is_array($action)) {
if (false !== strpos($url, (string) $find) && \is_array($action)) {
$url = strtr($url, $action);
}
}
Expand Down Expand Up @@ -438,9 +424,7 @@ private function getCookie(string $url, array $httpHeader = []): ?string
}

// see https://tools.ietf.org/html/rfc6265.html#section-4.2.1
return implode('; ', array_map(function ($name) use ($cookies) {
return $name . '=' . $cookies[$name];
}, array_keys($cookies)));
return implode('; ', array_map(fn ($name) => $name . '=' . $cookies[$name], array_keys($cookies)));
}

return null;
Expand Down Expand Up @@ -477,7 +461,7 @@ private function getAccept(string $url, array $httpHeader = [])
*/
private function headerOnlyType(array $headers): bool
{
$contentType = isset($headers['content-type']) ? $headers['content-type'] : '';
$contentType = $headers['content-type'] ?? '';

if (!preg_match('!\s*(([-\w]+)/([-\w\+]+))!im', strtolower($contentType), $match)) {
return false;
Expand Down Expand Up @@ -543,7 +527,7 @@ private function getUglyURL(string $url, string $html)
{
$found = false;
foreach ($this->config['ajax_triggers'] as $string) {
if (stripos($html, $string)) {
if (stripos($html, (string) $string)) {
$found = true;
break;
}
Expand Down
35 changes: 13 additions & 22 deletions src/Graby.php
Original file line number Diff line number Diff line change
Expand Up @@ -28,29 +28,20 @@
class Graby
{
private bool $debug = false;
/** @var LoggerInterface */
private $logger;
private LoggerInterface $logger;
private string $logLevel = 'info';

private array $config = [];

/** @var HttpClient|null */
private $httpClient = null;
/** @var ContentExtractor|null */
private $extractor = null;
private HttpClient $httpClient;
private ContentExtractor $extractor;

/** @var ConfigBuilder */
private $configBuilder;
/** @var Punycode */
private $punycode;
private ConfigBuilder $configBuilder;
private Punycode $punycode;

private bool $imgNoReferrer = false;

/**
* @param array $config
* @param Client|null $client Http client
*/
public function __construct($config = [], Client $client = null, ConfigBuilder $configBuilder = null)
public function __construct(array $config = [], Client $client = null, ConfigBuilder $configBuilder = null)
{
$resolver = new OptionsResolver();
$resolver->setDefaults([
Expand Down Expand Up @@ -107,7 +98,7 @@ public function __construct($config = [], Client $client = null, ConfigBuilder $

if (null === $configBuilder) {
$configBuilder = new ConfigBuilder(
isset($this->config['extractor']['config_builder']) ? $this->config['extractor']['config_builder'] : [],
$this->config['extractor']['config_builder'] ?? [],
$this->logger
);
}
Expand Down Expand Up @@ -508,18 +499,18 @@ private function validateUrl(string $url): string

private function isUrlAllowed(string $url): bool
{
$allowedUrls = $this->getConfig('allowed_urls');
$blockedUrls = $this->getConfig('blocked_urls');
$allowedUrls = (array) $this->getConfig('allowed_urls');
$blockedUrls = (array) $this->getConfig('blocked_urls');

if (!empty($allowedUrls)) {
foreach ($allowedUrls as $allowurl) {
if (false !== stristr($url, $allowurl)) {
if (false !== stristr($url, (string) $allowurl)) {
return true;
}
}
} else {
foreach ($blockedUrls as $blockurl) {
if (false !== stristr($url, $blockurl)) {
if (false !== stristr($url, (string) $blockurl)) {
return false;
}
}
Expand Down Expand Up @@ -579,7 +570,7 @@ private function handleMimeAction(array $mimeInfo, string $effectiveUrl, array $
return null;
}

$body = isset($response['body']) ? $response['body'] : '';
$body = $response['body'] ?? '';

$infos = [
// at this point status will always be considered as 200
Expand Down Expand Up @@ -650,7 +641,7 @@ private function handleMimeAction(array $mimeInfo, string $effectiveUrl, array $
if ('text/plain' === $mimeInfo['mime']) {
$infos['html'] = '<pre>' .
$this->cleanupXss(
$this->convert2Utf8($body, isset($response['headers']) ? $response['headers'] : [])
$this->convert2Utf8($body, $response['headers'] ?? [])
) . '</pre>';
}

Expand Down

0 comments on commit cbc76b7

Please sign in to comment.