Skip to content

Commit

Permalink
Merge pull request #284 from j0k3r/feature/config-as-value-object
Browse files Browse the repository at this point in the history
Convert configuration to Value Object
  • Loading branch information
j0k3r committed Feb 2, 2022
2 parents 68aa280 + bb954af commit d395eeb
Show file tree
Hide file tree
Showing 13 changed files with 612 additions and 261 deletions.
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -311,7 +311,6 @@ $graby = new Graby(array(
),
'extractor' => array(
'default_parser' => 'libxml',
'allowed_parsers' => array('libxml', 'html5lib'),
// key is fingerprint (fragment to find in HTML)
// value is host name to use for site config lookup if fingerprint matches
// \s* match anything INCLUDING new lines
Expand Down
2 changes: 1 addition & 1 deletion maintenance/Rector/MockGrabyResponseRector.php
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ public function refactor(Node $node): ?Node
return null;
}

$url = $fetchUrls[0];
$url = (string) $fetchUrls[0];

// Add imports.
$this->useNodesToAddCollector->addUseImport(
Expand Down
2 changes: 1 addition & 1 deletion phpunit.xml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
>

<testsuites>
<testsuite name="Graby Test Suite">
<testsuite name="Graby">
<directory>./tests</directory>
</testsuite>
</testsuites>
Expand Down
68 changes: 18 additions & 50 deletions src/Extractor/ContentExtractor.php
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
use Psr\Log\LoggerInterface;
use Psr\Log\NullLogger;
use Readability\Readability;
use Symfony\Component\OptionsResolver\OptionsResolver;

/**
* Content Extractor.
Expand All @@ -21,7 +20,7 @@ class ContentExtractor
public $readability;
private ?\DOMXPath $xpath = null;
private ?string $html = null;
private array $config;
private ContentExtractorConfig $config;
private ?SiteConfig $siteConfig = null;
private ?string $title = null;
private ?string $language = null;
Expand All @@ -41,37 +40,10 @@ class ContentExtractor
*/
public function __construct($config = [], LoggerInterface $logger = null, ConfigBuilder $configBuilder = null)
{
$resolver = new OptionsResolver();
$resolver->setDefaults([
'default_parser' => 'libxml',
'allowed_parsers' => ['libxml', 'html5lib'],
// key is fingerprint (fragment to find in HTML)
// value is host name to use for site config lookup if fingerprint matches
// \s* match anything INCLUDING new lines
'fingerprints' => [
'/\<meta\s*content=([\'"])blogger([\'"])\s*name=([\'"])generator([\'"])/i' => 'fingerprint.blogspot.com',
'/\<meta\s*name=([\'"])generator([\'"])\s*content=([\'"])Blogger([\'"])/i' => 'fingerprint.blogspot.com',
'/\<meta\s*name=([\'"])generator([\'"])\s*content=([\'"])WordPress/i' => 'fingerprint.wordpress.com',
],
'config_builder' => [],
'readability' => [
'pre_filters' => [],
'post_filters' => [],
],
'src_lazy_load_attributes' => [
'data-src',
'data-lazy-src',
'data-original',
'data-sources',
'data-hi-res-src',
'data-srcset',
],
]);

$this->config = $resolver->resolve($config);
$this->config = new ContentExtractorConfig($config);

$this->logger = $logger ?? new NullLogger();
$this->configBuilder = $configBuilder ?? new ConfigBuilder($this->config['config_builder'], $this->logger);
$this->configBuilder = $configBuilder ?? new ConfigBuilder($this->config->getConfigBuilder(), $this->logger);
}

public function setLogger(LoggerInterface $logger): void
Expand Down Expand Up @@ -104,7 +76,7 @@ public function reset(): void
*/
public function findHostUsingFingerprints(string $html)
{
foreach ($this->config['fingerprints'] as $metaPattern => $host) {
foreach ($this->config->getFingerprints() as $metaPattern => $host) {
if (1 === preg_match($metaPattern, $html)) {
return $host;
}
Expand Down Expand Up @@ -133,7 +105,7 @@ public function buildSiteConfig(string $url, string $html = '', bool $addToCache

$configFingerprint = $this->configBuilder->buildForHost($fingerprintHost);

if (!empty($this->config['fingerprints']) && false !== $configFingerprint) {
if (!empty($this->config->getFingerprints()) && false !== $configFingerprint) {
$this->logger->info('Appending site config settings from {host} (fingerprint match)', ['host' => $fingerprintHost]);
$this->configBuilder->mergeConfig($config, $configFingerprint);

Expand Down Expand Up @@ -165,8 +137,8 @@ public function process(string $html, string $url, SiteConfig $siteConfig = null
}

// add lazyload information from siteconfig
if ($this->siteConfig->src_lazy_load_attr && !\in_array($this->siteConfig->src_lazy_load_attr, $this->config['src_lazy_load_attributes'], true)) {
$this->config['src_lazy_load_attributes'][] = $this->siteConfig->src_lazy_load_attr;
if ($this->siteConfig->src_lazy_load_attr && !\in_array($this->siteConfig->src_lazy_load_attr, $this->config->getSrcLazyLoadAttributes(), true)) {
$this->config->addSrcLazyLoadAttributes($this->siteConfig->src_lazy_load_attr);
}

$this->logger->debug('Actual site config', ['siteConfig' => $this->siteConfig]);
Expand All @@ -187,9 +159,9 @@ public function process(string $html, string $url, SiteConfig $siteConfig = null
// load and parse html
$parser = $this->siteConfig->parser();

if (!\in_array($parser, $this->config['allowed_parsers'], true)) {
$this->logger->info('HTML parser {parser} not listed, using {default_parser} instead', ['parser' => $parser, 'default_parser' => $this->config['default_parser']]);
$parser = $this->config['default_parser'];
if (!\in_array($parser, $this->config->getAllowedParsers(), true)) {
$this->logger->info('HTML parser {parser} not listed, using {default_parser} instead', ['parser' => $parser, 'default_parser' => $this->config->getDefaultParser()]);
$parser = $this->config->getDefaultParser();
}

$this->logger->info('Attempting to parse HTML with {parser}', ['parser' => $parser]);
Expand Down Expand Up @@ -586,7 +558,7 @@ public function process(string $html, string $url, SiteConfig $siteConfig = null
// remove image lazy loading
foreach ($this->body->getElementsByTagName('img') as $e) {
$hasAttribute = false;
foreach ($this->config['src_lazy_load_attributes'] as $attribute) {
foreach ($this->config->getSrcLazyLoadAttributes() as $attribute) {
if ($e->hasAttribute($attribute)) {
$hasAttribute = true;
}
Expand All @@ -611,7 +583,7 @@ public function process(string $html, string $url, SiteConfig $siteConfig = null
}

$attributes = [];
foreach ($this->config['src_lazy_load_attributes'] as $attribute) {
foreach ($this->config->getSrcLazyLoadAttributes() as $attribute) {
if ($e->hasAttribute($attribute)) {
$key = 'src';
if ('data-srcset' === $attribute) {
Expand Down Expand Up @@ -1072,16 +1044,12 @@ private function getReadability(string $html, string $url, string $parser, bool
{
$readability = new Readability($html, $url, $parser, $enableTidy);

if (isset($this->config['readability']['pre_filters']) && \is_array($this->config['readability']['pre_filters'])) {
foreach ($this->config['readability']['pre_filters'] as $filter => $replacer) {
$readability->addPreFilter($filter, $replacer);
}
foreach ($this->config->getReadability()['pre_filters'] as $filter => $replacer) {
$readability->addPreFilter($filter, $replacer);
}

if (isset($this->config['readability']['post_filters']) && \is_array($this->config['readability']['post_filters'])) {
foreach ($this->config['readability']['post_filters'] as $filter => $replacer) {
$readability->addPostFilter($filter, $replacer);
}
foreach ($this->config->getReadability()['post_filters'] as $filter => $replacer) {
$readability->addPostFilter($filter, $replacer);
}

return $readability;
Expand Down Expand Up @@ -1110,7 +1078,7 @@ private function extractEntityFromPattern(string $entity, string $pattern, $retu
$returnCallback = fn ($e) => trim($e);
}

if (!$this->xpath) {
if (!$this->xpath || !$this->readability) {
return false;
}

Expand Down Expand Up @@ -1167,7 +1135,7 @@ private function extractMultipleEntityFromPattern(string $entity, string $patter
$returnCallback = fn ($e) => trim($e);
}

if (!$this->xpath) {
if (!$this->xpath || !$this->readability) {
return false;
}

Expand Down
132 changes: 132 additions & 0 deletions src/Extractor/ContentExtractorConfig.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
<?php

namespace Graby\Extractor;

use Graby\OptionsResolver\ArrayStringOptionsTrait;
use Symfony\Component\OptionsResolver\Options;
use Symfony\Component\OptionsResolver\OptionsResolver;

/**
* Configuration for ContentExtractor as a Value Object.
*/
class ContentExtractorConfig
{
use ArrayStringOptionsTrait;

private const ALLOWED_PARSERS = ['libxml', 'html5lib'];

private string $default_parser;
/** @var array<string, string> */
private array $fingerprints;
private array $config_builder;
private array $readability;
/** @var array<string> */
private array $src_lazy_load_attributes;

public function __construct(array $config)
{
$resolver = new OptionsResolver();
$resolver->setDefaults([
'default_parser' => 'libxml',
// key is fingerprint (fragment to find in HTML)
// value is host name to use for site config lookup if fingerprint matches
// \s* match anything INCLUDING new lines
'fingerprints' => [
'/\<meta\s*content=([\'"])blogger([\'"])\s*name=([\'"])generator([\'"])/i' => 'fingerprint.blogspot.com',
'/\<meta\s*name=([\'"])generator([\'"])\s*content=([\'"])Blogger([\'"])/i' => 'fingerprint.blogspot.com',
'/\<meta\s*name=([\'"])generator([\'"])\s*content=([\'"])WordPress/i' => 'fingerprint.wordpress.com',
],
'config_builder' => [],
'readability' => [
'pre_filters' => [],
'post_filters' => [],
],
'src_lazy_load_attributes' => [
'data-src',
'data-lazy-src',
'data-original',
'data-sources',
'data-hi-res-src',
'data-srcset',
],
]);

$resolver->setAllowedValues('default_parser', self::ALLOWED_PARSERS);

$resolver->setAllowedTypes('default_parser', 'string');
$resolver->setAllowedTypes('fingerprints', 'array');
$resolver->setAllowedTypes('config_builder', 'array');
$resolver->setAllowedTypes('src_lazy_load_attributes', 'string[]');

$resolver->setDefault('readability', function (OptionsResolver $readabilityResolver) {
$readabilityResolver->setDefaults([
'pre_filters' => [],
'post_filters' => [],
]);
$readabilityResolver->setAllowedTypes('pre_filters', 'array');
$readabilityResolver->setAllowedTypes('post_filters', 'array');
});

$resolver->setNormalizer('readability', function (Options $options, $value) {
$this->validateArray($value, 'readability[pre_filters]', 'pre_filters');
$this->validateArray($value, 'readability[post_filters]', 'post_filters');

return $value;
});
$resolver->setNormalizer('fingerprints', function (Options $options, $value) {
$this->validateArray($value, 'fingerprints');

return $value;
});

$config = $resolver->resolve($config);

foreach ($config as $key => $value) {
$this->$key = $value;
}
}

public function getDefaultParser(): string
{
return $this->default_parser;
}

/**
* @return array<string>
*/
public function getAllowedParsers(): array
{
return self::ALLOWED_PARSERS;
}

/**
* @return array<string, string>
*/
public function getFingerprints(): array
{
return $this->fingerprints;
}

public function getConfigBuilder(): array
{
return $this->config_builder;
}

public function getReadability(): array
{
return $this->readability;
}

/**
* @return array<string>
*/
public function getSrcLazyLoadAttributes(): array
{
return $this->src_lazy_load_attributes;
}

public function addSrcLazyLoadAttributes(string $attribute): void
{
$this->src_lazy_load_attributes[] = $attribute;
}
}

0 comments on commit d395eeb

Please sign in to comment.