Skip to content

Commit

Permalink
Convert configuration to Value Object
Browse files Browse the repository at this point in the history
Mostly to enforce type of each parameter of a config.
Also, enforce type for the input configuration using `setAllowedTypes`.
  • Loading branch information
j0k3r committed Jan 24, 2022
1 parent 92e09fc commit 789d220
Show file tree
Hide file tree
Showing 3 changed files with 147 additions and 54 deletions.
2 changes: 1 addition & 1 deletion phpunit.xml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
>

<testsuites>
<testsuite name="Graby Test Suite">
<testsuite name="Graby">
<directory>./tests</directory>
</testsuite>
</testsuites>
Expand Down
69 changes: 16 additions & 53 deletions src/Extractor/HttpClient.php
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,13 @@
use Psr\Http\Message\ResponseInterface;
use Psr\Log\LoggerInterface;
use Psr\Log\NullLogger;
use Symfony\Component\OptionsResolver\OptionsResolver;

/**
* HttpClient will make sure to retrieve the right content with the right url.
*/
class HttpClient
{
private array $config;
private HttpClientConfig $config;
private HttpMethodsClient $client;
private LoggerInterface $logger;
private History $responseHistory;
Expand All @@ -36,43 +35,7 @@ class HttpClient
*/
public function __construct(Client $client, $config = [], LoggerInterface $logger = null)
{
$resolver = new OptionsResolver();
$resolver->setDefaults([
'ua_browser' => 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.92 Safari/535.2',
'default_referer' => 'http://www.google.co.uk/url?sa=t&source=web&cd=1',
'rewrite_url' => [
'docs.google.com' => ['/Doc?' => '/View?'],
'tnr.com' => ['tnr.com/article/' => 'tnr.com/print/article/'],
'.m.wikipedia.org' => ['.m.wikipedia.org' => '.wikipedia.org'],
'm.vanityfair.com' => ['m.vanityfair.com' => 'www.vanityfair.com'],
],
// Prevent certain file/mime types
// HTTP responses which match these content types will
// be returned without body.
'header_only_types' => [
'image',
'audio',
'video',
],
// URLs ending with one of these extensions will
// prompt client to send a HEAD request first
// to see if returned content type matches $headerOnlyTypes.
'header_only_clues' => ['mp3', 'zip', 'exe', 'gif', 'gzip', 'gz', 'jpeg', 'jpg', 'mpg', 'mpeg', 'png', 'ppt', 'mov'],
// User Agent strings - mapping domain names
'user_agents' => [],
// AJAX triggers to search for.
// for AJAX sites, e.g. Blogger with its dynamic views templates.
'ajax_triggers' => [
"<meta name='fragment' content='!'",
'<meta name="fragment" content="!"',
"<meta content='!' name='fragment'",
'<meta content="!" name="fragment"',
],
// number of redirection allowed until we assume request won't be complete
'max_redirect' => 10,
]);

$this->config = $resolver->resolve($config);
$this->config = new HttpClientConfig($config);

if (null === $logger) {
$logger = new NullLogger();
Expand All @@ -90,7 +53,7 @@ public function __construct(Client $client, $config = [], LoggerInterface $logge
new ErrorPlugin(),
],
[
'max_restarts' => $this->config['max_redirect'],
'max_restarts' => $this->config->getMaxRedirect(),
]
),
Psr17FactoryDiscovery::findRequestFactory()
Expand Down Expand Up @@ -119,7 +82,7 @@ public function fetch($url, $skipTypeVerification = false, $httpHeader = [])
$url = $this->cleanupUrl($url);

$method = 'get';
if (!$skipTypeVerification && !empty($this->config['header_only_types']) && $this->possibleUnsupportedType($url)) {
if (!$skipTypeVerification && !empty($this->config->getHeaderOnlyTypes()) && $this->possibleUnsupportedType($url)) {
$method = 'head';
}

Expand Down Expand Up @@ -147,7 +110,7 @@ public function fetch($url, $skipTypeVerification = false, $httpHeader = [])
/** @var ResponseInterface $response */
$response = $this->client->$method($url, $headers);
} catch (LoopException $e) {
$this->logger->info('Endless redirect: ' . ($this->config['max_redirect'] + 1) . ' on "{url}"', ['url' => $url]);
$this->logger->info('Endless redirect: ' . ($this->config->getMaxRedirect() + 1) . ' on "{url}"', ['url' => $url]);

return [
'effective_url' => $url,
Expand Down Expand Up @@ -277,8 +240,8 @@ public function fetch($url, $skipTypeVerification = false, $httpHeader = [])
private function cleanupUrl(string $url): string
{
// rewrite part of urls to something more readable
foreach ($this->config['rewrite_url'] as $find => $action) {
if (false !== strpos($url, (string) $find) && \is_array($action)) {
foreach ($this->config->getRewriteUrl() as $find => $action) {
if (false !== strpos($url, $find) && \is_array($action)) {
$url = strtr($url, $action);
}
}
Expand Down Expand Up @@ -319,7 +282,7 @@ private function possibleUnsupportedType(string $url): bool
return false;
}

return \in_array($ext, $this->config['header_only_clues'], true);
return \in_array($ext, $this->config->getHeaderOnlyClues(), true);
}

/**
Expand All @@ -332,7 +295,7 @@ private function possibleUnsupportedType(string $url): bool
*/
private function getUserAgent(string $url, array $httpHeader = []): string
{
$ua = $this->config['ua_browser'];
$ua = $this->config->getUaBrowser();

if (!empty($httpHeader['user-agent'])) {
$this->logger->info('Found user-agent "{user-agent}" for url "{url}" from site config', ['user-agent' => $httpHeader['user-agent'], 'url' => $url]);
Expand All @@ -356,10 +319,10 @@ private function getUserAgent(string $url, array $httpHeader = []): string
}

foreach ($try as $h) {
if (isset($this->config['user_agents'][$h])) {
$this->logger->info('Found user-agent "{user-agent}" for url "{url}" from config', ['user-agent' => $this->config['user_agents'][$h], 'url' => $url]);
if (isset($this->config->getUserAgents()[$h])) {
$this->logger->info('Found user-agent "{user-agent}" for url "{url}" from config', ['user-agent' => $this->config->getUserAgents()[$h], 'url' => $url]);

return $this->config['user_agents'][$h];
return $this->config->getUserAgents()[$h];
}
}

Expand All @@ -378,7 +341,7 @@ private function getUserAgent(string $url, array $httpHeader = []): string
*/
private function getReferer(string $url, array $httpHeader = []): string
{
$default_referer = $this->config['default_referer'];
$default_referer = $this->config->getDefaultReferer();

if (!empty($httpHeader['referer'])) {
$this->logger->info('Found referer "{referer}" for url "{url}" from site config', ['referer' => $httpHeader['referer'], 'url' => $url]);
Expand Down Expand Up @@ -471,7 +434,7 @@ private function headerOnlyType(array $headers): bool
$match[2] = strtolower(trim($match[2]));

foreach ([$match[1], $match[2]] as $mime) {
if (\in_array($mime, $this->config['header_only_types'], true)) {
if (\in_array($mime, $this->config->getHeaderOnlyTypes(), true)) {
return true;
}
}
Expand Down Expand Up @@ -526,8 +489,8 @@ private function getMetaRefreshURL(string $url, string $html)
private function getUglyURL(string $url, string $html)
{
$found = false;
foreach ($this->config['ajax_triggers'] as $string) {
if (stripos($html, (string) $string)) {
foreach ($this->config->getAjaxTriggers() as $string) {
if (stripos($html, $string)) {
$found = true;
break;
}
Expand Down
130 changes: 130 additions & 0 deletions src/Extractor/HttpClientConfig.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
<?php

namespace Graby\Extractor;

use Symfony\Component\OptionsResolver\OptionsResolver;

/**
* Configuration for HttpClient as a Value Object.
*/
class HttpClientConfig
{
private string $ua_browser;
private string $default_referer;
/** @var array<array<string, string>> */
private array $rewrite_url;
/** @var array<string> */
private array $header_only_types;
/** @var array<string> */
private array $header_only_clues;
private array $user_agents;
/** @var array<string> */
private array $ajax_triggers;
private int $max_redirect;

public function __construct(array $config)
{
$resolver = new OptionsResolver();
$resolver->setDefaults([
'ua_browser' => 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.92 Safari/535.2',
'default_referer' => 'http://www.google.co.uk/url?sa=t&source=web&cd=1',
'rewrite_url' => [
'docs.google.com' => ['/Doc?' => '/View?'],
'tnr.com' => ['tnr.com/article/' => 'tnr.com/print/article/'],
'.m.wikipedia.org' => ['.m.wikipedia.org' => '.wikipedia.org'],
'm.vanityfair.com' => ['m.vanityfair.com' => 'www.vanityfair.com'],
],
// Prevent certain file/mime types
// HTTP responses which match these content types will
// be returned without body.
'header_only_types' => [
'image',
'audio',
'video',
],
// URLs ending with one of these extensions will
// prompt client to send a HEAD request first
// to see if returned content type matches $headerOnlyTypes.
'header_only_clues' => ['mp3', 'zip', 'exe', 'gif', 'gzip', 'gz', 'jpeg', 'jpg', 'mpg', 'mpeg', 'png', 'ppt', 'mov'],
// User Agent strings - mapping domain names
'user_agents' => [],
// AJAX triggers to search for.
// for AJAX sites, e.g. Blogger with its dynamic views templates.
'ajax_triggers' => [
"<meta name='fragment' content='!'",
'<meta name="fragment" content="!"',
"<meta content='!' name='fragment'",
'<meta content="!" name="fragment"',
],
// number of redirection allowed until we assume request won't be complete
'max_redirect' => 10,
]);

$resolver->setAllowedTypes('ua_browser', 'string');
$resolver->setAllowedTypes('default_referer', 'string');
$resolver->setAllowedTypes('rewrite_url', 'array');
$resolver->setAllowedTypes('header_only_types', 'string[]');
$resolver->setAllowedTypes('header_only_clues', 'string[]');
$resolver->setAllowedTypes('user_agents', 'array');
$resolver->setAllowedTypes('ajax_triggers', 'string[]');
$resolver->setAllowedTypes('max_redirect', 'int');

$config = $resolver->resolve($config);

foreach ($config as $key => $value) {
$this->$key = $value;
}
}

public function getUaBrowser(): string
{
return $this->ua_browser;
}

public function getDefaultReferer(): string
{
return $this->default_referer;
}

/**
* @return array<array<string, string>>
*/
public function getRewriteUrl(): array
{
return $this->rewrite_url;
}

/**
* @return array<string>
*/
public function getHeaderOnlyTypes(): array
{
return $this->header_only_types;
}

/**
* @return array<string>
*/
public function getHeaderOnlyClues(): array
{
return $this->header_only_clues;
}

public function getUserAgents(): array
{
return $this->user_agents;
}

/**
* @return array<string>
*/
public function getAjaxTriggers(): array
{
return $this->ajax_triggers;
}

public function getMaxRedirect(): int
{
return $this->max_redirect;
}
}

0 comments on commit 789d220

Please sign in to comment.