Skip to content

Commit

Permalink
Merge pull request #212 from shtrom/replace_string-before-redirect
Browse files Browse the repository at this point in the history
Replace string before redirect
  • Loading branch information
j0k3r committed Feb 2, 2022
2 parents abe816a + 515b69c commit fda6724
Show file tree
Hide file tree
Showing 7 changed files with 2,649 additions and 38 deletions.
78 changes: 55 additions & 23 deletions src/Extractor/ContentExtractor.php
Original file line number Diff line number Diff line change
Expand Up @@ -131,30 +131,9 @@ public function process(string $html, string $url, SiteConfig $siteConfig = null
{
$this->reset();

$this->siteConfig = $siteConfig;
if (null === $this->siteConfig) {
$this->siteConfig = $this->buildSiteConfig($url, $html);
}
$this->prepareSiteConfig($html, $url, $siteConfig);

// add lazyload information from siteconfig
if ($this->siteConfig->src_lazy_load_attr && !\in_array($this->siteConfig->src_lazy_load_attr, $this->config->getSrcLazyLoadAttributes(), true)) {
$this->config->addSrcLazyLoadAttributes($this->siteConfig->src_lazy_load_attr);
}

$this->logger->debug('Actual site config', ['siteConfig' => $this->siteConfig]);

// do string replacements
if (!empty($this->siteConfig->find_string)) {
if (\count($this->siteConfig->find_string) === \count($this->siteConfig->replace_string)) {
$html = str_replace($this->siteConfig->find_string, $this->siteConfig->replace_string, $html, $count);
$this->logger->info('Strings replaced: {count} (find_string and/or replace_string)', ['count' => $count]);
} else {
$this->logger->info('Skipped string replacement - incorrect number of find-replace strings in site config');
}
unset($count);
}

$this->logger->debug('HTML after site config strings replacements', ['html' => $html]);
$html = $this->processStringReplacements($html, $url, $siteConfig);

// load and parse html
$parser = $this->siteConfig->parser();
Expand Down Expand Up @@ -626,6 +605,34 @@ public function process(string $html, string $url, SiteConfig $siteConfig = null
return $this->success;
}

/**
* Process string replacements in the $html body.
*
* @param SiteConfig $siteConfig Will avoid to recalculate the site config
*
* @return string $html with replacements performed
*/
public function processStringReplacements(string $html, string $url, ?SiteConfig $siteConfig = null): string
{
// We repeat this step from process(), so this method can be called on its own
$this->prepareSiteConfig($html, $url, $siteConfig);

// do string replacements
if (!empty($this->siteConfig->find_string)) {
if (\count($this->siteConfig->find_string) === \count($this->siteConfig->replace_string)) {
$html = str_replace($this->siteConfig->find_string, $this->siteConfig->replace_string, $html, $count);
$this->logger->info('Strings replaced: {count} (find_string and/or replace_string)', ['count' => $count]);
} else {
$this->logger->info('Skipped string replacement - incorrect number of find-replace strings in site config');
}
unset($count);
}

$this->logger->debug('HTML after site config strings replacements', ['html' => $html]);

return $html;
}

/**
* @return \DOMElement|\DOMNode|null
*/
Expand Down Expand Up @@ -712,6 +719,31 @@ protected function addAuthor(string $authorDirty): void
}
}

/**
* Set and prepare the SiteConfig, or get a default.
* If a siteConfig is already set and no prepare site config is passed, this is a noop.
*
* @param SiteConfig $siteConfig Will avoid to recalculate the site config
*/
private function prepareSiteConfig(string $html, string $url, ?SiteConfig $siteConfig = null): void
{
if (null !== $this->siteConfig && null === $siteConfig) {
return;
}

$this->siteConfig = $siteConfig;
if (null === $this->siteConfig) {
$this->siteConfig = $this->buildSiteConfig($url, $html);
}

// add lazyload information from siteconfig
if ($this->siteConfig->src_lazy_load_attr && !\in_array($this->siteConfig->src_lazy_load_attr, $this->config->getSrcLazyLoadAttributes(), true)) {
$this->config->addSrcLazyLoadAttributes($this->siteConfig->src_lazy_load_attr);
}

$this->logger->debug('Actual site config', ['siteConfig' => $this->siteConfig]);
}

/**
* Check if given node list exists and has length more than 0.
*
Expand Down
10 changes: 8 additions & 2 deletions src/Extractor/HttpClient.php
Original file line number Diff line number Diff line change
Expand Up @@ -28,19 +28,21 @@ class HttpClient
private HttpMethodsClient $client;
private LoggerInterface $logger;
private History $responseHistory;
private ?ContentExtractor $extractor;

/**
* @param Client $client Http client
* @param array $config
*/
public function __construct(Client $client, $config = [], LoggerInterface $logger = null)
public function __construct(Client $client, array $config = [], LoggerInterface $logger = null, ContentExtractor $extractor = null)
{
$this->config = new HttpClientConfig($config);

if (null === $logger) {
$logger = new NullLogger();
}

$this->logger = $logger;
$this->extractor = $extractor;

$this->responseHistory = new History();
$this->client = new HttpMethodsClient(
Expand Down Expand Up @@ -202,6 +204,10 @@ public function fetch($url, $skipTypeVerification = false, $httpHeader = [])
}
}

if (null !== $this->extractor) {
$body = $this->extractor->processStringReplacements($body, $effectiveUrl);
}

// check for <meta name='fragment' content='!'/>
// for AJAX sites, e.g. Blogger with its dynamic views templates.
// Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification
Expand Down
3 changes: 2 additions & 1 deletion src/Graby.php
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,8 @@ public function __construct(array $config = [], Client $client = null, ConfigBui
$this->httpClient = new HttpClient(
$client ?: new PluginClient(HttpClientDiscovery::find(), [new CookiePlugin(new CookieJar())]),
$this->config->getHttpClient(),
$this->logger
$this->logger,
$this->extractor
);

$this->punycode = new Punycode();
Expand Down
4 changes: 2 additions & 2 deletions tests/GrabyFunctionalTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,8 @@ public function testRealFetchContent(): void
$this->assertSame('Trying using method "{method}" on url "{url}"', $records[13]['message']);
$this->assertSame('get', $records[13]['context']['method']);
$this->assertSame('Use default referer "{referer}" for url "{url}"', $records[15]['message']);
$this->assertSame('Data fetched: {data}', $records[16]['message']);
$this->assertSame('Looking for site config files to see if single page link exists', $records[18]['message']);
$this->assertSame('Data fetched: {data}', $records[18]['message']);
$this->assertSame('Looking for site config files to see if single page link exists', $records[20]['message']);
}

public function testRealFetchContent2(): void
Expand Down
21 changes: 11 additions & 10 deletions tests/GrabyTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ public function dataForFetchContent(): array

$test = (string) file_get_contents($file->getRealpath());

preg_match('/-----URL-----\s*(.*?)\s*-----URL_EFFECTIVE-----\s*(.*?)\s*-----HEADER-----\s*(.*?)\s*-----LANGUAGE-----\s*(.*?)\s*-----AUTHOR-----\s*(.*?)\s*-----TITLE-----\s*(.*?)\s*-----SUMMARY-----\s*(.*?)\s*-----RAW_CONTENT-----\s*(.*?)\s*-----PARSED_CONTENT-----\s*(.*)/sx', $test, $match);
preg_match('/-----URL-----\s*(.*?)\s*-----URL_EFFECTIVE-----\s*(.*?)\s*-----HEADER-----\s*(.*?)\s*-----LANGUAGE-----\s*(.*?)\s*-----AUTHOR-----\s*(.*?)\s*-----TITLE-----\s*(.*?)\s*-----SUMMARY-----\s*(.*?)\s*-----RAW_CONTENT-----\s*(.*?)\s*(------RAW_CONTENT2-----\s*(.*?)\s*)?----PARSED_CONTENT-----\s*(.*)\s*/sx', $test, $match);

$tests[] = [
$match[1], // url
Expand All @@ -46,7 +46,8 @@ public function dataForFetchContent(): array
$match[6], // title
$match[7], // summary
$match[8], // raw content
$match[9], // parsed content
$match[10], // raw content2
$match[11], // parsed content
];
}

Expand All @@ -56,11 +57,11 @@ public function dataForFetchContent(): array
/**
* @dataProvider dataForFetchContent
*/
public function testFetchContent(string $url, string $urlEffective, string $header, string $language, string $author, string $title, string $summary, string $rawContent, string $parsedContent): void
public function testFetchContent(string $url, string $urlEffective, string $header, string $language, string $author, string $title, string $summary, string $rawContent, string $rawContent2, string $parsedContent): void
{
$httpMockClient = new HttpMockClient();
$httpMockClient->addResponse(new Response(200, ['Content-Type' => $header], $rawContent));
$httpMockClient->addResponse(new Response(200, ['Content-Type' => $header], $rawContent));
$httpMockClient->addResponse(new Response(200, ['Content-Type' => $header], (!empty($rawContent2)) ? $rawContent2 : $rawContent));

$graby = new Graby([
'xss_filter' => false,
Expand All @@ -77,22 +78,22 @@ public function testFetchContent(string $url, string $urlEffective, string $head

$this->assertCount(11, $res);

$this->assertSame($urlEffective, $res['url'], 'Same url');
$this->assertSame($title, $res['title'], 'Same title');
$this->assertSame($summary, $res['summary'], 'Same summary');

if ($language) {
$this->assertSame($language, $res['language']);
} else {
$this->assertEmpty($res['language']);
$this->assertEmpty($res['language'], 'language not empty; got ' . $res['language']);
}

if ($author) {
$this->assertSame([$author], $res['authors']);
} else {
$this->assertEmpty($res['authors']);
$this->assertEmpty($res['authors'], 'authors not empty; got ' . $res['language']);
}

$this->assertSame($urlEffective, $res['url'], 'Same url');
$this->assertSame($title, $res['title'], 'Same title');
$this->assertSame($summary, $res['summary'], 'Same summary');

$this->assertSame($parsedContent, $res['html'], 'Same html');

$this->assertStringContainsString('text/html', $res['headers']['content-type']);
Expand Down
12 changes: 12 additions & 0 deletions tests/fixtures/site_config/lifehacker.com.au.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Generated by FiveFilters.org's web-based selection tool
# Place this file inside your site_config/custom/ folder
# Source: https://siteconfig.fivefilters.org/grab.php?url=https%3A%2F%2Fwww.lifehacker.com.au%2F2019%2F03%2Fwe-tried-mark-zuckerbergs-tricks-for-looking-taller-in-photos%2F

body: //div[contains(concat(' ',normalize-space(@class),' '),' main__content ')]
author: //div[@class='meta__author']/a

# Avoid redirecting to 'nojs' page
find_string: <meta http-equiv="refresh"
replace_string: <meta norefresh

test_url: https://www.lifehacker.com.au/2019/03/we-tried-mark-zuckerbergs-tricks-for-looking-taller-in-photos/

0 comments on commit fda6724

Please sign in to comment.