Skip to content

Commit

Permalink
Add ability to process prefetched content
Browse files Browse the repository at this point in the history
fetchContent() now accepts an optional parameter, prefetchedContent, which
can contain the content of a page that was fetched before calling Graby.

If we take the example of Wallabag it gives the ability of sending the
content of a page (through a browser extension for example) without
making network calls to fetch the page.

Signed-off-by: Kevin Decherf <kevin@kdecherf.com>
  • Loading branch information
Kdecherf committed Feb 3, 2022
1 parent fda6724 commit 31135a7
Show file tree
Hide file tree
Showing 3 changed files with 61 additions and 5 deletions.
16 changes: 16 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,22 @@ array(

The `date` result is the same as displayed in the content. If `date` is not `null` in the result, we recommend you to parse it using [`date_parse`](http://php.net/date_parse) (this is what we are using to validate that the date is correct).

### Retrieve content from a prefetched page

If you want to extract content from a page you fetched outside of Graby, you can call `setContentAsPrefetched()` before calling `fetchContent()`, e.g.:

``` php
use Graby\Graby;

$article = 'http://www.bbc.com/news/entertainment-arts-32547474';

$input = '<html>[...]</html>';

$graby = new Graby();
$graby->setContentAsPrefetched($input);
$result = $graby->fetchContent($article);
```

### Cleanup content

Since the 1.9.0 version, you can also send html content to be cleanup in the same way graby clean content retrieved from an url. The url is still needed to convert links to absolute, etc.
Expand Down
33 changes: 28 additions & 5 deletions src/Graby.php
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ class Graby
private Punycode $punycode;
private bool $imgNoReferrer = false;

private ?string $prefetchedContent = null;

public function __construct(array $config = [], Client $client = null, ConfigBuilder $configBuilder = null)
{
$this->config = new GrabyConfig($config);
Expand Down Expand Up @@ -103,6 +105,11 @@ public function reloadConfigFiles(): void
$this->configBuilder->loadConfigFiles();
}

public function setContentAsPrefetched(string $content): void
{
$this->prefetchedContent = $content;
}

/**
* Fetch content from the given url and return a readable content.
*
Expand Down Expand Up @@ -208,6 +215,18 @@ public function cleanupHtml($contentBlock, string $url): string
return trim($this->cleanupXss((string) $html));
}

private function getResponseForPrefetchedContent(string $url): array
{
return [
'body' => $this->prefetchedContent,
'effective_url' => $url,
'headers' => [
'content-type' => 'text/html',
],
'status' => 200,
];
}

/**
* Do fetch content from an url.
*
Expand All @@ -218,9 +237,13 @@ private function doFetchContent(string $url): array
$url = $this->validateUrl($url);
$siteConfig = $this->configBuilder->buildFromUrl($url);

$this->logger->info('Fetching url: {url}', ['url' => $url]);

$response = $this->httpClient->fetch($url, false, $siteConfig->http_header);
if (null === $this->prefetchedContent) {
$this->logger->info('Fetching url: {url}', ['url' => $url]);
$response = $this->httpClient->fetch($url, false, $siteConfig->http_header);
} else {
$this->logger->info('Content provided as prefetched for url: {url}', ['url' => $url]);
$response = $this->getResponseForPrefetchedContent($url);
}

$effectiveUrl = $response['effective_url'];
$effectiveUrl = str_replace(' ', '%20', $effectiveUrl);
Expand Down Expand Up @@ -264,7 +287,7 @@ private function doFetchContent(string $url): array

// check site config for single page URL - fetch it if found
$isSinglePage = false;
if ($this->config->getSinglepage() && ($singlePageResponse = $this->getSinglePage($html, $effectiveUrl))) {
if ($this->config->getSinglepage() && null === $this->prefetchedContent && ($singlePageResponse = $this->getSinglePage($html, $effectiveUrl))) {
$isSinglePage = true;
$effectiveUrl = $singlePageResponse['effective_url'];

Expand Down Expand Up @@ -305,7 +328,7 @@ private function doFetchContent(string $url): array

// Deal with multi-page articles
$isMultiPage = (!$isSinglePage && $extractResult && null !== $this->extractor->getNextPageUrl());
if ($this->config->getMultipage() && $isMultiPage) {
if ($this->config->getMultipage() && null === $this->prefetchedContent && $isMultiPage) {
$this->logger->info('Attempting to process multi-page article');
// store first page to avoid parsing it again (previous url content is in `$contentBlock`)
$multiPageUrls = [$effectiveUrl];
Expand Down
17 changes: 17 additions & 0 deletions tests/GrabyTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -1342,6 +1342,23 @@ public function testWithTooLongHtmlJitFail(): void
$this->assertNotSame('No title found', $res['title']);
}

public function testPrefetchedContent(): void
{
$httpMockClient = new HttpMockClient();
$graby = new Graby([
'debug' => true,
], $httpMockClient);

$input = '<html><body><h1>This is my awesome article</h1><article><p>' . str_repeat('This is an awesome text with some links, here there are the awesome', 7) . '</p></article></body></html>';

$graby->setContentAsPrefetched($input);
$res = $graby->fetchContent('https://example.com/prefetched-content');

$this->assertSame('This is my awesome article', $res['title']);
$this->assertSame('https://example.com/prefetched-content', $res['url']);
$this->assertStringContainsString('here there are the awesome', $res['html']);
}

/**
* Return an instance of graby with a mocked Guzzle client returning data from a predefined file.
*/
Expand Down

0 comments on commit 31135a7

Please sign in to comment.