Skip to content
This repository has been archived by the owner on Nov 29, 2018. It is now read-only.

Commit

Permalink
feat(admin): add option to clear all cached URLs by domain/w
Browse files Browse the repository at this point in the history
  • Loading branch information
hypeJunction committed May 31, 2017
1 parent 69d8486 commit 2164593
Show file tree
Hide file tree
Showing 6 changed files with 144 additions and 19 deletions.
13 changes: 13 additions & 0 deletions actions/admin/scraper/clear.php
@@ -0,0 +1,13 @@
<?php

$domain = get_input('domain');
if (!$domain) {
return elgg_ok_response();
}

$svc = \hypeJunction\Scraper\ScraperService::getInstance();
$urls = $svc->find($domain);

foreach ($urls as $url) {
$svc->delete($url);
}
78 changes: 59 additions & 19 deletions classes/hypeJunction/Scraper/ScraperService.php
Expand Up @@ -30,7 +30,7 @@ class ScraperService {
* Constructor
*
* @param Parser $parser Parser
* @param Pool $cache Cache
* @param Pool $cache Cache
*/
public function __construct(Parser $parser, Pool $cache) {
$this->parser = $parser;
Expand All @@ -49,18 +49,21 @@ public static function getInstance() {
$cache = $routes_cache = is_memcache_available() ? new Memcache() : new FileCache();
self::$_instance = new self($parser, $cache);
}

return self::$_instance;
}

/**
* Get scraped data
*
*
* @param string $url URL
*
* @return array|void
*/
public function get($url) {
if (!$this->parser->isValidUrl($url)) {
elgg_log(__METHOD__ . ' expects a valid URL: ' . $url);

return null;
}

Expand All @@ -80,12 +83,35 @@ public function get($url) {
return $row ? unserialize($row->data) : null;
}

/**
* Find scraped resourced
*
* @param string $query Query to match against
*
* @return string[]
*/
public function find($query) {

$query = sanitize_string($query);

$dbprefix = elgg_get_config('dbprefix');
$rows = get_data("
SELECT url FROM {$dbprefix}scraper_data
WHERE url LIKE '%$query%'
");

return array_map(function ($elem) {
return $elem->url;
}, $rows);
}

/**
* Parse and scrape a URL
*
* @param string $url URL
* @param bool $flush Flush existing URL data
* @param bool $recurse Recurse into subresources
*
* @return array|false
*/
public function parse($url, $flush = false, $recurse = true) {
Expand All @@ -94,6 +120,7 @@ public function parse($url, $flush = false, $recurse = true) {

if (!$this->parser->isValidUrl($url)) {
elgg_log("Invalid URL: $url");

return false;
}

Expand All @@ -115,6 +142,7 @@ public function parse($url, $flush = false, $recurse = true) {

if (!$response instanceof \GuzzleHttp\Psr7\Response || $response->getStatusCode() != 200) {
$this->save($url, false);

return false;
}

Expand All @@ -127,9 +155,10 @@ public function parse($url, $flush = false, $recurse = true) {
$content_length = array_shift($content_length);
}

if ((int) $content_length > $max_upload) {
if ((int)$content_length > $max_upload) {
// Large images eat up memory
$this->save($url, false);

return false;
}

Expand All @@ -144,6 +173,7 @@ public function parse($url, $flush = false, $recurse = true) {

if (!$data) {
$this->save($url, false);

return false;
}

Expand All @@ -169,13 +199,14 @@ public function parse($url, $flush = false, $recurse = true) {
break;
}

$data = elgg_trigger_plugin_hook('parse', 'framework:scraper', array(
$data = elgg_trigger_plugin_hook('parse', 'framework:scraper', [
'url' => $url,
), $data);
], $data);

elgg_log("URL data parsed: " . print_r($data, true));

$this->save($url, $data);

return $data;
}

Expand All @@ -184,6 +215,7 @@ public function parse($url, $flush = false, $recurse = true) {
*
* @param string $url URL
* @param array $data Data
*
* @return boolean
*/
public function save($url, $data = false) {
Expand All @@ -200,13 +232,14 @@ public function save($url, $data = false) {
ON DUPLICATE KEY UPDATE
data = :data
", [
':url' => (string) $url,
':url' => (string)$url,
':data' => serialize($data),
':hash' => sha1($url),
]);

if ($result) {
$this->cache->put(sha1($url), $data);

return true;
}

Expand All @@ -217,6 +250,7 @@ public function save($url, $data = false) {
* Delete URL data from DB and cache
*
* @param string $url URL
*
* @return bool
*/
public function delete($url) {
Expand All @@ -233,24 +267,25 @@ public function delete($url) {
}
}
}

$this->cache->invalidate(sha1($url));

$dbprefix = elgg_get_config('dbprefix');
$result = delete_data("
DELETE FROM {$dbprefix}scraper_data
WHERE url = :url
", [
':url' => (string) $url,
':url' => (string)$url,
]);

return (bool) $result;
return (bool)$result;
}

/**
* Saves an image on Elgg's filestore
*
* @param string $url URL of the image
*
* @return \ElggFile|false
*/
public function saveImageFromUrl($url) {
Expand Down Expand Up @@ -286,11 +321,12 @@ public function saveImageFromUrl($url) {
$tmp->write($raw_bytes);
$tmp->close();
unset($raw_bytes);

//@Todo - looks like we need some way to check this in core
// instead of elgg_save_resized_image() OOMing
if (!$this->hasMemoryToResize($tmp->getFilenameOnFilestore())) {
$tmp->delete();

return false;
}

Expand All @@ -299,6 +335,7 @@ public function saveImageFromUrl($url) {
$imagesize = getimagesize($tmp->getFilenameOnFilestore());
if (!$imagesize || $imagesize[0] < $lower_threashold || $imagesize[0] > $upper_threshold) {
$tmp->delete();

return false;
}

Expand Down Expand Up @@ -328,27 +365,28 @@ public function saveImageFromUrl($url) {

/**
* Parse thumbnails from scraped data
*
*
* @param array $data Data
*
* @return array
*/
public function parseThumbs(array $data = []) {
$assets = [];
$thumbnails = (array) elgg_extract('thumbnails', $data, []);
$icons = (array) elgg_extract('icons', $data, []);
$thumbnails = (array)elgg_extract('thumbnails', $data, []);
$icons = (array)elgg_extract('icons', $data, []);

// Try 3 images and choose the one with highest dimensions
$thumbnails = array_filter(array_unique(array_merge($thumbnails, $icons)));
$thumbs_parsed = 0;
foreach ($thumbnails as $thumbnail) {
$thumbnail = elgg_normalize_url($thumbnail);
$asset = $this->parse($thumbnail, false, false);

if ($asset) {
$thumbs_parsed++;
$assets[] = $asset;
}

if ($thumbs_parsed == 5) {
break;
}
Expand All @@ -359,6 +397,7 @@ public function parseThumbs(array $data = []) {
if ($a['width'] == $b['width'] && $a['height'] == $b['height']) {
return 0;
}

return ($a['width'] > $b['width'] || $a['height'] > $b['height']) ? -1 : 1;
});

Expand Down Expand Up @@ -396,22 +435,23 @@ public static function getHttpClientConfig() {

return elgg_trigger_plugin_hook('http:config', 'framework:scraper', null, $config);
}

/**
* Do we estimate that we have enough memory available to resize an image?
*
*
* @param string $source - the source path of the file
*
* @return bool
*/
public function hasMemoryToResize($source) {
$imginfo = getimagesize($source);
$requiredMemory1 = ceil($imginfo[0] * $imginfo[1] * 5.35);
$requiredMemory2 = ceil($imginfo[0] * $imginfo[1] * ($imginfo['bits'] / 8) * $imginfo['channels'] * 2.5);
$requiredMemory = (int) max($requiredMemory1, $requiredMemory2);
$requiredMemory = (int)max($requiredMemory1, $requiredMemory2);

$mem_avail = elgg_get_ini_setting_in_bytes('memory_limit');
$mem_used = memory_get_usage();

$mem_avail = $mem_avail - $mem_used - 20971520; // 20 MB buffer, yeah arbitrary but necessary

return $mem_avail > $requiredMemory;
Expand Down
7 changes: 7 additions & 0 deletions languages/en.php
Expand Up @@ -31,4 +31,11 @@
'scraper:refetch:confirm' => 'Refetch will erase existing URL information, preview images and any modifications you have made to it',
'scraper:refetch:success' => 'URL has been successfully refetched',
'scraper:refetch:error' => 'URL could not be refetched',

'admin:scraper:cache' => 'Scraper Cache',
'admin:scraper:cache:domain' => 'Cached Domain/URL',
'admin:scraper:cache:find' => 'Find Cached URLs',
'admin:scraper:cache:clear' => 'Clear Cache',
'admin:scraper:cache:no_results' => 'No URLs were cached in this domain',

];
9 changes: 9 additions & 0 deletions start.php
Expand Up @@ -41,6 +41,7 @@
elgg_register_plugin_hook_handler('register', 'menu:scraper:card', [Menus::class, 'setupCardMenu']);
elgg_register_action('admin/scraper/edit', __DIR__ . '/actions/admin/scraper/edit.php', 'admin');
elgg_register_action('admin/scraper/refetch', __DIR__ . '/actions/admin/scraper/refetch.php', 'admin');
elgg_register_action('admin/scraper/clear', __DIR__ . '/actions/admin/scraper/clear.php', 'admin');

// Admin
elgg_register_menu_item('page', array(
Expand All @@ -51,6 +52,14 @@
'section' => 'develop'
));

elgg_register_menu_item('page', array(
'name' => 'scraper:cache',
'href' => 'admin/scraper/cache',
'text' => elgg_echo('admin:scraper:cache'),
'context' => 'admin',
'section' => 'develop'
));

elgg_register_ajax_view('output/card');
});

Expand Down
41 changes: 41 additions & 0 deletions views/default/admin/scraper/cache.php
@@ -0,0 +1,41 @@
<?php

$domain = get_input('domain');

echo elgg_view_form('admin/scraper/cache', [
'method' => 'GET',
'action' => current_page_url(),
'disable_security' => true,
], [
'domain' => $domain,
]);

if (!$domain) {
return;
}

$svc = \hypeJunction\Scraper\ScraperService::getInstance();
$urls = $svc->find($domain);
if (empty($urls)) {
echo elgg_format_element('p', [
'class' => 'elgg-no-results',
], elgg_echo('admin:scraper:cache:no_results'));

return;
}

foreach ($urls as $url) {
$card = elgg_view('output/card', [
'href' => $url,
]);
echo elgg_format_element('div', [], $card);
}

echo elgg_view('output/url', [
'class' => 'elgg-button elgg-button-delete',
'text' => elgg_echo('admin:scraper:cache:clear'),
'href' => elgg_http_add_url_query_elements('action/admin/scraper/clear', [
'domain' => $domain,
]),
'is_action' => true,
]);
15 changes: 15 additions & 0 deletions views/default/forms/admin/scraper/cache.php
@@ -0,0 +1,15 @@
<?php

echo elgg_view_field([
'#type' => 'text',
'#label' => elgg_echo('admin:scraper:cache:domain'),
'name' => 'domain',
'value' => get_input('domain'),
]);

$footer = elgg_view_field([
'#type' => 'submit',
'value' => elgg_echo('admin:scraper:cache:find'),
]);

elgg_set_form_footer($footer);

0 comments on commit 2164593

Please sign in to comment.