Skip to content

Commit

Permalink
Use content batching
Browse files Browse the repository at this point in the history
  • Loading branch information
cyberpower678 committed Jun 4, 2023
1 parent 104745e commit 0249206
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 6 deletions.
20 changes: 17 additions & 3 deletions app/src/Core/APII.php
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,13 @@ class API {
* @access public
*/
public $content = "";
/**
* Stores the page content retrieval time for the page being analyzed
*
* @var int
* @access public
*/
public $contentFetchTime = 0;
/**
* Stores the revids of the page's history
*
Expand Down Expand Up @@ -192,11 +199,18 @@ class API {
* @copyright Copyright (c) 2015-2023, Maximilian Doerr, Internet Archive
* @author Maximilian Doerr (Cyberpower678)
*/
public function __construct( $page, $pageid, $config ) {
public function __construct( $page, $pageid, $config, $cachedContent = false ) {
$this->page = $page;
$this->pageid = $pageid;
$this->config = $config;
$this->content = self::getPageText( $page );
if( $cachedContent === false ) {
$this->content = self::getPageText( $page );
$this->contentFetchTime = time();
}
else {
$this->content = $cachedContent['wikitext'];
$this->contentFetchTime = $cachedContent['time'];
}
if( $config['rate_limit'] != 0 ) self::$rateLimit = $config['rate_limit'];
else self::$rateLimit = false;

Expand Down Expand Up @@ -5288,7 +5302,7 @@ public static function flushMetrics() {
*/
public function closeResources() {
$this->db->closeResource();
curl_close( self::$globalCurl_handle );
if( is_resource( self::$globalCurl_handle ) ) curl_close( self::$globalCurl_handle );
self::$globalCurl_handle = null;
$this->db = null;
}
Expand Down
15 changes: 13 additions & 2 deletions app/src/Core/parse.php
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,6 @@ public function analyzePage( &$modifiedLinks = [], $webRequest = false, &$editEr
$otheradded = 0;
$analyzed = 0;
$newlyArchived = [];
$timestamp = date( "Y-m-d\TH:i:s\Z" );
$history = [];
$toCheck = [];
$toCheckMeta = [];
Expand Down Expand Up @@ -847,9 +846,21 @@ public function analyzePage( &$modifiedLinks = [], $webRequest = false, &$editEr
$pageModified ) {
$revid =
API::edit( $this->commObject->page, $newtext,
$this->commObject->getConfigText( "maineditsummary", $magicwords ), false, $timestamp,
$this->commObject->getConfigText( "maineditsummary", $magicwords ), false,
date( "Y-m-d\TH:i:s\Z", $this->commObject->contentFetchTime ),
true, false, "", $editError
);
if( strpos( $editError, "editconflict" ) !== false ) {
$tmp = APIICLASS;
$commObject = new $tmp( $this->commObject->page, $this->commObject->pageid, $this->commObject->config );
$tmp = PARSERCLASS;
$parser = new $tmp( $commObject );
$stats = $parser->analyzePage();
$commObject->closeResources();
$parser = $commObject = null;

return $stats;
}
} else $magicwords['logstatus'] = "posted";
if( isset( $revid ) ) {
$magicwords['diff'] = str_replace( "api.php", "index.php", API ) . "?diff=prev&oldid=$revid";
Expand Down
21 changes: 20 additions & 1 deletion app/src/deadlink.php
Original file line number Diff line number Diff line change
Expand Up @@ -208,15 +208,34 @@
echo "Round $iteration: Fetched " . count( $pages ) . " articles!!\n\n";
}

//Create page retrieval batches
foreach( $pages as $tid => $tpage ) {
$batches[] = $tpage['pageid'];
}

$batches = array_chunk( $batches, API::getTitlesLimit() );

$contentBatch = [];
$nextIndex = 0;

//Begin page analysis
foreach( $pages as $tid => $tpage ) {
if( empty( $contentBatch ) ) {
$contentBatch = API::getBatchText( $batches[$nextIndex], 'pageid' );
$nextIndex++;
$fetchTime = time();
}
$pagesAnalyzed++;
$runpagecount++;
API::enableProfiling();
$tmp = APIICLASS;
$commObject = new $tmp( $tpage['title'], $tpage['pageid'], $config );
$commObject =
new $tmp( $tpage['title'], $tpage['pageid'], $config,
[ 'wikitext' => $contentBatch[$tpage['pageid']], 'time' => $fetchTime ]
);
$tmp = PARSERCLASS;
$parser = new $tmp( $commObject );
unset( $contentBatch[$tpage['pageid']] );
$stats = $parser->analyzePage();
$commObject->closeResources();
$parser = $commObject = null;
Expand Down

0 comments on commit 0249206

Please sign in to comment.