Skip to content

Commit

Permalink
fix: get rid of multiple fetchQueueItem execution for the same items
Browse files Browse the repository at this point in the history
  • Loading branch information
Konstantin Bychkov committed Dec 24, 2018
1 parent bddcd71 commit dc96bc4
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 1 deletion.
13 changes: 12 additions & 1 deletion lib/crawler.js
Original file line number Diff line number Diff line change
Expand Up @@ -1160,9 +1160,13 @@ Crawler.prototype.queueURL = function(url, referrer, force) {
Crawler.prototype.fetchQueueItem = function(queueItem) {
var crawler = this;

crawler.fetchingQueueItem = true;

crawler.queue.update(queueItem.id, {
status: "spooled"
}, function(error, queueItem) {
crawler.fetchingQueueItem = false;

if (error) {
return crawler.emit("queueerror", error, queueItem);
}
Expand Down Expand Up @@ -1731,11 +1735,18 @@ Crawler.prototype.crawl = function() {
var crawler = this;

if (crawler._openRequests.length >= crawler.maxConcurrency ||
crawler.fetchingRobotsTxt) {
crawler.fetchingRobotsTxt || crawler.fetchingQueueItem) {
return crawler;
}

// The flag means the fetching process begins which includes finding of oldest unfetched item and
// updating its status to `spooled`. It is required to avoid multiple fetching of the same item
// at defined interval in case of slow queue implementation (DB, for example)
crawler.fetchingQueueItem = true;

crawler.queue.oldestUnfetchedItem(function(error, queueItem) {
crawler.fetchingQueueItem = false;

if (error) {
// Do nothing
} else if (queueItem) {
Expand Down
30 changes: 30 additions & 0 deletions test/reliability.js
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,36 @@ describe("Crawler reliability", function() {
localCrawler.start();
});

it("should only fetch queue item once", function(done) {
var localCrawler = makeCrawler("http://127.0.0.1:3000/");
var timeout = localCrawler.interval * 2;
var buffer = [];

var _oldestUnfetchedItem = localCrawler.queue.oldestUnfetchedItem;
var _update = localCrawler.queue.update;

localCrawler.queue.oldestUnfetchedItem = function(callback) {
setTimeout(_oldestUnfetchedItem.bind(this, callback), timeout);
};

localCrawler.queue.update = function(id, updates, callback) {
setTimeout(_update.bind(this, id, updates, callback), timeout);
};

localCrawler.queueURL("http://127.0.0.1:3000/forbidden");

localCrawler.on("fetchstart", function(queueItem) {
buffer.push(queueItem.url);
});

localCrawler.on("complete", function() {
buffer.length.should.equal(8);
done();
});

localCrawler.start();
});

describe("when stopping the crawler", function() {

it("should not terminate open connections unless asked", function(done) {
Expand Down

0 comments on commit dc96bc4

Please sign in to comment.