The Hcr helps you to grab some data on the web page. It allows you to crawl all site recursively. It supports limiting requests, adding custom headers and converting html to object as you wish.
There is an example config that you can modify and use. The callback argument that you passed to constructor is default callback for all functions.
hcr
is available on npm. To install it, type:
$ npm install hcr
var opts = {
reservoir: 100,
reservoirRefreshInterval: 60 * 1000
};
var crawler = new Crawler(opts, callback);
crawler.getPage(['site1.com', 'site2.com']);
var opts = {
reservoir: 100,
reservoirRefreshInterval: 60 * 1000
};
var crawler = new Crawler(opts, callback);
var object = {
'Name': {
selector: '#name',
func: 'text'
},
'Image': {
selector: '#image',
func: 'attr',
args: ['src']
}
};
crawler.toObject(['site1.com', 'site2.com'], object);
var opts = {
reservoir: 100,
reservoirRefreshInterval: 60 * 1000
};
var crawler = new Crawler(opts, callback);
var object = {
'Name': {
selector: '#name',
func: 'text'
},
'Image': {
selector: '#image',
func: 'attr',
args: ['src']
}
};
crawler.recursiveToObject(['site1.com', 'site2.com'], object);
var opts = {
reservoir: 100,
reservoirRefreshInterval: 60 * 1000
};
var crawler = new Crawler(opts, callback);
var object = {
'Name': {
selector: '#span',
prop: 'textContent'
}
};
crawler.recursiveToObject(['site1.com', 'site2.com'], object);
var opts = {
reservoir: 100,
reservoirRefreshInterval: 60 * 1000
};
var crawler = new Crawler(opts, callback);
var regex = /[A-Z]/g;
var object = {
'Name': {
selector: '#span',
prop: 'textContent'
}
};
crawler.recursiveRegexToObject(['site1.com', 'site2.com'], regex, object);
var opts = {
reservoir: 100,
reservoirRefreshInterval: 60 * 1000
};
var crawler = new Crawler(opts, callback);
var doneCallback = function() {
// crawling done
};
crawler.on('completed', doneCallback);