Skip to content

jrergon/hcr

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

12 Commits
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 

Repository files navigation

hcr

The Hcr helps you to grab some data on the web page. It allows you to crawl all site recursively. It supports limiting requests, adding custom headers and converting html to object as you wish.

Depencies

Getting Started

There is an example config that you can modify and use. The callback argument that you passed to constructor is default callback for all functions.

Installation

hcr is available on npm. To install it, type:

$ npm install hcr

Usage

var opts = {
	reservoir: 100,
  	reservoirRefreshInterval: 60 * 1000 
};

var crawler = new Crawler(opts, callback);

crawler.getPage(['site1.com', 'site2.com']);
var opts = {
	reservoir: 100,
  	reservoirRefreshInterval: 60 * 1000 
};

var crawler = new Crawler(opts, callback);

var object = {
	'Name': {
		selector: '#name',
		func: 'text'
	},
	'Image': {
		selector: '#image',
		func: 'attr',
		args: ['src']
	}
};

crawler.toObject(['site1.com', 'site2.com'], object);
var opts = {
	reservoir: 100,
  	reservoirRefreshInterval: 60 * 1000 
};

var crawler = new Crawler(opts, callback);

var object = {
	'Name': {
		selector: '#name',
		func: 'text'
	},
	'Image': {
		selector: '#image',
		func: 'attr',
		args: ['src']
	}
};

crawler.recursiveToObject(['site1.com', 'site2.com'], object);
var opts = {
	reservoir: 100,
  	reservoirRefreshInterval: 60 * 1000 
};

var crawler = new Crawler(opts, callback);

var object = {
	'Name': {
		selector: '#span',
		prop: 'textContent'
	}
};

crawler.recursiveToObject(['site1.com', 'site2.com'], object);
var opts = {
	reservoir: 100,
  	reservoirRefreshInterval: 60 * 1000 
};

var crawler = new Crawler(opts, callback);

var regex = /[A-Z]/g;

var object = {
	'Name': {
		selector: '#span',
		prop: 'textContent'
	}
};

crawler.recursiveRegexToObject(['site1.com', 'site2.com'], regex, object);
var opts = {
	reservoir: 100,
  	reservoirRefreshInterval: 60 * 1000 
};

var crawler = new Crawler(opts, callback);
var doneCallback = function() {
	// crawling done
};

crawler.on('completed', doneCallback);

About

Easy To Use Web Crawler

Resources

License

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published