/
crawler.py
75 lines (64 loc) · 2.45 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#!/usr/bin/python
import sys
import yaml
import rethinkdb as r
import gearman
import redis
from pybloomd import BloomdClient
import re
import urlparse
import requests
import urlnorm
parameter_file = open("parameters.yml", "r")
parameters = yaml.load(parameter_file)
except_url_suffixes = ["js", "css", "json", "png", "jpg", "gif", "woff2", "xml", "rss"]
rethink = r.connect(parameters['rethinkdb_server']['host'], parameters['rethinkdb_server']['port']).repl()
rethink.use(parameters['rethinkdb_server']['database'])
raw_result_table = parameters['rethinkdb_server']['tables']['raw_result']
redis_client = redis.Redis(parameters['redis_server']['host'])
gm_worker = gearman.GearmanWorker(parameters['gearman_server']['hosts'])
bloom_client = BloomdClient(parameters['bloomd_servers'])
url_frontier = bloom_client.create_filter('url_frontier')
linkregex = re.compile('<a\s*href=[\'|"](.*?)[\'"].*?>')
crawler_headers = parameters['crawler_headers']
def url_pre_norm(link, base):
if link.startswith('/'):
link = 'http://' + base[1] + link
elif link.startswith('#'):
link = 'http://' + base[1] + base[2] + link
elif not link.startswith('http'):
link = 'http://' + base[1] + '/' + link
return link
def task_listener_crawler(gearman_worker, gearman_job):
url = gearman_job.data
url_frontier.add(url)
urls = urlparse.urlparse(url)
print "Crawling ", url
response = requests.get(url, crawler_headers)
print 'Downloaded page'
if response.status_code == 200:
raw_data = response.text
if response.encoding != 'utf8':
raw_data = response.text.encode(response.encoding).decode('utf8')
r.table(raw_result_table).insert({'url': url, 'raw': raw_data, 'status': 200}, conflict="replace").run(rethink)
links = linkregex.findall(raw_data)
for link in (links.pop(0) for _ in xrange(len(links))):
pre_norm_url = url_pre_norm(link, urls)
norm_url = urlnorm.norm(pre_norm_url)
norm_parts = urlparse.urlparse(norm_url)
ext_url = norm_parts.path.split(".")[-1].lower()
if ext_url not in except_url_suffixes and url_frontier.add(norm_url):
print "Add ", norm_url, " to redis queue"
redis_client.rpush("urls:enqueued", norm_url)
print "Done"
return "ok"
else:
r.table(raw_result_table).insert({'url': url, 'status': response.status_code}, conflict="replace").run(rethink)
return "fail"
def main(argv):
# Main code here
print "I'm crawler"
gm_worker.register_task('crawler', task_listener_crawler)
gm_worker.work()
if __name__ == "__main__":
main(sys.argv)