/
base.py
320 lines (227 loc) · 8.67 KB
/
base.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
from eventlet import GreenPool
from crawley.multiprogramming.pool import ThreadPool
from re import compile as re_compile
from urllib2 import urlparse
from crawley import config
from crawley.http.managers import RequestManager
from crawley.extractors import XPathExtractor
from crawley.exceptions import AuthenticationError
from crawley.utils import url_matcher
user_crawlers = []
class CrawlerMeta(type):
"""
This metaclass adds the user's crawlers to a list
used by the CLI commands.
Abstract base crawlers won't be added.
"""
def __init__(cls, name, bases, dct):
if not hasattr(cls, '__module__' ) or not cls.__module__.startswith(config.CRAWLEY_ROOT_DIR):
user_crawlers.append(cls)
super(CrawlerMeta, cls).__init__(name, bases, dct)
Pools = {'greenlets' : {'pool' : GreenPool, 'max_concurrency' : config.MAX_GREEN_POOL_SIZE },
'threads' : {'pool' : ThreadPool, 'max_concurrency' : config.MAX_THREAD_POOL_SIZE }, }
class BaseCrawler(object):
"""
User's Crawlers must inherit from this class, may
override some methods and define the start_urls list,
the scrapers and the max crawling depth.
"""
__metaclass__ = CrawlerMeta
start_urls = []
""" A list containing the start urls for the crawler """
allowed_urls = []
""" A list of urls allowed for crawl """
black_list = []
""" A list of blocked urls which never be crawled """
scrapers = []
""" A list of scrapers classes """
max_depth = -1
""" The maximun crawling recursive level """
max_concurrency_level = None
""" The maximun coroutines concurrecy level """
requests_delay = config.REQUEST_DELAY
""" The average delay time between requests """
requests_deviation = config.REQUEST_DEVIATION
""" The requests deviation time """
extractor = None
""" The extractor class. Default is XPathExtractor """
post_urls = []
"""
The Post data for the urls. A List of tuples containing (url, data_dict)
Example: ("http://www.mypage.com/post_url", {'page' : '1', 'color' : 'blue'})
"""
login = None
"""
The login data. A tuple of (url, login_dict).
Example: ("http://www.mypage.com/login", {'user' : 'myuser', 'pass', 'mypassword'})
"""
search_all_urls = True
"""
If user doesn't define the get_urls method in scrapers then the crawler will search for urls
in the current page itself depending on the [search_all_urls] attribute.
"""
_url_regex = re_compile(r'\b(([\w-]+://?|www[.])[^\s()<>]+(?:\([\w\d]+\)|([^[:punct:]\s]|/)))')
def __init__(self, sessions=None, settings=None):
"""
Initializes the crawler
params:
sessions: Database or Documents persistant sessions
debug: indicates if the crawler logs to stdout debug info
"""
if sessions is None:
sessions = []
self.sessions = sessions
self.debug = getattr(settings, 'SHOW_DEBUG_INFO', True)
self.settings = settings
if self.extractor is None:
self.extractor = XPathExtractor
self.extractor = self.extractor()
pool_type = getattr(settings, 'POOL', 'greenlets')
pool = Pools[pool_type]
if self.max_concurrency_level is None:
self.max_concurrency_level = pool['max_concurrency']
self.pool = pool['pool'](self.max_concurrency_level)
self.request_manager = RequestManager(settings=settings, delay=self.requests_delay, deviation=self.requests_deviation)
self._initialize_scrapers()
def _initialize_scrapers(self):
"""
Instanciates all the scraper classes
"""
self.scrapers = [scraper_class(settings=self.settings) for scraper_class in self.scrapers]
def _make_request(self, url, data=None):
"""
Returns the response object from a request
params:
data: if this param is present it makes a POST.
"""
return self.request_manager.make_request(url, data, self.extractor)
def _get_response(self, url, data=None):
"""
Returns the response data from a request
params:
data: if this param is present it makes a POST.
"""
for pattern, post_data in self.post_urls:
if url_matcher(url, pattern):
data = post_data
return self._make_request(url, data)
def _manage_scrapers(self, response):
"""
Checks if some scraper is suited for data extraction on the current url.
If so, gets the extractor object and delegate the scraping task
to the scraper Object
"""
scraped_urls = []
for scraper in self.scrapers:
urls = scraper.try_scrape(response)
if urls is not None:
self._commit()
scraped_urls.extend(urls)
return scraped_urls
def _commit(self):
"""
Makes a Commit in all sessions
"""
for session in self.sessions:
session.commit()
def _search_in_urls_list(self, urls_list, url, default=True):
"""
Searches an url in a list of urls
"""
if not urls_list:
return default
for pattern in urls_list:
if url_matcher(url, pattern):
return True
return False
def _validate_url(self, url):
"""
Validates if the url is in the crawler's [allowed_urls] list and not in [black_list].
"""
return self._search_in_urls_list(self.allowed_urls, url) and not self._search_in_urls_list(self.black_list, url, default=False)
def _fetch(self, url, depth_level=0):
"""
Recursive url fetching.
Params:
depth_level: The maximun recursion level
url: The url to start crawling
"""
if not self._validate_url(url):
return
if self.debug:
print "-" * 80
print "crawling -> %s" % url
try:
response = self._get_response(url)
except Exception, ex:
self.on_request_error(url, ex)
return
if self.debug:
print "-" * 80
urls = self._manage_scrapers(response)
if not urls:
if self.search_all_urls:
urls = self.get_urls(response)
else:
return
for new_url in urls:
if depth_level >= self.max_depth and self.max_depth != -1:
return
self.pool.spawn_n(self._fetch, new_url, depth_level + 1)
def _login(self):
"""
If target pages are hidden behind a login then
pass through it first.
self.login can be None or a tuple containing
(login_url, params_dict)
"""
if self.login is None:
return
url, data = self.login
if self._get_response(url, data) is None:
raise AuthenticationError("Can't login")
def start(self):
"""
Crawler's run method
"""
self.on_start()
self._login()
for url in self.start_urls:
self.pool.spawn_n(self._fetch, url, depth_level=0)
self.pool.waitall()
self.on_finish()
#Overridables
def get_urls(self, response):
"""
Returns a list of urls found in the current html page
"""
urls = []
for url_match in self._url_regex.finditer(response.raw_html):
urls.append(url_match.group(0))
tree = XPathExtractor().get_object(response.raw_html)
for link_tag in tree.xpath("//a"):
if not 'href' in link_tag.attrib:
continue
url = link_tag.attrib["href"]
if not self._url_regex.match(url):
parsed_url = urlparse.urlparse(response.url)
new_url = "%s://%s%s" % (parsed_url.scheme, parsed_url.netloc, url)
urls.append(new_url)
return urls
#Events section
def on_start(self):
"""
Override this method to do some work when the crawler starts.
"""
pass
def on_finish(self):
"""
Override this method to do some work when the crawler finishes.
"""
pass
def on_request_error(self, url, ex):
"""
Override this method to customize the request error handler.
"""
if self.debug:
print "Request to %s returned error: %s" % (url, ex)