Fetching contributors…
Cannot retrieve contributors at this time
336 lines (262 sloc) 11.2 KB
import logging
import os
import sys
import tempfile
import time
import requests
from .cache import CachingSession, FileCache # noqa
if sys.version_info[0] < 3: # pragma: no cover
from urllib2 import urlopen as urllib_urlopen
from urllib2 import URLError as urllib_URLError
_str_type = unicode
else: # pragma: no cover
from urllib.request import urlopen as urllib_urlopen
from urllib.error import URLError as urllib_URLError
_str_type = str
__version__ = '1.0.0'
_user_agent = ' '.join(('scrapelib', __version__, requests.utils.default_user_agent()))
class NullHandler(logging.Handler):
def emit(self, record):
_log = logging.getLogger('scrapelib')
class HTTPMethodUnavailableError(requests.RequestException):
Raised when the supplied HTTP method is invalid or not supported
by the HTTP backend.
def __init__(self, message, method):
super(HTTPMethodUnavailableError, self).__init__(message)
self.method = method
class HTTPError(requests.HTTPError):
Raised when urlopen encounters a 4xx or 5xx error code and the
raise_errors option is true.
def __init__(self, response, body=None):
message = '%s while retrieving %s' % (response.status_code, response.url)
super(HTTPError, self).__init__(message)
self.response = response
self.body = body or self.response.text
class FTPError(requests.HTTPError):
def __init__(self, url):
message = 'error while retrieving %s' % url
super(FTPError, self).__init__(message)
class ThrottledSession(requests.Session):
def _throttle(self):
now = time.time()
diff = self._request_frequency - (now - self._last_request)
if diff > 0:
_log.debug("sleeping for %fs" % diff)
self._last_request = time.time()
self._last_request = now
def requests_per_minute(self):
return self._requests_per_minute
def requests_per_minute(self, value):
if value > 0:
self._throttled = True
self._requests_per_minute = value
self._request_frequency = 60.0 / value
self._last_request = 0
self._throttled = False
self._requests_per_minute = 0
self._request_frequency = 0.0
self._last_request = 0
def request(self, method, url, **kwargs):
if self._throttled:
return super(ThrottledSession, self).request(method, url, **kwargs)
# this object exists because Requests assumes it can call
# resp.raw._original_response.msg.getheaders() and we need to cope with that
class DummyObject(object):
def getheaders(self, name):
return ''
def get_all(self, name, default):
return default
_dummy = DummyObject()
_dummy._original_response = DummyObject()
_dummy._original_response.msg = DummyObject()
class FTPAdapter(requests.adapters.BaseAdapter):
def send(self, request, stream=False, timeout=None, verify=False, cert=None, proxies=None):
if request.method != 'GET':
raise HTTPMethodUnavailableError("FTP requests do not support method '%s'" %
request.method, request.method)
real_resp = urllib_urlopen(request.url, timeout=timeout)
# we're going to fake a requests.Response with this
resp = requests.Response()
resp.status_code = 200
resp.url = request.url
resp.headers = {}
resp._content =
resp.raw = _dummy
return resp
except urllib_URLError:
raise FTPError(request.url)
class RetrySession(requests.Session):
def __init__(self):
super(RetrySession, self).__init__()
self._retry_attempts = 0
self.retry_wait_seconds = 10
# retry_attempts is a property so that it can't go negative
def retry_attempts(self):
return self._retry_attempts
def retry_attempts(self, value):
self._retry_attempts = max(value, 0)
def accept_response(self, response, **kwargs):
return response.status_code < 400
def request(self, method, url, retry_on_404=False, **kwargs):
# the retry loop
tries = 0
exception_raised = None
while tries <= self.retry_attempts:
exception_raised = None
resp = super(RetrySession, self).request(method, url, **kwargs)
# break from loop on an accepted response
if self.accept_response(resp) or (resp.status_code == 404 and not retry_on_404):
except (requests.HTTPError, requests.ConnectionError, requests.Timeout) as e:
if isinstance(e, requests.exceptions.SSLError):
exception_raised = e
# if we're going to retry, sleep first
tries += 1
if tries <= self.retry_attempts:
# twice as long each time
wait = (self.retry_wait_seconds * (2 ** (tries - 1)))
_log.debug('sleeping for %s seconds before retry' % wait)
# out of the loop, either an exception was raised or we had a success
if exception_raised:
raise exception_raised
return resp
# compose sessions, order matters (cache then throttle then retry)
class Scraper(CachingSession, ThrottledSession, RetrySession):
Scraper is the most important class provided by scrapelib (and generally
the only one to be instantiated directly). It provides a large number
of options allowing for customization.
Usage is generally just creating an instance with the desired options and
then using the :meth:`urlopen` & :meth:`urlretrieve` methods of that
:param raise_errors: set to True to raise a :class:`HTTPError`
on 4xx or 5xx response
:param requests_per_minute: maximum requests per minute (0 for
unlimited, defaults to 60)
:param retry_attempts: number of times to retry if timeout occurs or
page returns a (non-404) error
:param retry_wait_seconds: number of seconds to retry after first failure,
subsequent retries will double this wait
def __init__(self, raise_errors=True, requests_per_minute=60, retry_attempts=0,
retry_wait_seconds=5, header_func=None):
super(Scraper, self).__init__()
self.mount('ftp://', FTPAdapter())
# added by this class
self.raise_errors = raise_errors
# added by ThrottledSession
self.requests_per_minute = requests_per_minute
# added by RetrySession
self.retry_attempts = retry_attempts
self.retry_wait_seconds = retry_wait_seconds
# added by this class
self._header_func = header_func
# added by CachingSession
self.cache_storage = None
self.cache_write_only = True
# non-parameter options
self.timeout = None
self.user_agent = _user_agent
# statistics structure
def reset_stats(self):
self.stats = {}
self.stats['total_requests'] = 0
self.stats['total_time'] = 0
self.stats['average_time'] = None
def user_agent(self):
return self.headers['User-Agent']
def user_agent(self, value):
self.headers['User-Agent'] = value
def disable_compression(self):
return self.headers['Accept-Encoding'] == 'text/*'
def disable_compression(self, value):
# disabled: set encoding to text/*
if value:
self.headers['Accept-Encoding'] = 'text/*'
# enabled: if set to text/* pop, otherwise leave unmodified
elif self.headers.get('Accept-Encoding') == 'text/*':
self.headers['Accept-Encoding'] = 'gzip, deflate, compress'
def request(self, method, url, **kwargs):"{0} - {1}".format(method.upper(), url))
# apply global timeout
timeout = kwargs.pop('timeout', self.timeout)
if self._header_func:
headers = requests.structures.CaseInsensitiveDict(self._header_func(url))
headers = {}
kwarg_headers = kwargs.pop('headers', {})
headers = requests.sessions.merge_setting(
headers, self.headers,
headers = requests.sessions.merge_setting(
kwarg_headers, headers,
_start_time = time.time()
resp = super(Scraper, self).request(method, url, timeout=timeout, headers=headers,
self.stats['total_requests'] += 1
self.stats['total_time'] += (time.time() - _start_time)
self.stats['average_time'] = self.stats['total_time'] / self.stats['total_requests']
if self.raise_errors and not self.accept_response(resp):
raise HTTPError(resp)
return resp
def urlretrieve(self, url, filename=None, method='GET', body=None, dir=None, **kwargs):
Save result of a request to a file, similarly to
If an error is encountered may raise any of the scrapelib
A filename may be provided or :meth:`urlretrieve` will safely create a
temporary file. If a directory is provided, a file will be given a random
name within the specified directory. Either way, it is the responsibility
of the caller to ensure that the temporary file is deleted when it is no
longer needed.
:param url: URL for request
:param filename: optional name for file
:param method: any valid HTTP method, but generally GET or POST
:param body: optional body for request, to turn parameters into
an appropriate string use :func:`urllib.urlencode()`
:param dir: optional directory to place file in
:returns filename, response: tuple with filename for saved
response (will be same as given filename if one was given,
otherwise will be a temp file in the OS temp directory) and
a :class:`Response` object that can be used to inspect the
response headers.
result = self.request(method, url, data=body, **kwargs)
result.code = result.status_code # backwards compat
if not filename:
fd, filename = tempfile.mkstemp(dir=dir)
f = os.fdopen(fd, 'wb')
f = open(filename, 'wb')
return filename, result
_default_scraper = Scraper(requests_per_minute=0)
def urlopen(url, method='GET', body=None, **kwargs): # pragma: no cover
return _default_scraper.urlopen(url, method, body, **kwargs)