## Spiders详解
　　参考：[]()<BR>    
　　Scrapy提供了Spider、CrawlSpider (XMLFeedSpider、CSVFeedSpider、SitemapSpider此处未做注解)。<br>
　　Scrapy_redis提供了RedisSpider、RedisCrawlSpider爬虫采集类。<br>
### 一、Spider源码详解

In [1]:

class object_ref(object):
    """Inherit from this class (instead of object) to a keep a record of live
    instances"""

    __slots__ = ()

    def __new__(cls, *args, **kwargs):
        obj = object.__new__(cls)
        live_refs[cls][obj] = time()    # 记录爬虫存活时间
        return obj

    
class Spider(object_ref):
    """Base class for scrapy spiders. All spiders must inherit from this
    class.                                                                 所有的Scrapy的爬虫类必须继承Spider
    """

    name = None   # 定义爬虫的名字
    custom_settings = None   # 定义爬虫自己的配置参数（最高优先级），是专属于Spider的配置，此方法会覆盖全局的配置，
    # 此设置必须在初始化前被更新，必须定义成类变量。

    def __init__(self, name=None, **kwargs):
        if name is not None:
            self.name = name   # 定义爬虫给名字
        elif not getattr(self, 'name', None):
            raise ValueError("%s must have a name" % type(self).__name__)
        self.__dict__.update(kwargs)
        if not hasattr(self, 'start_urls'):
            self.start_urls = []   # 初始化爬虫开始的urls

    @property
    def logger(self):   # 构造爬虫的日志器
        logger = logging.getLogger(self.name)
        return logging.LoggerAdapter(logger, {'spider': self})

    def log(self, message, level=logging.DEBUG, **kw):   # 日志记录： msg 日志级别
        """Log the given message at the given log level

        This helper wraps a log call to the logger within the spider, but you
        can use it directly (e.g. Spider.logger.info('msg')) or use any other
        Python logger too.
        """
        self.logger.log(level, message, **kw)

    @classmethod    # 修饰符对应的函数不需要实例化，不需要 self 参数，但第一个参数需要是表示自身类的 cls 参数，可以来调用类的属性，类的方法，实例化对象等。
    def from_crawler(cls, crawler, *args, **kwargs):   # args、kwargs 为传递给init的参数  此方法和Pipeline里面使用是一样的。
        spider = cls(*args, **kwargs)
        spider._set_crawler(crawler)
        return spider

    def _set_crawler(self, crawler):   # 初始化crawler、settings、爬虫关闭信号
        self.crawler = crawler    # 代表本Spider对应的Crawler对象，包含了许多项目组件。可以利用它来获取项目中的一些配置信息，最常见的就是从settings.py里面获取项目的配置信息。
        self.settings = crawler.settings
        crawler.signals.connect(self.close, signals.spider_closed)

    def start_requests(self):         # 爬虫请求入口，用于生成初始请求，它必须必须返回一个可迭代对象。
        cls = self.__class__
        if method_is_overridden(cls, Spider, 'make_requests_from_url'):
            warnings.warn(     # make_requests_from_url 方法被启用，建议重写 start_requests 方法
                "Spider.make_requests_from_url method is deprecated; it "
                "won't be called in future Scrapy releases. Please "
                "override Spider.start_requests method instead (see %s.%s)." % (
                    cls.__module__, cls.__name__
                ),
            )
            for url in self.start_urls:
                yield self.make_requests_from_url(url)
        else:
            for url in self.start_urls:
                yield Request(url, dont_filter=True)    # 默认使用 self.parse 函数解析

    def make_requests_from_url(self, url):
        """ This method is deprecated. """     # 方法被弃用
        return Request(url, dont_filter=True)

    def parse(self, response):     # 必须在自定义爬虫文件中实现的解析函数
        # 该方法及其他的Request回调函数必须返回一个包含 Request、dict 或 Item 的可迭代的对象。
        raise NotImplementedError('{}.parse callback is not defined'.format(self.__class__.__name__))

    @classmethod
    def update_settings(cls, settings):
        settings.setdict(cls.custom_settings or {}, priority='spider')

    @classmethod
    def handles_request(cls, request):   # 判断请求的 url 是否符合 allowed_domains，不符合的请求不发送
        return url_is_from_spider(request.url, cls)

    @staticmethod   # 静态方法，当spider关闭时，该函数被调用
    def close(spider, reason):   # 从close的源码可以看出，如果需要在爬虫结束的时候进行一些操作，那么就可以通过改写 close 方法，
        # 或者在编写的爬虫类中实现 closed 方法。
        closed = getattr(spider, 'closed', None)
        if callable(closed):
            return closed(reason)

    def __str__(self):
        return "<%s %r at 0x%0x>" % (type(self).__name__, self.name, id(self))

    __repr__ = __str__

NameError: name 'object_ref' is not defined

### 二、CrawlSpider源码详解

In [None]:
class CrawlSpider(Spider):            # 通用爬虫类

    rules = ()                  # 链接提取处理规则类

    def __init__(self, *a, **kw):
        super(CrawlSpider, self).__init__(*a, **kw)
        self._compile_rules()      # 实列话各个 链接提取处理规则类

    def parse(self, response):
        return self._parse_response(response, self.parse_start_url, cb_kwargs={}, follow=True)

    def parse_start_url(self, response):
        return []    # start_url首页的页面解析：返回一个包含 Request、dict 或 Item 的可迭代的对象。

    def process_results(self, response, results):
        return results

    def _build_request(self, rule, link):
        r = Request(url=link.url, callback=self._response_downloaded)    # 根据链接提取器提取的链接 构建请求以跟进
        r.meta.update(rule=rule, link_text=link.text)      # 记录 链接提取器 rule:  meta['rule']为整数，是在self.rules中的index值
        return r

    def _requests_to_follow(self, response):
        if not isinstance(response, HtmlResponse):
            return
        seen = set()    # 链接过滤
        for n, rule in enumerate(self._rules):       # 提取连接，返回链接的请求
            links = [lnk for lnk in rule.link_extractor.extract_links(response)
                     if lnk not in seen]
            if links and rule.process_links:
                links = rule.process_links(links)
            for link in links:
                seen.add(link)
                request = self._build_request(n, link)
                yield rule._process_request(request, response)

    def _response_downloaded(self, response):
        rule = self._rules[response.meta['rule']]      # 链接跟进的回调函数的配置
        return self._parse_response(response, rule.callback, rule.cb_kwargs, rule.follow)

    def _parse_response(self, response, callback, cb_kwargs, follow=True):
        if callback:
            cb_res = callback(response, **cb_kwargs) or ()      # 跟进连接的解析处理  生成器
            cb_res = self.process_results(response, cb_res)       # 结果处理函数  生成器
            for requests_or_item in iterate_spider_output(cb_res):
                yield requests_or_item

        if follow and self._follow_links:
            for request_or_item in self._requests_to_follow(response):   # 继续跟进链接
                yield request_or_item

    def _compile_rules(self):
        self._rules = [copy.copy(r) for r in self.rules]              # 浅拷贝
        for rule in self._rules:                            # 初始化各个rule实例
            rule._compile(self)

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = super(CrawlSpider, cls).from_crawler(crawler, *args, **kwargs)
        spider._follow_links = crawler.settings.getbool(
            'CRAWLSPIDER_FOLLOW_LINKS', True)     # 设置是否跟进连接， 默认跟进
        return spider

    
class Rule(object):      # 链接提取处理类

    def __init__(self, link_extractor=None, callback=None, cb_kwargs=None, follow=None, process_links=None, process_request=None):
        self.link_extractor = link_extractor or _default_link_extractor     # 链接提取规则
        self.callback = callback   # 链接处理回调函数
        self.cb_kwargs = cb_kwargs or {}
        self.process_links = process_links   # 链接处理函数
        self.process_request = process_request or _identity    # 连接跟进请求处理函数
        self.process_request_argcount = None
        self.follow = follow if follow is not None else not callback    # 链接是否跟进设置

    def _compile(self, spider):              # 初始化链接处理对象规则
        self.callback = _get_method(self.callback, spider)   # 设置回调函数
        self.process_links = _get_method(self.process_links, spider)   # 设置链接处理函数
        self.process_request = _get_method(self.process_request, spider)   # 设置连接请求处理函数
        self.process_request_argcount = len(get_func_args(self.process_request))    # 判断 请求处理函数参数设置对错
        if self.process_request_argcount == 1:
            msg = 'Rule.process_request should accept two arguments (request, response), accepting only one is deprecated'
            warnings.warn(msg, category=ScrapyDeprecationWarning, stacklevel=2)
    
    

### 三、RedisSpider源码

In [None]:
class RedisSpider(RedisMixin, Spider):
    """Spider that reads urls from redis queue when idle.

    Attributes
    ----------
    redis_key : str (default: REDIS_START_URLS_KEY)    # 获取任务 redis key  字符串
        Redis key where to fetch start URLs from..
    redis_batch_size : int (default: CONCURRENT_REQUESTS)    # 一次从redis获取的任务数量：默认为spider的并发量CONCURRENT_REQUESTS
        Number of messages to fetch from redis on each attempt.
    redis_encoding : str (default: REDIS_ENCODING)     # reids数据的编码
        Encoding to use when decoding messages from redis queue.

    Settings
    --------
    REDIS_START_URLS_KEY : str (default: "<spider.name>:start_urls")     # settings设置：设置获取任务的redis key  字符串
        Default Redis key where to fetch start URLs from..
    REDIS_START_URLS_BATCH_SIZE : int (deprecated by CONCURRENT_REQUESTS)    # 可从一次从redis获取的任务数量（
        Default number of messages to fetch from redis on each attempt.
    REDIS_START_URLS_AS_SET : bool (default: False)                             # redis 任务队列使用集合还是列表，默认列表
        Use SET operations to retrieve messages from the redis queue. If False,
        the messages are retrieve using the LPOP command.
    REDIS_ENCODING : str (default: "utf-8")                          # 默认的redis编码
        Default encoding to use when decoding messages from redis queue.

    """

    @classmethod
    def from_crawler(self, crawler, *args, **kwargs):
        obj = super(RedisSpider, self).from_crawler(crawler, *args, **kwargs)
        obj.setup_redis(crawler)              # 初始化redis链接
        return obj



### 四、RedisCrawlSpider源码

In [None]:
class RedisCrawlSpider(RedisMixin, CrawlSpider):
    """Spider that reads urls from redis queue when idle.

    Attributes                     # 同RedisSpider
    ----------
    redis_key : str (default: REDIS_START_URLS_KEY)
        Redis key where to fetch start URLs from..
    redis_batch_size : int (default: CONCURRENT_REQUESTS)
        Number of messages to fetch from redis on each attempt.
    redis_encoding : str (default: REDIS_ENCODING)
        Encoding to use when decoding messages from redis queue.

    Settings
    --------
    REDIS_START_URLS_KEY : str (default: "<spider.name>:start_urls")
        Default Redis key where to fetch start URLs from..
    REDIS_START_URLS_BATCH_SIZE : int (deprecated by CONCURRENT_REQUESTS)
        Default number of messages to fetch from redis on each attempt.
    REDIS_START_URLS_AS_SET : bool (default: True)
        Use SET operations to retrieve messages from the redis queue.
    REDIS_ENCODING : str (default: "utf-8")
        Default encoding to use when decoding messages from redis queue.

    """

    @classmethod
    def from_crawler(self, crawler, *args, **kwargs):
        obj = super(RedisCrawlSpider, self).from_crawler(crawler, *args, **kwargs)
        obj.setup_redis(crawler)   # 初始化redis链接
        return obj


### 五、RedisMixin 源码

In [2]:
class RedisMixin(object):
    """Mixin class to implement reading urls from a redis queue."""
    redis_key = None       # redis  任务的redis key  字符串
    redis_batch_size = None
    redis_encoding = None

    # Redis client placeholder.
    server = None      # redis连接服务

    def start_requests(self):
        """Returns a batch of start requests from redis."""
        return self.next_requests()     # 返回 batch_size 个请求任务

    def setup_redis(self, crawler=None):    # 初始化redis链接  与 redis任务空闲信号
        """Setup redis connection and idle signal.

        This should be called after the spider has set its crawler object.
        """
        if self.server is not None:
            return

        if crawler is None:
            # We allow optional crawler argument to keep backwards
            # compatibility.
            # XXX: Raise a deprecation warning.
            crawler = getattr(self, 'crawler', None)

        if crawler is None:
            raise ValueError("crawler is required")

        settings = crawler.settings             # 获取爬虫设置
        if self.redis_key is None:
            self.redis_key = settings.get(
                'REDIS_START_URLS_KEY', defaults.START_URLS_KEY,
            )

        self.redis_key = self.redis_key % {'name': self.name}   # redis  key 有spider name 与 key 字符串组成

        if not self.redis_key.strip():
            raise ValueError("redis_key must not be empty")

        if self.redis_batch_size is None:
            # TODO: Deprecate this setting (REDIS_START_URLS_BATCH_SIZE).
            self.redis_batch_size = settings.getint(
                'REDIS_START_URLS_BATCH_SIZE',
                settings.getint('CONCURRENT_REQUESTS'),     # 默认self.redis_batch_size设置为：CONCURRENT_REQUESTS
            )

        try:
            self.redis_batch_size = int(self.redis_batch_size)
        except (TypeError, ValueError):
            raise ValueError("redis_batch_size must be an integer")

        if self.redis_encoding is None:
            self.redis_encoding = settings.get('REDIS_ENCODING', defaults.REDIS_ENCODING)

        self.logger.info("Reading start URLs from redis key '%(redis_key)s' "
                         "(batch size: %(redis_batch_size)s, encoding: %(redis_encoding)s",
                         self.__dict__)

        self.server = connection.from_settings(crawler.settings)           # 创建redis链接
        # The idle signal is called when the spider has no requests left,
        # that's when we will schedule new requests from redis queue
        crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)    # 绑定redis空闲信号

    def next_requests(self):
        """Returns a request to be scheduled or none."""
        use_set = self.settings.getbool('REDIS_START_URLS_AS_SET', defaults.START_URLS_AS_SET)   # 判断 redis任务使用的是set 还是 list
        fetch_one = self.server.spop if use_set else self.server.lpop    # 根据 任务数据结构选择  任务获取方法
        # XXX: Do we need to use a timeout here?
        found = 0
        # TODO: Use redis pipeline execution.
        while found < self.redis_batch_size:
            data = fetch_one(self.redis_key)    # 从redis 获取任务
            if not data:
                # Queue empty.
                break
            req = self.make_request_from_data(data)       # 根据任务构造 请求
            if req:
                yield req
                found += 1
            else:
                self.logger.debug("Request not made from data: %r", data)

        if found:
            self.logger.debug("Read %s requests from '%s'", found, self.redis_key)

    def make_request_from_data(self, data):
        """Returns a Request instance from data coming from Redis.

        By default, ``data`` is an encoded URL. You can override this method to
        provide your own message decoding.

        Parameters
        ----------
        data : bytes
            Message from redis.

        """
        url = bytes_to_str(data, self.redis_encoding)     # 处理redis任务数据的编码格式
        return self.make_requests_from_url(url)

    def schedule_next_requests(self):
        """Schedules a request if available"""
        # TODO: While there is capacity, schedule a batch of redis requests.
        for req in self.next_requests():                         # 爬虫spider空闲时，调用该方法，继续从redis获取新的任务
            self.crawler.engine.crawl(req, spider=self)

    def spider_idle(self):
        """Schedules a request if available, otherwise waits."""
        # XXX: Handle a sentinel to close the spider.
        self.schedule_next_requests()                             # 爬虫空闲信号的绑定处理方法
        raise DontCloseSpider
