#### 1. 简介
- 爬虫框架
- 异步下载

#### 2. 流程
- 1. 创建一个scrapy项目
    - `scrapy startproject mySpider`
- 2. 生成一个爬虫
    - `scrapy genspider itcast 'itcast.cn'`
- 3. 提取数据
    - `完善spider`
- 4. 保存数据
    - pipeline中保存数据

#### 3. 启动
- scrapy crawl SPIDER_NAME

#### 4. spider demo

In [None]:
class ItcastSpider(scrapy.Spider):
    name = 'itcast'  # 爬虫名, 爬虫启动的时候使用
    allowed_domains = ['itcast.cn']  # 允许爬取的范围
    start_urls = ['http://www.itcast.cn/channel/teacher.shtml']  # 开始路径

    def parse(self, response): # 数据提取方法， 接收下载中间件传过来的response
        ret1 = response.xpath("//div[@class='tea_con']//h3/text()").extract()
        print("---------->:", ret1)
        
        yield ret1[0]  # spider 的数据传到 pipeline
        
# 在选择器中提取字符串
# 1. extract() 返回一个含有字符串数据的列表
# 2. extract_first() 返回列表中的第一个字符串

# 注意：
# 1. spider 中的parse方法名不能修改
# 2. 需要爬取的url地址必须要属于allow_domain下的连接
# 3. response.xpath() 返回的是一个含有selector对象的列表

#### 5. pipeline demo

In [None]:
# demo
class MyspiderPipeline(object):
    
    def open_spider(self, spider): //在爬虫开启的时候执行，仅执行一次
        pass
    
    def close_spider(self, spider):  //在爬虫关闭的时候执行，仅执行一次
        pass
    
    def process_item(self, item, spider): 
        item['hello'] = 'word'
        return item
    
# 开启 pipeline， 在setting中设置开启
ITEM_PIPELINES = {
   'myspider.pipelines.MyspiderPipeline': 300,
}

# 注意：
# 1. pipeline 权重越小，优先级越高
# 2. process_item 方法名不能修改

#### 6. logging 模块
- 配置： 在 setting 中 :
    - 设置日志级别：LOG_LEVEL = 'INFO'
    - 指定日志文件：LOG_FILE = './log.log', 设置后终端不会显示

##### 6.1 设置日志输出样式（普通项目）

In [None]:
logging.basicConfig(level=logging.DEBUG,
                format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
                datefmt='%a, %d %b %Y %H:%M:%S',
                filename='myapp.log',
                filemode='w')

- logging.basicConfig函数各参数:
    - filename: 指定日志文件名
    - filemode: 和file函数意义相同，指定日志文件的打开模式，'w'或'a'
    - format: 指定输出的格式和内容，format可以输出很多有用信息，如上例所示:
         - %(levelno)s: 打印日志级别的数值
         - %(levelname)s: 打印日志级别名称
         - %(pathname)s: 打印当前执行程序的路径，其实就是sys.argv[0]
         - %(filename)s: 打印当前执行程序名
         - %(funcName)s: 打印日志的当前函数
         - %(lineno)d: 打印日志的当前行号
         - %(asctime)s: 打印日志的时间
         - %(thread)d: 打印线程ID
         - %(threadName)s: 打印线程名称
         - %(process)d: 打印进程ID
         - %(message)s: 打印日志信息
    - datefmt: 指定时间格式，同time.strftime()
    - level: 设置日志级别，默认为logging.WARNING
    - stream: 指定将日志的输出流，可以指定输出到sys.stderr,sys.stdout或者文件，默认输出到sys.stderr，当stream和filename同时指定时，stream被忽略

#### 7. 翻页

In [None]:
# 翻页demo

def parse(self, response):
    tr_list = response.xpath("//table[@class='tablelist']//tr")[1:-1]
    for tr in tr_list:
        item = {}
        item['title'] = tr.xpath("./td[1]/a/text()").extract_first()
        item['position'] = tr.xpath("./td[2]/text()").extract_first()
        item['publish_date'] = tr.xpath("./td[5]/text()").extract_first()
        yield item
    # 找到下一页的url地址
    next_url = response.xpath("//a[@id='next']/@href").extract_first()
    if next_url != 'javascript:;':
        next_url = 'https://hr.tencent.com/' + next_url
        yield scrapy.Request(
            next_url,
            callback=self.parse
        )

# scrapy.Request常用参数为： 
#   callback： 指定传入的url交给那个解析函数去处理
#   meta： 实现在不同的解析函数中传递数据， meta默认会携带部分信息， 比如：下载延迟、请求深度等
#   dont_filter: 让scrapy的去重不会过滤当前url，scrapy默认有url去重功能，对需要重复请求的url有重要作用


#### 8. scrapy.item

In [None]:
# 定义Item
class TencentItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    title = scrapy.Field()
    position = scrapy.Field()
    publish_date = scrapy.Field()

# 处理 Item
class TencentPipeline(object):
    def process_item(self, item, spider):
        # 判断 item 类型
        if isinstance(item, TencentItem):
            print(item)
            collection.insert(dict(item))
        return item

In [None]:
# 带详情页-处理
class YgSpider(scrapy.Spider):
    name = 'yg'
    allowed_domains = ['sun0769.com']
    start_urls = ['http://wz.sun0769.com/index.php/question/questionType?type=4']

    def parse(self, response):
        # 分组
        tr_list = response.xpath("//div[@id='morelist']//table[2]//table//tr")
        for tr in tr_list:
            item = SunItem()
            item['title'] = tr.xpath("./td[2]/a[@class='news14']/@title").extract_first()
            item['href'] = tr.xpath("./td[2]/a[@class='news14']/@href").extract_first()
            item['publish_date'] = tr.xpath("./td[last()]/text()").extract_first()
            yield scrapy.Request(
                item['href'],
                callback=self.parse_detail,
                meta={
                    'item': item
                }
            )
        # 下一页
        next_url = response.xpath("//div[@class='pagination']/a[text()='>']/@href").extract_first()
        if next_url is not None:
            yield scrapy.Request(
                next_url,
                callback=self.parse
            )

    def parse_detail(self, response):
        item = response.meta['item']

        item['content'] = response.xpath("//div[@class='c1 text14_2']/div[@class='contentext']/text()").extract()
        item['content_img'] = response.xpath("//div[@class='c1 text14_2']//img/@src").extract()
        item['content_img'] = ['http://wz.sun0769.com'+i for i in item['content_img']]
        yield item


#### 9. scrapy.shell
- 使用方法：
    - scrapy shell www.baidu.com
- response.url : 当前响应的url地址
- response.request.url : 当前响应对应的请求url地址
- response.headers : 响应头
- response.body : 响应体，默认byte类型
- response.request.headers : 当前响应的请求头

#### 10. scrapy setting 文件
- 使用：
    - 在 spider 中： self.settings.get('key')
    - 在 pipeline 中： spider.settings.get('key)

#### 11. scrapy crawlspider 文件
- 创建爬虫： scrapy genspider -t crawl cf circ.gov.cn

##### 11.1 demo

In [None]:
class CfSpider(CrawlSpider):
    name = 'cf'
    allowed_domains = ['circ.gov.cn']
    start_urls = ['http://bxjg.circ.gov.cn/web/site0/tab5240/module14430/page1.htm']

    rules = (
        # LinkExtractor 连接提出器， 提取url地址
        # callback 提取出来的url地址的response会交给callback处理
        # follow 当前url地址响应是否重新经过rules来提出url地址
        Rule(LinkExtractor(allow=r'/web/site0/tab5240/info\d+\.htm'), callback='parse_item'),
        Rule(LinkExtractor(allow=r'/web/site0/tab5240/module14430/page\d+\.htm'), follow=True),
    )

    def parse_item(self, response):
        item = {}
        item['title'] = re.findall("<!--TitleStart-->(.*?)<!--TitleEnd-->", response.body.decode())[0]
        item['publish_date'] = re.findall("发布时间：20\d{2}-\d{2}-\d{2}", response.body.decode())[0]
        print(item)

#### 12. scrapy 模拟登录

In [None]:
# demo1
class Gt2Spider(scrapy.Spider):
    name = 'gt2'
    allowed_domains = ['github.com']
    start_urls = ['https://github.com/login']

    def parse(self, response):
        yield scrapy.FormRequest.from_response(
            response,  # 自动的从response中寻找from表单
            formdata={'login': 'layueyu', 'password': 'Bing199128'},
            callback=self.after_login
        )

    def after_login(self, response):
        with open('a.html', 'w', encoding='utf-8') as f:
            f.write(response.body.decode())
        print(re.findall('layueyu', response.body.decode()))
        
# demo2
class GtSpider(scrapy.Spider):
    name = 'gt'
    allowed_domains = ['github.com']
    start_urls = ['https://github.com/login']

    def parse(self, response):
        authenticity_token = response.xpath("//input[@name='authenticity_token']/@value").extract_first()
        utf8 = response.xpath("//input[@name='utf8']/@value").extract_first()
        commit = response.xpath("//input[@name='commit']/@value").extract_first()

        post_data = dict(
            login='layueyu',
            password='Bing199128',
            authenticity_token=authenticity_token,
            commit=commit,
            utf8=utf8
        )

        yield scrapy.FormRequest(
            'https://github.com/session',
            formdata=post_data,
            callback=self.after_login
        )

    def after_login(self, response):
        with open('a.html', 'w', encoding='utf-8') as f:
            f.write(response.body.decode())
        print(re.findall('layueyu', response.body.decode()))

#### 13. 下载中间件

In [None]:
# 示例
class LoginDownloaderMiddleware(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the downloader middleware does not modify the
    # passed objects.

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_request(self, request, spider):
        # Called for each request that goes through the downloader
        # middleware.

        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object
        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called
        # 添加自定以UA
        
        return None

    def process_response(self, request, response, spider):
        # Called with the response returned from the downloader.

        # Must either;
        # - return a Response object
        # - return a Request object
        # - or raise IgnoreRequest
        return response

    def process_exception(self, request, exception, spider):
        # Called when a download handler or a process_request()
        # (from other downloader middleware) raises an exception.

        # Must either:
        # - return None: continue processing this exception
        # - return a Response object: stops process_exception() chain
        # - return a Request object: stops process_exception() chain
        pass

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)
        

# 添加自定以UA
class RandomUserAgent(object):

    def process_request(self, request, spider):
        ua = random.choice(USER_AGENTS)
        request.headers['User-Agent'] = ua

# 添加代理， 在request的meta信息中添加proxy字段
# 代理形式： 协议+ip地址+端口
class ProxyMiddleware(object):
    def process_request(self, request, spider):
        request.meta['proxy'] = 'http://127.0.0.1:1234'