In [5]:
# 不使用多线程获取

import os
import re
from urllib import request
import requests
from lxml import etree


def parse_url(url, headers):
    response = requests.get(url, headers=headers)
    html_str = response.text
    print("获取成功：{}".format(url))
    return html_str


def extract_imgs(html_str):
    html = etree.HTML(html_str)
    #     print(html)
    imgs = html.xpath(
        '//div[@class="col-sm-9 center-wrap"]//img[@class!="gif"]')
    for img in imgs:
        alt = img.get('alt')  # 获取图片描述
        alt = re.sub(r'[\?？。\.！!:,，]*', "", alt)  # 去掉图片描述里面的不规范字符，避免文件存储时出现错误
        src = img.get('data-original')  # 获取图片url
        suffix = os.path.splitext(src)[
            -1]  # 获取图片后缀 (os.path.splitext: splite extention 可以切割文件扩展名，返回一个列表)
        file_name = alt + suffix  #  设置文件名，用于保存文件
        request.urlretrieve(
            src, "../crawl_file/images/" +
            file_name)  # request.urlretriece(url, file_name)  读取网页，并保存
    print("保存成功")


def main(page):
    url = "http://www.doutula.com/article/list/?page={}".format(page)
    headers = {
        "User-Agent":
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.3"
    }
    html_str = parse_url(url, headers)
    extract_imgs(html_str)


if __name__ == "__main__":
    for page in range(1, 600):
        main(page)

获取成功：http://www.doutula.com/article/list/?page=1
保存成功
获取成功：http://www.doutula.com/article/list/?page=2
保存成功
获取成功：http://www.doutula.com/article/list/?page=3
保存成功
获取成功：http://www.doutula.com/article/list/?page=4
保存成功
获取成功：http://www.doutula.com/article/list/?page=5
保存成功
获取成功：http://www.doutula.com/article/list/?page=6
保存成功
获取成功：http://www.doutula.com/article/list/?page=7
保存成功
获取成功：http://www.doutula.com/article/list/?page=8
保存成功
获取成功：http://www.doutula.com/article/list/?page=9
保存成功
获取成功：http://www.doutula.com/article/list/?page=10
保存成功
获取成功：http://www.doutula.com/article/list/?page=11
保存成功
获取成功：http://www.doutula.com/article/list/?page=12
保存成功
获取成功：http://www.doutula.com/article/list/?page=13
保存成功
获取成功：http://www.doutula.com/article/list/?page=14
保存成功
获取成功：http://www.doutula.com/article/list/?page=15
保存成功
获取成功：http://www.doutula.com/article/list/?page=16
保存成功
获取成功：http://www.doutula.com/article/list/?page=17
保存成功
获取成功：http://www.doutula.com/article/list/?page=18
保存成功
获取成功：http://www.dou

KeyboardInterrupt: 

### 使用多线程队列方式实现表情包获取

In [2]:
import os
import re
from urllib import request
from lxml import etree
import threading
from queue import Queue
import requests


class Producer(threading.Thread):
    """生产url数据"""
    def __init__(self, page_queue, imgs_queue, *args, **kwargs):
        """重写父类__init__（）方法"""
        super().__init__(*args, **kwargs)  # 继承父类__init__()
        self.page_queue = page_queue
        self.imgs_queue = imgs_queue
        self.headers = {
           "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36",
            "Referer": "https://pos.baidu.com/wh/o.htm?ltr="
        }

    def parse_url(self):
        url = self.page_queue.get()
        try:
            response = requests.get(url, headers=self.headers)
            if response.status_code == 200:
                print("解析成功：{}".format(url))
                return response.text
            return None
        except requests.RequestException:
            print("解析失败")
            return None

    def extract_imgs(self, html_str):
        try:
            html = etree.HTML(html_str)
        except ValueError:
            return
        print("提取成功")
        imgs = html.xpath(
            '//div[@class="col-sm-9 center-wrap"]//img[@class!="gif"]')
        for img in imgs:
            alt = img.get('alt')  # 获取图片描述
            alt = re.sub(r'[\?？。\.！!:,，]*', "", alt)  # 去掉图片描述里面的不规范字符，避免文件存储时出现错误
            src = img.get('data-original')  # 获取图片url
            suffix = os.path.splitext(src)[-1]  # 获取图片后缀 (os.path.splitext: splite extention 可以切割文件扩展名，返回一个列表)
            file_name = alt + suffix  #  设置文件名，用于保存文件
            self.imgs_queue.put((src, "../crawl_file/images/" + file_name))

    def run(self):
        """创建线程，实现线程主程序"""
        while True:
            if self.page_queue.empty():
                break
            # 发送请求，获取数据
            html_str = self.parse_url()
#             print(html_str)
            # 提取数据,并put()到imgs_queue队列中
            self.extract_imgs(html_str)
        
        
class Consumer(threading.Thread):
    """保存数据"""
    def __init__(self, page_queue, imgs_queue, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.page_queue = page_queue
        self.imgs_queue = imgs_queue
        
    def run(self):
        """创建线程，实现线程主程序"""
        while True:
            if self.page_queue.empty() and self.imgs_queue.empty():
                break
            scr, file_name = self.imgs_queue.get()
            request.urlretrieve(scr, file_name)


def main():
    page_queue = Queue()  # 页码队列
    imgs_queue = Queue(1000)  # 图片队列
    url = "http://www.doutula.com/article/list/?page={}"  #

    # 往页码队列里面添加url
    for i in range(1, 601):  # 往队列里面添加600页表情包url地址
        page_queue.put(url.format(i))

    for i in range(5):
        t = Producer(page_queue, imgs_queue)
        t.start()

    for i in range(5):
        t = Consumer(page_queue, imgs_queue)
        t.start()


if __name__ == "__main__":
    main()

解析成功：http://www.doutula.com/article/list/?page=4
解析成功：http://www.doutula.com/article/list/?page=2
解析成功：http://www.doutula.com/article/list/?page=3
解析成功：http://www.doutula.com/article/list/?page=1
解析成功：http://www.doutula.com/article/list/?page=5
提取成功
提取成功
提取成功
提取成功
提取成功
解析成功：http://www.doutula.com/article/list/?page=8
提取成功
解析成功：http://www.doutula.com/article/list/?page=9
提取成功
解析成功：http://www.doutula.com/article/list/?page=12
提取成功
解析成功：http://www.doutula.com/article/list/?page=7
提取成功
解析成功：http://www.doutula.com/article/list/?page=6
提取成功
解析成功：http://www.doutula.com/article/list/?page=13
解析成功：http://www.doutula.com/article/list/?page=10
提取成功
提取成功
解析成功：http://www.doutula.com/article/list/?page=14
提取成功
解析成功：http://www.doutula.com/article/list/?page=15
提取成功
解析成功：http://www.doutula.com/article/list/?page=11
提取成功


Exception in thread Thread-16:
Traceback (most recent call last):
  File "D:\anaconda\lib\threading.py", line 917, in _bootstrap_inner
    self.run()
  File "<ipython-input-2-af8d59f0af48>", line 56, in run
    self.extract_imgs(html_str)
  File "<ipython-input-2-af8d59f0af48>", line 35, in extract_imgs
    html = etree.HTML(html_str)
  File "src/lxml/etree.pyx", line 3170, in lxml.etree.HTML
  File "src/lxml/parser.pxi", line 1876, in lxml.etree._parseMemoryDocument
ValueError: can only parse strings

Exception in thread Thread-18:
Traceback (most recent call last):
  File "D:\anaconda\lib\threading.py", line 917, in _bootstrap_inner
    self.run()
  File "<ipython-input-2-af8d59f0af48>", line 56, in run
    self.extract_imgs(html_str)
  File "<ipython-input-2-af8d59f0af48>", line 35, in extract_imgs
    html = etree.HTML(html_str)
  File "src/lxml/etree.pyx", line 3170, in lxml.etree.HTML
  File "src/lxml/parser.pxi", line 1876, in lxml.etree._parseMemoryDocument
ValueError: can only