In [None]:
import requests
from urllib.parse import urlencode
from pymongo import MongoClient
import time


class Toutiao(object):
    def __init__(self, name):
        self.name = name  # 获取要爬取的搜索内容相应的关键字
        self.base_url = "https://www.toutiao.com/api/search/content/?"
        self.headers = {
            "cookie":
            "tt_webid=6736143139453601287; WEATHER_CITY=%E5%8C%97%E4%BA%AC; tt_webid=6736143139453601287; csrftoken=1173d1e59061616313ba56775c3f5118; _ga=GA1.2.73970871.1568380906; s_v_web_id=9d78d95a5883b8e8a9363aa0386c9e3a",
            "referer":
            "https://www.toutiao.com/search/?keyword=%E8%A1%97%E6%8B%8D",
            "user-agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36",
            "x-requested-with": "XMLHttpRequest"
        }
        # 初始化mongoDB客户端，并指定collection为 toutiao
        client = MongoClient(host='localhost', port=27017)
        db = client['TouTiao']
        self.collection = db['toutiao']

    def parse_url(self, url):
        """发送请求，获取响应"""
        print(url)
        try:
            response = requests.get(url, headers=self.headers)
            if response.status_code == 200:
                return response.json()
        except requests.ConnectionError as e:
            print("Error is ", e.args)

    def get_data(self, html_json):
        """根据传进来的json字典，提取相应的数据"""
        if html_json['count']:  # 判断新闻的数量
            items = html_json['data']  
            for item in items:
                img_dict = {}
                if item.get('has_image', None):  # 判断数据中是否含有图片
                    img_dict['title'] = item.get('title', None)  # 获取图片的title
                    img_dict['image_list'] = item.get('image_list')  # 获取所有图片的url，返回一个字典
                    yield img_dict

    def run(self):
        """实现主程序"""
        has_more = 1  # 先假设网页有多个ajax请求
        page = 0  # 初始化网页的页码
        while has_more:
            # 构造url
            self.params = {
                "aid": "24",
                "app_name": "web_search",
                "offset": page * 20,
                "format": "json",
                "keyword": self.name,
                "autoload": "true",
                "count": "20",
                "en_qc": "1",
                "cur_tab": "1",
                "from": "search_tab",
                "pd": "synthesis",
                "timestamp": int(time.time() * 1000)  # 根据ajax请求，模拟相应的时间戳
            }
            page += 1
            url = self.base_url + urlencode(self.params)
            # 发送请求获取响应
            html_json = self.parse_url(url)
            if html_json['count']:
                has_more = html_json['has_more']
            # 提取数据
            img_dict_generator = self.get_data(html_json)
            # 保存数据
            for img in img_dict_generator:
                try:
                    self.collection.insert_one(img)
                    print('保存成功')
                except:
                    print("保存过了")
            time.sleep(2)

In [None]:
if __name__ == "__main__":
    toutiao_jiepai = Toutiao("街拍")
    html_json = toutiao_jiepai.run()

In [None]:
import re
from queue import Queue
from pymongo import MongoClient
import urllib
import os
import time


def decorator(func):
    def wrap(*args, **kwargs):
        start = time.time()
        ret = func(*args, **kwargs)
        end = time.time()
        print("spend time : {:.8f}".format(end-start))
        return ret
    return wrap

class ProduceUrl(threading.Thread):
    def __init__(self, filepath_q, url_q, collection, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.filepath_q = filepath_q
        self.url_q = url_q
        self.collection = collection

    @decorator      
    def run(self):
        result = self.collection.aggregate([{"$project":{"_id":0}}])  # 获取图片数据
        for img in result:
            title = img.get('title')
            file_name = re.sub(r'[:：，。,\.?？！!\|"”“]', "", title)  # 去掉不规范字符，防止在写入文件时因为文件名而出错
            image_list = img.get('image_list')
            for index, image in enumerate(image_list, start=1):
                src = image.get('url')
                self.url_q.put(src)
                file_path = "../crawl_file/images/toutiao/" + file_name + str(index)+".jpg"
                self.filepath_q.put(file_path)
                

class ConsumerUrl(threading.Thread):
    def __init__(self, filepath_q, url_q, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.filepath_q = filepath_q
        self.url_q = url_q
    
    @decorator
    def run(self):
        while True:
            src = self.url_q.get()
            file_path = self.filepath_q.get()
            if not os.path.exists(file_path):  # 根据文件名判断是否文件已经下载过了
                urllib.request.urlretrieve(src, file_path)
                print('保存成功' + file_path)
            if self.url_q.empty():
                break
        

                    
                    
def main():
    # 连接数据库，并指定collection
    client = MongoClient(host='localhost', port=27017)
    db = client['TouTiao']
    collection = db['toutiao']
    # 创建文件路径队列和url队列
    filepath_q = Queue(500)
    url_q = Queue(1000)
    # 创建生产者进程，生产文件路径和url
    threads = []
    for i in range(2):
        t_producer = ProduceUrl(filepath_q, url_q, collection)
        t_producer.start()
        threads.append(t_producer)
    for i in range(10):
        t_consumer = ConsumerUrl(filepath_q, url_q)
        t_consumer.start()
        threads.append(t_consumer)
    
    for thread in threads:
        thread.join()
    `
if __name__ == "__main__":
    main()
    