协程(coroutine)
- 实现并发编程的一种方式。多线程/多进程，是解决并发问题的经典模型之一。在最初的互联网世界，起到举足轻重的作用。
- Python2中，使用生成器来实现Python协程。例如以下代码

In [6]:
# Python对协程的支持是通过generator实现的。

# 在generator中，我们不但可以通过for循环来迭代，还可以不断调用next()函数获取由yield语句返回的下一个值。

# 但是Python的yield不但可以返回一个值，它还可以接收调用者发出的参数。

# 来看例子：

# 传统的生产者-消费者模型是一个线程写消息，一个线程取消息，通过锁机制控制队列和等待，但一不小心就可能死锁。

# 如果改用协程，生产者生产消息后，直接通过yield跳转到消费者开始执行，待消费者执行完毕后，切换回生产者继续生产，效率极高：

def consumer():
    r = ''
    while True:
        n = yield r
        if not n:
            return
        print(f'[CONSUMER] Consuming {n}...')
        r = '200 OK'


def produce(c):
    c.send(None)
    for n in range(1, 6):
        print(f'[PRODUCER] Producing {n}...')
        r = c.send(n)
        print(f'[PRODUCER] Consumer return: {r}')
    c.close()


c = consumer()
produce(c)


[PRODUCER] Producing 1...
[CONSUMER] Consuming 1...
[PRODUCER] Consumer return: 200 OK
[PRODUCER] Producing 2...
[CONSUMER] Consuming 2...
[PRODUCER] Consumer return: 200 OK
[PRODUCER] Producing 3...
[CONSUMER] Consuming 3...
[PRODUCER] Consumer return: 200 OK
[PRODUCER] Producing 4...
[CONSUMER] Consuming 4...
[PRODUCER] Consumer return: 200 OK
[PRODUCER] Producing 5...
[CONSUMER] Consuming 5...
[PRODUCER] Consumer return: 200 OK


Python 3.7以后

In [1]:
# 先看一个例子
import time


def crawl_page(url):
    print(f'crawling {url}')
    sleep_time = int(url.split('_')[-1])
    time.sleep(sleep_time)
    print(f'OK {url}')


def main(urls):
    for url in urls:
        crawl_page(url)


%time main(['url_1', 'url_2', 'url_3', 'url_4'])


crawling url_1
OK url_1
crawling url_2
OK url_2
crawling url_3
OK url_3
crawling url_4
OK url_4
CPU times: total: 15.6 ms
Wall time: 10 s


In [8]:
# 引入协程
import asyncio


async def crawl_page(url):
    print(f'crawling {url}')
    sleep_time = int(url.split('_')[-1])
    await asyncio.sleep(sleep_time)
    print(f'OK {url}')


async def main(urls):
    for url in urls:
        await crawl_page(url)

await(main(['url_1', 'url_2', 'url_3', 'url_4']))


crawling url_1
OK url_1
crawling url_2
OK url_2
crawling url_3
OK url_3
crawling url_4
OK url_4


还是10秒,优化，引入任务(Task)

In [9]:
import asyncio


async def crawl_page(url):
    print(f'crawling {url}')
    sleep_time = int(url.split('_')[-1])
    await asyncio.sleep(sleep_time)
    print(f'OK {url}')


async def main(urls):
    tasks = [asyncio.create_task(crawl_page(url)) for url in urls]
    for task in tasks:
        await task

await(main(['url_1', 'url_2', 'url_3', 'url_4']))


crawling url_1
crawling url_2
crawling url_3
crawling url_4
OK url_1
OK url_2
OK url_3
OK url_4


执行tasks，另一种做法

In [10]:
import asyncio


async def crawl_page(url):
    print(f'crawling {url}')
    sleep_time = int(url.split('_')[-1])
    await asyncio.sleep(sleep_time)
    print(f'OK {url}')


async def main(urls):
    tasks = [asyncio.create_task(crawl_page(url)) for url in urls]
    await asyncio.gather(*tasks)

await(main(['url_1', 'url_2', 'url_3', 'url_4']))


crawling url_1
crawling url_2
crawling url_3
crawling url_4
OK url_1
OK url_2
OK url_3
OK url_4


解密协程运行时

In [12]:
import asyncio


async def worker_1():
    print('worker_1 start')
    await asyncio.sleep(1)
    print('worker_1 done')


async def worker_2():
    print('worker_2 start')
    await asyncio.sleep(2)
    print('worker_2 done')


async def main():
    print('before await')
    await worker_1()
    print('awaited worker_1')
    await worker_2()
    print('awaited worker_2')

await main()


before await
worker_1 start
worker_1 done
awaited worker_1
worker_2 start
worker_2 done
awaited worker_2


In [15]:
import asyncio


async def worker_1():
    print('worker_1 start')
    await asyncio.sleep(1)
    print('worker_1 done')


async def worker_2():
    print('worker_2 start')
    await asyncio.sleep(2)
    print('worker_2 done')


async def main():
    task1 = asyncio.create_task(worker_1())
    task2 = asyncio.create_task(worker_2())
    print('before await')
    await task1
    print('awaited worker_1')
    await task2
    print('awaited worker_2')

await main()


before await
worker_1 start
worker_2 start
worker_1 done
awaited worker_1
worker_2 done
awaited worker_2


In [1]:
import asyncio


async def worker_1():
    await asyncio.sleep(1)
    return 1


async def worker_2():
    await asyncio.sleep(2)
    return 2/0


async def worker_3():
    await asyncio.sleep(3)
    return 3


async def main():
    task_1 = asyncio.create_task(worker_1())
    task_2 = asyncio.create_task(worker_2())
    task_3 = asyncio.create_task(worker_3())

    await asyncio.sleep(2)
    task_3.cancel()

    res = await asyncio.gather(task_1, task_2, task_3, return_exceptions=True)
    print(res)

await main()


[1, ZeroDivisionError('division by zero'), CancelledError()]


协程实现一个生产者消费者模型

In [1]:
import random
import asyncio


async def consumer(queue, id):
    while True:
        val = await queue.get()
        print(f'{id} get a val: {val}')
        await asyncio.sleep(1)


async def producer(queue, id):
    for _ in range(5):
        val = random.randint(1, 10)
        await queue.put(val)
        print(f'{id} set a val: {val}')
        await asyncio.sleep(1)


async def main():
    queue = asyncio.Queue()

    consumer_1 = asyncio.create_task(consumer(queue, 'consumer_1'))
    consumer_2 = asyncio.create_task(consumer(queue, 'consumer_2'))

    producer_1 = asyncio.create_task(producer(queue, 'producer_1'))
    producer_2 = asyncio.create_task(producer(queue, 'producer_2'))

    await asyncio.sleep(10)
    consumer_1.cancel()
    consumer_2.cancel()

    await asyncio.gather(consumer_1, consumer_2, producer_1, producer_2, return_exceptions=True)

await main()


producer_1 set a val: 7
producer_2 set a val: 3
consumer_1 get a val: 7
consumer_2 get a val: 3
producer_1 set a val: 4
producer_2 set a val: 6
consumer_2 get a val: 4
consumer_1 get a val: 6
producer_1 set a val: 9
producer_2 set a val: 10
consumer_1 get a val: 9
consumer_2 get a val: 10
producer_1 set a val: 8
producer_2 set a val: 6
consumer_2 get a val: 8
consumer_1 get a val: 6
producer_1 set a val: 5
producer_2 set a val: 5
consumer_1 get a val: 5
consumer_2 get a val: 5


实战：豆瓣今日推荐电影爬虫

In [None]:
# 同步版本
import requests
from bs4 import BeautifulSoup


def main():
    url = "https://movie.douban.com/cinema/later/beijing/"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE'
    }
    init_page = requests.get(url, headers=headers).content
    init_soup = BeautifulSoup(init_page, "lxml")

    all_movies = init_soup.find("div", id="showing-soon")
    for each_movie in all_movies.find_all("div", class_="item"):
        all_a_tag = each_movie.find_all("a")
        all_li_tag = each_movie.find_all("li")

        movie_name = all_a_tag[1].text
        url_to_fetch = all_a_tag[0]['href']
        movie_date = all_li_tag[0].text

        response_item = requests.get(url_to_fetch, headers=headers).content
        soup_item = BeautifulSoup(response_item, "lxml")
        img_tag = soup_item.find("img")

        print(f'{movie_name} {movie_date} {img_tag.get("src")}')


%time main()


In [None]:
# 协程版本
import asyncio
import aiohttp

from bs4 import BeautifulSoup


async def fetch_content(url):
    header = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36'
    }
    async with aiohttp.ClientSession(headers=header, connector=aiohttp.TCPConnector(verify_ssl=False)) as session:
        async with session.get(url) as resp:
            return await resp.text()


async def main():
    url = "https://movie.douban.com/cinema/later/beijing/"

    init_page = await fetch_content(url)
    init_soup = BeautifulSoup(init_page, "lxml")

    movie_names, urls_to_fetch, movie_dates = [], [], []

    all_movies = init_soup.find("div", id="showing-soon")
    for each_movie in all_movies.find_all("div", class_="item"):
        all_a_tag = each_movie.find_all("a")
        all_li_tag = each_movie.find_all("li")

        movie_names.append(all_a_tag[1].text)
        urls_to_fetch.append(all_a_tag[0]['href'])
        movie_dates.append(all_li_tag[0].text)

    tasks = [fetch_content(url) for url in urls_to_fetch]
    pages = await asyncio.gather(*tasks)

    for movie_name, movie_date, page in zip(movie_names, movie_dates, pages):
        soup_item = BeautifulSoup(page, 'lxml')
        img_tag = soup_item.find("img")

        print(f'{movie_name} {movie_date} {img_tag.get("src")}')

await main()
