In [2]:
# 一个简单的爬虫栗子
import time
 
def crawl_page(url):
    print('crawling {}'.format(url))
    sleep_time = int(url.split('_')[-1])
    time.sleep(sleep_time)
    print('OK {}'.format(url))

def main(urls):
    for url in urls:
        crawl_page(url)

%time main(['url_1', 'url_2', 'url_3', 'url_4'])



crawling url_1
OK url_1
crawling url_2
OK url_2
crawling url_3
OK url_3
crawling url_4
OK url_4
CPU times: user 6.05 ms, sys: 3.01 ms, total: 9.07 ms
Wall time: 10 s


In [12]:
# 并发化, 使用协程

import asyncio

async def crawl_page(url):
    print('crawling {}'.format(url))
    sleep_time = int(url.split('_')[-1])
    await asyncio.sleep(sleep_time)
    print('OK {}'.format(url))

async def main(urls):
    for url in urls:
        await crawl_page(url)

await main(['url_1', 'url_2', 'url_3', 'url_4'])

crawling url_1
OK url_1
crawling url_2
OK url_2
crawling url_3
OK url_3
crawling url_4
OK url_4


In [14]:
import asyncio

async def crawl_page(url):
    print('crawling {}'.format(url))
    sleep_time = int(url.split('_')[-1])
    await asyncio.sleep(sleep_time)
    print('OK {}'.format(url))

async def main(urls):
    """
    我们要等所有任务都结束才行，用for task in tasks: await task 即可。
    """
    tasks = [asyncio.create_task(crawl_page(url)) for url in urls]
    for task in tasks:
        await task

await main(['url_1', 'url_2', 'url_3', 'url_4'])

crawling url_1
crawling url_2
crawling url_3
crawling url_4
OK url_1
OK url_2
OK url_3
OK url_4


In [17]:
# 另外一种做法
import asyncio

async def crawl_page(url):
    print('crawling {}'.format(url))
    sleep_time = int(url.split('_')[-1])
    await asyncio.sleep(sleep_time)
    print('OK {}'.format(url))

async def main(urls):
    tasks = [asyncio.create_task(crawl_page(url)) for url in urls]
    """
    要注意的是，*tasks 解包列表，将列表变成了函数的参数；与之对应的是， ** dict 将字典变成了函数的参数。
    """
    await asyncio.gather(*tasks)

await main(['url_1', 'url_2', 'url_3', 'url_4'])



crawling url_1
crawling url_2
crawling url_3
crawling url_4
OK url_1
OK url_2
OK url_3
OK url_4


In [18]:
# 深入代码底层
import asyncio

async def worker_1():
    print('worker_1 start')
    await asyncio.sleep(1)
    print('worker_1 done')

async def worker_2():
    print('worker_2 start')
    await asyncio.sleep(2)
    print('worker_2 done')

async def main():
    print('before await')
    await worker_1()
    print('await worker_1')
    await worker_2()
    print('await worker_2')

await main()

before await
worker_1 start
worker_1 done
await worker_1
worker_2 start
worker_2 done
await worker_2


In [21]:
import asyncio

async def worker_1():
    print('worker_1 start')
    await asyncio.sleep(1)
    print('worker_1 done')

async def worker_2():
    print('worker_2 start')
    await asyncio.sleep(2)
    print('worker_2 done')

async def main():
    task1 = asyncio.create_task(worker_1())
    task2 = asyncio.create_task(worker_2())

    print('before await')
    await task1
    print('awaited worker1')
    await task2
    print('awaited worker2')

await main()


before await
worker_1 start
worker_2 start
worker_1 done
awaited worker1
worker_2 done
awaited worker2


In [22]:
"""
如果我们想给某些协程任务限定运行时间，一旦超时就取消，又该怎么做呢？再进一步，如果某些协程运行时出现错误，又该怎么处理呢？
"""

import asyncio

async def worker_1():
    await asyncio.sleep(1)
    return 1

async def worker_2():
    await asyncio.sleep(2)
    return 2/0

async def worker_3():
    await asyncio.sleep(3)
    return 3

async def main():
    task_1 = asyncio.create_task(worker_1())
    task_2 = asyncio.create_task(worker_2())
    task_3 = asyncio.create_task(worker_3())

    await asyncio.sleep(2)
    task_3.cancel()

    """
    return_exceptions = True, 
    如果不设置这个参数，错误就会完整地 throw 到我们这个执行层，
    从而需要 try except 来捕捉，这也就意味着其他还没被执行的任务会被全部取消掉。
    为了避免这个局面，我们将 return_exceptions 设置为 True 即可。
    """
    res = await asyncio.gather(task_1, task_2, task_3, return_exceptions=True)
    print(res)

await main()



[1, ZeroDivisionError('division by zero'), CancelledError('')]


In [23]:
# 协程实现生产者消费者模型

import asyncio
import random

async def consumer(queue, id):
    while True:
        val = await queue.get()
        print('{} get a val: {}'.format(id, val))
        await asyncio.sleep(1)

async def producer(queue, id):
    for i in range(5):
        val = random.randint(1, 10)
        await queue.put(val)
        print('{} put a val: {}'.format(id, val))
        await asyncio.sleep(1)

async def main():
    queue = asyncio.Queue()

    consumer_1 = asyncio.create_task(consumer(queue, 'consumer_1'))
    consumer_2 = asyncio.create_task(consumer(queue, 'consumer_2'))

    producer_1 = asyncio.create_task(producer(queue, 'producer_1'))
    producer_2 = asyncio.create_task(producer(queue, 'producer_2'))

    await asyncio.sleep(10)
    consumer_1.cancel()
    consumer_2.cancel()

    await asyncio.gather(consumer_1, consumer_2, producer_1, producer_2, return_exceptions=True)


await main()

producer_1 put a val: 10
producer_2 put a val: 6
consumer_1 get a val: 10
consumer_2 get a val: 6
producer_1 put a val: 7
producer_2 put a val: 2
consumer_1 get a val: 7
consumer_2 get a val: 2
producer_1 put a val: 9
producer_2 put a val: 2
consumer_1 get a val: 9
consumer_2 get a val: 2
producer_1 put a val: 1
producer_2 put a val: 6
consumer_1 get a val: 1
consumer_2 get a val: 6
producer_1 put a val: 6
producer_2 put a val: 1
consumer_1 get a val: 6
consumer_2 get a val: 1


In [37]:
# 豆瓣今日推荐电影爬虫
# https://movie.douban.com/cinema/later/beijing/

import requests
from bs4 import BeautifulSoup

def main():
    url = "https://movie.douban.com/cinema/later/beijing/"
    head = {
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36',
        'Referer':'https://time.geekbang.org/column/article/101855',
        'Connection':'keep-alive'
    }
    init_page = requests.get(url, headers=head).content
    init_soup = BeautifulSoup(init_page, 'lxml')

    all_movies = init_soup.find('div', id = 'showing-soon')
    for each_movie in all_movies.find_all('div', class_='item'):
        all_a_tag = each_movie.find_all('a')
        all_li_tag = each_movie.find_all('li')

        movie_name = all_a_tag[1].text
        url_to_fetch = all_a_tag[1]['href']
        movie_date = all_li_tag[0].text

        response_item = requests.get(url_to_fetch, headers=head).content
        soup_item = BeautifulSoup(response_item, 'lxml')
        img_tag = soup_item.find('img')
        
        # print(movie_name, movie_date, response_item)

        print('{} {} {}'.format(movie_name, movie_date, img_tag['src']))

%time main()




最初的梦想 01月07日 https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2766852789.jpg
魔法满屋 01月07日 https://img9.doubanio.com/view/photo/s_ratio_poster/public/p2807936075.jpg
一江春水 01月07日 https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2779904200.jpg
独家头条 01月07日 https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2812894770.jpg
冰上时刻 01月07日 https://img2.doubanio.com/view/photo/s_ratio_poster/public/p2791542382.jpg
屋内有人 01月07日 https://img9.doubanio.com/view/photo/s_ratio_poster/public/p2755165106.jpg
萌鸡小队：萌闯新世界 01月08日 https://img9.doubanio.com/view/photo/s_ratio_poster/public/p2781907425.jpg
追梦少年 01月08日 https://img9.doubanio.com/view/photo/s_ratio_poster/public/p2674282554.jpg
农民院士 01月09日 https://img9.doubanio.com/view/photo/s_ratio_poster/public/p2799267344.jpg
黑客帝国：矩阵重启 01月14日 https://img9.doubanio.com/view/photo/s_ratio_poster/public/p2761285836.jpg
东北虎 01月14日 https://img9.doubanio.com/view/photo/s_ratio_poster/public/p2812275146.jpg
汪汪队立大功大电影 01月14日 https://img3.do

In [47]:
import asyncio
import aiohttp

from bs4 import BeautifulSoup

async def fetch_content(url):
    header  = {
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36',
        'Referer':'https://time.geekbang.org/column/article/101855',
        'Connection':'keep-alive'
    }
    async with aiohttp.ClientSession(
        headers=header, connector=aiohttp.TCPConnector(ssl=False)
    ) as session:
        async with session.get(url) as response:
            print(response)
            return await response.text()

async def main():
    url = "https://movie.douban.com/cinema/later/beijing/"
    header  = {
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36',
        'Referer':'https://time.geekbang.org/column/article/101855',
        'Connection':'keep-alive'
    }
    init_page = await fetch_content(url)
    init_soup = BeautifulSoup(init_page, 'lxml')

    movie_names, urls_to_fetch, movie_dates = [], [], []

    all_movies = init_soup.find('div', id='showing-soon')

    for each_movie in all_movies.find_all('div', class_='item'):
        all_a_tag = each_movie.find_all('a')
        all_li_tag = each_movie.find_all('li')

        movie_names.append(all_a_tag[1].text)
        urls_to_fetch.append(all_a_tag[1]['href'])
        movie_dates.append(all_li_tag[0].text)

    tasks = [fetch_content(url) for url in urls_to_fetch]
    pages = await asyncio.gather(*tasks)

    for movie_name, movie_date, page in zip(movie_names, movie_dates, pages):
        soup_item = BeautifulSoup(page, 'lxml')
        img_tag = soup_item.find('img')

        print('{} {} {}'.format(movie_name, movie_date, img_tag['src']))

await main()

<ClientResponse(https://movie.douban.com/cinema/later/beijing/) [200 OK]>
<CIMultiDictProxy('Date': 'Fri, 31 Dec 2021 17:59:09 GMT', 'Content-Type': 'text/html; charset=utf-8', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'Keep-Alive': 'timeout=30', 'Set-Cookie': 'bid=f-x3-ngLg84; Expires=Sat, 31-Dec-22 17:59:09 GMT; Domain=.douban.com; Path=/', 'X-DOUBAN-NEWBID': 'f-x3-ngLg84', 'Server': 'dae', 'Strict-Transport-Security': 'max-age=15552000', 'X-Content-Type-Options': 'nosniff', 'Content-Encoding': 'gzip')>



AttributeError: 'NoneType' object has no attribute 'find_all'

In [53]:

import asyncio
import aiohttp

from bs4 import BeautifulSoup

async def fetch_content(url, header):
    async with aiohttp.ClientSession(
        headers=header, connector=aiohttp.TCPConnector(ssl=False)
    ) as session:
        async with session.get(url) as response:
            return await response.text()

async def main():
    url = "https://movie.douban.com/cinema/later/beijing/"
    header  = {
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36',
        'Referer':'https://time.geekbang.org/column/article/101855',
        'Connection':'keep-alive'
    }
    init_page = await fetch_content(url, header)
    init_soup = BeautifulSoup(init_page, 'lxml')

    movie_names, urls_to_fetch, movie_dates = [], [], []

    all_movies = init_soup.find('div', id="showing-soon")
    for each_movie in all_movies.find_all('div', class_="item"):
        all_a_tag = each_movie.find_all('a')
        all_li_tag = each_movie.find_all('li')

        movie_names.append(all_a_tag[1].text)
        urls_to_fetch.append(all_a_tag[1]['href'])
        movie_dates.append(all_li_tag[0].text)

    tasks = [fetch_content(url) for url in urls_to_fetch]
    pages = await asyncio.gather(*tasks)

    for movie_name, movie_date, page in zip(movie_names, movie_dates, pages):
        soup_item = BeautifulSoup(page, 'lxml')
        img_tag = soup_item.find('img')

        print('{} {} {}'.format(movie_name, movie_date, img_tag['src']))

await main()


AttributeError: 'NoneType' object has no attribute 'find_all'

In [55]:
import time
import aiohttp
import asyncio

from bs4 import BeautifulSoup

now = lambda: time.perf_counter()

async def fetchHtmlText(url):
    async with aiohttp.ClientSession(
        headers = {'users-agent': 'Mozilla/5.0'},
        connector=aiohttp.TCPConnector(ssl=False)
    ) as session:
        async with session.get(url) as response:
            return await response.text()

async def main():
    url = 'https://movie.douban.com/cinema/later/beijing/'
    html = await fetchHtmlText(url)
    soup = BeautifulSoup(html, 'html.parser')

    divs = soup.find_all('div', class_='item mod')
    urls = list(map(lambda x: x.a.img['src', divs]))
    names = list(map(lambda x: x.h3.a.string, divs))
    dats = list(map(lambda x: x.ul.li.string, divs))

    lis = zip(names, dats, urls)
    for i in lis:
        print("{0:{3}^25}\t{1:{3}^10}\t{2:{3}^}".format(i[0], i[1], i[2], chr(12288)))

start = now()


SyntaxError: invalid syntax (3745030874.py, line 22)