# 加速：非同步爬蟲

* 了解非同步爬蟲加速原理與實作

## 作業目標

* 比較一下非同步爬蟲跟多線程爬蟲的差異是什麼？各自的優缺點為何？

In [1]:
import requests
import _thread
import time

urls = ['https://morvanzhou.github.io/']*10

startTime = time.time()

for url in urls:
    _thread.start_new_thread( requests.get, (url, ) )

finishTime = time.time()
print(finishTime - startTime) # 利用 thread 的爬蟲所需時間

0.0009975433349609375


In [2]:
import aiohttp, asyncio
import nest_asyncio
nest_asyncio.apply()

URL = 'https://morvanzhou.github.io/'

async def job(session):
    response = await session.get(URL)                               #等待並切換
    return str(response.url)

async def main(loop):
    async with aiohttp.ClientSession() as session:                  #官網推薦建立Session的形式,也可以直接用request
        tasks = [loop.create_task(job(session)) for _ in range(10)]
        finished, unfinished = await asyncio.wait(tasks)            #收集完成的結果,會返回完成的和沒完成的,等全部都完成了才返回
        all_results = [r.result() for r in finished]                #獲取所有結果
        print(all_results)


t1 = time.time()
loop = asyncio.get_event_loop()
loop.run_until_complete(main(loop))
# loop.close()
print("Async total time:", time.time() - t1)

['https://morvanzhou.github.io/', 'https://morvanzhou.github.io/', 'https://morvanzhou.github.io/', 'https://morvanzhou.github.io/', 'https://morvanzhou.github.io/', 'https://morvanzhou.github.io/', 'https://morvanzhou.github.io/', 'https://morvanzhou.github.io/', 'https://morvanzhou.github.io/', 'https://morvanzhou.github.io/']
Async total time: 0.16390228271484375
