# 加速：非同步爬蟲

* 了解非同步爬蟲加速原理與實作

### Python 中的非同步爬蟲

In [3]:
import aiohttp
import asyncio
import nest_asyncio
nest_asyncio.apply()

async def fetch(session, url):
    async with session.get(url) as response:
        return await response.text()

async def main():
    async with aiohttp.ClientSession() as session:
        html = await fetch(session, 'http://python.org')
        print(html)

loop = asyncio.get_event_loop()
loop.run_until_complete(main())
loop.close()

Python Packaging</a></li>
                                
                                <li>
<time datetime="2020-03-31T15:00:00.000001+00:00"><span class="say-no-more">2020-</span>03-31</time>
 <a href="http://feedproxy.google.com/~r/PythonSoftwareFoundationNews/~3/ft6nIyIxM-w/psfs-projected-2020-financial-outcome.html">PSF&#39;s Projected 2020 Financial Outcome</a></li>
                                
                            </ul>
                        </div><!-- end .shrubbery -->

                    </div>

                    <div class="medium-widget event-widget last">
                        
                        <div class="shrubbery">
                        
                            <h2 class="widget-title"><span aria-hidden="true" class="icon-calendar"></span>Upcoming Events</h2>
                            <p class="give-me-more"><a href="/events/calendars/" title="More Events">More</a></p>
                            
                            <ul class

RuntimeError: Cannot close a running event loop

### 比較一下

In [4]:
import requests, time

URL = 'https://morvanzhou.github.io/'

def normal():
    for i in range(2):
        r = requests.get(URL)
        url = r.url
        print(url)

t1 = time.time()
normal()
print("Normal total time:", time.time()-t1)


https://morvanzhou.github.io/
https://morvanzhou.github.io/
Normal total time: 1.2544188499450684


In [5]:
import aiohttp, asyncio
import nest_asyncio
nest_asyncio.apply()


async def job(session):
    response = await session.get(URL)                               #等待並切換
    return str(response.url)

async def main(loop):
    async with aiohttp.ClientSession() as session:                  #官網推薦建立Session的形式,也可以直接用request
        tasks = [loop.create_task(job(session)) for _ in range(2)]
        finished, unfinished = await asyncio.wait(tasks)            #收集完成的結果,會返回完成的和沒完成的,等全部都完成了才返回
        all_results = [r.result() for r in finished]                #獲取所有結果
        print(all_results)

t1 = time.time()
loop = asyncio.get_event_loop()
loop.run_until_complete(main(loop))
# loop.close()
print("Async total time:", time.time() - t1)

['https://morvanzhou.github.io/', 'https://morvanzhou.github.io/']
Async total time: 0.5904998779296875


## 作業目標

* 比較一下非同步爬蟲跟多線程爬蟲的差異是什麼？各自的優缺點為何？

In [None]:
# conda install -c conda-forge aiohttp
#https://anaconda.org/conda-forge/aiohttp
https://docs.python.org/zh-cn/3/library/asyncio.html

# conda install -c bjrn nest_asyncio
#https://www.cnpython.com/pypi/nest-asyncio
#https://anaconda.org/bjrn/nest_asyncio


# https://morvanzhou.github.io/tutorials/data-manipulation/scraping/4-02-asyncio/