## 異部爬蟲
* 使用 `aiohttp` & `asyncio`
* 安裝 
    * `pip install asyncio`
    * `pip install aiohttp`
* jupyter notebook 報錯 Runtime Error 解法:
    * `pip install nest_asyncio`
        * import之後 `nest_asyncio.apply()`
    * 確認套件 `tornado` 版本, >= 5 會有衝突
        * `pip install tornado==4.5.3`
        

In [None]:
import pandas as pd
import json
import time
import requests
import aiohttp
import asyncio
import nest_asyncio
nest_asyncio.apply()
import warnings
warnings.filterwarnings('ignore')

### data
* 使用104的資料, 抽28個網址當作測試, 目標是提取每家公司的職缺數




In [None]:
test_url = ['https://www.104.com.tw/company/ajax/joblist/cu5l2yg',
            'https://www.104.com.tw/company/ajax/joblist/18tfytrc',
            'https://www.104.com.tw/company/ajax/joblist/cit8qrk',
            'https://www.104.com.tw/company/ajax/joblist/1a2x6bkort',
            'https://www.104.com.tw/company/ajax/joblist/1a2x6bjzjj',
            'https://www.104.com.tw/company/ajax/joblist/aldnfbc',
            'https://www.104.com.tw/company/ajax/joblist/20816qw',
            'https://www.104.com.tw/company/ajax/joblist/1a2x6bl5f2',
            'https://www.104.com.tw/company/ajax/joblist/1a2x6bkz5g',
            'https://www.104.com.tw/company/ajax/joblist/1a2x6bkvj6',
            'https://www.104.com.tw/company/ajax/joblist/fuw5umg',
            'https://www.104.com.tw/company/ajax/joblist/xtxy0wo',
            'https://www.104.com.tw/company/ajax/joblist/xjf9b4o',
            'https://www.104.com.tw/company/ajax/joblist/yvhv2k0',
            'https://www.104.com.tw/company/ajax/joblist/2a6xnbk',
            'https://www.104.com.tw/company/ajax/joblist/1a2x6bj4l8',
            'https://www.104.com.tw/company/ajax/joblist/4bn0uio',
            'https://www.104.com.tw/company/ajax/joblist/18kefvtc',
            'https://www.104.com.tw/company/ajax/joblist/aetgx1c',
            'https://www.104.com.tw/company/ajax/joblist/1a2x6bkgbz',
            'https://www.104.com.tw/company/ajax/joblist/1a2x6bjimk',
            'https://www.104.com.tw/company/ajax/joblist/1a2x6bkq8q',
            'https://www.104.com.tw/company/ajax/joblist/9k1zdso',
            'https://www.104.com.tw/company/ajax/joblist/5fpgamw',
            'https://www.104.com.tw/company/ajax/joblist/13hn9jxk',
            'https://www.104.com.tw/company/ajax/joblist/clw1usg',
            'https://www.104.com.tw/company/ajax/joblist/1a2x6bknb6',
            'https://www.104.com.tw/company/ajax/joblist/1a2x6bivq9']

### 使用 requests

In [None]:
t1 = time.time()
res_list = []
for url in test_url:
    tmp_dict = {}
    cnt = json.loads(requests.get(url, verify=False).text)['data']['totalCount']
    tmp_dict = {'cnt': cnt,
                'url': url}
    res_list.append(tmp_dict)
print("requests total time:", time.time() - t1)
"""requests total time: 1.955094575881958"""

### 使用asyncio & aiohttp

In [None]:
async def fetch_coroutine(client, url):
    response = await client.get(url)       # 等待並切換
    data = await response.text()
    data = json.loads(data)
    cnt = data['data']['totalCount']
    
    result_dict = {
        'cnt': cnt,
        'url': url
    }
    return result_dict

async def main(loop):
    headers = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'}
    async with aiohttp.ClientSession(loop=loop, headers=headers, conn_timeout=5 ) as client:
        tasks = [fetch_coroutine(client, url) for url in test_url]     #fetch_coroutine為重複執行的function #把所有task打包                          
        finished, unfinished = await asyncio.wait(tasks)  
        all_results = [r.result() for r in finished]    # 獲取所有結果
        return all_results

In [None]:
t1 = time.time()
loop = asyncio.get_event_loop()
all_results = loop.run_until_complete(main(loop))
print("Async total time:", time.time() - t1)
"""Async total time: 0.18433618545532227"""