In [0]:
#참고1(비동기 크롤링 구조): https://medium.com/@mjhans83/%ED%8C%8C%EC%9D%B4%EC%8D%AC%EC%9C%BC%EB%A1%9C-%ED%81%AC%EB%A1%A4%EB%A7%81-%ED%95%98%EA%B8%B0-2-d643ddafb635
#참고2(asyncio.Semaphore) : https://docs.python.org/ko/3/library/asyncio-sync.html
#참고3: https://hwangheek.github.io/2019/asynchronous-python/
#참고4(Async 메커니즘): https://tech.ssut.me/python-3-play-with-asyncio/

In [2]:
!pip install fake_useragent
!pip install aiohttp



In [0]:
import asyncio
import ast
import aiohttp
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
from random import randint
import traceback
from time import time
import re

In [0]:
#공유자원(list) 동기화를 위해 lock사용
lock = asyncio.Lock()
h_list = []
g_list = []
uri = ''

def main():
    begin = time()
    #비동기 처리를 위해 이벤트루프(스케줄링)를 획득하고 코루틴 객체를 실행
    loop = asyncio.get_event_loop()
    loop.run_until_complete(fetch_crawling(h_list, g_list))
    #loop.close()
    end = time()
    print('전체 실행시간: {0:.3f}초'.format(end - begin))

    begin = time()
    if len(h_list) > 0:
        h_g_dataframe = createDataFrame() #h_list, g_list
        writeCSV(h_g_dataframe, file_name = "h_g_20200216.csv")
    end = time()
    print('파일 저장시간: {0:.3f}초'.format(end - begin))

In [0]:
#hs prefix 1자리(0~9로 시작함)를 반복수행하는 비동기함수 선언
async def fetch_crawling(h_list, g_list):
    hs_prefix_list = [0, 1, 2, 3, 4, 5, 6, 7 ,8 ,9] #10개
    
    #하나의 비동기 요청->대상 Page 획득(몇천개)->해당 Page를 비동기로 실행->하나의 비동기 작업이 끝날때까지 대기필요
    for h_prefix in hs_prefix_list:
        begin = time()
        crawler_semaphore = asyncio.Semaphore(value=1) # hs_prefix 하나당 작업전체를 완료하고 다음번호 실행

        req_param = {}
        req_param['page'] = 1
        await async_crawling_start(h_prefix, crawler_semaphore, req_param)    

        end = time()
        print('[h_prefix:',h_prefix,'] 실행 시간: {0:.3f}초'.format(end - begin))

async def async_crawling_start(h_prefix, semaphore, req_param):
    await semaphore.acquire() # lock 획득
    print("[h_prefix:", h_prefix, "] START!")

    try:
        res_data = await async_request_post(h_prefix, req_param)

        #last_page = res_data['paginationInfo']['lastPageNo']
        #totalRecordCount = res_data['paginationInfo']['totalRecordCount']
        last_page = 1
        totalRecordCount = 1
        print("total page number:" + str(last_page) + ", totalRecordCount:", totalRecordCount)

        if last_page > 0: 
            await append_list(res_data)
            await async_page_crawling(h_prefix, req_param, last_page)

        await asyncio.sleep(0.05)
        print("[h_prefix:", h_prefix, "] END!")
        semaphore.release()
    except RecursionError:
        print("[{}] RECURSION ERROR (MULTI CRAWLING)".format(word))
        await asyncio.sleep(0.05)
        semaphore.release()

async def async_request_post(h_prefix, req_param):
    await asyncio.sleep(2.0)    # asyncio.sleep도 네이티브 코루틴
    print('req_param:', req_param)

    ret_val = "1 /t test"
    #ret_dic['paginationInfo']['lastPageNo'] = ""
    #ret_dic['paginationInfo']['totalRecordCount'] = ""
    #ret_dic['uls_over']['itemList'] = ""
    
    return ret_val

async def async_page_crawling(h_prefix, req_param, last_page):
    if last_page > 1:
        print("[{} - {}] STARTED".format(h_prefix, last_page))
        req_param["page"] = last_page

        res_data = await async_request_post(h_prefix, req_param)
        
        async with lock: # list에 저장할때는 동기화(순서대로 접근) 유지
            await append_list(res_datat)
        await async_page_crawling(h_prefix, last_page-1)

In [0]:
#response의 모든row를 순회하면서 data저장
async def append_list(res): #, h_list, g_list
    #last_page = res['paginationInfo']['lastPageNo']
    last_page = 0
    #totalRecordCount = res['paginationInfo']['totalRecordCount']
    #print("total page number:" + str(last_page) + ", totalRecordCount:", totalRecordCount)

    #item_list = res['uls_over']['itemList']
    item_list = []

    p = re.compile('[0-9]*')
    tot_word_cnt = 0
    for i in range(len(item_list)):
        h_sub_list = p.findall(str(item_list[i]['DTRM_HS_SGN_CN']).strip())
        h_code = ''.join(h_sub_list)
        h_list.append(h_code[0:10])
        gd = str(item_list[i]['ENGL_CMDT_DESC']).strip()
        g_list.append(gd)
        
        w_cnt = len(gd.split(" "))
        tot_word_cnt += w_cnt
    
    avg_word_cnt = 0
    if(len(item_list)!=0):
        avg_word_cnt = tot_word_cnt/len(item_list)
    print('len(h_list):', len(h_list))
    return last_page, avg_word_cnt

def createDataFrame(): #h_list, g_list
    import pandas as pd
    h_df = pd.DataFrame(h_list)
    h_df.columns=['HS']
    g_df = pd.DataFrame(g_list)
    g_df.columns=['GD']
    h_g_df = pd.concat([h_df, g_df],axis=1)

    print(h_g_df.head())
    print(len(h_g_df))
    return h_g_df

def writeCSV(dataFrame, file_name, mode='w'):
    dataFrame.to_csv(file_name, mode=mode, index=False)
    print('export csv success! row:', len(dataFrame))

In [30]:
#do main
main()

[h_prefix: 0 ] START!
req_param: {'page': 1}
total page number:1, totalRecordCount: 1
len(h_list): 0
[h_prefix: 0 ] END!
[h_prefix: 0 ] 실행 시간: 2.056초
[h_prefix: 1 ] START!
req_param: {'page': 1}
total page number:1, totalRecordCount: 1
len(h_list): 0
[h_prefix: 1 ] END!
[h_prefix: 1 ] 실행 시간: 2.053초
[h_prefix: 2 ] START!
req_param: {'page': 1}
total page number:1, totalRecordCount: 1
len(h_list): 0
[h_prefix: 2 ] END!
[h_prefix: 2 ] 실행 시간: 2.053초
[h_prefix: 3 ] START!
req_param: {'page': 1}
total page number:1, totalRecordCount: 1
len(h_list): 0
[h_prefix: 3 ] END!
[h_prefix: 3 ] 실행 시간: 2.054초
[h_prefix: 4 ] START!
req_param: {'page': 1}
total page number:1, totalRecordCount: 1
len(h_list): 0
[h_prefix: 4 ] END!
[h_prefix: 4 ] 실행 시간: 2.053초
[h_prefix: 5 ] START!
req_param: {'page': 1}
total page number:1, totalRecordCount: 1
len(h_list): 0
[h_prefix: 5 ] END!
[h_prefix: 5 ] 실행 시간: 2.053초
[h_prefix: 6 ] START!
req_param: {'page': 1}
total page number:1, totalRecordCount: 1
len(h_list): 0