### 크롤링 속도향상

- re(Regular Expression: 정규표현식), requests 모듈 네트워크 크롤링 기본
- thread, concurrent.futures 모듈 사용 속도 개선

In [1]:
# 모듈 임포트
import requests
import re
import time

# 스레드
import threading
# 비동기처리 고수준 인테페이스 모듈
from concurrent.futures import ThreadPoolExecutor

#### 크롤링 예제 사이트
- https://search.daum.net/search?w=img&nil_search=btn&DA=NTB&enc=utf8&q=%EA%B3%A0%EC%96%91%EC%9D%B4

#### 이미지 저장함수

In [2]:
# 이미지 저장함수
def save_image(idx, img_url):
    res = requests.get(img_url).content # 이미지 바이트값 

    with open(f'./img/{idx}.jpg', mode='wb') as f:
        f.write(res)
        print(f'saved image with {idx}')

In [None]:
# 이미지 저장 테스트
save_image(9999, 'https://th.bing.com/th/id/OIP.Y9h80DtXEhUNVqMCm8cn8wHaEJ?w=333&h=187&c=7&r=0&o=5&dpr=1.3&pid=1.7')

saved image with 9999


#### 다음 고양이 이미지 검색URL 추출 후 저장

- 다음은 Beautifulsoup 같은 모듈로 DOM 추출이 조금 힘듬
- 그래서 정규식으로 추출 진행

In [None]:
reg = re.compile('oimgurl: ".{0,300}", cpid')  # oimgurl: *** , cpid 사이에 들어있는 문자열을 추출

url = 'https://search.daum.net/search?w=img&nil_search=btn&DA=NTB&enc=utf8&q=%EA%B3%A0%EC%96%91%EC%9D%B4' # 검색결과 웹 페이지
html = requests.get(url) # 정적으로 URL 요청
html_raw_data = str(html.content)

html_raw_data

'b\'<!doctype html>\\n<html xmlns="http://www.w3.org/1999/xhtml" lang="ko">\\n<head profile="http://a9.com/-/spec/opensearch/1.1/">\\n<meta http-equiv="content-Type" content="text/html;charset=utf-8" />\\n<meta http-equiv="X-UA-Compatible" content="IE=edge" />\\n<meta name="autocomplete" content="off" />\\n<meta name="referrer" content="always">\\n<meta name="format-detection" content="telephone=no" />\\n<meta property="og:title" content="\\xea\\xb3\\xa0\\xec\\x96\\x91\\xec\\x9d\\xb4 &ndash; Daum \\xea\\xb2\\x80\\xec\\x83\\x89" />\\n<meta property="og:url" content="https://search.daum.net/search?w=img&amp;nil_search=btn&amp;DA=NTB&amp;enc=utf8&amp;q=%EA%B3%A0%EC%96%91%EC%9D%B4" />\\n<meta property="og:description" content="Daum \\xea\\xb2\\x80\\xec\\x83\\x89\\xec\\x97\\x90\\xec\\x84\\x9c \\xea\\xb3\\xa0\\xec\\x96\\x91\\xec\\x9d\\xb4\\xec\\x97\\x90 \\xeb\\x8c\\x80\\xed\\x95\\x9c \\xec\\xb5\\x9c\\xec\\x8b\\xa0\\xec\\xa0\\x95\\xeb\\xb3\\xb4\\xeb\\xa5\\xbc \\xec\\xb0\\xbe\\xec\\x95\\x84\\x

In [12]:
reg_iter = reg.finditer(html_raw_data)

for idx, data in enumerate(reg_iter):
    img_url = data.group().split('oimgurl:')[1].replace(', cpid', '').replace('"', '') # 필요없는 요소 제거하고 URL만 남기는 처리
    print(img_url)

 http://cfile138.uf.daum.net/R682x0/121A8B1C4B3DBAF43D2636
 http://cfile138.uf.daum.net/R682x0/121A8B1C4B3DBAF43D2636
 https://blog.kakaocdn.net/dn/vhHFC/btseHkjDwtY/4aErOpBMk08mKnseT3OSkK/img.webp
 https://blog.kakaocdn.net/dn/vhHFC/btseHkjDwtY/4aErOpBMk08mKnseT3OSkK/img.webp
 https://blog.kakaocdn.net/dn/K8ZqE/btsFlSiaOIJ/AQ2qHUr330RJhKFUtvS7t1/img.png
 https://blog.kakaocdn.net/dn/K8ZqE/btsFlSiaOIJ/AQ2qHUr330RJhKFUtvS7t1/img.png
 https://blog.kakaocdn.net/dn/czc6Am/btsKw0tBw7k/RwXK40jendWas888gXm510/img.webp
 https://blog.kakaocdn.net/dn/czc6Am/btsKw0tBw7k/RwXK40jendWas888gXm510/img.webp
 https://images.mypetlife.co.kr/content/uploads/2019/12/09151754/shutterstock_331765481.jpg
 https://images.mypetlife.co.kr/content/uploads/2019/12/09151754/shutterstock_331765481.jpg
 https://t1.daumcdn.net/cfile/tistory/2739C74852BC617203
 https://t1.daumcdn.net/cfile/tistory/2739C74852BC617203
 https://blog.kakaocdn.net/dn/kH6TF/btsGNqDFaZb/zQZgvXLLhPSnh7OHxzmSVk/img.png
 https://blog.kakaocdn.ne

#### 일반 이미지 저장시간 측정

In [13]:
start_time = time.time()    # 시작시간 체크

reg_iter = reg.finditer(html_raw_data)

for idx, data in enumerate(reg_iter):
    img_url = data.group().split('oimgurl:')[1].replace(', cpid', '').replace('"', '') # 필요없는 요소 제거하고 URL만 남기는 처리
    # print(img_url)
    save_image(idx, img_url)

end_time = time.time()     # 종료시간 체크
print(f'저장 소요시간 => {end_time - start_time} 초.')

saved image with 0
saved image with 1
saved image with 2
saved image with 3
saved image with 4
saved image with 5
saved image with 6
saved image with 7
saved image with 8
saved image with 9
saved image with 10
saved image with 11
saved image with 12
saved image with 13
saved image with 14
saved image with 15
saved image with 16
saved image with 17
saved image with 18
saved image with 19
saved image with 20
saved image with 21
saved image with 22
saved image with 23
saved image with 24
saved image with 25
saved image with 26
saved image with 27
saved image with 28
saved image with 29
saved image with 30
saved image with 31
saved image with 32
saved image with 33
saved image with 34
saved image with 35
saved image with 36
saved image with 37
saved image with 38
saved image with 39
saved image with 40
saved image with 41
saved image with 42
saved image with 43
saved image with 44
saved image with 45
saved image with 46
saved image with 47
saved image with 48
saved image with 49
saved imag

##### 일반 저장방식으로 81.76초 소요

#### 스레드 방식으로 저장

- 기존 소스 그대로 활용

In [14]:
start_time = time.time()    # 시작시간 체크

reg_iter = reg.finditer(html_raw_data)
thred_list = []   # 1. thread 처리를 위한 리스트 

for idx, data in enumerate(reg_iter):
    img_url = data.group().split('oimgurl:')[1].replace(', cpid', '').replace('"', '') # 필요없는 요소 제거하고 URL만 남기는 처리
    # 2. 스레드 처리할 스레드 변수 선언, target에 기존 사용하던 함수입력, args에 기존 함수에 사용하던 매개변수 갯수만큼 튜플로 입력
    worker = threading.Thread(target=save_image, args=(idx, img_url))
    worker.start()   # 3. 스레드 시작
    thred_list.append(worker)  # 4. 리스트 추가

for thread in thred_list:  # 5. 스레드 종료
    thread.join()   

end_time = time.time()     # 종료시간 체크
print(f'저장 소요시간 => {end_time - start_time} 초.')

saved image with 0
saved image with 1
saved image with 16
saved image with 17
saved image with 28
saved image with 29
saved image with 9saved image with 8

saved image with 5
saved image with 25
saved image with 2
saved image with 24
saved image with 11
saved image with 3
saved image with 4
saved image with 20
saved image with 7saved image with 10
saved image with 19

saved image with 14
saved image with 21
saved image with 12
saved image with 50
saved image with 51
saved image with 52
saved image with 53
saved image with 6
saved image with 18
saved image with 15
saved image with 13
saved image with 26
saved image with 32
saved image with 27
saved image with 56
saved image with 33
saved image with 57
saved image with 58
saved image with 59
saved image with 60
saved image with 61
saved image with 34
saved image with 37
saved image with 23
saved image with 22
saved image with 35
saved image with 36
saved image with 41
saved image with 69
saved image with 68
saved image with 40
saved imag

##### 스레드 저장방식으로 13초 소요

#### concurrent.futures 모듈 사용 병렬처리

In [15]:
start_time = time.time()    # 시작시간 체크

reg_iter = reg.finditer(html_raw_data)

# concurrent.futures에 ThreadPoolExecutor에 포함시키면 됨
with ThreadPoolExecutor(max_workers=16) as executor:
    for idx, data in enumerate(reg_iter):
        img_url = data.group().split('oimgurl:')[1].replace(', cpid', '').replace('"', '') # 필요없는 요소 제거하고 URL만 남기는 처리
        # print(img_url)
        executor.submit(save_image, idx, img_url)

end_time = time.time()     # 종료시간 체크
print(f'저장 소요시간 => {end_time - start_time} 초.')

saved image with 0saved image with 1

saved image with 16
saved image with 17
saved image with 14
saved image with 6
saved image with 5
saved image with 19
saved image with 2
saved image with 15
saved image with 10
saved image with 9
saved image with 7
saved image with 11
saved image with 8
saved image with 3
saved image with 18
saved image with 28
saved image with 29
saved image with 4
saved image with 13
saved image with 12
saved image with 20
saved image with 21
saved image with 22
saved image with 23
saved image with 24
saved image with 25
saved image with 36
saved image with 26
saved image with 33
saved image with 30
saved image with 32
saved image with 35
saved image with 27
saved image with 34
saved image with 31
saved image with 50
saved image with 51
saved image with 53
saved image with 52
saved image with 37
saved image with 56
saved image with 57
saved image with 58
saved image with 60
saved image with 59
saved image with 61
saved image with 38
saved image with 39
saved imag

##### concurrent.futures 사용 동시저장 처리시간은 13.98초 소요