# Crawler - Crawling

**link를 찾는 것**이 가장 중요하다

search space 내에서 정보를 탐색하는 방법(인공지능)
- 어디로 움직일 지를 선택하는 방법(DFS/BFS)

```python
while 더이상 방문할 곳이 없을때까지 # 봇이 안 죽게끔
  URL Pool => 앞으로 봇이 방문해야 할 페이지들의 목록
  # 방문해도 될까? robotparser
  seed = URL.pop() -> 이번에 봇이 방문할 링크
  (thread -> Bot.seed())
  request(seed) -> response -> 1) 다른 링크, 2) 페이지 내용 분석(Scraping)
  link 추출(link가 어디에 있는지, link 어떻게 생겼는지) # 많은 링크를 찾아야 함
  정규화(URL형태로)
  방문한 적이 있는지? #전략;
  # 다음번 방문 전, 적당한 delay
  없으면 URL Pool 추가
```

In [None]:
# 크롤러:  링크 수집기 + 색인을 위한 데이터 수집(스크래핑)
#        ---
#        * A
# * iframe(블로그)[src]
#       form
#   img, video, audio
# script, stile... (src)
# (불특정다수)
# %검색어% : 검색어가 들어가있는 모든 게시물, 순위화 및 관련성 측정 불가
# PageRank: Link 중 누가 중요한지 순위를 매김 => 모든 페이지가 동일하지 않다

In [None]:
# A -> B -> C -> A
# A(B에 있는 내용이 좋거나 중요하다는 것을 암시)

In [None]:
# O -> ? (DFS, BFS)
#                  X  (검색결과)
#       X    X    X     X     X
#  X  O  O  O ...     어디가 끝인지 알 수 없음

In [36]:
from requests import request, get
from requests.compat import urljoin, urlparse, urlunparse
from bs4 import BeautifulSoup
import re

In [37]:
# robotparser
# 도메인을 받아서,
# robots.txt 확인
# agent:
# [Dis]Allow: _______ 규칙

# 새로운 netloc(domain) 만나면, /robots.txt
# 동일한                만나면, /robots.txt
# robots = {'domain':['path', 'path', ...]}
robots = dict()

# Remote Disconnected 방지 or 상대 사이트에서 bot 감지 위해서 내 User-agent
headers = {'user-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Whale/3.26.244.21 Safari/537.36'}

#            netloc, path(disallow에 있는지 검사할 대상)
def canFetch(host, path):
    # 반드시 https로 시작해야지만 정상적인 URL, 이를 위한 검사
    if not re.search('^https?', host):
        return 'Scheme 오류'

    url = host+'/' if host[-1] != '/' else host
    # URL 가장 마지막에 /robots.txt를 붙이기 위해 필요
    # www.naver.comrobots.txt => www.naver.com/robots.txt
    url += 'robots.txt'

    # urlparse(https://www.naver.com).netloc => k=www.naver.com
    k = urlparse(url).netloc
    # k not in robots{'www.naver.com':['path', 'path', ...]}
    # k(www.naver.com)가 dict에 없을 경우,
    if k not in robots.keys():
        resp = get(url, headers=headers)

        # netloc/robots.txt에 접근X (없어서, 서버가 응답X)
        if resp.status_code != 200:
            print(resp.status_code)
            return True

        # if k not in robots.keys():
        robots[k] = re.findall('^disallow:\s*(.+)$', resp.text, re.IGNORECASE|re.MULTILINE)

    # dict[k] = 'path', 'path', ...]
    # path in 'path', 'path', ...]
    if path in robots[k]:
        return False
    # 명시적으로 거부하지 않았기에, opt-out(blacklist)에 의해서 True
    else:
        # path.split('/') # 나중에 완성
        return True

In [None]:
from time import sleep

URLs = list()   # 앞으로 방문해야 할 URL 목록
Seens = list()  # 기존에 방문한 URL 목록

# 시작점이 되는 seed 주소 한 개 추가
URLs.append('https://search.naver.com/search.naver?where=nexearch&sm=top_hty&fbm=0&ie=utf8&query=%EC%B9%B4%EB%A6%AC%EB%82%98')

# 앞으로 방문할 목록이 빌 때까지(더 이상 방문할 URL이 없을 때까지)
while URLs:
    # BFS(관련성)를 위한 Queue:0, DFS(세부적)를 구현한다면 Stack:-1
    seed = URLs.pop(0) # pop()하면 순차적으로 꺼내기
    Seens.append(seed) # seed는 실제 방문을 하던, 오류나 중복 등으로 방문하지 않던, 방문한 목록에 추가

    components = urlparse(seed) # robotsParser의 parameter를 위해

    # (scheme, netloc, path, params, qs, fragment)
    # canfetch('scheme://netloc', 'path')
    if canFetch('://'.join(components[:2]), components.path) == False:
        print('가져가면 안돼요')

    # Request - Response
    resp = get(seed, headers=headers)

    if resp.status_code != 200:
        # 400 or 500 error
        # 500번대는 ServerError, 나중에 다시 시도
        if resp.status_code == 500:
            URLs.append(seed)
        else:
        # 400번대는 ClientError, 방문하지 않도록
            print('Response 없음')
            continue # 밑에 실행 안하고, 다음 while => URLs.pop() -> 다음 주소로 이동

    # text/*, application/*, image/*, ...
    # Hyperlink in TEXT/HTML
    # HTML.Response.Headers에 TEXT/HTML인지 확인
    if not re.search(r'text/html', resp.headers['content-type']):
        continue

    # HTML -> DOM
    dom = BeautifulSoup(resp.text, 'html.parser')
    # A(href 속성이 있는), IFRAME(src 속성이 있는)
    for link in dom.select('a[href], iframe[src]'):
        # href = 만약 href 속성이 있으면 href, 아니면 src
        href = link.attrs['href'] if link.has_attr('href') else link.attrs['src']
        if not re.match(r'#|javascript', href):
            # http://다른주소
            # #top
            # javascript:func()
            # mailto:
            # tel:
            nurl = urljoin(seed, href)

            # URL 정규화, /다른페이지 => http://netloc/다른페이지
            if nurl not in Seens and \
               nurl not in URLs: # 한번이라도 방문한적이 없으면(Seens와 URLs 모두)
                # 앞으로 방문할 URL
                URLs.append(nurl)

    # 너무 많은 Traffic 발생시키지 않기 위해
    # sleep(1) # <= 난수

418
Response 없음
가져가면 안돼요
418
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
가져가면 안돼요
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음
Response 없음


KeyboardInterrupt: 

In [None]:
len(URLs), len(Seens)

(1145, 117)

In [None]:
Seens

['https://search.naver.com/search.naver?where=nexearch&sm=top_hty&fbm=0&ie=utf8&query=%EC%B9%B4%EB%A6%AC%EB%82%98',
 'https://www.naver.com',
 'https://search.naver.com/search.naver?ssc=tab.image.all&where=image&sm=tab_jum&query=%EC%B9%B4%EB%A6%AC%EB%82%98',
 'https://search.naver.com/search.naver?ssc=tab.news.all&where=news&sm=tab_jum&query=%EC%B9%B4%EB%A6%AC%EB%82%98',
 'https://search.naver.com/search.naver?ssc=tab.blog.all&sm=tab_jum&query=%EC%B9%B4%EB%A6%AC%EB%82%98',
 'https://search.naver.com/search.naver?ssc=tab.cafe.all&sm=tab_jum&query=%EC%B9%B4%EB%A6%AC%EB%82%98',
 'https://search.naver.com/search.naver?ssc=tab.kin.kqna&where=kin&sm=tab_jum&query=%EC%B9%B4%EB%A6%AC%EB%82%98',
 'https://search.naver.com/search.naver?ssc=tab.influencer.chl&where=influencer&sm=tab_jum&query=%EC%B9%B4%EB%A6%AC%EB%82%98',
 'https://search.naver.com/search.naver?ssc=tab.video.all&where=video&sm=tab_jum&query=%EC%B9%B4%EB%A6%AC%EB%82%98',
 'https://search.shopping.naver.com/search/all?where=all&frm

In [None]:
seed, resp.headers, resp.status_code, resp.reason

In [None]:
get(seed, headers={'user-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Whale/3.26.244.21 Safari/537.36'})

In [None]:
# BFS => [1], [2, 3, 4], [5, 6, 7] => Queue(FIFO)
# DFS => [1], [4] [7, 6, 5], [3] [2] => Stack(LIFO)
#               1
#       2       3       4
#                     5 6 7

In [None]:
[1,2,3,4].pop(0), [1,2,3,4].pop(-1)

In [None]:
    # <a href="">
    # <button>
    # <form>
    # <iframe src="">
    # <audio>
    # <video>

## Focused Crawling

In [None]:
# 웹은 굉장히 크기 때문에 심각한 문제가 없을 때까지 크롤러는 계속 돈다
# => 안 끝남 => 전략이 필요(BFS, DFS, Focused Crawling)
# 무엇을 집중적으로 수집할 것인지 한정지어놓고, 그 안에서만 크롤링하도록

In [None]:
# 1. Depth 제한, 너무 멀리 벗어나지 않도록
# 1     2       3
# A -> 500 -> 10000
#      501 ->

In [None]:
# 2. Domain 제한
# A -> 500.filter(naver.com)
#      100 -> filter(naver.com)

In [None]:
# 3. Depth, Domain 모두 제한
# 4. 영역(HTML Tag) 제한

### 1. 깊이 제한

In [None]:
URLs = list()
Seens = list()

URLs.append({
    'url':'https://search.naver.com/search.naver?where=nexearch&sm=top_hty&fbm=0&ie=utf8&query=%EC%B9%B4%EB%A6%AC%EB%82%98',
    'depth':1
})

# Focused Crawling - 깊이 제한
limit = 3

while URLs:
    seed = URLs.pop(-1) # Stack, DFS
    Seens.append(seed)

    # 특정 깊이까지만 탐색
    if seed['depth'] > limit:
        continue

    components = urlparse(seed['url'])

    if canFetch('://'.join(components[:2]), components.path) == False:
        print('가져가면 안돼요')

    resp = get(seed['url'], headers=headers)

    if resp.status_code != 200:

        if resp.status_code == 500:
            URLs.append(seed['url'])
        else:
            print('Response 없음')
            continue

    if not re.search(r'text/html', resp.headers['content-type']):
        continue

    dom = BeautifulSoup(resp.text, 'html.parser')
    for link in dom.select('a[href], iframe[src]'):
        href = link.attrs['href'] if link.has_attr('href') else link.attrs['src']
        if not re.match(r'#|javascript', href):
            nurl = urljoin(seed['url'], href)

            if nurl not in [s['url'] for s in Seens] and \
               nurl not in [s['url'] for s in URLs]:
                URLs.append({'url':nurl, 'depth':seed['depth']+1})

    # 너무 많은 Traffic 발생시키지 않기 위해
    # sleep(1) # <= 난수

404
가져가면 안돼요
404
404


KeyboardInterrupt: 

In [None]:
len(URLs), len(Seens)

(237, 566)

In [None]:
Seens

[{'url': 'https://search.naver.com/search.naver?where=nexearch&sm=top_hty&fbm=0&ie=utf8&query=%EC%B9%B4%EB%A6%AC%EB%82%98',
  'depth': 1},
 {'url': 'https://www.naver.com/more.html', 'depth': 2},
 {'url': 'https://help.naver.com/', 'depth': 3},
 {'url': 'https://www.navercorp.com', 'depth': 4},
 {'url': 'https://help.naver.com/service/30016/contents/18033?lang=ko',
  'depth': 4},
 {'url': 'https://policy.naver.com/rules/youthpolicy.html', 'depth': 4},
 {'url': 'https://help.naver.com/index.help?lang=ko', 'depth': 4},
 {'url': 'https://policy.naver.com/policy/service_group.html', 'depth': 3},
 {'url': 'http://help.naver.com/', 'depth': 4},
 {'url': 'https://policy.naver.com/policy/emreject.html', 'depth': 4},
 {'url': 'https://www.navercorp.com/nhn/company/proposalGuide.nhn',
  'depth': 4},
 {'url': 'http://recruit.navercorp.com/', 'depth': 4},
 {'url': 'http://www.navercorp.com/', 'depth': 4},
 {'url': 'https://green.naver.com/', 'depth': 4},
 {'url': 'https://help.naver.com/support/re

### 2. 도메인 제한

In [None]:
URLs = list()
Seens = list()

URLs.append({
    'url':'https://search.naver.com/search.naver?where=nexearch&sm=top_hty&fbm=0&ie=utf8&query=%EC%B9%B4%EB%A6%AC%EB%82%98',
    'depth':1
})

limit = 3
# Focused Crawling - 도메인 제한
allow = [re.compile(re.escape('www.naver.com')),
         re.compile(r'naver.com$')]

while URLs:
    seed = URLs.pop(0)
    Seens.append(seed)

    # Focused Crawling
    # 깊이 제한(이것을 포함한다면 3번도 구현 완료)
    if seed['depth'] > limit:
      continue

    # 도메인 제한
    if sum([m for m in
            map(lambda r: True if r.search(urlparse(seed['url']).netloc) else False,
                allow)]) == 0:
      # generator <= map(함수, iterator)
      # map(함수(iterator 객체):re.search() => Obj or None)
      # seed['URL'] = 'blog.naver.com'
      # => [re.search(www.naver.com, blog.naver.com) = False,
      #     re.search(naver.com$, blog.naver.com) = True]
      # sum([False, True]) = 1
      # if == 0이면 -> Whitelist(해당 도메인만 방문)
      #     > 0이면 -> Blacklist(해당 도메인 제외하고 나머지 방문)
      continue

    components = urlparse(seed['url'])

    if canFetch('://'.join(components[:2]), components.path) == False:
        print('가져가면 안돼요')

    resp = get(seed['url'], headers=headers)

    if resp.status_code != 200:
        if resp.status_code == 500:
            URLs.append(seed['url'])
        else:
            print('Response 없음')
            continue

    if not re.search(r'text/html', resp.headers['content-type']):
        continue

    dom = BeautifulSoup(resp.text, 'html.parser')
    for link in dom.select('a[href], iframe[src]'):
        href = link.attrs['href'] if link.has_attr('href') else link.attrs['src']
        if not re.match(r'#|javascript', href):
            nurl = urljoin(seed['url'], href)

            if nurl not in [s['url'] for s in Seens] and \
               nurl not in [s['url'] for s in URLs]:
                URLs.append({'url':nurl, 'depth':seed['depth']+1})


    # 너무 많은 Traffic 발생시키지 않기 위해
    # sleep(1) # <= 난수

418
Response 없음
가져가면 안돼요
418
Response 없음


KeyboardInterrupt: 

In [None]:
len(URLs), len(Seens)

(694, 15)

In [None]:
Seens

[{'url': 'https://search.naver.com/search.naver?where=nexearch&sm=top_hty&fbm=0&ie=utf8&query=%EC%B9%B4%EB%A6%AC%EB%82%98',
  'depth': 1},
 {'url': 'https://www.naver.com', 'depth': 2},
 {'url': 'https://search.naver.com/search.naver?ssc=tab.image.all&where=image&sm=tab_jum&query=%EC%B9%B4%EB%A6%AC%EB%82%98',
  'depth': 2},
 {'url': 'https://search.naver.com/search.naver?ssc=tab.news.all&where=news&sm=tab_jum&query=%EC%B9%B4%EB%A6%AC%EB%82%98',
  'depth': 2},
 {'url': 'https://search.naver.com/search.naver?ssc=tab.blog.all&sm=tab_jum&query=%EC%B9%B4%EB%A6%AC%EB%82%98',
  'depth': 2},
 {'url': 'https://search.naver.com/search.naver?ssc=tab.cafe.all&sm=tab_jum&query=%EC%B9%B4%EB%A6%AC%EB%82%98',
  'depth': 2},
 {'url': 'https://search.naver.com/search.naver?ssc=tab.kin.kqna&where=kin&sm=tab_jum&query=%EC%B9%B4%EB%A6%AC%EB%82%98',
  'depth': 2},
 {'url': 'https://search.naver.com/search.naver?ssc=tab.influencer.chl&where=influencer&sm=tab_jum&query=%EC%B9%B4%EB%A6%AC%EB%82%98',
  'depth':

### 3. HTML Tag 제한

In [38]:
URLs = list()
Seens = list()

URLs.append({
    'url':'https://news.naver.com/',
    'depth':1
})

limit = 3
allow = [re.compile(re.escape('news.naver.com')),
         re.compile(r'naver.com$')]

while URLs:
    seed = URLs.pop(0)
    Seens.append(seed)

    if seed['depth'] > limit:
      continue

    if sum([m for m in
            map(lambda r: True if r.search(urlparse(seed['url']).netloc) else False,
                allow)]) == 0:
      continue

    components = urlparse(seed['url'])

    if canFetch('://'.join(components[:2]), components.path) == False:
        print('가져가면 안돼요')

    resp = get(seed['url'], headers=headers)

    if resp.status_code != 200:
        if resp.status_code == 500:
            URLs.append(seed['url'])
        else:
            print('Response 없음')
            continue

    print(seed['url'])
    if not re.search(r'text/html', resp.headers['content-type']):
        continue

    # HTML Tag 제한 => Whitelist: 어떤 것들이 반드시 있어야 함
    dom = BeautifulSoup(resp.text, 'html.parser')
    # tagname:ul, class:Nlnb_menu_list의 자손 중 li
    # li의 자식 중 tagname:a, class:Nitem_link, 속성 href가 있는 링크
    for link in dom.select('ul.Nlnb_menu_list li > a.Nitem_link[href]'):
        href = link.attrs['href'] if link.has_attr('href') else link.attrs['src']
        if not re.match(r'#|javascript', href):
            nurl = urljoin(seed['url'], href)
            if nurl not in [s['url'] for s in Seens] and \
               nurl not in [s['url'] for s in URLs]:
                URLs.append({'url':nurl, 'depth':seed['depth']+1})

    # 해당 section에서의 뉴스 링크
    for link in dom.select('div.sa_text > a[href]'):
        href = link.attrs['href']
        if not re.match(r'#|javascript', href):
            nurl = urljoin(seed['url'], href)
            if nurl not in [s['url'] for s in Seens] and \
               nurl not in [s['url'] for s in URLs]:
                URLs.append({'url':nurl, 'depth':seed['depth']+1})

가져가면 안돼요
https://news.naver.com/
가져가면 안돼요
https://news.naver.com/?viewType=pc
https://news.naver.com/section/100
https://news.naver.com/section/101
https://news.naver.com/section/102
https://news.naver.com/section/103
https://news.naver.com/section/105
https://news.naver.com/section/104
https://news.naver.com/main/ranking/popularDay.naver
https://news.naver.com/newspaper/home?viewType=pc
https://news.naver.com/opinion/home
https://news.naver.com/main/tv/index.naver?mid=tvh
https://news.naver.com/factcheck/main
https://media.naver.com/algorithm
https://news.naver.com/ombudsman/errorArticleList
https://news.naver.com/main/ranking/popularDay.naver?mid=etc&sid1=111
https://news.naver.com/newspaper/home
https://news.naver.com/main/opinion/home.naver
https://news.naver.com/main/factcheck/main.naver
https://n.news.naver.com/mnews/article/666/0000067149
https://n.news.naver.com/mnews/article/014/0005321886
https://n.news.naver.com/mnews/article/003/0013122263
https://n.news.naver.com/mnews/art

KeyboardInterrupt: 

In [39]:
len(URLs), len(Seens)

(243, 42)

In [40]:
Seens

[{'url': 'https://news.naver.com/', 'depth': 1},
 {'url': 'https://news.naver.com/?viewType=pc', 'depth': 2},
 {'url': 'https://news.naver.com/section/100', 'depth': 2},
 {'url': 'https://news.naver.com/section/101', 'depth': 2},
 {'url': 'https://news.naver.com/section/102', 'depth': 2},
 {'url': 'https://news.naver.com/section/103', 'depth': 2},
 {'url': 'https://news.naver.com/section/105', 'depth': 2},
 {'url': 'https://news.naver.com/section/104', 'depth': 2},
 {'url': 'https://news.naver.com/main/ranking/popularDay.naver', 'depth': 2},
 {'url': 'https://news.naver.com/newspaper/home?viewType=pc', 'depth': 2},
 {'url': 'https://news.naver.com/opinion/home', 'depth': 2},
 {'url': 'https://news.naver.com/main/tv/index.naver?mid=tvh', 'depth': 2},
 {'url': 'https://news.naver.com/factcheck/main', 'depth': 2},
 {'url': 'https://media.naver.com/algorithm', 'depth': 2},
 {'url': 'https://news.naver.com/ombudsman/errorArticleList', 'depth': 2},
 {'url': 'https://news.naver.com/main/ranki

### 링크-본문에서 사진까지 Scraping

In [41]:
URLs = list()
Seens = list()

URLs.append({
    'url':'https://news.naver.com/',
    'depth':1
})
# site => news.naver.com

limit = 4
allow = [re.compile(re.escape('news.naver.com')),
         re.compile(r'news.naver.com$'),
         re.compile(r'pstatic.net$')]
# news.naver.com or $

while URLs:
    seed = URLs.pop(-1)
    Seens.append(seed)

    if seed['depth'] > limit:
        continue

    if sum([m for m in
            map(lambda r:True if r.search(urlparse(seed['url']).netloc) else False,
                allow)]) == 0:
        continue

    components = urlparse(seed['url'])

    if canFetch('://'.join(components[:2]), components.path) == False:
        print('가져가면 안되요')

    resp = get(seed['url'], headers=headers)

    if resp.status_code != 200:
        if resp.status_code == 500:
            URLs.append(seed['url'])
        else:
            print('Response 없음')
            continue

    print(seed['url'])

    # 링크+본문 Scraping 영역
    if re.search(r'image/(?:jpeg|jpg|gif|png|bmp)', resp.headers['content-type']):
    # image
        name = re.sub(r'[?:;/$]', '', seed['url'])
        ext = re.search(r'image/(jpeg|jpg|gif|png|bmp)', resp.headers['content-type']).group(1)
        with open(name+'.'+ext, 'wb') as fp:
            fp.write(resp.content)

    elif re.search(r'text/html', resp.headers['content-type']):
    # text
        dom = BeautifulSoup(resp.text, 'html.parser')
        # HTML Tag 제한 =>

        # tagname:ul, class:Nlnb_menu_list 의 자손 중 li
        # li의 자식 중 tagname:a, class:Nitem_link, 속성 href가 있는 링크
        # section100, 101, ...
        for link in dom.select('ul.Nlnb_menu_list li > a.Nitem_link[href]'):
            href = link.attrs['href']
            if not re.match(r'#|javascript', href):
                nurl = urljoin(seed['url'], href)
                if nurl not in [s['url'] for s in Seens] and\
                   nurl not in [s['url'] for s in URLs]:
                    URLs.append({'url':nurl, 'depth':seed['depth']+1})

            # 해당 section에서의 뉴스링크
        for link in dom.select('div.sa_text > a[href]'):
            href = link.attrs['href']
            if not re.match(r'#|javascript', href):
                nurl = urljoin(seed['url'], href)
                if nurl not in [s['url'] for s in Seens] and\
                   nurl not in [s['url'] for s in URLs]:
                    URLs.append({'url':nurl, 'depth':seed['depth']+1})

        # Scraping
        if dom.select_one('#title_area, #dic_area'): # if None이면, 없다는뜻(뉴스 본문이 아님)
            # 본문 내 이미지 링크
            for link in dom.select('#dic_area img[data-src]'):
                href = link.attrs['data-src']
                if not re.match(r'#|javascript', href):
                    nurl = urljoin(seed['url'], href)
                    if nurl not in [s['url'] for s in Seens] and\
                       nurl not in [s['url'] for s in URLs]:
                        URLs.append({'url':nurl, 'depth':seed['depth']+1})

            title = dom.select_one('#title_area')
            content = dom.select_one('#dic_area')
            g = re.search(r'(\d{3})/(\d{5,})$', seed['url'])
            with open(g.group(1)+'-'+g.group(2)+'.txt', 'w', encoding='utf8') as fp:
                fp.write(title.text+'\n\n')
                fp.write(content.text)

가져가면 안되요
https://news.naver.com/
https://news.naver.com/ombudsman/errorArticleList
https://news.naver.com/factcheck/main
https://news.naver.com/main/tv/index.naver?mid=tvh
https://news.naver.com/opinion/home
https://news.naver.com/newspaper/home?viewType=pc
https://news.naver.com/main/ranking/popularDay.naver
https://news.naver.com/section/104
https://n.news.naver.com/mnews/article/001/0015268854
404
https://imgnews.pstatic.net/image/001/2025/03/17/PEP20250314044501009_P4_20250317102819863.jpg?type=w860
404
https://imgnews.pstatic.net/image/001/2025/03/17/PAF20250313067101009_P4_20250317102819855.jpg?type=w860
404
https://imgnews.pstatic.net/image/001/2025/03/17/PAP20250314043001009_P4_20250317102819850.jpg?type=w860
404
https://imgnews.pstatic.net/image/001/2025/03/17/PUP20250314004801009_P4_20250317102819845.jpg?type=w860
https://n.news.naver.com/mnews/article/448/0000514180
404
https://imgnews.pstatic.net/image/448/2025/03/17/2025031790082_0_20250317104510311.jpg?type=w860
https://n

KeyboardInterrupt: 

In [42]:
dom.select('#dic_area img')

[<img alt="우크라이나 볼로디미르 젤렌스키 대통령. AP 연합뉴스" class="_LAZY_LOADING _LAZY_LOADING_INIT_HIDE" data-src="https://imgnews.pstatic.net/image/081/2025/03/17/0003525647_001_20250317103219430.jpg?type=w860" id="img1" style="display: none;">
 </img>,
 <img alt="안드리 흐나토프 신임 우크라이나 참모총장. 우크라이나 대통령실 제공" class="_LAZY_LOADING _LAZY_LOADING_INIT_HIDE" data-src="https://imgnews.pstatic.net/image/081/2025/03/17/0003525647_002_20250317103219479.jpg?type=w860" id="img2" style="display: none;">
 </img>,
 <img alt="도널드 트럼프 미국 대통령과 블라디미르 푸틴 러시아 대통령. UPI/TASS 연합뉴스" class="_LAZY_LOADING _LAZY_LOADING_INIT_HIDE" data-src="https://imgnews.pstatic.net/image/081/2025/03/17/0003525647_003_20250317103219519.jpg?type=w860" id="img3" style="display: none;">
 </img>]

In [43]:
len(URLs), len(Seens)

(48, 26)

In [44]:
Seens

[{'url': 'https://news.naver.com/', 'depth': 1},
 {'url': 'https://news.naver.com/ombudsman/errorArticleList', 'depth': 2},
 {'url': 'https://media.naver.com/algorithm', 'depth': 2},
 {'url': 'https://news.naver.com/factcheck/main', 'depth': 2},
 {'url': 'https://news.naver.com/main/tv/index.naver?mid=tvh', 'depth': 2},
 {'url': 'https://news.naver.com/opinion/home', 'depth': 2},
 {'url': 'https://news.naver.com/newspaper/home?viewType=pc', 'depth': 2},
 {'url': 'https://news.naver.com/main/ranking/popularDay.naver', 'depth': 2},
 {'url': 'https://news.naver.com/section/104', 'depth': 2},
 {'url': 'https://n.news.naver.com/mnews/article/001/0015268854', 'depth': 3},
 {'url': 'https://imgnews.pstatic.net/image/001/2025/03/17/PEP20250314044501009_P4_20250317102819863.jpg?type=w860',
  'depth': 4},
 {'url': 'https://imgnews.pstatic.net/image/001/2025/03/17/PAF20250313067101009_P4_20250317102819855.jpg?type=w860',
  'depth': 4},
 {'url': 'https://imgnews.pstatic.net/image/001/2025/03/17/PA