In [7]:
import re
import requests
from bs4 import BeautifulSoup 
from urllib.parse import urljoin
from urllib.parse import urlparse
from tldextract import extract

In [10]:
wait_list = ["https://afuntw.github.io/demo-crawling/demo-page/ex4/index1.html"]
viewed_list = []
h2_answer = []

while wait_list != []:
    url = wait_list.pop(0)
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'lxml')
    print('Current website: {}'.format(url))
    
    viewed_list.append(url)
    h2 = soup.find_all('h2')
    for tag in h2:
        h2_answer.append(tag.text)
        
    links = soup.find_all('a', href=True)
    for link in links:

        new_url = urljoin(url, link['href'])

        # 過濾錨點, 不需要再對相同的網頁送 request
        check_anchor = not re.match('#.*', link['href'])

        # 過濾程式碼
        check_code = not re.match('^javascript.*', link['href'])

        # 過濾協定, 只取 http 或是 https
        # Hint: 若原本 href 是相對路徑則沒有協定, 要先透過 urljoin 取得絕對路徑
        check_protocol = urlparse(new_url).scheme in ['http', 'https']
        
         # 實際過濾的判斷式
        if check_anchor and check_code and check_protocol:

            # 對當前 url 與新的 url 做 extract 分析網域
            root_url = extract(url)
            current_url = extract(new_url)
            print('root_url extract: {}'.format(root_url))
            print('current_url extract: {}'.format(current_url))

            # 檢查 subdomain 是 www 或是與當前頁面的 subdomain 相同
            check_subdomain = current_url.subdomain == 'www' or current_url.subdomain == root_url.subdomain

            # 檢查新的 url 要與當前頁面的 domain 相同, 且符合 subdomain 需求
            if root_url.domain == current_url.domain and check_subdomain:

                # 新的 url 要符合的條件
                # 1. wait_list 裏面沒有出現
                # 2. viewed_list 也沒有出現
                if new_url not in wait_list and new_url not in viewed_list:

                    # 將新發現的超連結存入 wait list
                    wait_list.append(new_url)

    print('Get h2 tags: {}'.format(h2_answer))
    print('URL wait list: {}'.format(wait_list))
    print('URL viewed list: {}'.format(viewed_list))
    print()

Current website: https://afuntw.github.io/demo-crawling/demo-page/ex4/index1.html
root_url extract: ExtractResult(subdomain='afuntw', domain='github', suffix='io')
current_url extract: ExtractResult(subdomain='afuntw', domain='github', suffix='io')
root_url extract: ExtractResult(subdomain='afuntw', domain='github', suffix='io')
current_url extract: ExtractResult(subdomain='afuntw', domain='github', suffix='io')
root_url extract: ExtractResult(subdomain='afuntw', domain='github', suffix='io')
current_url extract: ExtractResult(subdomain='www', domain='facebook', suffix='com')
Get h2 tags: ['Home Heading 1', 'Home Heading 2', 'Home Heading 3', "First featurette heading. It'll blow your mind.", "Oh yeah, it's that good. See for yourself.", 'And lastly, this one. Checkmate.']
URL wait list: ['https://afuntw.github.io/demo-crawling/demo-page/ex4/index2.html', 'https://afuntw.github.io/demo-crawling/demo-page/ex4/index3.html']
URL viewed list: ['https://afuntw.github.io/demo-crawling/demo-p