In [None]:
import os
import re
import requests
from bs4 import BeautifulSoup

# Gutenberg 的中文書籍頁面 URL
base_url = "https://www.gutenberg.org"
language_page = f"{base_url}/browse/languages/zh"

# 建立儲存書籍的資料夾
output_dir = "gutenberg_books"
os.makedirs(output_dir, exist_ok=True)

# 用於判斷書名是否包含中文的正規表達式
chinese_char_pattern = re.compile(r'[\u4e00-\u9fff]')

def fetch_book_list(language_page):
    """取得中文書籍的連結與書名"""
    response = requests.get(language_page)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, "html.parser")

    books = []
    for link in soup.select("li > a[href]"):
        title = link.text.strip()
        if chinese_char_pattern.search(title):  # 書名包含中文
            books.append({
                "title": title,
                "url": base_url + link["href"]
            })
    return books

def sanitize_filename(name):
    """清理檔名中的非法字元"""
    return re.sub(r'[\\/*?:"<>|]', "_", name)

def download_book(book):
    """下載書籍內容並儲存為 .txt"""
    try:
        response = requests.get(book["url"])
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")

        # 找到書籍內容的下載連結（只取 .txt 格式）
        download_link = soup.find("a", href=True, text="Plain Text UTF-8")
        if not download_link:
            # print(f"無法找到書籍內容: {book['title']}")
            return

        # 下載書籍內容
        book_url = base_url + download_link["href"]
        book_response = requests.get(book_url)
        book_response.raise_for_status()

        # 清理檔名並儲存書籍內容
        sanitized_title = sanitize_filename(book["title"])
        file_path = os.path.join(output_dir, f"{sanitized_title}.txt")
        with open(file_path, "wb") as file:
            file.write(book_response.content)
        print(f"已下載: {book['title']}")
    except Exception as e:
        print(f"下載失敗: {book['title']} - {e}")

def main():
    #print("開始抓取書籍列表...")
    books = fetch_book_list(language_page)
    print(f"開始下載內容")

    for book in books:
        download_book(book)

    print("下載完成")

if __name__ == "__main__":
    main()



開始抓取書籍列表...
找到 538 本書籍，開始下載內容...


  download_link = soup.find("a", href=True, text="Plain Text UTF-8")


無法找到書籍內容: About
          ▾
無法找到書籍內容: About Project Gutenberg
無法找到書籍內容: Collection Development
無法找到書籍內容: Contact Us
無法找到書籍內容: History & Philosophy
無法找到書籍內容: Permissions & License
無法找到書籍內容: Privacy Policy
無法找到書籍內容: Terms of Use
無法找到書籍內容: Search and Browse
      	  ▾
無法找到書籍內容: Book Search
無法找到書籍內容: Bookshelves
無法找到書籍內容: Frequently Downloaded
無法找到書籍內容: Offline Catalogs
無法找到書籍內容: Help
          ▾
無法找到書籍內容: All help topics →
無法找到書籍內容: Copyright How-To
無法找到書籍內容: Errata, Fixes and Bug Reports
無法找到書籍內容: File Formats
無法找到書籍內容: Frequently Asked Questions
無法找到書籍內容: Policies →
無法找到書籍內容: Public Domain eBook Submission
無法找到書籍內容: Submitting Your Own Work
無法找到書籍內容: Tablets, Phones and eReaders
無法找到書籍內容: The Attic →
無法找到書籍內容: Donate
已下載: 豆棚閒話 -> gutenberg_books\豆棚閒話.txt
已下載: 戲中戲 -> gutenberg_books\戲中戲.txt
已下載: 比目魚 -> gutenberg_books\比目魚.txt
已下載: 比目魚 -> gutenberg_books\比目魚.txt
已下載: Study of Inner Cultivation -> gutenberg_books\Study of Inner Cultivation.txt
已下載: 三字經 -> gutenberg_books\三字經.txt
已下載: 山水情 -

KeyboardInterrupt: 