In [None]:
import requests as req
from bs4 import BeautifulSoup as bs
import os
import random
import re

# 建立下載資料夾
if not os.path.exists('downloads'):
    os.makedirs('downloads')

# 檢查是否包含中文字的函數
def has_chinese(text):
    pattern = re.compile(r'[\u4e00-\u9fff]+')
    return bool(pattern.search(text))

# 中文書籍頁面
url = "https://www.gutenberg.org/browse/languages/zh"
res = req.get(url)
soup = bs(res.text, "lxml")

# 收集所有中文書籍連結
book_links = []
for link in soup.find_all('a'):
    if '/ebooks/' in str(link.get('href')):
        title = link.text.strip()
        # 只收集包含中文字的書籍
        if has_chinese(title):
            book_links.append({
                'id': link.get('href').split('/')[-1],
                'title': title
            })

print(f"找到 {len(book_links)} 本中文書籍")

# 下載所有中文書籍
selected_books = book_links  # 不進行隨機選擇，下載所有書籍

# 下載選中的書籍
for book in selected_books:
    try:
        # 構建下載連結
        download_url = f'https://www.gutenberg.org/files/{book["id"]}/{book["id"]}-0.txt'
        
        # 下載內容
        book_content = req.get(download_url)
        book_content.encoding = 'utf-8'
        
        # 儲存檔案，移除檔名中的特殊字元
        safe_title = ''.join(c for c in book['title'] if c.isalnum() or c in (' ', '-', '_'))
        file_path = f'downloads/{safe_title}.txt'
        
        with open(file_path, 'w', encoding='utf-8') as f:
            f.write(book_content.text)
        print(f'成功下載: {book["title"]}')
        
    except Exception as e:
        print(f'下載 {book["title"]} 失敗: {str(e)}')

print('\n下載完成！')