In [4]:
import os
import time
import requests
import asyncio
from bs4 import BeautifulSoup
from datetime import datetime
from urllib.parse import urljoin, urlparse
from playwright.async_api import async_playwright

BASE_URL = "https://docs.streamlit.io/develop"
SAVE_ROOT = "/Users/migu/Desktop/資料庫/gen_ai_try/ai_metadata/streamlit"
CRAWL_DELAY = 1.5

async def scrape_main_content_as_markdown(url, save_path):
    """從單一頁面爬主內文，轉成 markdown 儲存"""
    response = requests.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, 'html.parser')

    markdown_lines = [
        "---",
        f"title: Streamlit Documentation Capture",
        f"url: {url}",
        f"date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
        "---\n"
    ]

    main = soup.find('main')
    if not main:
        print(f"⚠️ 找不到主內容: {url}")
        return

    for el in main.find_all(['h1','h2','h3','h4','h5','h6','p','pre','code','ul','ol','li']):
        if el.name.startswith('h'):
            lvl = int(el.name[1])
            markdown_lines.append(f"{'#'*lvl} {el.get_text(strip=True)}\n")
        elif el.name == 'p':
            text = el.get_text(strip=True)
            if text:
                markdown_lines.append(text + "\n")
        elif el.name == 'pre':
            code = el.get_text()
            markdown_lines.append(f"```python\n{code}\n```\n")
        elif el.name == 'code':
            inline = el.get_text(strip=True)
            if inline:
                markdown_lines.append(f"`{inline}`\n")
        elif el.name in ['ul','ol']:
            for li in el.find_all('li'):
                li_txt = li.get_text(strip=True)
                markdown_lines.append(f"- {li_txt}")

    content = "\n".join(markdown_lines)
    os.makedirs(os.path.dirname(save_path), exist_ok=True)
    with open(save_path, 'w', encoding='utf-8') as f:
        f.write(content)
    print(f"✅ 已完成: {save_path}")

async def expand_all_details(page):
    """展開所有 <details> 標籤下尚未打開的側欄節點"""
    while True:
        details = await page.query_selector_all("nav details:not([open])")
        if not details:
            break
        print(f"🔵 尚有 {len(details)} 個節點未展開，正在展開...")
        for d in details:
            try:
                summ = await d.query_selector("summary")
                if summ:
                    await summ.click()
                    await page.wait_for_timeout(300)
            except Exception as e:
                print(f"⚠️ 展開失敗: {e}")

async def get_target_links(page):
    """在側欄展開完畢後，撈出所有 /develop/xxx/yyy/zzz 格式的連結"""
    links = set()
    a_tags = await page.query_selector_all("nav a.menu__link")
    for a in a_tags:
        href = await a.get_attribute("href")
        if href and href.startswith("/develop"):
            parts = urlparse(href).path.strip("/").split("/")
            if len(parts) == 4:
                full = urljoin(BASE_URL, href)
                links.add(full)
    return list(links)

async def main():
    # 1. 啟動並展開側欄
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        await page.goto(BASE_URL, wait_until="networkidle")
        await page.wait_for_selector("nav")
        await expand_all_details(page)
        print("🔵 側欄已完全展開")

        # 2. 抓取所有目標小頁連結
        links = await get_target_links(page)
        print(f"🔵 共找到 {len(links)} 個小頁面，開始爬取...")

        await browser.close()

    # 3. 逐頁爬取並存檔
    for idx, url in enumerate(links):
        path = urlparse(url).path.lstrip("/")
        save_path = os.path.join(SAVE_ROOT, f"{path}.md")
        try:
            await scrape_main_content_as_markdown(url, save_path)
        except Exception as e:
            print(f"❌ 爬取失敗: {url} - {e}")
        if idx < len(links) - 1:
            await asyncio.sleep(CRAWL_DELAY)

    print("🎉 全部完成！")

# 若在 Notebook/Colab 執行：
await main()
# 若在 .py 腳本執行，請改為：
# if __name__ == "__main__":
#     asyncio.run(main())


🔵 側欄已完全展開
🔵 共找到 0 個小頁面，開始爬取...
🎉 全部完成！
