In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import requests
from bs4 import BeautifulSoup
from markdownify import markdownify as md
from urllib.parse import urljoin, urlparse
import sqlite3
import os
import time

# ====== 參數設定 ======
BASE_URL   = "https://googleapis.github.io/python-genai/"
START_PAGE = "index.html"
OUTPUT_DIR = "python_genai_docs"
DB_PATH    = "python_genai_docs.db"

# 建立輸出資料夾
os.makedirs(OUTPUT_DIR, exist_ok=True)

# 建立 SQLite 資料庫 & table
conn = sqlite3.connect(DB_PATH)
cur  = conn.cursor()
cur.execute("""
CREATE TABLE IF NOT EXISTS pages (
    url        TEXT PRIMARY KEY,
    title      TEXT,
    file_path  TEXT,
    scraped_at TEXT
)
""")
conn.commit()

session = requests.Session()
visited  = set()
to_visit = [urljoin(BASE_URL, START_PAGE)]

def is_valid_url(url: str) -> bool:
    """
    檢查 URL 是否屬於我們要爬的範圍：
    - 同一網域、同一路徑前綴
    - 剔除靜態資源 (.css/.js/.png...)
    """
    p = urlparse(url)
    b = urlparse(BASE_URL)
    if p.netloc != b.netloc: return False
    if not p.path.startswith(b.path): return False
    if any(p.path.endswith(ext) for ext in (".css",".js",".png",".jpg",
                                            ".svg",".gif",".ico")):
        return False
    return True

# ====== 開始爬取 ======
while to_visit:
    url = to_visit.pop(0)
    if url in visited:
        continue
    visited.add(url)
    print(f"🔍 Scraping {url}")

    resp = session.get(url)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    # 取第一個 <h1> 或 <title> 當標題
    title_tag = soup.find("h1") or soup.find("title")
    title     = title_tag.get_text(strip=True) if title_tag else url

    # 擷取主要內容
    container = (soup.find("main")
                 or soup.find(attrs={"role":"main"})
                 or soup.body)
    markdown = md(str(container), heading_style="ATX")

    # 產生安全的檔名
    safe = "".join(c if c.isalnum() or c in "-_" else "_" 
                   for c in title.lower())[:50]
    fname = f"{safe}.md"
    fpath = os.path.join(OUTPUT_DIR, fname)

    # 寫入 .md
    with open(fpath, "w", encoding="utf-8") as f:
        f.write(f"# {title}\n\n{markdown}")

    # 存入資料庫
    scraped_time = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
    cur.execute("""
    INSERT OR REPLACE INTO pages (url, title, file_path, scraped_at)
    VALUES (?, ?, ?, ?)
    """, (url, title, fpath, scraped_time))
    conn.commit()

    # 掃描頁面中所有連結，排入待抓清單
    for a in soup.find_all("a", href=True):
        href = a["href"].split("#")[0]  # 去除錨點
        if not href:
            continue
        new_url = urljoin(url, href)
        if is_valid_url(new_url) and new_url not in visited and new_url not in to_visit:
            to_visit.append(new_url)

conn.close()
print("✅ 全部完成！")


🔍 Scraping https://googleapis.github.io/python-genai/index.html
🔍 Scraping https://googleapis.github.io/python-genai/genai.html
🔍 Scraping https://googleapis.github.io/python-genai/_sources/index.rst.txt
🔍 Scraping https://googleapis.github.io/python-genai/_sources/genai.rst.txt
✅ 全部完成！
