In [2]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import requests
from bs4 import BeautifulSoup
from markdownify import markdownify as md
import sqlite3
import os

# --- 1. 依需求填入要抓的網址列表 ---
URLS = [
    "https://ai.google.dev/gemini-api/docs/openai",
    "https://ai.google.dev/gemini-api/docs/text-generation",
    "https://ai.google.dev/gemini-api/docs/image-understanding",
    "https://ai.google.dev/gemini-api/docs/document-processing?lang=python",
    "https://ai.google.dev/gemini-api/docs/thinking",
    "https://ai.google.dev/gemini-api/docs/structured-output?lang=python",
    "https://ai.google.dev/gemini-api/docs/long-context",
    "https://ai.google.dev/gemini-api/docs/code-execution?lang=python",
    "https://ai.google.dev/gemini-api/docs/grounding?lang=python",
    "https://ai.google.dev/gemini-api/docs/prompting-strategies",
    "https://ai.google.dev/gemini-api/docs/caching?lang=python",
    "https://ai.google.dev/gemini-api/docs/tokens?lang=python",
    "https://ai.google.dev/gemini-api/docs/files",
    "https://ai.google.dev/api/models",
    "https://ai.google.dev/api/generate-content"

]

# --- 2. 準備輸出資料夾 ---
OUTPUT_DIR = "docs"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# --- 3. 建立 SQLite 資料庫與 table ---
DB_PATH = "docs.db"
conn = sqlite3.connect(DB_PATH)
cur = conn.cursor()
cur.execute("""
CREATE TABLE IF NOT EXISTS pages (
    id           INTEGER PRIMARY KEY AUTOINCREMENT,
    url          TEXT UNIQUE,
    title        TEXT,
    content_md   TEXT,
    file_path    TEXT,
    scraped_at   TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
""")
conn.commit()

# --- 4. 主流程：逐 URL 抓取、轉 Markdown、寫檔、寫入 DB ---
for url in URLS:
    print(f"Processing {url} ...")
    resp = requests.get(url)
    resp.raise_for_status()

    soup = BeautifulSoup(resp.text, "html.parser")
    # 優先取 <main>，沒有再 fallback 到 <body>
    container = soup.find("main") or soup.body
    # 擷取第一個 h1 作為標題，若沒有就用 URL
    h1 = soup.find("h1")
    title = h1.get_text(strip=True) if h1 else url

    # 轉成 Markdown（ATX 標題風格）
    markdown = md(str(container), heading_style="ATX")

    # 存成 .md 檔
    safe_name = "".join(c if c.isalnum() or c in "-_" else "_" for c in title.lower())
    filename = f"{safe_name}.md"
    file_path = os.path.join(OUTPUT_DIR, filename)
    with open(file_path, "w", encoding="utf-8") as f:
        f.write(f"# {title}\n\n")
        f.write(markdown)

    # 寫入 SQLite（若已存在相同 URL 則更新）
    cur.execute("""
    INSERT INTO pages (url, title, content_md, file_path)
    VALUES (?, ?, ?, ?)
    ON CONFLICT(url) DO UPDATE
      SET title      = excluded.title,
          content_md = excluded.content_md,
          file_path  = excluded.file_path,
          scraped_at = CURRENT_TIMESTAMP
    """, (url, title, markdown, file_path))
    conn.commit()

    print(f" → Saved to `{file_path}` and DB.")

conn.close()
print("All done.")


Processing https://ai.google.dev/gemini-api/docs/openai ...
 → Saved to `docs/openai_compatibility.md` and DB.
Processing https://ai.google.dev/gemini-api/docs/text-generation ...
 → Saved to `docs/text_generation.md` and DB.
Processing https://ai.google.dev/gemini-api/docs/image-understanding ...
 → Saved to `docs/image_understanding.md` and DB.
Processing https://ai.google.dev/gemini-api/docs/document-processing?lang=python ...
 → Saved to `docs/document_understanding.md` and DB.
Processing https://ai.google.dev/gemini-api/docs/thinking ...
 → Saved to `docs/gemini_thinking.md` and DB.
Processing https://ai.google.dev/gemini-api/docs/structured-output?lang=python ...
 → Saved to `docs/generate_structured_output_with_the_gemini_api.md` and DB.
Processing https://ai.google.dev/gemini-api/docs/long-context ...
 → Saved to `docs/long_context.md` and DB.
Processing https://ai.google.dev/gemini-api/docs/code-execution?lang=python ...
 → Saved to `docs/code_execution.md` and DB.
Processing 