In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os

# 建立資料夾
os.makedirs("data", exist_ok=True)

# 基本設定
base_url = "https://www.ptt.cc"
board = "CheerGirlsTW"
page_count = 3  # 要爬的頁數

# 繞過年齡驗證
session = requests.Session()
session.cookies.set("over18", "1")

# 找出看板的最新 index 頁碼
def get_latest_index():
    res = session.get(f"{base_url}/bbs/{board}/index.html")
    soup = BeautifulSoup(res.text, "html.parser")
    prev_link = soup.select("div.btn-group-paging a")[1]["href"]  # 第二個按鈕是「‹ 上頁」
    # 解析 index 數字，例如 "/bbs/CheerGirlsTW/index1234.html"
    latest_index = int(prev_link.split("index")[1].split(".html")[0]) + 1
    return latest_index

# 從單一頁面抓文章
def get_articles_from_page(url):
    res = session.get(url)
    soup = BeautifulSoup(res.text, "html.parser")
    entries = soup.select("div.r-ent")
    page_articles = []

    for e in entries:
        try:
            push = e.select_one("div.nrec").text.strip()
            title_tag = e.select_one("div.title a")
            title = title_tag.text.strip()
            link = base_url + title_tag["href"]
            author = e.select_one("div.author").text.strip()
            page_articles.append([title, author, push, link])
        except:
            continue  # 跳過已被刪除的文章

    return page_articles

# 取得最新頁碼
latest_index = get_latest_index()

# 開始爬取
all_articles = []
for i in range(page_count):
    page_index = latest_index - i
    page_url = f"{base_url}/bbs/{board}/index{page_index}.html"
    print(f"🔍 正在爬第 {i+1} 頁：{page_url}")
    articles = get_articles_from_page(page_url)
    all_articles.extend(articles)

# 儲存 CSV
df = pd.DataFrame(all_articles, columns=["標題", "作者", "推文", "連結"])
df.to_csv("data/cheergirls_articles_3pages.csv", index=False, encoding="utf-8-sig")

print("✅ 完成！共爬到", len(df), "篇文章，已儲存到 data/cheergirls_articles_3pages.csv")


🔍 正在爬第 1 頁：https://www.ptt.cc/bbs/CheerGirlsTW/index342.html
🔍 正在爬第 2 頁：https://www.ptt.cc/bbs/CheerGirlsTW/index341.html
🔍 正在爬第 3 頁：https://www.ptt.cc/bbs/CheerGirlsTW/index340.html
✅ 完成！共爬到 48 篇文章，已儲存到 data/cheergirls_articles_3pages.csv
