In [9]:
import requests
from bs4 import BeautifulSoup
import csv
import time
import re

class PTTPost:
    def __init__(self, title, link, date, push_count, content=""):
        self.title = title
        self.link = link
        self.date = date
        self.push_count = push_count
        self.content = content

    def to_dict(self):
        return {
            "推文數": self.push_count,
            "標題": self.title,
            "連結": self.link,
            "日期": self.date,
            "內文": self.content
        }

        
class PTTSpider:
    BASE_URL = "https://www.ptt.cc"
    
    def __init__(self, board, max_pages=5):
        self.board = board
        self.max_pages = max_pages
        self.session = requests.Session()
        self.session.cookies.set('over18', '1')  # PTT 18歲驗證
        self.posts = []

    def _fetch_page(self, url):
        print(f" Fetching: {url}")
        try:
            headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
            }
            
            res = self.session.get(url, headers=headers, timeout=10)
            if res.status_code != 200:
                print(f" 抓取失敗：HTTP {res.status_code} - {url}")
                print("狀態碼：", res.status_code)
                print("回傳頁面前 300 字：", res.text[:300])
                return None  # 回傳 None，讓上層決定是否繼續
            res.encoding = 'utf-8'
            return BeautifulSoup(res.text, "html.parser")
        except requests.RequestException as e:
            print(f" 請求錯誤：{e}")
            return None

    def _parse_posts(self, soup, keyword_filter=None):
        entries = soup.select("div.r-ent")
        for entry in entries:
            title_tag = entry.select_one("div.title a")
            push_tag = entry.select_one("div.nrec")
            date_tag = entry.select_one("div.date")
            if not title_tag:
                continue

            title = title_tag.text.strip()
            # skip 公告
            if "[公告]" in title:
                continue
            link = self.BASE_URL + title_tag['href']
            date = date_tag.text.strip()
            push_text = push_tag.text.strip() if push_tag else "0"

            # special case
            if push_text == "爆":
                push_count = 100
            elif re.match(r'X\d+', push_text):
                push_count = -int(push_text[1:])
            else:
                try:
                    push_count = int(push_text)
                except:
                    push_count = 0

            content = self._fetch_article_content(link)

            # keyword_filter
            if keyword_filter:
                if not any(keyword.lower() in (title + content).lower() for keyword in keyword_filter):
                    continue  
            
            post = PTTPost(title, link, date, push_count, content)
            self.posts.append(post)


    def _get_next_page_url(self, soup):
        btns = soup.select("div.btn-group-paging a")
        for btn in btns:
            if "上頁" in btn.text:
                return self.BASE_URL + btn["href"]
        return None

    def crawl(self, keyword_filter=None):
        url = f"{self.BASE_URL}/bbs/{self.board}/index.html"
        for _ in range(self.max_pages):
            soup = self._fetch_page(url)
            self._parse_posts(soup, keyword_filter)
            url = self._get_next_page_url(soup)
            if not url:
                break
        #time.sleep(1)


    def save_to_csv(self, filename):
        with open(filename, mode='w', newline='', encoding='utf-8') as file:
            writer = csv.DictWriter(file, fieldnames=["推文數", "標題", "連結", "日期", "內文"])
            writer.writeheader()
            for post in self.posts:
                writer.writerow(post.to_dict())
        print(f" CSV 檔案已儲存：{filename}")

        
    def _fetch_article_content(self, url):
        try:
            soup = self._fetch_page(url)
            main_content = soup.select_one("#main-content")
            if not main_content:
                return ""

            # Step 1：找出「發信站」在哪個 <span class="f2">
            cut_node = None
            for span in main_content.find_all("span", class_="f2"):
                if "發信站" in span.text:
                    cut_node = span
                    break
            # if url == "https://www.ptt.cc/bbs/Lifeismoney/M.1735838860.A.6F3.html":
            #     print(main_content)
            
            # Step 2：kill div 
            for tag in main_content.find_all(["div"], class_=["article-metaline", "article-metaline-right"]):
                tag.decompose()
                
            
            # if url == "https://www.ptt.cc/bbs/Lifeismoney/M.1735838860.A.6F3.html":
            #     print("\n\nafter\n\n")
            #     print(main_content)
                
            # Step 3：only get 發信站 previous text
            text_lines = []
            # .contents 只抓「最外層的直接子節點」
            for content in main_content.descendants:
                # 停在發信站
                if content == cut_node:
                    break
                # 處理純文字
                elif isinstance(content, str):
                    line = content.strip()
                    if line:
                        text_lines.append(line)

            return "\n".join(text_lines)

        except Exception as e:
            print(f"⚠️ 無法讀取文章：{url}，錯誤：{e}")
            return ""

In [10]:
if __name__ == "__main__":
    #keywords = ["LINE", "蝦皮", "pChome", "優惠"]
    keywords = ["LINE"]
    spider = PTTSpider("Lifeismoney", max_pages=5)
    try:
        spider.crawl(keyword_filter=keywords)
    except Exception as e:
        print(f"❌ 程式錯誤：{e}")
        exit(1)
    spider.save_to_csv("static.csv")


Fetching: https://www.ptt.cc/bbs/Lifeismoney/index.html
Fetching: https://www.ptt.cc/bbs/Lifeismoney/M.1744377914.A.7F1.html
Fetching: https://www.ptt.cc/bbs/Lifeismoney/M.1744387246.A.135.html
Fetching: https://www.ptt.cc/bbs/Lifeismoney/M.1744389532.A.A0F.html
Fetching: https://www.ptt.cc/bbs/Lifeismoney/M.1744389682.A.879.html
Fetching: https://www.ptt.cc/bbs/Lifeismoney/M.1744399107.A.0EC.html
Fetching: https://www.ptt.cc/bbs/Lifeismoney/M.1744402075.A.8BB.html
Fetching: https://www.ptt.cc/bbs/Lifeismoney/M.1744403665.A.116.html
Fetching: https://www.ptt.cc/bbs/Lifeismoney/M.1744415834.A.A1A.html
Fetching: https://www.ptt.cc/bbs/Lifeismoney/M.1744417653.A.67B.html
Fetching: https://www.ptt.cc/bbs/Lifeismoney/M.1744426820.A.981.html
Fetching: https://www.ptt.cc/bbs/Lifeismoney/M.1744429854.A.C96.html
Fetching: https://www.ptt.cc/bbs/Lifeismoney/M.1744429921.A.8AC.html
Fetching: https://www.ptt.cc/bbs/Lifeismoney/M.1744434866.A.28B.html
Fetching: https://www.ptt.cc/bbs/Lifeismoney/M.

In [11]:
import pandas as pd
df = pd.read_csv('static.csv', encoding="utf-8")  
# 顯示前 3 筆資料
print(df.head(10))

   推文數                                     標題  \
0   70            [情報] LINE購物驚喜紅包 滿$199元回饋30元   
1    4              line禮物 折9元，最低1元換麥香（飲料-分眾）   
2    9  Re: [情報] LINE MOBILE吃到飽232元、輕量40元 中華線   
3    6  Re: [情報] LINE MOBILE吃到飽232元、輕量40元 中華線   
4    0             [情報] PChome明天搶拉麵道3入$9拿鐵$18   
5   10                 [情報] PChome Line導購4%回饋   
6   55                     [情報] line 貼圖表情貼 1折   
7    6            [情報] Line禮物一點送好友好運最高可拿5200點   
8    9                       [情報] 麥當勞轉蜜系列買5送5   
9   15                [情報] apple line導購限定商品7%   

                                                  連結    日期  \
0  https://www.ptt.cc/bbs/Lifeismoney/M.174437791...  4/11   
1  https://www.ptt.cc/bbs/Lifeismoney/M.174438968...  4/12   
2  https://www.ptt.cc/bbs/Lifeismoney/M.174439910...  4/12   
3  https://www.ptt.cc/bbs/Lifeismoney/M.174440366...  4/12   
4  https://www.ptt.cc/bbs/Lifeismoney/M.174445710...  4/12   
5  https://www.ptt.cc/bbs/Lifeismoney/M.174429706...  4/10   
6  https://www.ptt.cc/bbs/

In [12]:
!file static.csv

!(echo -ne '\xEF\xBB\xBF'; cat static.csv) > mac_format.csv

static.csv: CSV text
