# **Crawler - One click - PTT**


In [None]:
# @title **請輸入爬蟲目標**

# @markdown > 爬蟲目標 `target` 如下圖所示，大小寫須完全一致，且無空白
# @markdown ![](https://github.com/hsiangjenli/2023-summer-intern/blob/4915c49d0ec599841617f4c731015393f247c0a4/images/crawler_ptt_demo_url.png?raw=true)

# @markdown ---

target = 'Stock' # @param ["Finance", "creditcard", "Bank_Service", "MobilePay", "Stock"] {allow-input: true}
pages = 2 # @param {type:"slider", min:0, max:30, step:1}

In [None]:
from google.colab import drive
import os

drive.mount('/content/drive/')

In [None]:
import re
import requests
from datetime import datetime
from bs4 import BeautifulSoup

url = "https://www.ptt.cc/bbs/{target}/index{page}.html"
r = requests.get(url.format(target=target, page=""))
soup = BeautifulSoup(r.text, "lxml")

In [None]:
def get_article_urls(soup: str) -> list:
    article_urls = []
    for l in soup.find_all("div", attrs={"class": "title"}):
        if l.a:
            article_urls.append("https://www.ptt.cc" + l.a["href"])

    return article_urls

def get_last_page_number(soup: str) -> int:
    re_page_number = r"\d+"
    for l in soup.find_all("a", attrs={"class": "btn wide"}):
        if "上頁" in l.text:
            page_number = re.findall(re_page_number, l["href"])[0]
            return int(page_number) + 1

def EmptyConentHandler(func):
    def wrapper(*args, **kwargs):
        try:
            return func(*args, **kwargs)
        except:
            return ""
    return wrapper

# @markdown 1. 首先，抓取討論文章的 title
@EmptyConentHandler
def get_article_title(soup: str) -> str:
    return soup.find("meta", attrs={"property": "og:title"})['content']

# @markdown 2. 抓取 title 中的討論類別 e.g. 請益、心得、新聞
# @markdown ![](https://github.com/hsiangjenli/2023-summer-intern/blob/master/images/crawler_ptt_demo_category.png?raw=true)
def get_article_category(string) -> str:
    re_category = r"\[(.*?)\]"
    category = re.findall(pattern=re_category, string=string)

    if category:
        return category[0]

    return ""

# @markdown 3. 抓取文章發布時間
def get_article_datetime(soup: str) -> str:

    @EmptyConentHandler
    def find_datetime(dt):
        return datetime.strptime(dt.text, "%a %b %d %H:%M:%S %Y")

    datetime_candidates = soup.find_all("span", attrs={"class": "article-meta-value"})

    for dt in datetime_candidates:
        if find_datetime(dt):
            return find_datetime(dt)

# @markdown 4. 抓取文章內文
@EmptyConentHandler
def get_article_content(soup: str) -> str:
    re_content = r"(作者.{1,30}看板.{1,30}標題.{1,30}時間.{1,30}=?)[\s\S]+(?=※ 發信站:)"
    raw_content = soup.find_all("div", attrs={"class": "bbs-screen bbs-content"})[0].text
    matches = re.finditer(re_content, raw_content)

    for match in matches:
        full_content = match.group(0)
        meta_info = match.group(1)

    return full_content.replace(meta_info, "")

# @markdown 5. 抓取文章回覆
def get_article_comments(soup: str) -> list:
    comments = []
    for comment in soup.find_all("div", attrs={"class": "push"}):
        comments.append(comment.text.replace("\n", ""))
    return comments

## **統整在一起**

In [None]:
import re
import time
import requests
import pandas as pd
from datetime import datetime
from bs4 import BeautifulSoup

url = "https://www.ptt.cc/bbs/{target}/index{page}.html"
r = requests.get(url.format(target=target, page=""))
soup = BeautifulSoup(r.text, "lxml")

index_page_urls = get_article_urls(soup=soup)
last_page_number = get_last_page_number(soup=soup)

outputs = []
urls = []
urls.extend(index_page_urls)

for i in range(2, pages+1):
  page = last_page_number - i
  r = requests.get(url.format(target=target, page=page))
  soup = BeautifulSoup(r.text, "lxml")
  urls.extend(get_article_urls(soup=soup))

for url in urls:
  r = requests.get(url)
  soup = BeautifulSoup(r.text, "lxml")
  output = {
    "url": url,
    "title": get_article_title(soup=soup),
    "category": get_article_category(string=get_article_title(soup=soup)),
    "date": get_article_datetime(soup=soup),
    "content": get_article_content(soup=soup),
    "comments": get_article_comments(soup=soup)
    }
  outputs.append(output)
  print(f'---\n{output["title"]} {output["url"]}')
  time.sleep(0.2)

In [None]:
# 匯出成 Excel
df = pd.DataFrame(outputs).to_excel(f"/content/drive/MyDrive/crawler_{target}.xlsx")