In [31]:
from bs4 import BeautifulSoup
import requests
import datetime

def day_delta(date):
    today = datetime.date.today()
    date_month, date_day = map(int, date.split('/'))
    date = datetime.date(today.year, date_month, date_day)
    if date > today:
        date = date.replace(year=today.year - 1)
    return today - date

def lineNotifyMessage(token, msg):
    headers = {
        "Authorization": "Bearer " + token,
        "Content-Type": "application/x-www-form-urlencoded"
    }
    payload = {'message': msg}
    
    r = requests.post(
        "https://notify-api.line.me/api/notify",
        headers=headers, 
        params=payload)
    return r.status_code

url = 'https://www.ptt.cc/bbs/studyabroad/index.html'
headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'
}
res = requests.get(url, headers=headers)

soup = BeautifulSoup(res.text)
chunks = soup.select('div.r-ent')
chunks.reverse()
prev_page = 'https://www.ptt.cc' + soup.select("div#action-bar-container div.btn-group-paging a")[1]["href"]

max_days = 3
results = []

while True:
    done = False
    
    for chunk in chunks:
        # Get title
        title = chunk.select('div.title')[0].text.strip()
        if '公告' in title or '刪除' in title or '留學版個人化搜尋小工具(新增所有領域)' in title: # Skip redundant posts
            continue
            
        # Get date
        date = chunk.select('div.date')[0].text.strip()
        if day_delta(date) > datetime.timedelta(days=max_days): # Exit if finding post is too old
            done = True
            break

        # Get author
        author = chunk.select('div.author')[0].text.strip()
        
        # Get the number of pushs
        try:
            push_cnt = chunk.select('span.hl')[0].text.strip()
        except: # If the post has no pushs
            push_cnt = '0'
            
        # Get link
        link = 'https://www.ptt.cc' + chunk.select('div.title a')[0]['href']

        results += [{
            'title': title, 
            'author': author,
            'date': date,
            'push_cnt': push_cnt,
            'link': link
        }]
    
    if done:
        break
    
    # Crawl the previous page
    res = requests.get(prev_page, headers=headers)

    soup = BeautifulSoup(res.text)
    chunks = soup.select('div.r-ent')
    chunks.reverse()
    # Update prev_page
    prev_page = 'https://www.ptt.cc' + soup.select("div#action-bar-container div.btn-group-paging a")[1]["href"]

message = ''
for result in results:
    message += '\n' + result['push_cnt'] + ' ' + result['title'] + ' ' + result['date'] + ' ' + result['link']
token = 'YOUR_TOKEN'
print(message)
print(lineNotifyMessage(token, message))


0 [問題] 空大出國讀碩 11/08 https://www.ptt.cc/bbs/studyabroad/M.1667919528.A.8E7.html
4 [選校] 2023 Fall ME轉CS 極低GPA  選校請益 11/08 https://www.ptt.cc/bbs/studyabroad/M.1667916372.A.CB6.html
0 [租屋] （代po）法國里昂交換租房 11/08 https://www.ptt.cc/bbs/studyabroad/M.1667901429.A.D7F.html
0 [尋人] ASU 2023 Spring 11/08 https://www.ptt.cc/bbs/studyabroad/M.1667886109.A.2B8.html
0 [情報] 2022感恩節 留美碩博士線上講座 11/08 https://www.ptt.cc/bbs/studyabroad/M.1667857741.A.DD7.html
0 [情報] 南加大 USC 經濟學與計量經濟碩士 11/11 11/07 https://www.ptt.cc/bbs/studyabroad/M.1667806906.A.2F3.html
10 [選校] 2023fall MIS 中高GPA 轉領域選校 11/07 https://www.ptt.cc/bbs/studyabroad/M.1667757723.A.52C.html
4 [問題] 有指定問題的SOP寫作方向 11/06 https://www.ptt.cc/bbs/studyabroad/M.1667731279.A.CA0.html
0 [租屋] 租屋請益 11/06 https://www.ptt.cc/bbs/studyabroad/M.1667730233.A.590.html
5 [情報] George Mason University CS招生 11/06 https://www.ptt.cc/bbs/studyabroad/M.1667695913.A.317.html
37 Re: [心得] CS 末班車果然已經開走了 11/05 https://www.ptt.cc/bbs/studyabroad/M.1667657295.A.9A7.html
16 [情報