In [1]:
import datetime
import numpy as np
import pandas as pd
import requests
import random
import time
import re
import jieba as jb
from bs4 import BeautifulSoup

In [2]:
# *** 選擇從哪天開始爬取文章 ***
while True: 
    try:
        print("請問要從哪一天開始抓取保險版一週的文章?")
        year = int(input('年(Year):'))
        month = int(input('月(month):'))
        day = int(input('日(day):'))
        datetime_date = datetime.date(year, month, day)
        break
    except:
        print("請輸入正確的年份、月份、日期")

# 為期一週
date_range = []
for i in range(7):
    # today = time.strftime("%m/%d").lstrip('0')  # 今天日期, 去掉開頭的 '0' 以符合 PTT 網站格式
    date = (datetime_date - datetime.timedelta(days=i)).strftime('%m/%d').lstrip('0')
    date_range.append(date)
    
print("您選擇的日期範圍為",date_range)

請問要從哪一天開始抓取保險版一週的文章?
年(Year):2021
月(month):12
日(day):15
您選擇的日期範圍為 ['12/15', '12/14', '12/13', '12/12', '12/11', '12/10', '12/09']


In [3]:
def get_ppt_page(url):
    # 紀錄cookies 是否年滿18歲
    resp = requests.get(
        url=url,
        cookies={'over18': '1'}  
    )
    if resp.status_code != 200:
        print('Invalid url:', resp.url)
        return None
    else:
        return resp.text

In [4]:
def get_pageinfo(resdata,domain_url):
    soup = BeautifulSoup(resdata, 'html5lib')
    # 取得上一頁按鈕
    paging_div = soup.find('div', 'btn-group btn-group-paging')
    prev_url = paging_div.find_all('a')[1]['href']
    
    # 儲存取得的文章資料<div class="r-ent"></div>
    pptdata = []  
    date_divs = soup.find_all('div', 'r-ent')
    
    num = 0
    whether_prev = True
    for k in date_divs:
        # 判斷文章是否在一週內發佈
        week_date = k.find('div', 'date').text.strip() in date_range
        
        # 代表最早的日期，就不符合條件，所以不用下一頁
        if num == 0:
            if week_date == False:
                whether_prev = False
        # 一周內發布       
        if week_date:
            # 發布日期
            post_date = k.find('div', 'date').text.strip()
            #print("發布日期",post_date)
            
            # 推文數
            push_count = k.find('div', 'nrec').text
            push_num = 0
            if push_count:
                try:
                    push_num = int(push_count)  
                except ValueError:
                    # 若轉換失敗，可能是'爆'或 'X1', 'X2'
                    if push_count == '爆':
                        push_num = 100
                    elif push_count.startswith('X'):
                        push_num = -100
                    else:
                        push_num = 0
            #print("推文數",push_num)
            
            # 有超連結，代表文章存在
            if k.find('a'): 
                # 文章標題
                title = k.find('a').text
                #print("標題",title)
                # 文章連結
                href = k.find('a')['href']
                #print("標題連結",domain_url+href)
                
                pptdata.append({
                    'date': post_date,
                    'push_num': push_num,
                    'title': title,
                    'href': domain_url+href
                })
        
        #每做完一次，num+1
        num = num + 1 
    df = pd.DataFrame(pptdata, columns=['date','push_num', 'title', 'href'])
    
    return prev_url,df,whether_prev

In [5]:
if __name__ == '__main__':
    # *** 選擇保險版 ***
    domain_url = 'https://www.ptt.cc'
    Gossiping = '/bbs/Gossiping/index.html'
    # 第一頁
    web_url = domain_url + Gossiping
    #print(web_url)
    ppt_page = get_ppt_page(web_url)
    prev_href,df,whether_prev = get_pageinfo(ppt_page,domain_url) 
    
    # 往前一頁
    while whether_prev:
        web_url = domain_url+prev_href
        #print(web_url)
        ppt_page = get_ppt_page(web_url)
        prev_href,df_next,whether_prev = get_pageinfo(ppt_page,domain_url) 
        df = pd.concat([df,df_next])
   
    df.index = df['date']
    df = df.sort_index(ascending=False)
    df = df.reset_index(drop=True)

In [6]:
df

Unnamed: 0,date,push_num,title,href
0,12/15,12,[協尋] 12/13 高雄楠陽高架橋下機車專用道 北上,https://www.ptt.cc/bbs/Gossiping/M.1639509137....
1,12/15,-100,[公告] 禁止發電影爆雷文至12/19 23:59:59止,https://www.ptt.cc/bbs/Gossiping/M.1639530614....
2,12/09,20,[協尋] 求行車記錄器宜蘭縣冬山路12/3油罐車禍,https://www.ptt.cc/bbs/Gossiping/M.1639035420....
3,12/09,13,[協尋] 求行車記錄器新北市汐止弘道街車禍,https://www.ptt.cc/bbs/Gossiping/M.1638983770....
