In [1]:
import os
import time
import json
import subprocess
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import random

def fetch_page(url):
    curl_command = [
        'curl', url,
        '-H', 'accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        '-H', 'accept-language: zh-CN,zh;q=0.9,en;q=0.8',
        '-H', 'sec-fetch-dest: document',
        '-H', 'sec-fetch-mode: navigate',
        '-H', 'sec-fetch-site: none',
        '-H', 'sec-fetch-user: ?1',
        '-H', 'upgrade-insecure-requests: 1',
        '-H', 'user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36'
    ]
    
    result = subprocess.run(curl_command, capture_output=True, text=True)
    #print(result)
    return result.stdout if result.returncode == 0 else None

def fetch_article(url):
    curl_command = [
        'curl', '-i', url,
        '-H', 'accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        '-H', 'accept-language: zh-CN,zh;q=0.9,en;q=0.8',
        '-b', '_awl=2.1740198990.5-404e3f6e73e257dac52922cf63761642-6763652d617369612d6561737431-7; _chartbeat2=.1705476051042.1740198990957.0000000100100001.CPMAl_EV33CBYEv5GlgcVZDhfqVU.1; _cb_svref=external; dicbo_id=%7B%22dicbo_fetch%22%3A1740198991413%7D; datadome=KsKBUmnX5wD9DYXmSxENthQPyM4B3zge5fMF5mI1e__ZOO15Sim_GaHji473RrW3ArTLjguYiu~UFv1GTFmQ9Qbnf2N9vml8uCd3wrAiRmrk0Nly~LBKRZvcYUGmfawK',
        '-H', 'priority: u=0, i',
        '-H', 'sec-ch-device-memory: 8',
        '-H', 'sec-ch-ua: "Not(A:Brand";v="99", "Google Chrome";v="133", "Chromium";v="133"',
        '-H', 'sec-ch-ua-arch: "arm"',
        '-H', 'sec-ch-ua-full-version-list: "Not(A:Brand";v="99.0.0.0", "Google Chrome";v="133.0.6943.127", "Chromium";v="133.0.6943.127"',
        '-H', 'sec-ch-ua-mobile: ?0',
        '-H', 'sec-ch-ua-model: ""',
        '-H', 'sec-ch-ua-platform: "macOS"',
        '-H', 'sec-fetch-dest: document',
        '-H', 'sec-fetch-mode: navigate',
        '-H', 'sec-fetch-site: none',
        '-H', 'sec-fetch-user: ?1',
        '-H', 'upgrade-insecure-requests: 1',
        '-H', 'user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36'
    ]
    
    result = subprocess.run(curl_command, capture_output=True, text=True)
    headers, _, body = result.stdout.partition("\n\n")
    
    for line in headers.split("\n"):
        if line.lower().startswith("location: "):
            redirected_url = line.split(": ", 1)[1].strip()
            print(redirected_url)
            return fetch_article(redirected_url)
    
    return body if result.returncode == 0 else None

def parse_archive_page(html):
    soup = BeautifulSoup(html, 'html.parser')
    articles = []
    
    for link in soup.find_all('a', {'data-testid': 'TitleLink'}):
        href = link.get('href')
        if href.startswith("/article/"):
            articles.append("https://www.reuters.com" + href)
    
    return articles

def parse_article_page(html):
    soup = BeautifulSoup(html, 'html.parser')
    
    title = soup.find('title').get_text(strip=True) if soup.find('title') else ""
    
    date_info = soup.find_all(class_='date-line__date___kNbY')
    date, time_, updated = [d.get_text(strip=True) for d in date_info[:3]] if len(date_info) >= 3 else ("", "", "")
    
    body = "".join([p.get_text(strip=True) for p in soup.find_all(class_='article-body__content__17Yit')])
    
    tags = [tag.get_text(strip=True) for tag in soup.find_all(attrs={'aria-label': 'Tags'})]
    
    return {
        "title": title,
        "date": date,
        "time": time_,
        "updated": updated,
        "body": body,
        "tags": tags
    }

def save_articles_links(current_date, article_links):
    # 格式化日期为 YYYYMMDD
    date_str = current_date.strftime("%Y%m%d")
    
    # 设置保存目录和文件路径
    output_dir = "articles_links"
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    output_file = os.path.join(output_dir, f"{date_str}.txt")
    
    # 将列表内容写入文件
    with open(output_file, 'w', encoding='utf-8') as f:
        for link in article_links:
            f.write(f"{link}\n")
    
    print(f"Saved {len(article_links)} links to {output_file}")
    
def main():
    base_url = "https://www.reuters.com/archive/{}/{}/{}/"
    start_date = datetime(2019, 2, 9)
    end_date = datetime(2019, 12, 31)
    current_date = start_date
    #time.sleep(3600)
    while current_date <= end_date:
        time.sleep(random.uniform(60, 100))
        yyyy_mm = current_date.strftime("%Y-%m")
        dd = current_date.strftime("%d")
        print(f'current_date={current_date.strftime("%Y-%m-%d")}')
        page = 1

        article_links = set([])
        retry_sleep_time = 60 # If need retry, time should sleep for a while before retry
    
        while True:
            archive_url = base_url.format(yyyy_mm, dd, page)
            html = fetch_page(archive_url)
            #print(html)
            if not html:
                break
            
            links = parse_archive_page(html)
            article_links = article_links | set(links)
            
            if len(links) < 2:
                if page == 1:
                    # no enough links for first page
                    article_links.clear()
                    
                    retry_flag = True
                    retry_sleep_time += 60 # every time you encounter a block issue, sleep 60 seconds more
                    print(f'Sleep for {retry_sleep_time} seconds before retry')
                    time.sleep(retry_sleep_time)
                    continue
                    
                #print(html)
                print('not links')
                break
            else:
                retry_sleep_time = 60

            page += 1
            if page % 10 == 0:
                print(f'{page} pages for {current_date.strftime("%Y-%m-%d")}, total num is {len(article_links)}')
            time.sleep(random.uniform(1, 15))
        print(f'Totally {page} pages for {current_date.strftime("%Y-%m-%d")}, total num is {len(article_links)}')
        save_articles_links(current_date, article_links)
        
        current_date += timedelta(days=1)


main()

current_date=2019-02-05
10 pages for 2019-02-05, total num is 62
20 pages for 2019-02-05, total num is 124
30 pages for 2019-02-05, total num is 164
40 pages for 2019-02-05, total num is 225
50 pages for 2019-02-05, total num is 285
60 pages for 2019-02-05, total num is 336
70 pages for 2019-02-05, total num is 391
80 pages for 2019-02-05, total num is 440
90 pages for 2019-02-05, total num is 482
100 pages for 2019-02-05, total num is 534
110 pages for 2019-02-05, total num is 588
120 pages for 2019-02-05, total num is 633
130 pages for 2019-02-05, total num is 685
not links
Totally 131 pages for 2019-02-05, total num is 686
Saved 686 links to articles_links/20190205.txt
current_date=2019-02-06
10 pages for 2019-02-06, total num is 53
20 pages for 2019-02-06, total num is 99
30 pages for 2019-02-06, total num is 150
40 pages for 2019-02-06, total num is 209
50 pages for 2019-02-06, total num is 274
60 pages for 2019-02-06, total num is 325
70 pages for 2019-02-06, total num is 371
80 

KeyboardInterrupt: 