In [None]:
import os
import time
import json
import subprocess
from bs4 import BeautifulSoup
from datetime import datetime, timedelta

def fetch_page(url):
    curl_command = [
        'curl', url,
        '-H', 'accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        '-H', 'accept-language: zh-CN,zh;q=0.9,en;q=0.8',
        '-H', 'sec-fetch-dest: document',
        '-H', 'sec-fetch-mode: navigate',
        '-H', 'sec-fetch-site: none',
        '-H', 'sec-fetch-user: ?1',
        '-H', 'upgrade-insecure-requests: 1',
        '-H', 'user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36'
    ]
    
    result = subprocess.run(curl_command, capture_output=True, text=True)
    return result.stdout if result.returncode == 0 else None

def fetch_article(url):
    curl_command = [
        'curl', '-i', url,
        '-H', 'accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        '-H', 'accept-language: zh-CN,zh;q=0.9,en;q=0.8',
        '-b', '_awl=2.1740198990.5-404e3f6e73e257dac52922cf63761642-6763652d617369612d6561737431-7; _chartbeat2=.1705476051042.1740198990957.0000000100100001.CPMAl_EV33CBYEv5GlgcVZDhfqVU.1; _cb_svref=external; dicbo_id=%7B%22dicbo_fetch%22%3A1740198991413%7D; datadome=KsKBUmnX5wD9DYXmSxENthQPyM4B3zge5fMF5mI1e__ZOO15Sim_GaHji473RrW3ArTLjguYiu~UFv1GTFmQ9Qbnf2N9vml8uCd3wrAiRmrk0Nly~LBKRZvcYUGmfawK',
        '-H', 'priority: u=0, i',
        '-H', 'sec-ch-device-memory: 8',
        '-H', 'sec-ch-ua: "Not(A:Brand";v="99", "Google Chrome";v="133", "Chromium";v="133"',
        '-H', 'sec-ch-ua-arch: "arm"',
        '-H', 'sec-ch-ua-full-version-list: "Not(A:Brand";v="99.0.0.0", "Google Chrome";v="133.0.6943.127", "Chromium";v="133.0.6943.127"',
        '-H', 'sec-ch-ua-mobile: ?0',
        '-H', 'sec-ch-ua-model: ""',
        '-H', 'sec-ch-ua-platform: "macOS"',
        '-H', 'sec-fetch-dest: document',
        '-H', 'sec-fetch-mode: navigate',
        '-H', 'sec-fetch-site: none',
        '-H', 'sec-fetch-user: ?1',
        '-H', 'upgrade-insecure-requests: 1',
        '-H', 'user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36'
    ]
    
    result = subprocess.run(curl_command, capture_output=True, text=True)
    headers, _, body = result.stdout.partition("\n\n")
    
    for line in headers.split("\n"):
        if line.lower().startswith("location: "):
            redirected_url = line.split(": ", 1)[1].strip()
            print(redirected_url)
            return fetch_article(redirected_url)
    
    return body if result.returncode == 0 else None

def parse_archive_page(html):
    soup = BeautifulSoup(html, 'html.parser')
    articles = []
    
    for link in soup.find_all('a', {'data-testid': 'TitleLink'}):
        href = link.get('href')
        if href.startswith("/article/"):
            articles.append("https://www.reuters.com" + href)
    
    return articles

def parse_article_page(html):
    soup = BeautifulSoup(html, 'html.parser')
    
    title = soup.find('title').get_text(strip=True) if soup.find('title') else ""
    
    date_info = soup.find_all(class_='date-line__date___kNbY')
    date, time_, updated = [d.get_text(strip=True) for d in date_info[:3]] if len(date_info) >= 3 else ("", "", "")
    
    body = "".join([p.get_text(strip=True) for p in soup.find_all(class_='article-body__content__17Yit')])
    
    tags = [tag.get_text(strip=True) for tag in soup.find_all(attrs={'aria-label': 'Tags'})]
    
    return {
        "title": title,
        "date": date,
        "time": time_,
        "updated": updated,
        "body": body,
        "tags": tags
    }

def save_article(article, date_str):
    folder = f"articles/{date_str}"
    os.makedirs(folder, exist_ok=True)
    
    file_name = os.path.join(folder, f"{article['title'].replace('/', '_')}.json")
    with open(file_name, 'w', encoding='utf-8') as f:
        json.dump(article, f, ensure_ascii=False, indent=4)

def main():
    base_url = "https://www.reuters.com/archive/{}/{}/{}/"
    start_date = datetime(2023, 1, 1)
    end_date = datetime(2023, 1, 2)
    current_date = start_date
    
    while current_date <= end_date:
        yyyy_mm = current_date.strftime("%Y-%m")
        dd = current_date.strftime("%d")
        page = 1
        
        while True:
            archive_url = base_url.format(yyyy_mm, dd, page)
            print(archive_url)
            html = fetch_page(archive_url)
            if not html:
                break
            
            article_links = parse_archive_page(html)
            if not article_links:
                break
            
            for article_url in article_links:
                print(article_url)
                article_html = fetch_article(article_url)
                if article_html:
                    article_data = parse_article_page(article_html)
                    print(article_data['title'])
                    save_article(article_data, current_date.strftime("%Y%m%d"))
                time.sleep(5)
            
            page += 1
            time.sleep(5)
        
        current_date += timedelta(days=1)


main()

https://www.reuters.com/archive/2023-01/01/1/
https://www.reuters.com/article/world/netanyahu-says-israel-not-bound-by-despicable-un-vote-idUSKBN2TF0AL/
https://jp.reuters.com/world/middle-east/netanyahu-says-israel-not-bound-by-despicable-un-vote-2022-12-31/
ページが見つかりません
https://www.reuters.com/article/world/netanyahu-says-israel-not-bound-by-despicable-un-vote-idUSKBN2TF0AL/
https://jp.reuters.com/world/middle-east/netanyahu-says-israel-not-bound-by-despicable-un-vote-2022-12-31/
ページが見つかりません
https://www.reuters.com/article/world/netanyahu-says-israel-not-bound-by-despicable-un-vote-idUSKBN2TF0AL/
https://jp.reuters.com/world/middle-east/netanyahu-says-israel-not-bound-by-despicable-un-vote-2022-12-31/
ページが見つかりません
https://www.reuters.com/article/sports/basketball/down-15-at-half-no-4-kansas-rallies-past-oklahoma-st-idUSMTZEICVR83JOY/
Down 15 at half, No. 4 Kansas rallies past Oklahoma St. | Reuters
https://www.reuters.com/article/sports/basketball/down-15-at-half-no-4-kansas-rallies-pa