In [4]:
import requests
from bs4 import BeautifulSoup
import time
import random
import json
import os
import re
from datetime import datetime, timedelta

# random user agent list
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0",
    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:88.0) Gecko/20100101 Firefox/88.0",
    "Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Mobile/15E148 Safari/604.1",
]

# input and output directory
INPUT_FILE = "articles_links/20190105.txt"
OUTPUT_DIR = "reuters_articles/20190105"
LOG_FILE = "reuters_crawl_log.txt"
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

def log_error(message):
    '''
        Record error message
    '''
    with open(LOG_FILE, "a", encoding="utf-8") as f:
        f.write(f"{time.strftime('%Y-%m-%d %H:%M:%S')} - {message}\n")

def read_links(file_path):
    '''
        Read links from last step.
    '''
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            links = [line.strip() for line in f if line.strip()]
        return links
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        log_error(f"File not found: {file_path}")
        return []

def get_page_content(url):
    '''
        Get article page content by url.
    '''
    headers = {
        "User-Agent": random.choice(USER_AGENTS),
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.9",
        "Referer": "https://www.google.com/",
        "Connection": "keep-alive",
    }
    try:
        response = requests.get(url, headers=headers, timeout=15, allow_redirects=True)
        response.raise_for_status()
        return response.text
    except requests.RequestException as e:
        error_msg = f"Request failed: {url}, error: {e}"
        log_error(error_msg)
        return None

def parse_article(html):
    '''
        Parse article contents.
    '''
    soup = BeautifulSoup(html, 'html.parser')
    
    # Extract title
    title = soup.find('title').get_text(strip=True) if soup.find('title') else ""
    
    # Extract date info
    date_info = soup.find_all(class_='date-line__date___kNbY')
    date, time_, updated = [d.get_text(strip=True) for d in date_info[:3]] if len(date_info) >= 3 else ("", "", "")
    
    # Extract article body
    body = "".join([p.get_text(strip=True) for p in soup.find_all(class_='article-body__content__17Yit')])
    
    # Extract tags and remove 'Suggested Topics:'
    tags_raw = [tag.get_text(strip=True) for tag in soup.find_all(attrs={'aria-label': 'Tags'})]
    tags = []
    for tag in tags_raw:
        if tag.startswith("Suggested Topics:"):
            cleaned_tag = tag.replace("Suggested Topics:", "").strip()
            if cleaned_tag:
                tags.append(cleaned_tag)
        else:
            tags.append(tag)
    
    return {
        "title": title,
        "date": date,
        "time": time_,
        "updated": updated,
        "body": body,
        "tags": tags
    }

def sanitize_filename(title):
    '''
        Sanitize the filename for it can contain spaces and other special characters.
    '''
    invalid_chars = r'[<>:"/\\|?*]'
    sanitized = re.sub(invalid_chars, '_', title)
    return sanitized[:200]

def save_article(article_data, index):
    '''
        Save article content as a json file.
    '''
    if not article_data["title"]:
        filename = f"{OUTPUT_DIR}/article_{index:04d}.json"
    else:
        sanitized_title = sanitize_filename(article_data["title"])
        filename = f"{OUTPUT_DIR}/{sanitized_title}.json"
    
    base_filename = filename
    counter = 1
    while os.path.exists(filename):
        filename = f"{base_filename[:-5]}_{counter}.json"
        counter += 1
    
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(article_data, f, ensure_ascii=False, indent=4)
    print(f"Saved article: {article_data['title']} to {filename}")

def process_articles():
    '''
        Process all the article links.
    '''
    links = read_links(INPUT_FILE)
    if not links:
        print("No link found, exit the process")
        return
    
    random.shuffle(links)
    
    print(f"Start to process...")
    succ_no = 0
    for i, link in enumerate(links, 1):
        html = get_page_content(link)
        if html:
            article_data = parse_article(html)
            save_article(article_data, i)
            succ_no += 1
            if succ_no >= 100:
                break
                
        # a random delay
        time.sleep(random.uniform(1, 3))

if __name__ == "__main__":
    # start date
    start_date = datetime(2019, 1, 24)
    # end date
    end_date = datetime(2019, 1, 25)
    # iteration date
    current_date = start_date
    
    while current_date <= end_date:
        date_str = current_date.strftime("%Y%m%d")
        # construct article link file
        INPUT_FILE = f"articles_links/{date_str}.txt"
        # create article contents directory
        OUTPUT_DIR = f"reuters_articles/{date_str}"
    
        if not os.path.exists(OUTPUT_DIR):
            os.makedirs(OUTPUT_DIR)
        process_articles()
    
        # Iterate the date
        current_date += timedelta(days=1)


当前输入文件: articles_links/20190121.txt
当前输出目录: reuters_articles/20190121
开始处理...
已保存文章: Erdogan - Turkey is ready to take over Syria's Manbij | Reuters 到 reuters_articles/20190121/Erdogan - Turkey is ready to take over Syria's Manbij _ Reuters.json
已保存文章: Oladipo, Pacers cruise by Hornets | Reuters 到 reuters_articles/20190121/Oladipo, Pacers cruise by Hornets _ Reuters.json
已保存文章: EXCLUSIVE-Modi considers cheap loans, other help for small Indian businesses - sources | Reuters 到 reuters_articles/20190121/EXCLUSIVE-Modi considers cheap loans, other help for small Indian businesses - sources _ Reuters.json
已保存文章: Modern China's birth rate falls to lowest ever | Reuters 到 reuters_articles/20190121/Modern China's birth rate falls to lowest ever _ Reuters.json
已保存文章: Rival groups demonstrate in Thailand as election tensions grow | Reuters 到 reuters_articles/20190121/Rival groups demonstrate in Thailand as election tensions grow _ Reuters.json
已保存文章: CRISIL ratings for Indian debt instruments-Ja

KeyboardInterrupt: 