In [26]:
import os
import requests
from bs4 import BeautifulSoup


def save_article_text(url):
    # Set headers to mimic a web browser
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }

    # Send a GET request to the URL with headers
    response = requests.get(url, headers=headers)
    response.raise_for_status()

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract the title, subtitle, description, and main text
    title_element = soup.find('title')
    title = title_element.text.strip() if title_element else ""
    
    subtitle_element = soup.find('meta', attrs={'name': 'description'})
    subtitle = subtitle_element['content'].strip() if subtitle_element and 'content' in subtitle_element.attrs else ""

    description_element = soup.find('meta', attrs={'name': 'og:description'})
    description = description_element['content'].strip() if description_element and 'content' in description_element.attrs else ""

    # Find and exclude unwanted elements by class names or content patterns
    unwanted_elements = soup.find_all(['script', 'style', 'a', 'div', 'span'], class_=['follow-us', 'newsletter', 'advertisement'])
    for element in unwanted_elements:
        element.extract()

    article_element = soup.find('article')
    text = article_element.text.strip() if article_element else ""

    # Set the subtitle to the description if it is empty
    if not subtitle:
        subtitle = description.strip()

    # Concatenate the extracted strings
    article_text = f"{title}\n\n{subtitle}\n\n{text}"

    # Create "text_articles" directory if it doesn't exist
    directory = "text_articles"
    if not os.path.exists(directory):
        os.makedirs(directory)

    # Save article text to a file within the "text_articles" directory
    filename = f"{directory}/{title}.txt"
    with open(filename, "w", encoding="utf-8") as file:
        file.write(article_text)

    print(f"Article saved: {filename}")

In [27]:
# List of URLs
urls = [
    "https://www.nytimes.com/2023/05/19/nyregion/weisselberg-trump-bragg-perjury.html",
    "https://www.nytimes.com/2023/05/19/opinion/republican-legislatures-abortion-trangender-education.html",
    "https://nymag.com/intelligencer/2023/05/andrew-yang-advises-vivek-ramaswamy-to-lean-into-memes.html"
]

# Save articles from the list of URLs
for url in urls:
    save_article_text(url)

Article saved: text_articles/Trump Ally Could Face Perjury Charge if He Doesn’t Cooperate With D.A. - The New York Times.txt
Article saved: text_articles/Opinion | The Four Freedoms, According to Republicans - The New York Times.txt
Article saved: text_articles/Andrew Yang Advises Vivek Ramaswamy to ‘Lean Into Memes’.txt
