In [16]:
import os
import requests
from bs4 import BeautifulSoup
import re

def save_article_text(url):
    # Set headers to mimic a web browser
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }

    # Send a GET request to the URL with headers
    response = requests.get(url, headers=headers)
    response.raise_for_status()

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract the title, subtitle, description, and main text
    title_element = soup.find('title')
    title = title_element.text.strip() if title_element else ""

    subtitle_element = soup.find('meta', attrs={'name': 'description'})
    subtitle = subtitle_element['content'].strip() if subtitle_element and 'content' in subtitle_element.attrs else ""

    description_element = soup.find('meta', attrs={'name': 'og:description'})
    description = description_element['content'].strip() if description_element and 'content' in description_element.attrs else ""

    # Find and exclude unwanted elements by class names or content patterns
    unwanted_elements = soup.find_all(['script', 'style', 'a', 'div', 'span'], class_=['follow-us', 'newsletter', 'advertisement'])
    patterns_to_exclude = ['next article', 'read next', 'correlated']
    for element in unwanted_elements:
        if any(pattern in str(element).lower() for pattern in patterns_to_exclude):
            element.extract()

    # Find and exclude footer container and "All rights reserved" text
    footer_elements = soup.find_all(['footer', 'div'], class_=['footer', 'bottom-footer'])
    for element in footer_elements:
        element.extract()
    all_rights_reserved_elements = soup.find_all(text=re.compile(r'\bAll rights reserved\b', re.IGNORECASE))
    for element in all_rights_reserved_elements:
        element.extract()

    # Find the main text element(s) based on the HTML structure of the page
    main_text_elements = soup.find_all('p')
    main_text = "\n\n".join([element.text.strip() for element in main_text_elements if element.text.strip()])

    # Set the subtitle to the description if it is empty
    if not subtitle:
        subtitle = description.strip()

    # Concatenate the extracted strings
    article_text = f"{title}\n\n{subtitle}\n\n{main_text}"

    # Create "text_articles" directory if it doesn't exist
    directory = "text_articles"
    if not os.path.exists(directory):
        os.makedirs(directory)

    # Save article text to a file within the "text_articles" directory
    filename = f"{directory}/{title}.txt"
    with open(filename, "w", encoding="utf-8") as file:
        file.write(article_text)

    print(f"Article saved: {filename}")


In [17]:
# List of URLs
urls = [
    "https://www.reuters.com/world/us/us-debt-ceiling-talks-paused-punchbowl-reporter-2023-05-19/",
    "https://www.aljazeera.com/news/2023/5/20/zelenskky-hails-bidens-decision-on-f-16-jet-training-for-ukraine"
]

# Save articles from the list of URLs
for url in urls:
    save_article_text(url)

  all_rights_reserved_elements = soup.find_all(text=re.compile(r'\bAll rights reserved\b', re.IGNORECASE))


Article saved: text_articles/White House, Republican team say no progress in debt ceiling talks | Reuters.txt
Article saved: text_articles/Zelenskky hails Biden’s decision on F-16 jet training for Ukraine | Russia-Ukraine war News | Al Jazeera.txt
