In [13]:
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tag import pos_tag
from bs4 import BeautifulSoup
import requests

In [14]:
def save_article_text(url):
    # Set headers to mimic a web browser
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }

    # Send a GET request to the URL with headers
    response = requests.get(url, headers=headers)
    response.raise_for_status()

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract the title, subtitle, description, and main text
    title_element = soup.find('title')
    title = title_element.text.strip() if title_element else ""

    subtitle_element = soup.find('meta', attrs={'name': 'description'})
    subtitle = subtitle_element['content'].strip() if subtitle_element and 'content' in subtitle_element.attrs else ""

    description_element = soup.find('meta', attrs={'name': 'og:description'})
    description = description_element['content'].strip() if description_element and 'content' in description_element.attrs else ""

    # Find and exclude unwanted elements by class names or content patterns
    unwanted_elements = soup.find_all(['script', 'style', 'a', 'div', 'span'], class_=['follow-us', 'newsletter', 'advertisement'])
    patterns_to_exclude = ['next article', 'read next', 'correlated']
    for element in unwanted_elements:
        if any(pattern in str(element).lower() for pattern in patterns_to_exclude):
            element.extract()

    # Find and exclude footer container and "All rights reserved" text
    footer_elements = soup.find_all(['footer', 'div'], class_=['footer', 'bottom-footer'])
    for element in footer_elements:
        element.extract()
    all_rights_reserved_elements = soup.find_all(text=re.compile(r'\bAll rights reserved\b', re.IGNORECASE))
    for element in all_rights_reserved_elements:
        element.extract()

    # Find the main text element(s) based on the HTML structure of the page
    main_text_elements = soup.find_all('p')
    main_text = "\n\n".join([element.text.strip() for element in main_text_elements if element.text.strip()])

    # Set the subtitle to the description if it is empty
    if not subtitle:
        subtitle = description.strip()

    # Concatenate the extracted strings
    article_text = f"{title}\n\n{subtitle}\n\n{main_text}"

    return article_text

In [15]:
def analyze_article(article):
    stop_words = set(stopwords.words('english'))
    sentences = sent_tokenize(article)

    total_adjectives = 0
    total_words = 0
    total_proper_names = 0

    for sent in sentences:
        words = word_tokenize(sent)
        words = [word for word in words if word.lower() not in stop_words]
        tagged_words = pos_tag(words)
        num_adjectives = len([word for word, tag in tagged_words if tag.startswith('JJ')])
        total_adjectives += num_adjectives
        total_words += len(words)
        total_proper_names += len([word for word, tag in tagged_words if tag == 'NNP'])

    print(f"Total words: {total_words}")
    print(f"Total adjectives: {total_adjectives}")
    print(f"Total proper names: {total_proper_names}")

In [16]:
url = "https://www.foxnews.com/politics/biden-vetoes-bill-cancelling-student-loan-handout"

article = save_article_text(url)
analyze_article(article)

Total words: 565
Total adjectives: 37
Total proper names: 159
