In [1]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('vader_lexicon')
nltk.download('sentiwordnet')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import sentiwordnet as swn

from nltk.sentiment.vader import SentimentIntensityAnalyzer

import stanza
stanza.download('en')  # Download the English model

import statistics
import numpy as np
import pandas as pd

import re

import requests
from bs4 import BeautifulSoup

import newspaper

import torch
torch.cuda.empty_cache()
torch.cuda.memory_summary(device=None, abbreviated=False)

[nltk_data] Downloading package punkt to /home/pierluigi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/pierluigi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/pierluigi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/pierluigi/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package sentiwordnet to
[nltk_data]     /home/pierluigi/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

2023-04-21 12:06:27 INFO: Downloading default packages for language: en (English) ...
2023-04-21 12:06:29 INFO: File exists: /home/pierluigi/stanza_resources/en/default.zip
2023-04-21 12:06:34 INFO: Finished downloading models and saved to /home/pierluigi/stanza_resources.




In [2]:
nlp = stanza.Pipeline(lang='en', processors='tokenize,sentiment', tokenize_no_ssplit=False, max_split_size_mb=15)

2023-04-21 12:06:34 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

2023-04-21 12:06:35 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| sentiment | sstplus  |

2023-04-21 12:06:35 INFO: Using device: cuda
2023-04-21 12:06:35 INFO: Loading: tokenize
2023-04-21 12:06:38 INFO: Loading: sentiment
2023-04-21 12:06:38 INFO: Done loading processors!


In [3]:
# Load the MPQA lexicon
lexicon = pd.read_csv("subjclueslen1-HLTEMNLP05.tff", sep=" ", header=None, 
                      names=["type", "len", "word", "pos", "stemmed", "polarity", "strength"])

lexicon["type"] = lexicon["type"].str[5:]
lexicon["word"] = lexicon["word"].str[len("word1="):]
lexicon["polarity"] = lexicon["polarity"].str[len("priorpolarity="):]
cols_to_remove = ["len", "pos", "stemmed", "strength"]
lexicon = lexicon.drop(columns=cols_to_remove)
lexicon["type"] = lexicon["type"].replace("weaksubj", 1)
lexicon["type"] = lexicon["type"].replace("strongsubj", 2)
lexicon["polarity"] = lexicon["polarity"].replace("negative", -1)
lexicon["polarity"] = lexicon["polarity"].replace("positive", 1)
lexicon["polarity"] = lexicon["polarity"].replace("both", 0)
lexicon["polarity"] = lexicon["polarity"].replace("neutral", 0)

In [4]:
url = "https://www.foxnews.com/politics/republicans-respond-after-irs-whistleblower-says-hunter-biden-investigation-being-mishandled"

In [7]:
def preprocess_article(url):
    # Create a newspaper Article object
    article = newspaper.Article(url)

    # Download and parse the article
    article.download()
    article.parse()

    # Extract the title, subtitle, description, and main text
    title = article.title.strip()
    subtitle = article.meta_data.get("description", "").strip()
    description = article.meta_description.strip()
    text = article.text.strip()

    # Set the subtitle to the description if it is empty
    if not subtitle:
        subtitle = description.strip()

    # Concatenate the extracted strings
    article_text = f"{title}\n\n{subtitle}\n\n{text}"
    # Tokenize the text into sentences
    sentences = sent_tokenize(article_text)
    
    # Identify the stop words for each sentence
    num_stop_words_per_sentence = []
    stop_words_per_sentence = []
    filtered_sentences = []
    num_words_per_sentence = []
    avg_stop_words_per_sentence = []
    total_words = 0

    # Create a Porter stemmer object
    stemmer = PorterStemmer()
    stemmed_sentences = []

    # Process the text with the pipeline and extract the sentiment for each sentence
    doc = nlp(text)
    sentiment_scores = []
    
    for i, sentence in enumerate(sentences):
        # Tokenize the sentence into words
        words = word_tokenize(sentence)
        all_words = len(words)
        total_words += all_words
        
        # Identify the stop words in the sentence
        stop_words = set(stopwords.words('english'))
        stop_words_found = [word for word in words if word.lower() in stop_words]
        all_stop_words = len(stop_words_found)
        filtered_words = [word for word in words if word.lower() not in stop_words]
        
        # Add the number of stop words and filtered sentence to the output
        num_stop_words = all_words - len(filtered_words)
        num_stop_words_per_sentence.append(num_stop_words)
        stop_words_per_sentence.append(stop_words_found)
        filtered_sentences.append(" ".join(filtered_words))
        num_words_per_sentence.append(all_words)
        
        # Calculate the average number of stop words per sentence
        avg_stop_words_per_sentence.append(num_stop_words / all_words)

        # Perform stemming on each word using the Porter stemmer
        stemmed_words = [stemmer.stem(word) for word in words]

        # Combine the stemmed words back into a single string
        stemmed_sentence = ' '.join(stemmed_words)
        stemmed_sentences.append(stemmed_sentence)
        output_text = '\n'.join(stemmed_sentences)

    for sentence in doc.sentences:
        sentiment_scores.append(sentence.sentiment)
    
    # Calculate summary statistics
    num_stop_words = sum(num_stop_words_per_sentence)
    num_sentences = len(sentences)
    avg_stop_words_per_sentence_all = num_stop_words / num_sentences
    max_stop_words_per_sentence = max(num_stop_words_per_sentence)
    min_stop_words_per_sentence = min(num_stop_words_per_sentence)
    avg_stop_words_per_word = num_stop_words / total_words
    
    # Calculate the average number of stop words per article
    avg_stop_words_per_sentence_avg = sum(avg_stop_words_per_sentence) / len(avg_stop_words_per_sentence)

    # Return the output
    return {
        'num_stop_words': num_stop_words,
        'total_words': total_words,
        'stop_words_found': stop_words_found,
        'all_stop_words': all_stop_words,
        'avg_stop_words_per_sentence_all': avg_stop_words_per_sentence_all,
        'max_stop_words_per_sentence': max_stop_words_per_sentence,
        'min_stop_words_per_sentence': min_stop_words_per_sentence,
        'avg_stop_words_per_word': avg_stop_words_per_word,
        'avg_stop_words_per_sentence': avg_stop_words_per_sentence,
        'avg_stop_words_per_sentence_avg': avg_stop_words_per_sentence_avg,
        'filtered_sentences': filtered_sentences,
        'stop_words_per_sentence': stop_words_per_sentence,
        'num_words_per_sentence': num_words_per_sentence,
        'num_stop_words_per_sentence': num_stop_words_per_sentence,
        'output_text': output_text,
        'sentiment_scores': sentiment_scores
    }


In [13]:
# Call the function to preprocess the article
results = preprocess_article(url)

# Print the information for each sentence
for i, sentence in enumerate(results['filtered_sentences']):
    print(f"Sentence {i+1}: {sentence}")
    print(f"Total words: {results['num_words_per_sentence'][i]}")
    print(f"Filtered words: {sentence.split()}")
    print(f"Number of filtered words: {len(sentence.split())}")
    print(f"Stop words: {results['stop_words_per_sentence'][i]}")
    print(f"Number of stop words: {results['num_stop_words_per_sentence'][i]}")
    print(f"Average number of stop words per sentence: {round(results['avg_stop_words_per_sentence'][i], 2)}")
    print(f"Sentiment score: {results['sentiment_scores'][i]}")
    print("\n")

# Print the general statistics on stop words
print(f"Total number of words: {results['total_words']}")
print(f"Total number of stop words: {results['num_stop_words']}")
print(f"Maximum number of stop words per sentence: {results['max_stop_words_per_sentence']}")
print(f"Minimum number of stop words per sentence: {results['min_stop_words_per_sentence']}")
print(f"Average number of stop words per article: {round(results['avg_stop_words_per_sentence_avg'], 2)}")
print(f"Average of sentiment score for all sentences: {sum(results['sentiment_scores']) / len(results['sentiment_scores'])}")
print(f"Maximum sentiment score: {max(results['sentiment_scores'])}")
print(f"Minimum sentiment score: {min(results['sentiment_scores'])}")
print(f"Standard deviation: {statistics.stdev(results['sentiment_scores'])}")


print()
print("Stemmed text:")
print(f"{results['output_text']}")


Sentence 1: Republicans respond IRS whistleblower says Hunter Biden investigation mishandled Members Congress calling transparency Biden administration IRS whistleblower said investigation Hunter Biden mishandled .
Total words: 38
Filtered words: ['Republicans', 'respond', 'IRS', 'whistleblower', 'says', 'Hunter', 'Biden', 'investigation', 'mishandled', 'Members', 'Congress', 'calling', 'transparency', 'Biden', 'administration', 'IRS', 'whistleblower', 'said', 'investigation', 'Hunter', 'Biden', 'mishandled', '.']
Number of filtered words: 23
Stop words: ['after', 'is', 'being', 'of', 'are', 'for', 'more', 'from', 'the', 'after', 'an', 'an', 'into', 'is', 'being']
Number of stop words: 15
Average number of stop words per sentence: 0.39
Sentiment score: 0


Sentence 2: Lawmakers Capitol Hill calling Biden administration held accountable `` blocking '' Congress public learning Biden family members ’ business deals China .
Total words: 35
Filtered words: ['Lawmakers', 'Capitol', 'Hill', '