In [1]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('vader_lexicon')
nltk.download('sentiwordnet')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import sentiwordnet as swn

from nltk.sentiment.vader import SentimentIntensityAnalyzer

import stanza
stanza.download('en')  # Download the English model

import statistics
import numpy as np
import pandas as pd

import re

import requests
from bs4 import BeautifulSoup

import newspaper

[nltk_data] Downloading package punkt to /home/pierluigi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/pierluigi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/pierluigi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/pierluigi/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package sentiwordnet to
[nltk_data]     /home/pierluigi/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

2023-04-27 16:53:27 INFO: Downloading default packages for language: en (English) ...
2023-04-27 16:53:29 INFO: File exists: /home/pierluigi/stanza_resources/en/default.zip
2023-04-27 16:53:34 INFO: Finished downloading models and saved to /home/pierluigi/stanza_resources.


# Converting a link into a string

We use the following script to convert an article into a string, which we will use for the tokenization of sentences and words, accordingly. 
This avoids saving the article in a .txt file every time.

In [2]:
url = "https://www.foxnews.com/politics/republicans-respond-after-irs-whistleblower-says-hunter-biden-investigation-being-mishandled"

In [3]:
def get_article_info(url):
    # Create a newspaper Article object
    article = newspaper.Article(url)

    # Download and parse the article
    article.download()
    article.parse()

    # Extract the title, subtitle, description, and main text
    title = article.title.strip()
    subtitle = article.meta_data.get("description", "").strip()
    description = article.meta_description.strip()
    text = article.text.strip()

    # Set the subtitle to the description if it is empty
    if not subtitle:
        subtitle = description.strip()

    # Concatenate the extracted strings
    article_text = f"{title}\n\n{subtitle}\n\n{text}"

    # Return the concatenated string
    return article_text

In [4]:
article = get_article_info(url)
print(article)

Republicans respond after IRS whistleblower says Hunter Biden investigation is being mishandled

Members of Congress are calling for more transparency from the Biden administration after an IRS whistleblower said an investigation into Hunter Biden is being mishandled.

Lawmakers on Capitol Hill are calling for the Biden administration to be held accountable for "blocking" Congress and the public from learning more about Biden family members’ business deals with China.

The congressional outcries come as a whistleblower within the Internal Revenue Service alleges an investigation into Hunter Biden is being mishandled by the Biden administration. The whistleblower also alleges "clear conflicts of interest" in the investigation.

"It’s deeply concerning that the Biden Administration may be obstructing justice by blocking efforts to charge Hunter Biden for tax violations," House Committee on Oversight and Accountability Chairman James Comer told Fox News on Wednesday.

Comer, R-Ky., also s

# Reading the .txt file and converting it into text (optional)

In [5]:
# read input file
#filepath = "/home/pierluigi/Documents/echo_chambers_intership/newsArticle.txt"
#with open(filepath, 'r') as file:
#    # assign the contents of the file to the article variable
#    article = file.read()
#
# print the contents of the file
#print(article)


# Tokenization of Sentences

Tokenization is a fundamental pre-processing step in natural language processing (NLP) that involves breaking down a text into smaller units, typically words, phrases, or symbols. In the context of sentence tokenization, the process involves segmenting a text document into individual sentences based on certain rules or patterns.

Sentence tokenization is important in NLP because many NLP tasks, such as sentiment analysis, machine translation, and text summarization, require input text to be split into sentences so that the task can be performed on a sentence-by-sentence basis.

There are several ways to perform sentence tokenization, ranging from simple rules-based approaches to more sophisticated machine learning-based methods. Rules-based approaches use a set of hand-crafted rules to segment text into sentences based on common sentence-ending punctuation marks such as periods, question marks, and exclamation marks. However, such approaches can be prone to errors when faced with complex sentence structures or non-standard punctuation marks.

Machine learning-based approaches use statistical models to learn patterns from large amounts of annotated text data and apply these patterns to new, unseen text. These methods typically involve training a model on a corpus of text documents, and then using the trained model to segment new text into sentences based on learned patterns.

In [6]:
# Tokenize the text into sentences
sentences = sent_tokenize(article)
print(sentences)

['Republicans respond after IRS whistleblower says Hunter Biden investigation is being mishandled\n\nMembers of Congress are calling for more transparency from the Biden administration after an IRS whistleblower said an investigation into Hunter Biden is being mishandled.', 'Lawmakers on Capitol Hill are calling for the Biden administration to be held accountable for "blocking" Congress and the public from learning more about Biden family members’ business deals with China.', 'The congressional outcries come as a whistleblower within the Internal Revenue Service alleges an investigation into Hunter Biden is being mishandled by the Biden administration.', 'The whistleblower also alleges "clear conflicts of interest" in the investigation.', '"It’s deeply concerning that the Biden Administration may be obstructing justice by blocking efforts to charge Hunter Biden for tax violations," House Committee on Oversight and Accountability Chairman James Comer told Fox News on Wednesday.', 'Comer

Another way to print sentences

In [7]:
# Print out each sentence
for sentence in sentences:
    print(sentence)

Republicans respond after IRS whistleblower says Hunter Biden investigation is being mishandled

Members of Congress are calling for more transparency from the Biden administration after an IRS whistleblower said an investigation into Hunter Biden is being mishandled.
Lawmakers on Capitol Hill are calling for the Biden administration to be held accountable for "blocking" Congress and the public from learning more about Biden family members’ business deals with China.
The congressional outcries come as a whistleblower within the Internal Revenue Service alleges an investigation into Hunter Biden is being mishandled by the Biden administration.
The whistleblower also alleges "clear conflicts of interest" in the investigation.
"It’s deeply concerning that the Biden Administration may be obstructing justice by blocking efforts to charge Hunter Biden for tax violations," House Committee on Oversight and Accountability Chairman James Comer told Fox News on Wednesday.
Comer, R-Ky., also said 

# Tokenization of words

In [8]:
for i, sentence in enumerate(sentences):

    # Tokenize the sentence into words
    words = word_tokenize(sentence)

    print(words)

['Republicans', 'respond', 'after', 'IRS', 'whistleblower', 'says', 'Hunter', 'Biden', 'investigation', 'is', 'being', 'mishandled', 'Members', 'of', 'Congress', 'are', 'calling', 'for', 'more', 'transparency', 'from', 'the', 'Biden', 'administration', 'after', 'an', 'IRS', 'whistleblower', 'said', 'an', 'investigation', 'into', 'Hunter', 'Biden', 'is', 'being', 'mishandled', '.']
['Lawmakers', 'on', 'Capitol', 'Hill', 'are', 'calling', 'for', 'the', 'Biden', 'administration', 'to', 'be', 'held', 'accountable', 'for', '``', 'blocking', "''", 'Congress', 'and', 'the', 'public', 'from', 'learning', 'more', 'about', 'Biden', 'family', 'members', '’', 'business', 'deals', 'with', 'China', '.']
['The', 'congressional', 'outcries', 'come', 'as', 'a', 'whistleblower', 'within', 'the', 'Internal', 'Revenue', 'Service', 'alleges', 'an', 'investigation', 'into', 'Hunter', 'Biden', 'is', 'being', 'mishandled', 'by', 'the', 'Biden', 'administration', '.']
['The', 'whistleblower', 'also', 'alleges'

# Stop word removal

In [9]:
total_words = 0

for i, sentence in enumerate(sentences):
    # Tokenize the sentence into words
    words = word_tokenize(sentence)
    
    # Identify the stop words in the sentence
    stop_words = set(nltk.corpus.stopwords.words('english'))
    filtered_words = [word for word in words if word.lower() not in stop_words]
    stop_words_found = [word for word in words if word.lower() in stop_words]
    
    # Count all the words in the sentence
    all_words = len(words)
    total_words += all_words  # add the count of all_words to the total_words variable

    # Count all the stop words in the sentence
    all_stop_words = len(stop_words_found)

    # Print out the results for each sentence
    print("Sentence ", i+1)
    print("Total words:", all_words)
    print("Filtered words:", filtered_words)
    print("Number of filtered words:", len(filtered_words))
    print("Stop words identified:", stop_words_found)
    print("Number of stop words identified:", all_stop_words)
    print()

print("Total number of words:", total_words)  # print the total sum of all words

Sentence  1
Total words: 38
Filtered words: ['Republicans', 'respond', 'IRS', 'whistleblower', 'says', 'Hunter', 'Biden', 'investigation', 'mishandled', 'Members', 'Congress', 'calling', 'transparency', 'Biden', 'administration', 'IRS', 'whistleblower', 'said', 'investigation', 'Hunter', 'Biden', 'mishandled', '.']
Number of filtered words: 23
Stop words identified: ['after', 'is', 'being', 'of', 'are', 'for', 'more', 'from', 'the', 'after', 'an', 'an', 'into', 'is', 'being']
Number of stop words identified: 15

Sentence  2
Total words: 35
Filtered words: ['Lawmakers', 'Capitol', 'Hill', 'calling', 'Biden', 'administration', 'held', 'accountable', '``', 'blocking', "''", 'Congress', 'public', 'learning', 'Biden', 'family', 'members', '’', 'business', 'deals', 'China', '.']
Number of filtered words: 22
Stop words identified: ['on', 'are', 'for', 'the', 'to', 'be', 'for', 'and', 'the', 'from', 'more', 'about', 'with']
Number of stop words identified: 13

Sentence  3
Total words: 26
Filte

# Pre process text

In [10]:
def remove_stop_words(text):
    # Tokenize the text into sentences
    sentences = sent_tokenize(text)
    
    # Identify the stop words for each sentence
    num_stop_words_per_sentence = []
    stop_words_per_sentence = []
    filtered_sentences = []
    num_words_per_sentence = []
    avg_stop_words_per_sentence = []
    total_words = 0
    
    for sentence in sentences:
        # Tokenize the sentence into words
        words = word_tokenize(sentence)
        num_words = len(words)
        total_words += num_words
        
        # Identify the stop words in the sentence
        stop_words = set(stopwords.words('english'))
        filtered_words = [w for w in words if not w.lower() in stop_words]
        
        # Add the number of stop words and filtered sentence to the output
        num_stop_words = num_words - len(filtered_words)
        num_stop_words_per_sentence.append(num_stop_words)
        stop_words_per_sentence.append(filtered_words)
        filtered_sentences.append(" ".join(filtered_words))
        num_words_per_sentence.append(num_words)
        
        # Calculate the average number of stop words per sentence
        avg_stop_words_per_sentence.append(num_stop_words / num_words)
    
    # Calculate summary statistics
    num_stop_words = sum(num_stop_words_per_sentence)
    num_sentences = len(sentences)
    avg_stop_words_per_sentence_all = num_stop_words / num_sentences
    max_stop_words_per_sentence = max(num_stop_words_per_sentence)
    min_stop_words_per_sentence = min(num_stop_words_per_sentence)
    avg_stop_words_per_word = num_stop_words / sum(num_words_per_sentence)
    
    # Calculate the average number of stop words per article
    avg_stop_words_per_sentence_avg = sum(avg_stop_words_per_sentence) / len(avg_stop_words_per_sentence)
    
    # Return the output
    return {
        'num_stop_words': num_stop_words,
        "total_words": total_words,
        'avg_stop_words_per_sentence_all': avg_stop_words_per_sentence_all,
        'max_stop_words_per_sentence': max_stop_words_per_sentence,
        'min_stop_words_per_sentence': min_stop_words_per_sentence,
        'avg_stop_words_per_word': avg_stop_words_per_word,
        'avg_stop_words_per_sentence': avg_stop_words_per_sentence,
        'avg_stop_words_per_sentence_avg': avg_stop_words_per_sentence_avg,
        'filtered_sentences': filtered_sentences,
        'stop_words_per_sentence': stop_words_per_sentence,
        'num_words_per_sentence': num_words_per_sentence,
    }


In [11]:
results = remove_stop_words(article)

print("Filtered sentences:")
for sentence in results["filtered_sentences"]:
    print(sentence)
    print("Average number of stop words per sentence:", round(results["avg_stop_words_per_sentence"][results["filtered_sentences"].index(sentence)], 2))
    print()

print("Statistics on stop words:")
print("Total number of words:", results["total_words"])
print("Number of stop words:", results["num_stop_words"])
print("Maximum number of stop words per sentence:", results["max_stop_words_per_sentence"])
print("Minimum number of stop words per sentence:", results["min_stop_words_per_sentence"])
print("Average number of stop words per article:", round(results["avg_stop_words_per_word"], 2))


Filtered sentences:
Republicans respond IRS whistleblower says Hunter Biden investigation mishandled Members Congress calling transparency Biden administration IRS whistleblower said investigation Hunter Biden mishandled .
Average number of stop words per sentence: 0.39

Lawmakers Capitol Hill calling Biden administration held accountable `` blocking '' Congress public learning Biden family members ’ business deals China .
Average number of stop words per sentence: 0.37

congressional outcries come whistleblower within Internal Revenue Service alleges investigation Hunter Biden mishandled Biden administration .
Average number of stop words per sentence: 0.38

whistleblower also alleges `` clear conflicts interest '' investigation .
Average number of stop words per sentence: 0.29

`` ’ deeply concerning Biden Administration may obstructing justice blocking efforts charge Hunter Biden tax violations , '' House Committee Oversight Accountability Chairman James Comer told Fox News Wednesda