In [1]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('vader_lexicon')
nltk.download('sentiwordnet')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import sentiwordnet as swn

from nltk.sentiment.vader import SentimentIntensityAnalyzer

import stanza
stanza.download('en')  # Download the English model

import statistics
import numpy as np
import pandas as pd

import re

import requests
from bs4 import BeautifulSoup

import newspaper

[nltk_data] Downloading package punkt to /home/pierluigi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/pierluigi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/pierluigi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/pierluigi/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package sentiwordnet to
[nltk_data]     /home/pierluigi/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

2023-04-20 15:06:48 INFO: Downloading default packages for language: en (English) ...
2023-04-20 15:06:49 INFO: File exists: /home/pierluigi/stanza_resources/en/default.zip
2023-04-20 15:06:56 INFO: Finished downloading models and saved to /home/pierluigi/stanza_resources.


# Converting a link into a string

We use the following script to convert an article into a string, which we will use for the tokenization of sentences and words, accordingly. 
This avoids saving the article in a .txt file every time.

In [4]:
url = "https://www.foxnews.com/politics/republicans-respond-after-irs-whistleblower-says-hunter-biden-investigation-being-mishandled"

In [5]:
def get_article_info(url):
    # Create a newspaper Article object
    article = newspaper.Article(url)

    # Download and parse the article
    article.download()
    article.parse()

    # Extract the title, subtitle, description, and main text
    title = article.title.strip()
    subtitle = article.meta_data.get("description", "").strip()
    description = article.meta_description.strip()
    text = article.text.strip()

    # Set the subtitle to the description if it is empty
    if not subtitle:
        subtitle = description.strip()

    # Concatenate the extracted strings
    article_text = f"{title}\n\n{subtitle}\n\n{text}"

    # Return the concatenated string
    return article_text

In [6]:
article = get_article_info(url)
print(article)

Republicans respond after IRS whistleblower says Hunter Biden investigation is being mishandled

Members of Congress are calling for more transparency from the Biden administration after an IRS whistleblower said an investigation into Hunter Biden is being mishandled.

Lawmakers on Capitol Hill are calling for the Biden administration to be held accountable for "blocking" Congress and the public from learning more about Biden family members’ business deals with China.

The congressional outcries come as a whistleblower within the Internal Revenue Service alleges an investigation into Hunter Biden is being mishandled by the Biden administration. The whistleblower also alleges "clear conflicts of interest" in the investigation.

"It’s deeply concerning that the Biden Administration may be obstructing justice by blocking efforts to charge Hunter Biden for tax violations," House Committee on Oversight and Accountability Chairman James Comer told Fox News on Wednesday.

Comer, R-Ky., also s

# Reading the .txt file and converting it into text (optional)

In [15]:
# read input file
filepath = input("/home/pierluigi/Documents/echo_chambers_intership/newsArticle.txt")
with open(filepath, 'r') as file:
    # assign the contents of the file to the article variable
    article = file.read()

# print the contents of the file
print(article)


Republican presidential candidate Nikki Haley stepped into the hallway after speaking at the Conservative Political Action Conference on Friday to supporters asking for selfies and autographs — and, from others, a less friendly greeting.

“We love Trump, we love Trump!” a crowd around her started chanting. Some Haley supporters shouted her name back as the former U.N. ambassador escaped with staff to an elevator.

The dust-up showed the risks of taking the primary fight to what has clearly become Trump’s home turf. Though CPAC has long been seen as a big-tent forum for the conservative movement and a mandatory cattle call for presidential hopefuls, the annual conference has increasingly grown into a stomping ground for the 45th president and his “Make America Great Again” wing of the GOP. Trump will speak at the event Saturday.

“Remember, you’re not at CPAC, you’re at TPAC,” John Fredericks, a pro-Trump talk radio host broadcasting from the sidelines here, said in an interview Wednesd

# Tokenization of Sentences

Tokenization is a fundamental pre-processing step in natural language processing (NLP) that involves breaking down a text into smaller units, typically words, phrases, or symbols. In the context of sentence tokenization, the process involves segmenting a text document into individual sentences based on certain rules or patterns.

Sentence tokenization is important in NLP because many NLP tasks, such as sentiment analysis, machine translation, and text summarization, require input text to be split into sentences so that the task can be performed on a sentence-by-sentence basis.

There are several ways to perform sentence tokenization, ranging from simple rules-based approaches to more sophisticated machine learning-based methods. Rules-based approaches use a set of hand-crafted rules to segment text into sentences based on common sentence-ending punctuation marks such as periods, question marks, and exclamation marks. However, such approaches can be prone to errors when faced with complex sentence structures or non-standard punctuation marks.

Machine learning-based approaches use statistical models to learn patterns from large amounts of annotated text data and apply these patterns to new, unseen text. These methods typically involve training a model on a corpus of text documents, and then using the trained model to segment new text into sentences based on learned patterns.

In [7]:
# Tokenize the text into sentences
sentences = sent_tokenize(article)
print(sentences)

['Republicans respond after IRS whistleblower says Hunter Biden investigation is being mishandled\n\nMembers of Congress are calling for more transparency from the Biden administration after an IRS whistleblower said an investigation into Hunter Biden is being mishandled.', 'Lawmakers on Capitol Hill are calling for the Biden administration to be held accountable for "blocking" Congress and the public from learning more about Biden family members’ business deals with China.', 'The congressional outcries come as a whistleblower within the Internal Revenue Service alleges an investigation into Hunter Biden is being mishandled by the Biden administration.', 'The whistleblower also alleges "clear conflicts of interest" in the investigation.', '"It’s deeply concerning that the Biden Administration may be obstructing justice by blocking efforts to charge Hunter Biden for tax violations," House Committee on Oversight and Accountability Chairman James Comer told Fox News on Wednesday.', 'Comer

Another way to print sentences

In [9]:
# Print out each sentence
for sentence in sentences:
    print(sentence)

Republicans respond after IRS whistleblower says Hunter Biden investigation is being mishandled

Members of Congress are calling for more transparency from the Biden administration after an IRS whistleblower said an investigation into Hunter Biden is being mishandled.
Lawmakers on Capitol Hill are calling for the Biden administration to be held accountable for "blocking" Congress and the public from learning more about Biden family members’ business deals with China.
The congressional outcries come as a whistleblower within the Internal Revenue Service alleges an investigation into Hunter Biden is being mishandled by the Biden administration.
The whistleblower also alleges "clear conflicts of interest" in the investigation.
"It’s deeply concerning that the Biden Administration may be obstructing justice by blocking efforts to charge Hunter Biden for tax violations," House Committee on Oversight and Accountability Chairman James Comer told Fox News on Wednesday.
Comer, R-Ky., also said 

# Tokenization of words

In [17]:
for i, sentence in enumerate(sentences):

    # Tokenize the sentence into words
    words = word_tokenize(sentence)

    print(words)

['Republicans', 'respond', 'after', 'IRS', 'whistleblower', 'says', 'Hunter', 'Biden', 'investigation', 'is', 'being', 'mishandled', 'Members', 'of', 'Congress', 'are', 'calling', 'for', 'more', 'transparency', 'from', 'the', 'Biden', 'administration', 'after', 'an', 'IRS', 'whistleblower', 'said', 'an', 'investigation', 'into', 'Hunter', 'Biden', 'is', 'being', 'mishandled', '.']
['Lawmakers', 'on', 'Capitol', 'Hill', 'are', 'calling', 'for', 'the', 'Biden', 'administration', 'to', 'be', 'held', 'accountable', 'for', '``', 'blocking', "''", 'Congress', 'and', 'the', 'public', 'from', 'learning', 'more', 'about', 'Biden', 'family', 'members', '’', 'business', 'deals', 'with', 'China', '.']
['The', 'congressional', 'outcries', 'come', 'as', 'a', 'whistleblower', 'within', 'the', 'Internal', 'Revenue', 'Service', 'alleges', 'an', 'investigation', 'into', 'Hunter', 'Biden', 'is', 'being', 'mishandled', 'by', 'the', 'Biden', 'administration', '.']
['The', 'whistleblower', 'also', 'alleges'

# Stop word removal

In [18]:
total_words = 0

for i, sentence in enumerate(sentences):
    # Tokenize the sentence into words
    words = word_tokenize(sentence)
    
    # Identify the stop words in the sentence
    stop_words = set(nltk.corpus.stopwords.words('english'))
    filtered_words = [word for word in words if word.lower() not in stop_words]
    stop_words_found = [word for word in words if word.lower() in stop_words]
    
    # Count all the words in the sentence
    all_words = len(words)
    total_words += all_words  # add the count of all_words to the total_words variable

    # Count all the stop words in the sentence
    all_stop_words = len(stop_words_found)

    # Print out the results for each sentence
    print("Sentence ", i+1)
    print("Total words:", all_words)
    print("Filtered words:", filtered_words)
    print("Number of filtered words:", len(filtered_words))
    print("Stop words identified:", stop_words_found)
    print("Number of stop words identified:", all_stop_words)
    print()

print("Total number of words:", total_words)  # print the total sum of all words

Sentence  1
Total words: 38
Filtered words: ['Republicans', 'respond', 'IRS', 'whistleblower', 'says', 'Hunter', 'Biden', 'investigation', 'mishandled', 'Members', 'Congress', 'calling', 'transparency', 'Biden', 'administration', 'IRS', 'whistleblower', 'said', 'investigation', 'Hunter', 'Biden', 'mishandled', '.']
Number of filtered words: 23
Stop words identified: ['after', 'is', 'being', 'of', 'are', 'for', 'more', 'from', 'the', 'after', 'an', 'an', 'into', 'is', 'being']
Number of stop words identified: 15

Sentence  2
Total words: 35
Filtered words: ['Lawmakers', 'Capitol', 'Hill', 'calling', 'Biden', 'administration', 'held', 'accountable', '``', 'blocking', "''", 'Congress', 'public', 'learning', 'Biden', 'family', 'members', '’', 'business', 'deals', 'China', '.']
Number of filtered words: 22
Stop words identified: ['on', 'are', 'for', 'the', 'to', 'be', 'for', 'and', 'the', 'from', 'more', 'about', 'with']
Number of stop words identified: 13

Sentence  3
Total words: 26
Filte

# Pre process text

In [68]:
def remove_stop_words(text):
    # Tokenize the text into sentences
    sentences = sent_tokenize(text)
    
    # Identify the stop words for each sentence
    num_stop_words_per_sentence = []
    stop_words_per_sentence = []
    filtered_sentences = []
    num_words_per_sentence = []
    avg_stop_words_per_sentence = []
    total_words = 0
    
    for sentence in sentences:
        # Tokenize the sentence into words
        words = word_tokenize(sentence)
        num_words = len(words)
        total_words += num_words
        
        # Identify the stop words in the sentence
        stop_words = set(stopwords.words('english'))
        filtered_words = [w for w in words if not w.lower() in stop_words]
        
        # Add the number of stop words and filtered sentence to the output
        num_stop_words = num_words - len(filtered_words)
        num_stop_words_per_sentence.append(num_stop_words)
        stop_words_per_sentence.append(filtered_words)
        filtered_sentences.append(" ".join(filtered_words))
        num_words_per_sentence.append(num_words)
        
        # Calculate the average number of stop words per sentence
        avg_stop_words_per_sentence.append(num_stop_words / num_words)
    
    # Calculate summary statistics
    num_stop_words = sum(num_stop_words_per_sentence)
    num_sentences = len(sentences)
    avg_stop_words_per_sentence_all = num_stop_words / num_sentences
    max_stop_words_per_sentence = max(num_stop_words_per_sentence)
    min_stop_words_per_sentence = min(num_stop_words_per_sentence)
    avg_stop_words_per_word = num_stop_words / sum(num_words_per_sentence)
    
    # Calculate the average number of stop words per article
    avg_stop_words_per_sentence_avg = sum(avg_stop_words_per_sentence) / len(avg_stop_words_per_sentence)
    
    # Return the output
    return {
        'num_stop_words': num_stop_words,
        "total_words": total_words,
        'avg_stop_words_per_sentence_all': avg_stop_words_per_sentence_all,
        'max_stop_words_per_sentence': max_stop_words_per_sentence,
        'min_stop_words_per_sentence': min_stop_words_per_sentence,
        'avg_stop_words_per_word': avg_stop_words_per_word,
        'avg_stop_words_per_sentence': avg_stop_words_per_sentence,
        'avg_stop_words_per_sentence_avg': avg_stop_words_per_sentence_avg,
        'filtered_sentences': filtered_sentences,
        'stop_words_per_sentence': stop_words_per_sentence,
        'num_words_per_sentence': num_words_per_sentence,
    }


In [69]:
results = remove_stop_words(article)

print("Filtered sentences:")
for sentence in results["filtered_sentences"]:
    print(sentence)
    print("Average number of stop words per sentence:", round(results["avg_stop_words_per_sentence"][results["filtered_sentences"].index(sentence)], 2))
    print()

print("Statistics on stop words:")
print("Total number of words:", results["total_words"])
print("Number of stop words:", results["num_stop_words"])
print("Maximum number of stop words per sentence:", results["max_stop_words_per_sentence"])
print("Minimum number of stop words per sentence:", results["min_stop_words_per_sentence"])
print("Average number of stop words per article:", round(results["avg_stop_words_per_word"], 2))


Filtered sentences:
Republicans respond IRS whistleblower says Hunter Biden investigation mishandled Members Congress calling transparency Biden administration IRS whistleblower said investigation Hunter Biden mishandled .
Average number of stop words per sentence: 0.39

Lawmakers Capitol Hill calling Biden administration held accountable `` blocking '' Congress public learning Biden family members ’ business deals China .
Average number of stop words per sentence: 0.37

congressional outcries come whistleblower within Internal Revenue Service alleges investigation Hunter Biden mishandled Biden administration .
Average number of stop words per sentence: 0.38

whistleblower also alleges `` clear conflicts interest '' investigation .
Average number of stop words per sentence: 0.29

`` ’ deeply concerning Biden Administration may obstructing justice blocking efforts charge Hunter Biden tax violations , '' House Committee Oversight Accountability Chairman James Comer told Fox News Wednesda

# Stemming analysis

Stemming analysis is a natural language processing technique that involves reducing words to their base or root form. The goal of stemming is to normalize variations of words by mapping all forms of a word to a common base or root word, which can help improve text analysis and information retrieval.

For example, the words "jumping," "jumps," and "jumped" can be stemmed to the root word "jump." This allows us to treat all variations of the word "jump" as the same word, which can simplify text analysis and improve the accuracy of search results.

There are several popular stemming algorithms, such as the Porter stemming algorithm and the Snowball stemming algorithm, that use various rules and heuristics to determine the base form of a word. Stemming is often used as a preprocessing step in natural language processing tasks such as text classification, sentiment analysis, and information retrieval.

In [70]:
# Create a Porter stemmer object
stemmer = PorterStemmer()

words = word_tokenize(article)

# Perform stemming on each word using the Porter stemmer
stemmed_words = [stemmer.stem(word) for word in words]

# Combine the stemmed words back into a single string
output_text = ' '.join(stemmed_words)

# Write the output text to a new file
# with open('output.txt', 'w') as f:
#    f.write(output_text)

print(output_text)


republican respond after ir whistleblow say hunter biden investig is be mishandl member of congress are call for more transpar from the biden administr after an ir whistleblow said an investig into hunter biden is be mishandl . lawmak on capitol hill are call for the biden administr to be held account for `` block '' congress and the public from learn more about biden famili member ’ busi deal with china . the congression outcri come as a whistleblow within the intern revenu servic alleg an investig into hunter biden is be mishandl by the biden administr . the whistleblow also alleg `` clear conflict of interest '' in the investig . `` it ’ s deepli concern that the biden administr may be obstruct justic by block effort to charg hunter biden for tax violat , '' hous committe on oversight and account chairman jame comer told fox new on wednesday . comer , r-ky. , also said `` decept , shadi busi scheme '' have allow the biden to make `` million from foreign adversari like china . '' hun

# Sentiment and Subjecitivity Markup

# Sentiment Analysis using the Stanza Library

Stanza is an open-source natural language processing (NLP) library for Python that provides a range of tools and models for tasks such as tokenization, part-of-speech tagging, dependency parsing, named entity recognition, and sentiment analysis.

Stanza is useful for a variety of NLP applications, including but not limited to:

- Sentiment Analysis: Stanza provides pre-trained models for sentiment analysis that can be used to classify the sentiment of a given text as positive, negative or neutral.

- Named Entity Recognition (NER): Stanza can be used to identify and extract named entities such as people, organizations, and locations from text.

- Part-of-Speech Tagging (POS): Stanza can be used to identify the part of speech of each word in a sentence, such as noun, verb, adjective, etc. This information can be used in a variety of downstream NLP tasks.

- Dependency Parsing: Stanza can be used to parse the grammatical structure of a sentence and identify the relationships between words. This can be useful for tasks such as information extraction and text summarization.

- Text Classification: Stanza can be used to classify text into predefined categories, such as spam vs. non-spam, positive vs. negative, etc.

- Machine Translation: Stanza can be used to translate text from one language to another.

In [19]:
nlp = stanza.Pipeline(lang='en', processors='tokenize,sentiment', tokenize_no_ssplit=False, max_split_size_mb=128)

2023-04-20 15:16:50 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

2023-04-20 15:16:51 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| sentiment | sstplus  |

2023-04-20 15:16:51 INFO: Using device: cuda
2023-04-20 15:16:51 INFO: Loading: tokenize
2023-04-20 15:16:53 INFO: Loading: sentiment
2023-04-20 15:16:54 INFO: Done loading processors!


In [20]:
# Define a function to analyze the sentiment of a given text
def analyze_sentiment(sentences):    
    # Process each sentence and extract sentiment
    results = []
    for sentence in sentences:
        doc = nlp(sentence)
        sentiment = doc.sentences[0].sentiment
        score = sentiment.score if hasattr(sentiment, 'score') else 0.0
        
        # Map the sentiment score to a label
        if score < 0.0:
            label = 'negative'
        elif score > 0.0:
            label = 'positive'
        else:
            label = 'neutral'
        
        # Add the sentence sentiment to the results list
        results.append((sentence, label, score))
    
    # Return the list of sentence sentiments
    return results

In [21]:
results = analyze_sentiment(sentences)
for sentence, label, score in results:
    print(f"Sentence: {sentence}")
    print(f"Sentiment: {label} with a score of {score}\n")

Sentence: Republicans respond after IRS whistleblower says Hunter Biden investigation is being mishandled

Members of Congress are calling for more transparency from the Biden administration after an IRS whistleblower said an investigation into Hunter Biden is being mishandled.
Sentiment: neutral with a score of 0.0

Sentence: Lawmakers on Capitol Hill are calling for the Biden administration to be held accountable for "blocking" Congress and the public from learning more about Biden family members’ business deals with China.
Sentiment: neutral with a score of 0.0

Sentence: The congressional outcries come as a whistleblower within the Internal Revenue Service alleges an investigation into Hunter Biden is being mishandled by the Biden administration.
Sentiment: neutral with a score of 0.0

Sentence: The whistleblower also alleges "clear conflicts of interest" in the investigation.
Sentiment: neutral with a score of 0.0

Sentence: "It’s deeply concerning that the Biden Administration ma

In [22]:
def get_sentiment_scores(text, nlp):
    doc = nlp(text)
    sentiment_scores = []
    for sentence in doc.sentences:
        sentiment_scores.append(sentence.sentiment)
    if len(sentiment_scores) == 0:
        return None
    else:
        return {
            'average': sum(sentiment_scores) / len(sentiment_scores),
            'maximum': max(sentiment_scores),
            'sd': statistics.stdev(sentiment_scores),
            'minimum': min(sentiment_scores)
        }


In [23]:
sentiment_scores = get_sentiment_scores(article, nlp)
print(sentiment_scores)


{'average': 0.7804878048780488, 'maximum': 1, 'sd': 0.4190581774617469, 'minimum': 0}


# Sentiment Analysis using Vader library

VADER (Valence Aware Dictionary and sEntiment Reasoner) is a lexicon and rule-based sentiment analysis tool that is specifically designed to handle sentiment analysis for social media text. It is part of the Natural Language Toolkit (nltk) library in Python.

The VADER library uses a combination of sentiment lexicon (a list of words and their valence scores) and rule-based approach to analyze the sentiment of a piece of text. Unlike traditional sentiment analysis tools that use machine learning techniques, VADER doesn't require any training data to analyze sentiment. Instead, it uses a set of pre-defined rules and patterns to determine the sentiment of a piece of text.

VADER is particularly useful for analyzing the sentiment of short and informal texts, such as tweets, online reviews, and chat messages. It takes into account both the polarity and intensity of the sentiment in the text, which makes it more accurate than traditional sentiment analysis tools in some cases.

In [24]:
# initialize the sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# analyze the sentiment of the text
scores = analyzer.polarity_scores(article)

# print the sentiment scores
print(scores)


{'neg': 0.083, 'neu': 0.863, 'pos': 0.054, 'compound': -0.9753}


In [25]:
def get_vader_stats(text):

    analyzer = SentimentIntensityAnalyzer()
    scores_list = []

    sentences = sent_tokenize(text)

    for sentence in sentences:
        scores = analyzer.polarity_scores(sentence)
        score_list = [scores['neg'], scores['neu'], scores['pos']]
        scores_list.append(score_list)

    if not scores_list:
        return None

    scores_array = np.array(scores_list)
    avg_scores = np.mean(scores_array, axis=0)
    max_scores = np.max(scores_array, axis=0)
    min_scores = np.min(scores_array, axis=0)
    std_scores = np.std(scores_array, axis=0)

    return avg_scores, max_scores, min_scores, std_scores


In [26]:
avg_scores, max_scores, min_scores, std_scores = get_vader_stats(article)

print("Average scores:", avg_scores)
print("Maximum scores:", max_scores)
print("Minimum scores:", min_scores)
print("Standard deviation scores:", std_scores)

Average scores: [0.07775758 0.86757576 0.05457576]
Maximum scores: [0.294 1.    0.346]
Minimum scores: [0.    0.494 0.   ]
Standard deviation scores: [0.08812698 0.1209152  0.07819983]


# Subjectivity analysis using MPQA

The scores we calculate in this program are subjectivity scores, which indicate the degree to which a word expresses a subjective or emotional meaning, as opposed to an objective or factual meaning.

Specifically, we are using the MPQA (Multi-Perspective Question Answering) subjectivity lexicon, which assigns each word in the lexicon a polarity score of either positive, negative, or neutral (i.e., a score of 1, -1, or 0, respectively) based on the word's emotional connotation.

In the program, we first load the MPQA lexicon (the file .tff should be in the same directory) and preprocess it so that we can easily look up the polarity score of each word in the lexicon. Then, we define a function subjectivity_analysis that takes a file path as input, reads in the text data from the file, and calculates the subjectivity score for each word in the text by looking up the polarity score of the word in the MPQA lexicon.

The average subjectivity score that we calculate is simply the average of all the subjectivity scores of the words in the text. The maximum and minimum subjectivity scores represent the most and least subjective words in the text, respectively. Finally, the standard deviation of the subjectivity scores measures how much the subjectivity scores vary from the average subjectivity score, and thus gives us an indication of the overall degree of subjectivity in the text.

In [27]:
# Load the MPQA lexicon
lexicon = pd.read_csv("subjclueslen1-HLTEMNLP05.tff", sep=" ", header=None, 
                      names=["type", "len", "word", "pos", "stemmed", "polarity", "strength"])

lexicon["type"] = lexicon["type"].str[5:]
lexicon["word"] = lexicon["word"].str[len("word1="):]
lexicon["polarity"] = lexicon["polarity"].str[len("priorpolarity="):]
cols_to_remove = ["len", "pos", "stemmed", "strength"]
lexicon = lexicon.drop(columns=cols_to_remove)
lexicon["type"] = lexicon["type"].replace("weaksubj", 1)
lexicon["type"] = lexicon["type"].replace("strongsubj", 2)
lexicon["polarity"] = lexicon["polarity"].replace("negative", -1)
lexicon["polarity"] = lexicon["polarity"].replace("positive", 1)
lexicon["polarity"] = lexicon["polarity"].replace("both", 0)
lexicon["polarity"] = lexicon["polarity"].replace("neutral", 0)


In [88]:
lexicon

Unnamed: 0,type,word,polarity
0,1,abandoned,-1
1,1,abandonment,-1
2,1,abandon,-1
3,2,abase,-1
4,2,abasement,-1
...,...,...,...
8217,2,zealot,-1
8218,2,zealous,-1
8219,2,zealously,-1
8220,2,zenith,1


In [28]:
def subjectivity_analysis(text):
    # Perform subjectivity analysis
    scores = []
    for word in text.split():
        word = word.strip().lower()
        if word in lexicon.word.tolist():
            polarity = lexicon[lexicon.word == word].polarity.values[0]
            scores.append(polarity)
    
    # Calculate statistics
    avg_score = np.mean(scores)
    max_score = np.max(scores)
    min_score = np.min(scores)
    sd_score = np.std(scores)
    
    return avg_score, max_score, min_score, sd_score

In [90]:
avg_score, max_score, min_score, sd_score = subjectivity_analysis(article)

print("Average subjectivity score:", avg_score)
print("Maximum subjectivity score:", max_score)
print("Minimum subjectivity score:", min_score)
print("Standard deviation of subjectivity scores:", sd_score)

Average subjectivity score: -0.1864406779661017
Maximum subjectivity score: 1
Minimum subjectivity score: -1
Standard deviation of subjectivity scores: 0.8532008778747944


# Sentiment analysis using SentiWordNet

Sentiment analysis using SentiWordNet involves using a lexical resource called SentiWordNet to perform sentiment analysis on text data. SentiWordNet is a publicly available lexical resource that assigns a sentiment score to each synset (set of synonyms) in WordNet, a large English lexical database.

To perform sentiment analysis using SentiWordNet, the text data is first preprocessed to remove any noise and convert it into a format that can be analyzed. Then, each word in the text data is assigned a synset based on its meaning. The sentiment score of each synset is then retrieved from SentiWordNet, and a sentiment score for the entire text is calculated by aggregating the scores of all the synsets in the text.

The sentiment score can be used to determine the overall sentiment of the text, such as whether it is positive, negative, or neutral. This can be useful for a wide range of applications, including social media monitoring, market research, and customer feedback analysis. However, it is important to note that SentiWordNet is based on WordNet, which is a English-centric lexical database, and may not be suitable for sentiment analysis of other languages.

In [29]:
def get_sentiwordnet_vector(text):
  text = text.lower()
  tokens = word_tokenize(text)
  tokens = [token for token in tokens if token.isalnum()]
  tokens = [token for token in tokens if not token in nltk.corpus.stopwords.words('english')]
  sentiwordnet_scores = []
  for token in tokens:
    pos_score = 0
    neg_score = 0
    synsets = swn.senti_synsets(token)
    for synset in synsets:
      pos_score += synset.pos_score()
      neg_score += synset.neg_score()
    if pos_score > neg_score:
      sentiment_score = 1
    elif neg_score > pos_score:
      sentiment_score = -1
    else:
      sentiment_score = 0
    sentiwordnet_scores.append(sentiment_score)
  assert(len(sentiwordnet_scores) == len(tokens))
  return {'tokens': tokens, 'sentiwordnet_scores': sentiwordnet_scores}


In [30]:
result = get_sentiwordnet_vector(article)
for i, token in enumerate(result['tokens']):
  print(f"{i+1}. Token: {token}, Sentiment Score: {result['sentiwordnet_scores'][i]}")

1. Token: republicans, Sentiment Score: 0
2. Token: respond, Sentiment Score: 1
3. Token: irs, Sentiment Score: 0
4. Token: whistleblower, Sentiment Score: 0
5. Token: says, Sentiment Score: 1
6. Token: hunter, Sentiment Score: -1
7. Token: biden, Sentiment Score: 0
8. Token: investigation, Sentiment Score: 1
9. Token: mishandled, Sentiment Score: -1
10. Token: members, Sentiment Score: 0
11. Token: congress, Sentiment Score: 0
12. Token: calling, Sentiment Score: 1
13. Token: transparency, Sentiment Score: 1
14. Token: biden, Sentiment Score: 0
15. Token: administration, Sentiment Score: 1
16. Token: irs, Sentiment Score: 0
17. Token: whistleblower, Sentiment Score: 0
18. Token: said, Sentiment Score: 1
19. Token: investigation, Sentiment Score: 1
20. Token: hunter, Sentiment Score: -1
21. Token: biden, Sentiment Score: 0
22. Token: mishandled, Sentiment Score: -1
23. Token: lawmakers, Sentiment Score: 0
24. Token: capitol, Sentiment Score: 0
25. Token: hill, Sentiment Score: 0
26. To