# Optimized Jupyter Notebook for NLTK VADER Analysis

In [None]:
This notebook has been optimized specifically so that you can run the entire tutorial from the notebook.

In [13]:
!pip install pandas
!pip install nltk
!wget https://raw.githubusercontent.com/favstats/demdebates2020/refs/heads/master/data/debates.csv

--2025-05-07 01:45:43--  https://raw.githubusercontent.com/favstats/demdebates2020/refs/heads/master/data/debates.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 2606:50c0:8000::154, 2606:50c0:8003::154, 2606:50c0:8002::154, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8000::154|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2081989 (2.0M) [text/plain]
Saving to: ‘debates.csv’


2025-05-07 01:45:44 (5.61 MB/s) - ‘debates.csv’ saved [2081989/2081989]



In [14]:
import nltk
nltk.download('vader_lexicon')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import pandas as pd
import string

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/kathleenalvescosta/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/kathleenalvescosta/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/kathleenalvescosta/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/kathleenalvescosta/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [15]:
df = pd.read_csv('debates.csv')
csvcleaned = df[
(df['speech'] != 'NA') &
(df['type'] != 'Moderator') &
(~df['speaker'].isin(['Speaker 1:', 'Speaker 2:', 'speaker 6:', 'speaker 7:', 'speaker 8:', 'Unknown',                                                                                                'Protester']))] \
.drop(columns=['background', 'gender', 'debate', 'day', 'type', 'order']
                 )
csvcleaned.to_csv('debate.csv', index=False) 

In [16]:
speakerdictionary = {}
for _, row in csvcleaned.iterrows():
    speaker = row['speaker']
    speech = row['speech']

    if speaker not in speakerdictionary:
        speakerdictionary[speaker] = []
    speakerdictionary[speaker].append(speech)

In [17]:
def preprocessing():
    lemmatizer = WordNetLemmatizer()
    lemmatizeddictionary = {}
    punctuation = ['...', '--', '``', "''"]
    stop_words = set(stopwords.words('english'))
    for speaker, speeches in speakerdictionary.items():
        lemmatizeddictionary[speaker] = []
        for sentence in speeches:
            tokens = word_tokenize(str(sentence).lower())
            tokens = [word for word in tokens if
                      word not in string.punctuation and word not in punctuation and word != 'nan']
            tokens = [word for word in tokens if not re.fullmatch(r"\s*", word)]
            filteredtokens = [token for token in tokens if token not in stop_words]
            
            lemmatized = [lemmatizer.lemmatize(word) for word in filteredtokens if word]
            if lemmatized:
                lemmatizeddictionary[speaker].append(lemmatized)
    return lemmatizeddictionary

In [18]:
def sentiment(speakerdictionary):
    analyzer = SentimentIntensityAnalyzer()
    for speaker, speeches in speakerdictionary.items():
        text = " ".join([" ".join(sentence) for sentence in speeches])
        if not text:
            continue
            
        scores = analyzer.polarity_scores(text)
        print(f"{speaker}: {scores}")

In [19]:
data = preprocessing()
sentiment(data)

Elizabeth Warren: {'neg': 0.121, 'neu': 0.687, 'pos': 0.192, 'compound': 1.0}
Amy Klobuchar: {'neg': 0.087, 'neu': 0.717, 'pos': 0.196, 'compound': 1.0}
Beto O'Rourke: {'neg': 0.126, 'neu': 0.663, 'pos': 0.211, 'compound': 0.9998}
Cory Booker: {'neg': 0.153, 'neu': 0.631, 'pos': 0.216, 'compound': 0.9999}
Julian Castro: {'neg': 0.083, 'neu': 0.709, 'pos': 0.207, 'compound': 0.9999}
Tulsi Gabbard: {'neg': 0.17, 'neu': 0.621, 'pos': 0.209, 'compound': 0.9974}
Bill de Blasio: {'neg': 0.123, 'neu': 0.672, 'pos': 0.205, 'compound': 0.999}
John Delaney: {'neg': 0.064, 'neu': 0.7, 'pos': 0.236, 'compound': 0.9998}
Jay Inslee: {'neg': 0.098, 'neu': 0.639, 'pos': 0.263, 'compound': 0.9997}
Tim Ryan: {'neg': 0.091, 'neu': 0.711, 'pos': 0.198, 'compound': 0.9995}
Bernie Sanders: {'neg': 0.131, 'neu': 0.677, 'pos': 0.192, 'compound': 1.0}
Michael Bennet: {'neg': 0.104, 'neu': 0.701, 'pos': 0.196, 'compound': 0.9993}
Joe Biden: {'neg': 0.088, 'neu': 0.774, 'pos': 0.139, 'compound': 1.0}
Kamala Harr