# Optimized Jupyter Notebook for NLTK VADER Analysis

This notebook has been optimized specifically so that you can run the entire tutorial from the notebook.

In [None]:
!pip install pandas
!pip install nltk
!wget https://raw.githubusercontent.com/favstats/demdebates2020/refs/heads/master/data/debates.csv

In [None]:
import nltk
nltk.download('vader_lexicon')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import pandas as pd
import string

In [None]:
df = pd.read_csv('debates.csv')
csvcleaned = df[
(df['speech'] != 'NA') &
(df['type'] != 'Moderator') &
(~df['speaker'].isin(['Speaker 1:', 'Speaker 2:', 'speaker 6:', 'speaker 7:', 'speaker 8:', 'Unknown',                                                                                                'Protester']))] \
.drop(columns=['background', 'gender', 'debate', 'day', 'type', 'order']
                 )
csvcleaned.to_csv('debate.csv', index=False) 

In [None]:
speakerdictionary = {}
for _, row in csvcleaned.iterrows():
    speaker = row['speaker']
    speech = row['speech']

    if speaker not in speakerdictionary:
        speakerdictionary[speaker] = []
    speakerdictionary[speaker].append(speech)

In [None]:
def preprocessing():
    lemmatizer = WordNetLemmatizer()
    lemmatizeddictionary = {}
    punctuation = ['...', '--', '``', "''"]
    stop_words = set(stopwords.words('english'))
    for speaker, speeches in speakerdictionary.items():
        lemmatizeddictionary[speaker] = []
        for sentence in speeches:
            tokens = word_tokenize(str(sentence).lower())
            tokens = [word for word in tokens if
                      word not in string.punctuation and word not in punctuation and word != 'nan']
            tokens = [word for word in tokens if not re.fullmatch(r"\s*", word)]
            filteredtokens = [token for token in tokens if token not in stop_words]
            
            lemmatized = [lemmatizer.lemmatize(word) for word in filteredtokens if word]
            if lemmatized:
                lemmatizeddictionary[speaker].append(lemmatized)
    return lemmatizeddictionary

In [None]:
def sentiment(speakerdictionary):
    analyzer = SentimentIntensityAnalyzer()
    for speaker, speeches in speakerdictionary.items():
        text = " ".join([" ".join(sentence) for sentence in speeches])
        if not text:
            continue
            
        scores = analyzer.polarity_scores(text)
        print(f"{speaker}: {scores}")

In [None]:
data = preprocessing()
sentiment(data)

Created by [Kathleen Costa](https://github.com/kathleenalvescosta) | Spring 2025