In [1]:
# Uncomment to install NLTK required libraries
# import nltk
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('vader_lexicon')
# nltk.download('opinion_lexicon')

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from bs4 import BeautifulSoup
import re
import requests

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from gensim.utils import lemmatize
import pattern

In [3]:
appos = {
"aren't" : "are not",
"can't" : "cannot",
"couldn't" : "could not",
"didn't" : "did not",
"doesn't" : "does not",
"don't" : "do not",
"hadn't" : "had not",
"hasn't" : "has not",
"haven't" : "have not",
"he'd" : "he would",
"he'll" : "he will",
"he's" : "he is",
"i'd" : "I would",
"i'd" : "I had",
"i'll" : "I will",
"i'm" : "I am",
"isn't" : "is not",
"it's" : "it is",
"it'll":"it will",
"i've" : "I have",
"let's" : "let us",
"mightn't" : "might not",
"mustn't" : "must not",
"shan't" : "shall not",
"she'd" : "she would",
"she'll" : "she will",
"she's" : "she is",
"shouldn't" : "should not",
"that's" : "that is",
"there's" : "there is",
"they'd" : "they would",
"they'll" : "they will",
"they're" : "they are",
"they've" : "they have",
"we'd" : "we would",
"we're" : "we are",
"weren't" : "were not",
"we've" : "we have",
"what'll" : "what will",
"what're" : "what are",
"what's" : "what is",
"what've" : "what have",
"where's" : "where is",
"who'd" : "who would",
"who'll" : "who will",
"who're" : "who are",
"who's" : "who is",
"who've" : "who have",
"won't" : "will not",
"wouldn't" : "would not",
"you'd" : "you would",
"you'll" : "you will",
"you're" : "you are",
"you've" : "you have",
"'re": " are",
"wasn't": "was not",
"we'll":" will",
"didn't": "did not"
}

In [4]:
df = pd.read_csv('cleaned_transcript.csv')
df.head()

Unnamed: 0,Speaker,Statement
0,HOLT,"Good evening everyone, I am Lester Holt and w..."
1,GUTHRIE,"Hi, I’m Savannah Guthrie and tonight it’s our..."
2,HOLT,Voters are trying to nail down where the cand...
3,GUTHRIE,"Well, now it’s time to find out."
4,UNKNOWN,Tonight round one. New Jersey Senator Cory Bo...


In [5]:
sid = SentimentIntensityAnalyzer()
sentiment = sid.polarity_scores(df.Statement[0])
sentiment_score = sentiment['compound']
print(df.Statement[0],sentiment)

 Good evening everyone, I am Lester Holt and welcome to the first democratic debate in the 2020 race for president. {'neg': 0.0, 'neu': 0.742, 'pos': 0.258, 'compound': 0.7096}


In [12]:
def process_statement(speaker, statement):
    # Calculate sentiment based on full string
    sentiment = sid.polarity_scores(statement)
    sentiment_score = sentiment['compound']
    
    # process data
    lower = statement.lower()
    words = lower.split()
    reformed = [appos[word] if word in appos else word for word in words]
    reformed = " ".join(reformed)
    
    stop_words = stopwords.words('english')
    
    # tokenize data and perform lemmatization
    tokens = word_tokenize(reformed)
    nostops = [i for i in tokens if i not in stop_words]
    words = [word for word in nostops if word.isalpha()]
    lemm_full = [x.decode('utf-8') for x in lemmatize(str(lower))]
    #lemm = [x.decode('utf-8').split('/') for x in lemmatize(str(lower))]
    #lem_words = [x[0] for x in lemm]
    #lem_tags = [x[1] for x in lemm]
    
    return [speaker, statement, sentiment_score, words, lemm_full]

In [13]:
processedTranscript = pd.DataFrame(columns=['speaker','statement', 'sentiment','words', 'lemmatization'])
for index,row in df.iterrows():
    speaker = row['Speaker']
    statement = row['Statement']
    processedTranscript.loc[len(processedTranscript)] = process_statement(speaker, statement)

In [14]:
processedTranscript.head()

Unnamed: 0,speaker,statement,sentiment,words,lemmatization
0,HOLT,"Good evening everyone, I am Lester Holt and w...",0.7096,"[good, evening, everyone, lester, holt, welcom...","[good/JJ, evening/NN, everyone/NN, be/VB, lest..."
1,GUTHRIE,"Hi, I’m Savannah Guthrie and tonight it’s our...",0.1531,"[hi, savannah, guthrie, tonight, first, chance...","[savannah/NN, guthrie/NN, tonight/NN, first/JJ..."
2,HOLT,Voters are trying to nail down where the cand...,0.0,"[voters, trying, nail, candidates, issues, set...","[voter/NN, be/VB, try/VB, nail/VB, candidate/N..."
3,GUTHRIE,"Well, now it’s time to find out.",0.2732,"[well, time, find]","[well/RB, now/RB, time/NN, find/VB]"
4,UNKNOWN,Tonight round one. New Jersey Senator Cory Bo...,0.0,"[tonight, round, one, new, jersey, senator, co...","[tonight/NN, round/NN, new/JJ, jersey/NN, sena..."


In [15]:
processedTranscript.to_csv('processed_transcript.csv')

In [16]:
# Statements by Speaker
processedTranscript[['speaker', 'statement']].groupby(['speaker']).agg(['count'])

Unnamed: 0_level_0,statement
Unnamed: 0_level_1,count
speaker,Unnamed: 1_level_2
BOOKER,25
CASTRO,32
DE BLASIO,20
DELANEY,32
DIAZ-BALART,26
GABBARD,17
GUTHRIE,37
HOLT,54
INSLEE,8
KLOBUCHAR,26


In [17]:
# Average sentiment by speaker
processedTranscript[['speaker', 'sentiment']].groupby(['speaker']).agg(['mean'])

Unnamed: 0_level_0,sentiment
Unnamed: 0_level_1,mean
speaker,Unnamed: 1_level_2
BOOKER,0.130664
CASTRO,0.165347
DE BLASIO,-0.097225
DELANEY,0.1648
DIAZ-BALART,0.1199
GABBARD,-0.118912
GUTHRIE,0.138573
HOLT,0.22185
INSLEE,0.3775
KLOBUCHAR,0.2567


In [30]:
# Statement with highest sentiment by speaker
processedTranscript[['speaker', 'sentiment']].groupby(['speaker'], as_index=False).agg(['max'])

Unnamed: 0_level_0,sentiment
Unnamed: 0_level_1,max
speaker,Unnamed: 1_level_2
BOOKER,0.9876
CASTRO,0.9601
DE BLASIO,0.9786
DELANEY,0.9774
DIAZ-BALART,0.8294
GABBARD,0.9836
GUTHRIE,0.9022
HOLT,0.8343
INSLEE,0.9812
KLOBUCHAR,0.9711


In [35]:
# Statement with lowest sentiment by speaker
processedTranscript[['speaker', 'sentiment']].groupby(['speaker'], as_index=False).(['min'])

0     -0.7691
1     -0.8273
2     -0.7691
3     -0.8273
4      0.0000
5     -0.7691
6     -0.8273
7     -0.9524
8     -0.7691
9     -0.8273
10    -0.9858
11    -0.8273
12    -0.9858
13    -0.8273
14    -0.9683
15    -0.8273
16    -0.9683
17    -0.8273
18    -0.8126
19    -0.8273
20    -0.8126
21    -0.8273
22    -0.8126
23    -0.8273
24    -0.8126
25    -0.8273
26    -0.8126
27    -0.8273
28    -0.8316
29    -0.8273
        ...  
491   -0.7691
492   -0.8273
493   -0.9566
494   -0.8273
495   -0.7783
496   -0.4939
497   -0.7783
498   -0.9524
499   -0.9062
500   -0.9524
501   -0.9062
502   -0.9062
503   -0.5244
504   -0.9901
505   -0.5244
506   -0.7691
507   -0.6674
508   -0.6674
509   -0.8273
510   -0.9683
511   -0.8273
512   -0.7783
513   -0.8316
514   -0.7783
515   -0.8126
516   -0.9524
517   -0.5244
518   -0.9858
519   -0.8273
520   -0.7691
Name: sentiment, Length: 521, dtype: float64