In [1]:
import pandas as pd
import numpy as np
import json
import nltk

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

# local library
from preproc import *

In [2]:
with open('speeches.json') as f:
    speeches = json.load(f)

In [3]:
bow = create_bow(speeches)
bow

Unnamed: 0_level_0,index,speaker,date,speech,title
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
oWlLZZ8pcp8,13,trump,2019-04-28T02:07:41Z,[Music] [Music] [Music] [Music] [Music] [Appla...,Watch Live: President Trump's MAGA Rally in Gr...
Z6N1WdJgnLo,52,biden,2019-04-29T21:04:46Z,[Applause] [Music] my name [Applause] thank yo...,Joe Biden holds first 2020 campaign rally
WIwGrZKdsY0,14,trump,2019-05-09T03:55:17Z,[Music] from the leaves of Minnesota [Music] a...,FULL RALLY: President Trump in Panama City Bea...
u-AEAq7jzcU,53,biden,2019-05-18T19:53:18Z,so let's do this please welcome my husband Joe...,Joe Biden speaks during a campaign rally in Ph...
GgINUxecNrg,15,trump,2019-05-21T00:39:36Z,[Music] from the laser Minnesota [Music] the T...,"FULL RALLY: President Trump in Montoursville, ..."
...,...,...,...,...,...
uBigQgyIPkE,12,biden,2020-09-03T04:25:23Z,that's why i'm speaking to you today the incum...,Joe Biden's full campaign speech HD 8/31/2020
SqNtc-t0Cr0,11,trump,2020-09-03T16:59:53Z,[Music] i was given a video of donald trump du...,"FULL: Breaking down the Trump ""hot-mic"" tape w..."
QHTgRNP_1VI,51,trump,2020-09-04T00:59:45Z,[Applause] [Applause] [Music] [Applause] so th...,Trump delivers campaign remarks at Arnold Palm...
KWLoD36kLfM,87,trump,2020-09-09T01:39:01Z,[Music] god well thank you very much thank you...,Raw Video: President Trump Full Speech at Rall...


## Create TFIDF

In [57]:
sw = stopwords.words('english')
# add custom words
sw.append('applause')


vectorizer = TfidfVectorizer(max_features=4000, # only top 4k words
                             min_df=3,          # words must appear in this many speeches to count
                             max_df=0.8,        # only 
                             stop_words=sw
                            )

tfidf = vectorizer.fit_transform(bow['speech'])

In [10]:
tfidf.shape

(85, 4000)

In [64]:
bow['speech']

id
oWlLZZ8pcp8    [Music] [Music] [Music] [Music] [Music] [Appla...
Z6N1WdJgnLo    [Applause] [Music] my name [Applause] thank yo...
WIwGrZKdsY0    [Music] from the leaves of Minnesota [Music] a...
u-AEAq7jzcU    so let's do this please welcome my husband Joe...
GgINUxecNrg    [Music] from the laser Minnesota [Music] the T...
                                     ...                        
uBigQgyIPkE    that's why i'm speaking to you today the incum...
SqNtc-t0Cr0    [Music] i was given a video of donald trump du...
QHTgRNP_1VI    [Applause] [Applause] [Music] [Applause] so th...
KWLoD36kLfM    [Music] god well thank you very much thank you...
wPK7ojxRNk4    hello wisconsin [Applause] to congressman styl...
Name: speech, Length: 85, dtype: object

In [28]:
#vocab_r = {value : key for (key, value) in vectorizer.vocabulary_.items()}

In [None]:
#df = pd.DataFrame(tfidf.toarray().T)
#df.index.name = 'myindex'
#df['term_str'] = df.apply(lambda x: vocab_r[x.name], 1)

In [58]:
tfidf = pd.DataFrame(tfidf.toarray())
tfidf.index.name = 'speech'
#tfidf.columns = tfidf.apply(lambda x: vocab_r[x.name])
tfidf.columns = vectorizer.get_feature_names()
tfidf.head(2)

Unnamed: 0_level_0,00,000,10,100,11,12,120,125,12th,13,...,york,young,younger,youngest,youth,youtube,yuma,zero,zip,zones
speech,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.047301,0.007106,0.015161,0.0,0.028566,0.0,0.0,0.0,0.0,...,0.024659,0.007482,0.0,0.0,0.013835,0.0,0.0,0.021408,0.0,0.010704
1,0.0,0.020306,0.0,0.026033,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [68]:
tfidf['videoId'] = bow.index.values
tfidf = tfidf.set_index('videoId')

## Sentiment Analysis - VADER

In [55]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

bow['polarity_score'] = bow.apply(lambda x: analyzer.polarity_scores(x.speech), 1)

#for sentence in sentences:
#    vs = analyzer.polarity_scores(sentence)
#    print("{:-<65} {}".format(sentence, str(vs)))

In [56]:
bow

Unnamed: 0_level_0,index,speaker,date,speech,title,polarity_score
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
oWlLZZ8pcp8,13,trump,2019-04-28T02:07:41Z,[Music] [Music] [Music] [Music] [Music] [Appla...,Watch Live: President Trump's MAGA Rally in Gr...,"{'neg': 0.092, 'neu': 0.726, 'pos': 0.182, 'co..."
Z6N1WdJgnLo,52,biden,2019-04-29T21:04:46Z,[Applause] [Music] my name [Applause] thank yo...,Joe Biden holds first 2020 campaign rally,"{'neg': 0.068, 'neu': 0.765, 'pos': 0.166, 'co..."
WIwGrZKdsY0,14,trump,2019-05-09T03:55:17Z,[Music] from the leaves of Minnesota [Music] a...,FULL RALLY: President Trump in Panama City Bea...,"{'neg': 0.088, 'neu': 0.735, 'pos': 0.177, 'co..."
u-AEAq7jzcU,53,biden,2019-05-18T19:53:18Z,so let's do this please welcome my husband Joe...,Joe Biden speaks during a campaign rally in Ph...,"{'neg': 0.079, 'neu': 0.759, 'pos': 0.161, 'co..."
GgINUxecNrg,15,trump,2019-05-21T00:39:36Z,[Music] from the laser Minnesota [Music] the T...,"FULL RALLY: President Trump in Montoursville, ...","{'neg': 0.082, 'neu': 0.706, 'pos': 0.212, 'co..."
...,...,...,...,...,...,...
uBigQgyIPkE,12,biden,2020-09-03T04:25:23Z,that's why i'm speaking to you today the incum...,Joe Biden's full campaign speech HD 8/31/2020,"{'neg': 0.142, 'neu': 0.707, 'pos': 0.151, 'co..."
SqNtc-t0Cr0,11,trump,2020-09-03T16:59:53Z,[Music] i was given a video of donald trump du...,"FULL: Breaking down the Trump ""hot-mic"" tape w...","{'neg': 0.059, 'neu': 0.831, 'pos': 0.11, 'com..."
QHTgRNP_1VI,51,trump,2020-09-04T00:59:45Z,[Applause] [Applause] [Music] [Applause] so th...,Trump delivers campaign remarks at Arnold Palm...,"{'neg': 0.098, 'neu': 0.753, 'pos': 0.149, 'co..."
KWLoD36kLfM,87,trump,2020-09-09T01:39:01Z,[Music] god well thank you very much thank you...,Raw Video: President Trump Full Speech at Rall...,"{'neg': 0.083, 'neu': 0.744, 'pos': 0.173, 'co..."


In [57]:
bow.loc['Z6N1WdJgnLo','polarity_score']

{'neg': 0.068, 'neu': 0.765, 'pos': 0.166, 'compound': 0.9999}

In [58]:
bow['compound'] = [s.get('compound') for s in bow['polarity_score']]
bow['pos'] = [s.get('pos') for s in bow['polarity_score']]
bow['neg'] = [s.get('neg') for s in bow['polarity_score']]
bow['len'] = bow['speech'].str.len()

In [62]:
bow.head(20)

Unnamed: 0_level_0,index,speaker,date,speech,title,polarity_score,compound,pos,neg,len
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
oWlLZZ8pcp8,13,trump,2019-04-28T02:07:41Z,[Music] [Music] [Music] [Music] [Music] [Appla...,Watch Live: President Trump's MAGA Rally in Gr...,"{'neg': 0.092, 'neu': 0.726, 'pos': 0.182, 'co...",1.0,0.182,0.092,51179
Z6N1WdJgnLo,52,biden,2019-04-29T21:04:46Z,[Applause] [Music] my name [Applause] thank yo...,Joe Biden holds first 2020 campaign rally,"{'neg': 0.068, 'neu': 0.765, 'pos': 0.166, 'co...",0.9999,0.166,0.068,21023
WIwGrZKdsY0,14,trump,2019-05-09T03:55:17Z,[Music] from the leaves of Minnesota [Music] a...,FULL RALLY: President Trump in Panama City Bea...,"{'neg': 0.088, 'neu': 0.735, 'pos': 0.177, 'co...",1.0,0.177,0.088,58321
u-AEAq7jzcU,53,biden,2019-05-18T19:53:18Z,so let's do this please welcome my husband Joe...,Joe Biden speaks during a campaign rally in Ph...,"{'neg': 0.079, 'neu': 0.759, 'pos': 0.161, 'co...",0.9999,0.161,0.079,21153
GgINUxecNrg,15,trump,2019-05-21T00:39:36Z,[Music] from the laser Minnesota [Music] the T...,"FULL RALLY: President Trump in Montoursville, ...","{'neg': 0.082, 'neu': 0.706, 'pos': 0.212, 'co...",1.0,0.212,0.082,39271
S7j1hYXD0uk,72,harris,2019-06-10T03:00:01Z,it's good to be back with you it is great to b...,Full speech: Kamala Harris | Iowa Democrats’ H...,"{'neg': 0.084, 'neu': 0.776, 'pos': 0.14, 'com...",0.9871,0.14,0.084,3766
yE2sMLutWZA,76,pence,2019-06-19T01:03:06Z,well hello Florida it is great to be back in t...,VP Mike Pence introduces President Trump at 20...,"{'neg': 0.046, 'neu': 0.716, 'pos': 0.238, 'co...",0.9998,0.238,0.046,6944
MEqINP-TuV8,16,trump,2019-06-19T01:48:35Z,[Applause] it has been my honor to serve as fi...,TRUMP 2020: President Trump Re-Election Campai...,"{'neg': 0.09, 'neu': 0.709, 'pos': 0.202, 'com...",1.0,0.202,0.09,41933
xH-iCVFTb6w,46,trump,2019-07-05T15:19:57Z,[Applause] [Music] [Applause] [Music] hello Am...,"President Trump COMPLETE REMARKS at July 4th ""...","{'neg': 0.083, 'neu': 0.711, 'pos': 0.206, 'co...",1.0,0.206,0.083,23723
Vd65smKGDVo,77,pence,2019-07-17T23:25:05Z,[Music] [Music] hello North Carolina it is gre...,AMERICA IS BACK: VP Pence amps up crowd during...,"{'neg': 0.032, 'neu': 0.754, 'pos': 0.214, 'co...",0.9998,0.214,0.032,6599


In [60]:
import plotly.express as px

fig = px.scatter(bow, x="pos", y="neg", color="speaker", size="len", hover_data=["title"])
fig.show()

In [63]:
fig = px.scatter(bow, x="date", y="neg", color="speaker", size="len", hover_data=["title"])
fig.show()

## Basic logistic regression classifier

In [106]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test  = train_test_split(
        tfidf, 
        bow['speaker'],
        train_size=0.80, 
        random_state=951)

In [107]:
from sklearn.linear_model import LogisticRegression
log_model = LogisticRegression()
log_model = log_model.fit(X=X_train, y=y_train)
y_pred = log_model.predict(X_test)

In [108]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.8823529411764706


In [109]:
bow_test = bow[bow.index.isin(X_test.index.values)].copy()
#bow_xtest['pred_speaker'] = y_pred
xt = X_test.copy()
xt['pred_speaker'] = y_pred
bow_test = bow_test.join(xt['pred_speaker'])

In [111]:
bow_test[['speaker','pred_speaker','title']]

Unnamed: 0_level_0,speaker,pred_speaker,title
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
oWlLZZ8pcp8,trump,trump,Watch Live: President Trump's MAGA Rally in Gr...
xH-iCVFTb6w,trump,trump,"President Trump COMPLETE REMARKS at July 4th ""..."
zc5RMAfg9mA,trump,trump,Pres. Trump New Hampshire Rally in Manchester ...
u_aN19uB7og,pence,trump,"""HE'S THE REAL DEAL"": VP Pence speaks at Trump..."
xa7MY2_pmxU,trump,trump,Watch LIVE President Trump campaign rally in F...
anAbeaQ96Ho,trump,trump,FULL SPEECH President Donald Trump Rally - Rio...
sL1UF0KDwsE,biden,biden,"Joe Biden's Full Remarks in Reno, Nevada | Joe..."
t5C98Fnl0M4,trump,trump,President Donald Trump speaks at his Hershey r...
_5ryKmfBm9E,pence,trump,VP Pence OpenING Speech First 2020 KAG Rally |...
WXtnH_r1HC8,trump,trump,LIVE: State of the Union Address


In [118]:
trump_text = bow[bow['speaker']=='trump']['speech'].str.cat(sep=' ')

