In [49]:
import pandas as pd
import numpy as np
import json

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from scipy.linalg import norm

# local library
from preproc import *

In [2]:
with open('speeches.json') as f:
    speeches = json.load(f)

In [3]:
bow = create_bow(speeches)
bow

Unnamed: 0_level_0,index,speaker,date,speech,title,transcript_type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
oWlLZZ8pcp8,8,trump,2019-04-28T02:07:41Z,[Music] [Music] [Music] [Music] [Music] [Appla...,Watch Live: President Trump's MAGA Rally in Gr...,speech
Z6N1WdJgnLo,47,biden,2019-04-29T21:04:46Z,[Applause] [Music] my name [Applause] thank yo...,Joe Biden holds first 2020 campaign rally,speech
WIwGrZKdsY0,9,trump,2019-05-09T03:55:17Z,[Music] from the leaves of Minnesota [Music] a...,FULL RALLY: President Trump in Panama City Bea...,speech
u-AEAq7jzcU,48,biden,2019-05-18T19:53:18Z,so let's do this please welcome my husband Joe...,Joe Biden speaks during a campaign rally in Ph...,speech
GgINUxecNrg,10,trump,2019-05-21T00:39:36Z,[Music] from the laser Minnesota [Music] the T...,"FULL RALLY: President Trump in Montoursville, ...",speech
...,...,...,...,...,...,...
SB9xXl8sHsE,107,trump,2020-10-01T03:28:36Z,me hello everybody hello duluth hello duluth o...,RAW VIDEO: President Donald Trump FULL SPEECH ...,speech
75QgFPgSySA,111,harris,2020-10-02T03:57:12Z,work on this yeah recording and i'm going to w...,Could life exist on other planets? John Harris,speech
WX1wwgm9GZ0,110,pence,2020-10-02T19:58:52Z,[Music] foreign [Music] hello everyone how you...,Canvas + O365 Integration with Jeff Pence,speech
y_SUMiYe-w0,112,trump,2020-10-03T15:00:13Z,new york times report alleging that the presid...,The president's taxes: A story to 'Trump' all ...,speech


In [15]:
# 

vectorizer = CountVectorizer(#max_features=5000, # only top 5k words
                             min_df=3,          # words must appear in this many speeches to count
                             max_df=0.9,        # words can't appear in more % of speeches than this.  
                             stop_words='english',
                             ngram_range=(1,3)  # unigrams, bigrams, trigrams
                            )

counts = vectorizer.fit_transform(bow['speech'])

In [16]:
counts_df = pd.DataFrame(counts.toarray())
counts_df.index.name = 'speech'
#tfidf.columns = tfidf.apply(lambda x: vocab_r[x.name])
counts_df.columns = vectorizer.get_feature_names()

In [17]:
counts_df.head(3)

Unnamed: 0_level_0,00,000,000 000,000 african,000 african americans,000 american,000 americans,000 arrests,000 brand,000 brand new,...,zip code,zone,zones,zones demanded,zones demanded american,zones jobs,zones jobs investment,zones right,zoning,zoom
speech,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,8,0,0,0,0,0,0,2,2,...,0,0,1,0,0,0,0,0,0,0
1,0,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,14,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
# convert counts into tfidf
transformer = TfidfTransformer(smooth_idf=False)
tfidf = transformer.fit_transform(counts)

In [21]:
tfidf_df = pd.DataFrame(tfidf.toarray())
tfidf_df.index.name = 'speech'
tfidf_df.columns = vectorizer.get_feature_names()
tfidf_df['videoId'] = bow.index.values
tfidf_df = tfidf_df.set_index('videoId')

In [22]:
tfidf_df.head()

Unnamed: 0_level_0,00,000,000 000,000 african,000 african americans,000 american,000 americans,000 arrests,000 brand,000 brand new,...,zip code,zone,zones,zones demanded,zones demanded american,zones jobs,zones jobs investment,zones right,zoning,zoom
videoId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
oWlLZZ8pcp8,0.0,0.031687,0.0,0.0,0.0,0.0,0.0,0.0,0.026441,0.026441,...,0.0,0.0,0.007939,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Z6N1WdJgnLo,0.0,0.014372,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
WIwGrZKdsY0,0.0,0.051714,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
u-AEAq7jzcU,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
GgINUxecNrg,0.0,0.018073,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
tfidf_df['_speaker'] = tfidf_df.apply(lambda x: bow.loc[x.name].speaker, 1)
tfidf_df = tfidf_df.reset_index().set_index(['videoId','_speaker'])

In [27]:
tfidf_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,00,000,000 000,000 african,000 african americans,000 american,000 americans,000 arrests,000 brand,000 brand new,...,zip code,zone,zones,zones demanded,zones demanded american,zones jobs,zones jobs investment,zones right,zoning,zoom
videoId,_speaker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
oWlLZZ8pcp8,trump,0.0,0.031687,0.0,0.0,0.0,0.0,0.0,0.0,0.026441,0.026441,...,0.0,0.0,0.007939,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Z6N1WdJgnLo,biden,0.0,0.014372,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
WIwGrZKdsY0,trump,0.0,0.051714,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
u-AEAq7jzcU,biden,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
GgINUxecNrg,trump,0.0,0.018073,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [50]:
# collapse to speaker-only
tfidf_speaker = tfidf_df.reset_index().drop(columns="videoId").groupby(['_speaker']).mean()
# Normalize doc vector lengths
tfidf_speaker = tfidf_speaker.apply(lambda x: x / norm(x), 1)
# transpose for easier filtering
tfidf_speaker = tfidf_speaker.T.reset_index()

In [51]:
tfidf_speaker.sort_values('biden', ascending=False).head(10)[['index']].values

array([['donald trump'],
       ['nation'],
       ['donald'],
       ['need'],
       ['folks'],
       ['look'],
       ['gonna'],
       ['crisis'],
       ['ll'],
       ['uh']], dtype=object)

In [52]:
top10 = pd.DataFrame(tfidf_speaker.sort_values('biden', ascending=False).head(10)[['index']].values)
top10.columns = ['biden']
top10['trump'] = tfidf_speaker.sort_values('trump', ascending=False).head(10)[['index']].values
top10['harris'] = tfidf_speaker.sort_values('harris', ascending=False).head(10)[['index']].values
top10['pence'] = tfidf_speaker.sort_values('pence', ascending=False).head(10)[['index']].values
top10

Unnamed: 0,biden,trump,harris,pence
0,donald trump,applause,joe,president donald trump
1,nation,ll,life,president donald
2,donald,got,justice,donald trump
3,need,lot,care act,donald
4,folks,gonna,affordable care act,freedom
5,look,doing,affordable care,president trump
6,gonna,new,care,joe biden
7,crisis,look,ginsburg,faith
8,ll,china,um,biden
9,uh,music,act,applause


In [58]:
import nltk
from nltk.tokenize import word_tokenize

In [63]:
sentences = bow['speech'].apply(nltk.WhitespaceTokenizer().tokenize)

In [62]:
from gensim.test.utils import datapath
from gensim.models.phrases import Phrases, Phraser

In [64]:
phrases = Phrases(sentences, min_count=1, threshold=1)

In [70]:
# this doesn't seem too helpful.