In [40]:
import pandas as pd
import numpy as np
import json

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from scipy.linalg import norm

# local library
from preproc import *

In [50]:
sw = set(stopwords.words('english') + ["[applause]", "[music]"])

In [2]:
with open('speeches.json') as f:
    speeches = json.load(f)

In [3]:
bow = create_bow(speeches)
bow

Unnamed: 0_level_0,index,speaker,date,speech,title,transcript_type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
oWlLZZ8pcp8,8,trump,2019-04-28T02:07:41Z,[Music] [Music] [Music] [Music] [Music] [Appla...,Watch Live: President Trump's MAGA Rally in Gr...,speech
Z6N1WdJgnLo,47,biden,2019-04-29T21:04:46Z,[Applause] [Music] my name [Applause] thank yo...,Joe Biden holds first 2020 campaign rally,speech
WIwGrZKdsY0,9,trump,2019-05-09T03:55:17Z,[Music] from the leaves of Minnesota [Music] a...,FULL RALLY: President Trump in Panama City Bea...,speech
u-AEAq7jzcU,48,biden,2019-05-18T19:53:18Z,so let's do this please welcome my husband Joe...,Joe Biden speaks during a campaign rally in Ph...,speech
GgINUxecNrg,10,trump,2019-05-21T00:39:36Z,[Music] from the laser Minnesota [Music] the T...,"FULL RALLY: President Trump in Montoursville, ...",speech
...,...,...,...,...,...,...
5iCeLlyYWf0,131,biden,2020-10-10T21:33:38Z,of donald trump and dan laughlin see joe biden...,Live: Biden Holds Campaign Event In Pennsylvan...,speech
CavJXOg8HnY,132,pence,2020-10-10T21:52:13Z,the villages had the best turnout in the entir...,#LiveStream #USElections2020 #Pence holds camp...,speech
7Ol3RSIykPc,135,harris,2020-10-12T12:00:11Z,this show is sponsored by expressvpn stop putt...,How Facebook opened Pandora’s box - Tristan Ha...,speech
qW_3MeSdkM4,133,trump,2020-10-12T13:00:09Z,the villages had the best turnout in the entir...,The Trump Administration's First Term Corrupti...,speech


In [163]:
import string
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem.snowball import SnowballStemmer
def my_tokenizer(document):
        tk = WhitespaceTokenizer()
        #stemmer = SnowballStemmer("english")
        # Break the sentence into tokens based on whitespace
        for token in tk.tokenize(document):
            # Apply preprocessing to the token
            token = token.lower()
            token = token.strip()
            token = token.strip('_')
            token = token.strip('*')

            # If punctuation, ignore token and continue
            if all(char in set(string.punctuation) for char in token):
                continue
                
            if token in sw:
                continue

            # stem the token and yield
            #stem = stemmer.stem(token)
            yield token

# Combined 1,2,3 grams

In [179]:
# 

vectorizer = CountVectorizer(#max_features=5000, # only top 5k words
                             min_df=2,          # words must appear in this many speeches to count
                             #max_df=0.9,        # words can't appear in more % of speeches than this.  
                             stop_words=stopwords.words('english'),
                             tokenizer=my_tokenizer,
                             ngram_range=(1,3)  # unigrams, bigrams, trigrams
                            )

counts = vectorizer.fit_transform(bow['speech'])

In [180]:
counts_df = pd.DataFrame(counts.toarray())
counts_df.index.name = 'speech'
#tfidf.columns = tfidf.apply(lambda x: vocab_r[x.name])
counts_df.columns = vectorizer.get_feature_names()

In [182]:
counts_df.head(3)

Unnamed: 0_level_0,"""i","""in","""the","$10,000","$10,000 family","$10,000,000,000","$10,000,000,000 cameron","$10,000,000,000 cameron lng","$2,000","$2,000 we're",...,zoom right right,zoz,zoz kid,zoz kid jean,zubik,zubik pittsburgh,zubik pittsburgh came,zucker,zuckerberg,♪
speech,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [183]:
# convert counts into tfidf
transformer = TfidfTransformer(smooth_idf=False)
tfidf = transformer.fit_transform(counts)

In [184]:
tfidf_df = pd.DataFrame(tfidf.toarray())
tfidf_df.index.name = 'speech'
tfidf_df.columns = vectorizer.get_feature_names()
tfidf_df['videoId'] = bow.index.values
tfidf_df = tfidf_df.set_index('videoId')

In [185]:
tfidf_df.head()

Unnamed: 0_level_0,"""i","""in","""the","$10,000","$10,000 family","$10,000,000,000","$10,000,000,000 cameron","$10,000,000,000 cameron lng","$2,000","$2,000 we're",...,zoom right right,zoz,zoz kid,zoz kid jean,zubik,zubik pittsburgh,zubik pittsburgh came,zucker,zuckerberg,♪
videoId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
oWlLZZ8pcp8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Z6N1WdJgnLo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
WIwGrZKdsY0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
u-AEAq7jzcU,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
GgINUxecNrg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [186]:
tfidf_df['_speaker'] = tfidf_df.apply(lambda x: bow.loc[x.name].speaker, 1)
tfidf_df = tfidf_df.reset_index().set_index(['videoId','_speaker'])

In [187]:
tfidf_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,"""i","""in","""the","$10,000","$10,000 family","$10,000,000,000","$10,000,000,000 cameron","$10,000,000,000 cameron lng","$2,000","$2,000 we're",...,zoom right right,zoz,zoz kid,zoz kid jean,zubik,zubik pittsburgh,zubik pittsburgh came,zucker,zuckerberg,♪
videoId,_speaker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
oWlLZZ8pcp8,trump,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Z6N1WdJgnLo,biden,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
WIwGrZKdsY0,trump,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
u-AEAq7jzcU,biden,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
GgINUxecNrg,trump,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [188]:
# collapse to speaker-only
tfidf_speaker = tfidf_df.reset_index().drop(columns="videoId").groupby(['_speaker']).mean()
# Normalize doc vector lengths
tfidf_speaker = tfidf_speaker.apply(lambda x: x / norm(x), 1)
# transpose for easier filtering
tfidf_speaker = tfidf_speaker.T.reset_index()

In [189]:
tfidf_speaker.sort_values('biden', ascending=False).head(10)[['index']].values

array([['know'],
       ['president'],
       ['people'],
       ['going'],
       ["i'm"],
       ['get'],
       ["that's"],
       ['like'],
       ['trump'],
       ["we're"]], dtype=object)

In [190]:
top10 = pd.DataFrame(tfidf_speaker.sort_values('biden', ascending=False).head(10)[['index']].values)
top10.columns = ['biden']
top10['trump'] = tfidf_speaker.sort_values('trump', ascending=False).head(10)[['index']].values
top10['harris'] = tfidf_speaker.sort_values('harris', ascending=False).head(10)[['index']].values
top10['pence'] = tfidf_speaker.sort_values('pence', ascending=False).head(10)[['index']].values
top10

Unnamed: 0,biden,trump,harris,pence
0,know,know,like,president
1,president,people,know,president donald trump
2,people,said,um,president donald
3,going,great,uh,trump
4,i'm,we're,people,years
5,get,like,that's,america
6,that's,going,yeah,donald trump
7,like,want,would,know
8,trump,that's,one,donald
9,we're,right,i'm,american


In [112]:
# Collocation - uses PMI (pointwise mutual information) to figure out 
# whether a bigram/trigram is better treated as a single "word"


In [191]:
# start with unigrams - just p(x)
pmi = counts_df.loc[:,[c for c in counts_df.columns.values if ' ' not in c]]
pmi = pmi.div(pmi.sum(axis=1), axis=0)


In [196]:
# bigrams - log(p(x,y) / (p(x)*p(y)))
pmi2 = counts_df.loc[:,[c for c in counts_df.columns.values if c.count(' ')==1 ]]
#pmi = pmi.div(pmi.sum(axis=1), axis=0)
def pxpy(xy_term):
    x, y = xy_term.name.split()
    #px = pmi[x] + 1e-60
    #py = pmi[y] + 1e-60
    return (pmi[x] + 1e-60) * (pmi[y] + 1e-60) # add a small value to avoid divide by zero

pmi2 = np.log2(pmi2.div(pmi2.apply(pxpy, axis=0), axis=0) + 1e-60)

In [199]:
pmi2.index.name = 'speech'
pmi2['videoId'] = bow.index.values
pmi2 = pmi2.set_index('videoId')
pmi2['_speaker'] = pmi2.apply(lambda x: bow.loc[x.name].speaker, 1)
pmi2 = pmi2.reset_index().set_index(['videoId','_speaker'])
pmi2s = pmi2.reset_index().drop(columns="videoId").groupby(['_speaker']).mean()
pmi2s = pmi2s.T.reset_index()
pmi2s.sort_values('biden', ascending=False).head(10)[['index']].values

array([['united states'],
       ['donald trump'],
       ['god bless'],
       ["we're going"],
       ['make sure'],
       ['every day'],
       ['states america'],
       ['american people'],
       ["i'm going"],
       ['joe biden']], dtype=object)

In [200]:
top10 = pd.DataFrame(pmi2s.sort_values('biden', ascending=False).head(10)[['index']].values)
top10.columns = ['biden']
top10['trump'] = pmi2s.sort_values('trump', ascending=False).head(10)[['index']].values
top10['harris'] = pmi2s.sort_values('harris', ascending=False).head(10)[['index']].values
top10['pence'] = pmi2s.sort_values('pence', ascending=False).head(10)[['index']].values
top10

Unnamed: 0,biden,trump,harris,pence
0,united states,united states,united states,donald trump
1,donald trump,thank much,donald trump,president donald
2,god bless,we're going,little bit,men women
3,we're going,years ago,joe biden,years ago
4,make sure,law enforcement,looks like,every day
5,every day,thank thank,they're going,american history
6,states america,long time,vice president,president trump
7,american people,white house,american people,united states
8,i'm going,great job,president united,law enforcement
9,joe biden,we're gonna,barack obama,four years


## Only bigrams

In [71]:
# create document-term count matrix
vectorizer = CountVectorizer(#max_features=5000, # only top 5k words
                             min_df=3,          # words must appear in this many speeches to count
                             max_df=0.9,        # words can't appear in more % of speeches than this.  
                             stop_words=stopwords.words('english'),
                             tokenizer=my_tokenizer,
                             ngram_range=(2,2)  # only bigrams
                            )
counts = vectorizer.fit_transform(bow['speech'])
counts_df = pd.DataFrame(counts.toarray())
counts_df.index.name = 'speech'
counts_df.columns = vectorizer.get_feature_names()

# convert counts into tfidf
transformer = TfidfTransformer(smooth_idf=False)
tfidf = transformer.fit_transform(counts)
# make it a nice df
tfidf_df = pd.DataFrame(tfidf.toarray())
tfidf_df.index.name = 'speech'
tfidf_df.columns = vectorizer.get_feature_names()
tfidf_df['videoId'] = bow.index.values
tfidf_df = tfidf_df.set_index('videoId')

# group by speaker and aggregate
tfidf_df['_speaker'] = tfidf_df.apply(lambda x: bow.loc[x.name].speaker, 1)
tfidf_df = tfidf_df.reset_index().set_index(['videoId','_speaker'])
# collapse to speaker-only
tfidf_speaker = tfidf_df.reset_index().drop(columns="videoId").groupby(['_speaker']).mean()
# Normalize doc vector lengths
tfidf_speaker = tfidf_speaker.apply(lambda x: x / norm(x), 1)
# transpose for easier filtering
tfidf_speaker = tfidf_speaker.T.reset_index()

# convert df to a handy top-10 comparison matrix
top10 = pd.DataFrame(tfidf_speaker.sort_values('biden', ascending=False).head(10)[['index']].values)
top10.columns = ['biden']
top10['trump'] = tfidf_speaker.sort_values('trump', ascending=False).head(10)[['index']].values
top10['harris'] = tfidf_speaker.sort_values('harris', ascending=False).head(10)[['index']].values
top10['pence'] = tfidf_speaker.sort_values('pence', ascending=False).head(10)[['index']].values
top10

Unnamed: 0,biden,trump,harris,pence
0,donald trump,we're going,senator graham,president donald
1,joe biden,thank much,affordable care,donald trump
2,we're going,united states,south carolina,four years
3,united states,we're gonna,care act,joe biden
4,make sure,great job,criminal justice,president trump
5,middle class,make america,like know,men women
6,i'm going,joe biden,trump party,trump white
7,child care,new york,health care,years president
8,states america,north carolina,donald trump,years means
9,may god,billion dollars,president trump,united states


In [72]:
# key phrases - a score for which terms each speaker uses that the others don't.
tfidf_speaker['biden2'] = tfidf_speaker.biden - (tfidf_speaker.harris + tfidf_speaker.pence + tfidf_speaker.trump)
tfidf_speaker['trump2'] = tfidf_speaker.trump - (tfidf_speaker.harris + tfidf_speaker.pence + tfidf_speaker.biden)
tfidf_speaker['harris2'] = tfidf_speaker.harris - (tfidf_speaker.trump + tfidf_speaker.pence + tfidf_speaker.biden)
tfidf_speaker['pence2'] = tfidf_speaker.pence - (tfidf_speaker.harris + tfidf_speaker.trump + tfidf_speaker.biden)

top10 = pd.DataFrame(tfidf_speaker.sort_values('biden2', ascending=False).head(10)[['index']].values)
top10.columns = ['biden']
top10['trump'] = tfidf_speaker.sort_values('trump2', ascending=False).head(10)[['index']].values
top10['harris'] = tfidf_speaker.sort_values('harris2', ascending=False).head(10)[['index']].values
top10['pence'] = tfidf_speaker.sort_values('pence2', ascending=False).head(10)[['index']].values
top10

Unnamed: 0,biden,trump,harris,pence
0,middle class,great job,senator graham,president donald
1,child care,fake news,affordable care,trump white
2,battle soul,thank much,south carolina,years means
3,"mr president,",sleepy joe,trump party,four years
4,across world,great people,care act,stand president
5,back better,going win,criminal justice,years president
6,god protect,history country,make case,president stands
7,protect troops,make america,million years,need four
8,clean energy,sanctuary cities,bring us,red tape
9,community college,air force,death penalty,promises made


## Only trigrams

In [64]:
# create document-term count matrix
vectorizer = CountVectorizer(#max_features=5000, # only top 5k words
                             min_df=3,          # words must appear in this many speeches to count
                             max_df=0.9,        # words can't appear in more % of speeches than this.  
                             stop_words=stopwords.words('english'),
                             tokenizer=my_tokenizer,
                             ngram_range=(3,3)  # only bigrams
                            )
counts = vectorizer.fit_transform(bow['speech'])
counts_df = pd.DataFrame(counts.toarray())
counts_df.index.name = 'speech'
counts_df.columns = vectorizer.get_feature_names()

# convert counts into tfidf
transformer = TfidfTransformer(smooth_idf=False)
tfidf = transformer.fit_transform(counts)
# make it a nice df
tfidf_df = pd.DataFrame(tfidf.toarray())
tfidf_df.index.name = 'speech'
tfidf_df.columns = vectorizer.get_feature_names()
tfidf_df['videoId'] = bow.index.values
tfidf_df = tfidf_df.set_index('videoId')

# group by speaker and aggregate
tfidf_df['_speaker'] = tfidf_df.apply(lambda x: bow.loc[x.name].speaker, 1)
tfidf_df = tfidf_df.reset_index().set_index(['videoId','_speaker'])
# collapse to speaker-only
tfidf_speaker = tfidf_df.reset_index().drop(columns="videoId").groupby(['_speaker']).mean()
# Normalize doc vector lengths
tfidf_speaker = tfidf_speaker.apply(lambda x: x / norm(x), 1)
# transpose for easier filtering
tfidf_speaker = tfidf_speaker.T.reset_index()

# convert df to a handy top-10 comparison matrix
top10 = pd.DataFrame(tfidf_speaker.sort_values('biden', ascending=False).head(10)[['index']].values)
top10.columns = ['biden']
top10['trump'] = tfidf_speaker.sort_values('trump', ascending=False).head(10)[['index']].values
top10['harris'] = tfidf_speaker.sort_values('harris', ascending=False).head(10)[['index']].values
top10['pence'] = tfidf_speaker.sort_values('pence', ascending=False).head(10)[['index']].values
top10

Unnamed: 0,biden,trump,harris,pence
0,united states america,thank much thank,affordable care act,president donald trump
1,battle soul nation,make america great,black lives matter,years president donald
2,build back better,new york times,criminal justice reform,four years president
3,may god protect,late term abortion,people south carolina,donald trump white
4,god protect troops,done great job,know i'm saying,four years means
5,bless may god,we're going win,united states senate,trump white house
6,god bless may,thank thank much,bring us together,first three years
7,thank thank thank,thank thank thank,president trump party,president donald trump.
8,beat donald trump,black lives matter,like know like,need four years
9,president united states,president united states,president united states,stand president donald


In [70]:
# key phrases - a score for which terms each speaker uses that the others don't.
tfidf_speaker['biden2'] = tfidf_speaker.biden - (tfidf_speaker.harris + tfidf_speaker.pence + tfidf_speaker.trump)
tfidf_speaker['trump2'] = tfidf_speaker.trump - (tfidf_speaker.harris + tfidf_speaker.pence + tfidf_speaker.biden)
tfidf_speaker['harris2'] = tfidf_speaker.harris - (tfidf_speaker.trump + tfidf_speaker.pence + tfidf_speaker.biden)
tfidf_speaker['pence2'] = tfidf_speaker.pence - (tfidf_speaker.harris + tfidf_speaker.trump + tfidf_speaker.biden)

top10 = pd.DataFrame(tfidf_speaker.sort_values('biden2', ascending=False).head(10)[['index']].values)
top10.columns = ['biden']
top10['trump'] = tfidf_speaker.sort_values('trump2', ascending=False).head(10)[['index']].values
top10['harris'] = tfidf_speaker.sort_values('harris2', ascending=False).head(10)[['index']].values
top10['pence'] = tfidf_speaker.sort_values('pence2', ascending=False).head(10)[['index']].values
top10

Unnamed: 0,biden,trump,harris,pence
0,build back better,late term abortion,affordable care act,president donald trump
1,god protect troops,we're going win,people south carolina,years president donald
2,may god protect,thank much thank,criminal justice reform,donald trump white
3,bless may god,done great job,bring us together,trump white house
4,god bless may,thank thank much,know i'm saying,four years means
5,battle soul nation,billions billions dollars,like know like,four years president
6,beat donald trump,sleepy joe biden,president trump party,stand president donald
7,hope history rhyme,make america great,equal justice law.,need four years
8,mr. vice president,air force one,black lives matter,trump president stands
9,there's single thing,short period time,criminal justice system,i'm proud report
