In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

In [2]:
token = pd.read_parquet('data/token.parquet')

In [3]:
library = pd.read_parquet('data/library.parquet')

## Cleanup

In [4]:
token

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos,token_str,term_str
speech_id,speaker,para_id,sent_id,token_id,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,CLINTON,0,0,0,NN,Wow,wow
0,CLINTON,0,1,0,NN,Thank,thank
0,CLINTON,0,1,1,PRP,you,you
0,CLINTON,0,2,0,NNP,Thank,thank
0,CLINTON,0,2,1,PRP,you,you
...,...,...,...,...,...,...,...
162,CLINTON,273,0,4,NNP,Chris,chris
162,CLINTON,273,1,0,NN,Great,great
162,CLINTON,273,1,1,TO,to,to
162,CLINTON,273,1,2,VB,see,see


In [4]:
#Split into Trump and Clinton speeches
clinton = token.query('speaker=="CLINTON"')
trump = token.query('speaker=="TRUMP"')

In [5]:
#Remove paragraph and sentence identifiers since we won't be using them
clinton=clinton.droplevel([3,4])
trump=trump.droplevel([3,4])

In [6]:
clinton

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,pos,token_str,term_str
speech_id,speaker,para_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,CLINTON,0,NN,Wow.,wow
0,CLINTON,0,NNP,Thank,thank
0,CLINTON,0,NN,you.,you
0,CLINTON,0,NNP,Thank,thank
0,CLINTON,0,PRP,you,you
...,...,...,...,...,...
162,CLINTON,273,NNP,Chris.,chris
162,CLINTON,273,NN,Great,great
162,CLINTON,273,TO,to,to
162,CLINTON,273,VB,see,see


In [7]:
#Combine terms to create a single row for each speech, add the speaker
clinton = clinton.groupby('speech_id')['term_str'].apply(' '.join).reset_index()
trump = trump.groupby('speech_id')['term_str'].apply(' '.join).reset_index()
clinton['speaker']='clinton'
trump['speaker']='trump'

In [8]:
clinton

Unnamed: 0,speech_id,term_str,speaker
0,0,wow thank you thank you all thank you i am rea...,clinton
1,1,thank you thank you all very much thank you th...,clinton
2,2,thanks george look we are definitely in confli...,clinton
3,3,right right well yes right no because he got a...,clinton
4,4,thank you thank you very much applause thank y...,clinton
...,...,...,...
83,158,good morning chuck absolutely look i have said...,clinton
84,159,well look i appreciate what he went through wh...,clinton
85,160,thank you alisyn yes uh hm absolutely no you k...,clinton
86,161,oh john you know these polls go up they go dow...,clinton


In [9]:
#Create combined dataframes
speeches=clinton.append(trump)

In [10]:
speeches=speeches.set_index('speech_id').sort_index()

In [11]:
token

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos,token_str,term_str
speech_id,speaker,para_id,sent_id,token_id,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,CLINTON,0,0,0,NN,Wow.,wow
0,CLINTON,0,1,0,NNP,Thank,thank
0,CLINTON,0,1,1,NN,you.,you
0,CLINTON,0,2,0,NNP,Thank,thank
0,CLINTON,0,2,1,PRP,you,you
...,...,...,...,...,...,...,...
162,CLINTON,273,0,4,NNP,Chris.,chris
162,CLINTON,273,1,0,NN,Great,great
162,CLINTON,273,1,1,TO,to,to
162,CLINTON,273,1,2,VB,see,see


## Compare to Scikit-Learn's MultinomialNB

In [12]:
#Split into training and test data
speech_train, speech_test, speaker_train, speaker_test = train_test_split(speeches.term_str, speeches.speaker, test_size=0.2, random_state=42)

In [13]:
model = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),('clf', MultinomialNB())])

In [14]:
model.fit(speech_train, speaker_train)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB())])

In [15]:
predicted = model.predict(speech_test)
np.mean(predicted == speaker_test)

0.9090909090909091

In [16]:
from sklearn.metrics import confusion_matrix
confusion_matrix(speaker_test, predicted)

array([[19,  0],
       [ 3, 11]])

In [17]:
predicted

array(['trump', 'clinton', 'clinton', 'trump', 'clinton', 'clinton',
       'clinton', 'trump', 'trump', 'clinton', 'clinton', 'clinton',
       'clinton', 'trump', 'clinton', 'clinton', 'trump', 'trump',
       'clinton', 'clinton', 'trump', 'clinton', 'clinton', 'clinton',
       'trump', 'clinton', 'clinton', 'clinton', 'trump', 'clinton',
       'clinton', 'clinton', 'trump'], dtype='<U7')