In [1]:
import pandas as pd
import numpy as np
from glob import glob
import re

In [2]:
ohco = ['speech_id','speaker','para_id','sent_id','token_id']

In [3]:
df = pd.read_json('data/ucsb_speeches_2016.json')

In [4]:
df.head()

Unnamed: 0,link,title,date,person,transcript
0,/documents/remarks-town-hall-meeting-portsmout...,"Remarks at a Town Hall Meeting in Portsmouth, ...",2015-12-29 00:00:00+00:00,Hillary Clinton,\nCLINTON: Wow. Thank you. Thank you all. Than...
1,/documents/remarks-the-university-minnesota-mi...,Remarks at the University of Minnesota in Minn...,2015-12-15 00:00:00+00:00,Hillary Clinton,\nThank you. Thank you all very much. Thank yo...
2,/documents/interview-with-george-stephanopoulo...,Interview with George Stephanopoulos of ABC Ne...,2015-12-06 00:00:00+00:00,Hillary Clinton,\nSTEPHANOPOULOS: And we'll hear more on that ...
3,/documents/interview-with-charlie-rose,Interview with Charlie Rose,2015-12-01 00:00:00+00:00,Hillary Clinton,"\nROSE: She is a former first lady, a former s..."
4,/documents/remarks-and-question-and-answer-ses...,Remarks and a Question and Answer Session at t...,2015-11-19 00:00:00+00:00,Hillary Clinton,\nCLINTON: Thank you. Thank you very much. [ap...


In [5]:
df.index.name = 'speech_id'
library = df[['link','title','date','person']]

In [6]:
# first OHCO level - split out speakers, using e.g. "CLINTON:"
# set default speaker for each speech - we'll set the specific ones later.
df['speaker'] = [x[1] for x in df['person'].str.upper().str.rsplit(' ',1)]

In [7]:
df = df.reset_index().set_index(['speech_id','speaker'])

In [8]:
# second OHCO level - split out paragraphs, using "\n" as the separator (for these transcripts). 
# Remove initial/trailing whitespace, including \n
df = df['transcript'].str.strip().str.split("\n", expand=True)\
    .stack().to_frame().rename(columns={0:'para_str'})
df.index.names = ohco[0:3]

In [9]:
df = df.reset_index().set_index(['speech_id','para_id'])

In [10]:
# whenever a paragraph starts with a caps name e.g. CLINTON:
# use that as the speaker until the next caps name
df['speaker'] = df['para_str'].str.extract(r'([A-Z]+)(:)')[0].ffill()
# remove the non-Trump/Clinton speakers
df = df[df['speaker'].isin(['TRUMP','CLINTON'])]

In [11]:
df = df.reset_index().set_index(ohco[0:3])

In [12]:
# remove the caps names
df['para_str'] = df['para_str'].str.replace(r'[A-Z]+: ','')

In [13]:
# third ohco level - sentence

# Alvarado used NLTK sentence tokenizer to split sentences. 
# but let's try it the lazy way to start
df = df['para_str'].str.split("[.!?]+", expand=True)\
    .stack().to_frame().rename(columns={0:'sent_str'})
df.index.names = ohco[0:4]

In [14]:
# drop 0-length strings (usually at end of paragraph)
df = df[df['sent_str'].str.len() > 0]

In [15]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,sent_str
speech_id,speaker,para_id,sent_id,Unnamed: 4_level_1
0,CLINTON,0,0,Wow
0,CLINTON,0,1,Thank you
0,CLINTON,0,2,Thank you all
0,CLINTON,0,3,Thank you
0,CLINTON,0,4,I am really delighted to be here on the first...


In [16]:
# fourth ohco level - tokens
import nltk

In [17]:
token = df['sent_str'].apply(lambda x: pd.Series(nltk.pos_tag(nltk.WhitespaceTokenizer().tokenize(x))))\
    .stack().to_frame()\
    .rename(columns={0:'pos_tuple'})
token['pos'] = token.pos_tuple.apply(lambda x: x[1])
token['token_str'] = token.pos_tuple.apply(lambda x: x[0])
token = token.drop('pos_tuple', 1)

  token = df['sent_str'].apply(lambda x: pd.Series(nltk.pos_tag(nltk.WhitespaceTokenizer().tokenize(x))))\


In [18]:
token.index.names = ohco # ok, all done

In [19]:
token.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos,token_str
speech_id,speaker,para_id,sent_id,token_id,Unnamed: 5_level_1,Unnamed: 6_level_1
0,CLINTON,0,0,0,NN,Wow
0,CLINTON,0,1,0,NN,Thank
0,CLINTON,0,1,1,PRP,you
0,CLINTON,0,2,0,NNP,Thank
0,CLINTON,0,2,1,PRP,you


In [20]:
# Lowercase, remove non-word characters
token['term_str'] = token['token_str'].str.lower().str.replace('[\W_]', '')
# drop words which consist entirely of non-word characters
token = token[token.term_str!=''].sort_index()

In [21]:
# Checkpoint
token.to_parquet('data/token.parquet')
library.to_parquet('data/library.parquet')

In [22]:
# Load from checkpoint
if 'token' not in locals():
    token = pd.read_parquet('data/token.parquet')
if 'library' not in locals():
    library = pd.read_parquet('data/library.parquet')

In [23]:
# create vocab table
# 
vocab = token.term_str.value_counts().to_frame().rename(columns={'index':'term_str', 'term_str':'n'})\
    .sort_index().reset_index().rename(columns={'index':'term_str'})
vocab.index.name = 'term_id'

In [24]:
# add stopwords
vocab['stop'] = 0
vocab.loc[vocab['term_str'].isin(nltk.corpus.stopwords.words('english')),'stop'] = 1

In [25]:
# (optional) add stems
#from nltk.stem.porter import PorterStemmer
#stemmer = PorterStemmer()
#vocab['p_stem'] = vocab.term_str.apply(stemmer.stem)

In [26]:
# add term rank 
if 'term_rank' not in vocab.columns:
    vocab = vocab.sort_values('n', ascending=False).reset_index()
    vocab.index.name = 'term_rank'
    vocab = vocab.reset_index().set_index('term_id')
    vocab['term_rank'] = vocab['term_rank'] + 1 # start with 1 instead of 0

In [27]:
vocab.sort_values('n', ascending=False).head()

Unnamed: 0_level_0,term_rank,term_str,n,stop
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10832,1,the,13552,1
11006,2,to,12704,1
782,3,and,12178,1
5499,4,i,7337,1
7517,5,of,6918,1


In [28]:
# add term id back to token table, for easy joining
token['term_id'] = token.term_str.map(vocab.reset_index().set_index('term_str').term_id)

In [29]:
# Checkpoint
vocab.to_parquet('data/vocab.parquet')

## Create DT matrices 
(Document-Term matrix, using Bag of Words and TF-IDF)

One each for Trump and Clinton

In [30]:
# Load from checkpoint
if 'vocab' not in locals():
    vocab = pd.read_parquet('data/vocab.parquet')

In [31]:
bag = ohco[:1] # bag size = 1 speech

In [32]:
# Split into Trump/Clinton
clinton = token.query('speaker=="CLINTON"')
trump = token.query('speaker=="TRUMP"')

In [33]:
BOW_c = clinton.groupby(bag+['term_id']).term_id.count()\
    .to_frame().rename(columns={'term_id':'n'})
BOW_t = trump.groupby(bag+['term_id']).term_id.count()\
    .to_frame().rename(columns={'term_id':'n'})

In [34]:
# Document-Term Matrices
DTCM_c = BOW_c['n'].unstack().fillna(0).astype('int')
DTCM_t = BOW_t['n'].unstack().fillna(0).astype('int')

In [35]:
# Term frequency - normalized
TF_c = (DTCM_c.T / DTCM_c.T.sum()).T
TF_t = (DTCM_t.T / DTCM_t.T.sum()).T

In [36]:
# Document frequency
DF_c = DTCM_c[DTCM_c > 0].count()
DF_t = DTCM_t[DTCM_t > 0].count()

In [37]:
# Create vocab tables for Clinton / Trump
# 
vocab_c = clinton.term_str.value_counts().to_frame().rename(columns={'index':'term_str', 'term_str':'n'})\
    .sort_index().reset_index().rename(columns={'index':'term_str'})
vocab_c.index.name = 'term_id'

vocab_t = trump.term_str.value_counts().to_frame().rename(columns={'index':'term_str', 'term_str':'n'})\
    .sort_index().reset_index().rename(columns={'index':'term_str'})
vocab_t.index.name = 'term_id'


In [38]:
# add stopwords
vocab_c['stop'] = 0
vocab_c.loc[vocab_c['term_str'].isin(nltk.corpus.stopwords.words('english')),'stop'] = 1

vocab_t['stop'] = 0
vocab_t.loc[vocab_t['term_str'].isin(nltk.corpus.stopwords.words('english')),'stop'] = 1

In [42]:
vocab_c['freq'] = TF_c.sum()
vocab_t['freq'] = TF_t.sum()

In [40]:
tf = vocab_c.freq # term frequency (normalized)
a = 1 # Laplace smoothing
N = vocab_c.freq.sum() # sum of all term frequencies in this class
V = vocab_c.shape[0] # vocab size
vocab_c['likelihood'] = (tf+a) / (N+ a*V)

In [43]:
tf = vocab_t.freq # term frequency (normalized)
a = 1 # Laplace smoothing
N = vocab_t.freq.sum() # sum of all term frequencies in this class
V = vocab_t.shape[0] # vocab size
vocab_t['likelihood'] = (tf+a) / (N+ a*V)

In [44]:
vocab_t.sort_values('likelihood',ascending=False)

Unnamed: 0_level_0,term_str,n,stop,freq,likelihood
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
782,attorney,12,0,1.934847,0.000335
7517,stupidity,8,0,1.142693,0.000245
366,activity,10,0,0.970873,0.000225
5616,pad,2,0,0.883623,0.000215
5499,opponents,8,0,0.810991,0.000207
...,...,...,...,...,...
8694,yesyes,1,0,,
8709,young,87,0,,
8710,younger,2,0,,
8716,yourself,17,1,,


In [45]:
import pymc3 as pm