In [226]:
import pandas as pd
import numpy as np
from glob import glob
import re

In [227]:
ohco = ['speech_id','speaker','para_id','sent_id','token_id']

In [228]:
df = pd.read_json('data/ucsb_speeches_2016.json')

In [229]:
df.head()

Unnamed: 0,link,title,date,person,transcript
0,/documents/remarks-town-hall-meeting-portsmout...,"Remarks at a Town Hall Meeting in Portsmouth, ...",2015-12-29 00:00:00+00:00,Hillary Clinton,\nCLINTON: Wow. Thank you. Thank you all. Than...
1,/documents/remarks-the-university-minnesota-mi...,Remarks at the University of Minnesota in Minn...,2015-12-15 00:00:00+00:00,Hillary Clinton,\nThank you. Thank you all very much. Thank yo...
2,/documents/interview-with-george-stephanopoulo...,Interview with George Stephanopoulos of ABC Ne...,2015-12-06 00:00:00+00:00,Hillary Clinton,\nSTEPHANOPOULOS: And we'll hear more on that ...
3,/documents/interview-with-charlie-rose,Interview with Charlie Rose,2015-12-01 00:00:00+00:00,Hillary Clinton,"\nROSE: She is a former first lady, a former s..."
4,/documents/remarks-and-question-and-answer-ses...,Remarks and a Question and Answer Session at t...,2015-11-19 00:00:00+00:00,Hillary Clinton,\nCLINTON: Thank you. Thank you very much. [ap...


In [230]:
df.index.name = 'speech_id'
library = df[['link','title','date','person']]

In [231]:
# first OHCO level - split out speakers, using e.g. "CLINTON:"
# set default speaker for each speech - we'll set the specific ones later.
df['speaker'] = [x[1] for x in df['person'].str.upper().str.rsplit(' ',1)]

In [232]:
df = df.reset_index().set_index(['speech_id','speaker'])

In [233]:
# second OHCO level - split out paragraphs, using "\n" as the separator (for these transcripts). 
# Remove initial/trailing whitespace, including \n
df = df['transcript'].str.strip().str.split("\n", expand=True)\
    .stack().to_frame().rename(columns={0:'para_str'})
df.index.names = ohco[0:3]

In [234]:
df = df.reset_index().set_index(['speech_id','para_id'])

In [235]:
# whenever a paragraph starts with a caps name e.g. CLINTON:
# use that as the speaker until the next caps name
df['speaker'] = df['para_str'].str.extract(r'([A-Z]+)(:)')[0].ffill()
# remove the non-Trump/Clinton speakers
df = df[df['speaker'].isin(['TRUMP','CLINTON'])]

In [236]:
df = df.reset_index().set_index(ohco[0:3])

In [237]:
# remove the caps names
df['para_str'] = df['para_str'].str.replace(r'[A-Z]+: ','')

In [238]:
# third ohco level - sentence

# Alvarado used NLTK sentence tokenizer to split sentences. 
# but let's try it the lazy way to start
df = df['para_str'].str.split("[.!?]+", expand=True)\
    .stack().to_frame().rename(columns={0:'sent_str'})
df.index.names = ohco[0:4]

In [239]:
# drop 0-length strings (usually at end of paragraph)
df = df[df['sent_str'].str.len() > 0]

In [240]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,sent_str
speech_id,speaker,para_id,sent_id,Unnamed: 4_level_1
0,CLINTON,0,0,Wow
0,CLINTON,0,1,Thank you
0,CLINTON,0,2,Thank you all
0,CLINTON,0,3,Thank you
0,CLINTON,0,4,I am really delighted to be here on the first...


In [241]:
# fourth ohco level - tokens
import nltk

In [242]:
token = df['sent_str'].apply(lambda x: pd.Series(nltk.pos_tag(nltk.WhitespaceTokenizer().tokenize(x))))\
    .stack().to_frame()\
    .rename(columns={0:'pos_tuple'})
token['pos'] = token.pos_tuple.apply(lambda x: x[1])
token['token_str'] = token.pos_tuple.apply(lambda x: x[0])
token = token.drop('pos_tuple', 1)

  """Entry point for launching an IPython kernel.


In [243]:
token.index.names = ohco # ok, all done

In [244]:
token.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos,token_str
speech_id,speaker,para_id,sent_id,token_id,Unnamed: 5_level_1,Unnamed: 6_level_1
0,CLINTON,0,0,0,NN,Wow
0,CLINTON,0,1,0,NN,Thank
0,CLINTON,0,1,1,PRP,you
0,CLINTON,0,2,0,NNP,Thank
0,CLINTON,0,2,1,PRP,you


In [245]:
# Lowercase, remove non-word characters
token['term_str'] = token['token_str'].str.lower().str.replace('[\W_]', '')
# drop words which consist entirely of non-word characters
token = token[token.term_str!=''].sort_index()

In [246]:
# Checkpoint
token.to_parquet('data/token.parquet')
library.to_parquet('data/library.parquet')

In [247]:
# Load from checkpoint
if 'token' not in locals():
    token = pd.read_parquet('data/token.parquet')
if 'library' not in locals():
    library = pd.read_parquet('data/library.parquet')

In [248]:
# create vocab table
# 
vocab = token.term_str.value_counts().to_frame().rename(columns={'index':'term_str', 'term_str':'n'})\
    .sort_index().reset_index().rename(columns={'index':'term_str'})
vocab.index.name = 'term_id'

In [249]:
# add stopwords
vocab['stop'] = 0
vocab.loc[vocab['term_str'].isin(nltk.corpus.stopwords.words('english')),'stop'] = 1

In [250]:
# (optional) add stems
#from nltk.stem.porter import PorterStemmer
#stemmer = PorterStemmer()
#vocab['p_stem'] = vocab.term_str.apply(stemmer.stem)

In [251]:
# add term rank 
if 'term_rank' not in vocab.columns:
    vocab = vocab.sort_values('n', ascending=False).reset_index()
    vocab.index.name = 'term_rank'
    vocab = vocab.reset_index().set_index('term_id')
    vocab['term_rank'] = vocab['term_rank'] + 1 # start with 1 instead of 0

In [252]:
vocab.sort_values('n', ascending=False).head()

Unnamed: 0_level_0,term_rank,term_str,n,stop
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10832,1,the,13552,1
11006,2,to,12704,1
782,3,and,12178,1
5499,4,i,7337,1
7517,5,of,6918,1


In [253]:
# add term id back to token table, for easy joining
token['term_id'] = token.term_str.map(vocab.reset_index().set_index('term_str').term_id)

In [254]:
# Checkpoint
vocab.to_parquet('data/vocab.parquet')

## Create DT matrices 
(Document-Term matrix, using Bag of Words and TF-IDF)

One each for Trump and Clinton

In [267]:
# Load from checkpoint
if 'vocab' not in locals():
    vocab = pd.read_parquet('data/vocab.parquet')

In [268]:
bag = ohco[:1] # bag size = 1 speech

In [269]:
# 100 speeches, take 20% for test set
test_ids = np.random.choice(token.reset_index().speech_id.unique(),size=20,replace=False)

In [270]:
token = token.reset_index().set_index('speech_id')
test_token = token.loc[test_ids]
train_token = token.loc[~token.index.isin(test_ids)]

In [271]:
# Split into Trump/Clinton
clinton_train = train_token.query('speaker=="CLINTON"')
trump_train = train_token.query('speaker=="TRUMP"')

In [322]:
clinton_count = len(clinton_train.index.unique())
trump_count = len(trump_train.index.unique())
total_count = clinton_count + trump_count
priors = [clinton_count/total_count,trump_count/total_count]
priors

[0.45121951219512196, 0.5487804878048781]

In [350]:
def create_vocab(token, a=1):
    vocab = token.term_str.value_counts().to_frame().rename(columns={'index':'term_str', 'term_str':'n'})\
    .sort_index().reset_index().rename(columns={'index':'term_str'})
    vocab.index.name = 'term_id'
    vocab['stop'] = 0
    vocab.loc[vocab['term_str'].isin(nltk.corpus.stopwords.words('english')),'stop'] = 1
    if 'term_rank' not in vocab.columns:
        vocab = vocab.sort_values('n', ascending=False).reset_index()
        vocab.index.name = 'term_rank'
        vocab = vocab.reset_index().set_index('term_id')
        vocab['term_rank'] = vocab['term_rank'] + 1 # start with 1 instead of 0
    token['term_id'] = token.term_str.map(vocab.reset_index().set_index('term_str').term_id)
    BOW = token.groupby(bag+['term_id']).term_id.count()\
    .to_frame().rename(columns={'term_id':'n'})
    DTCM = BOW['n'].unstack().fillna(0).astype('int')
    # Term frequency - normalized
    TF = (DTCM.T / DTCM.T.sum()).T
    DF = DTCM[DTCM > 0].count()
    freq=TF.sum().reset_index()
    vocab['freq'] = freq[0]
    tf = vocab.freq # term frequency (normalized)
    N = vocab.freq.sum() # sum of all term frequencies in this class
    V = vocab.shape[0] # vocab size
    # https://stackoverflow.com/questions/3704570/in-python-small-floats-tending-to-zero
    # https://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html
    # using the log likelihood for these reasons
    vocab['likelihood'] = np.log((tf+a) / (N+ a*V))
    vocab.sort_values('likelihood',ascending=False)
    return vocab

In [351]:
vocab_c = create_vocab(clinton_train).reset_index().set_index('term_str')
vocab_t = create_vocab(trump_train).reset_index().set_index('term_str')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


In [352]:
vocab_c

Unnamed: 0_level_0,term_id,term_rank,n,stop,freq,likelihood
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
to,6228,1,4342,1,1.603061,-7.880558
the,6121,2,4189,1,1.560112,-7.897194
and,449,3,3750,1,1.355956,-7.980299
i,3098,4,2722,1,1.126600,-8.082721
of,4227,5,2062,1,0.767632,-8.267605
...,...,...,...,...,...,...
piecing,4545,6881,1,0,0.000671,-8.836575
piled,4546,6882,1,0,0.000194,-8.837052
eager,2049,6883,1,0,0.000194,-8.837052
pinata,4550,6884,1,0,0.000379,-8.836867


In [353]:
def NB_classifier(token_test, class1_vocab, class2_vocab, priors=[.5,.5]):
    d = {'speech_id': token_test.index.unique(), 'true_value': "", 'prediction': ""}
    results = pd.DataFrame(d).set_index('speech_id')
    
    for i in results.index:
        results['true_value'].loc[i] = test_token.at[i,'speaker'][0]
        
        speech = token_test.loc[i]
        
        speech = speech.merge(class1_vocab['likelihood'], how='inner', on="term_str")\
        .rename(columns={'likelihood': 'class1_prob'})
        speech = speech.merge(class2_vocab['likelihood'], how='inner', on="term_str")\
        .rename(columns={'likelihood': 'class2_prob'})
        
        predict = pd.DataFrame(speech[['class1_prob','class2_prob']].sum())
        logpriors = np.log(priors)
        predict[0] = predict[0] + logpriors
        predict[0] = predict.sort_values(by=0, ascending=False)
        classification = predict.index[0]
        
        results['prediction'].loc[i] = classification
        
    return results

In [355]:
# class 1 is clinton, class 2 is trump

NB_classifier(test_token,vocab_c,vocab_t,priors=priors)

Unnamed: 0_level_0,true_value,prediction
speech_id,Unnamed: 1_level_1,Unnamed: 2_level_1
12,CLINTON,class1_prob
46,TRUMP,class1_prob
144,CLINTON,class1_prob
1,CLINTON,class1_prob
140,TRUMP,class1_prob
34,TRUMP,class1_prob
123,TRUMP,class1_prob
147,CLINTON,class1_prob
154,CLINTON,class1_prob
155,CLINTON,class1_prob
