In [83]:
import pandas as pd
import numpy as np
from glob import glob
import re

In [84]:
ohco = ['speech_id','speaker','para_id','sent_id','token_id']

In [85]:
df = pd.read_json('data/ucsb_speeches_2016.json')

In [86]:
df.head()

Unnamed: 0,link,title,date,person,transcript
0,/documents/remarks-town-hall-meeting-portsmout...,"Remarks at a Town Hall Meeting in Portsmouth, ...",2015-12-29 00:00:00+00:00,Hillary Clinton,\nCLINTON: Wow. Thank you. Thank you all. Than...
1,/documents/remarks-the-university-minnesota-mi...,Remarks at the University of Minnesota in Minn...,2015-12-15 00:00:00+00:00,Hillary Clinton,\nThank you. Thank you all very much. Thank yo...
2,/documents/interview-with-george-stephanopoulo...,Interview with George Stephanopoulos of ABC Ne...,2015-12-06 00:00:00+00:00,Hillary Clinton,\nSTEPHANOPOULOS: And we'll hear more on that ...
3,/documents/interview-with-charlie-rose,Interview with Charlie Rose,2015-12-01 00:00:00+00:00,Hillary Clinton,"\nROSE: She is a former first lady, a former s..."
4,/documents/remarks-and-question-and-answer-ses...,Remarks and a Question and Answer Session at t...,2015-11-19 00:00:00+00:00,Hillary Clinton,\nCLINTON: Thank you. Thank you very much. [ap...


In [87]:
df.index.name = 'speech_id'
library = df[['link','title','date','person']]

In [88]:
# first OHCO level - split out speakers, using e.g. "CLINTON:"
# set default speaker for each speech - we'll set the specific ones later.
df['speaker'] = [x[1] for x in df['person'].str.upper().str.rsplit(' ',1)]

In [89]:
df = df.reset_index().set_index(['speech_id','speaker'])

In [90]:
# second OHCO level - split out paragraphs, using "\n" as the separator (for these transcripts). 
# Remove initial/trailing whitespace, including \n
df = df['transcript'].str.strip().str.split("\n", expand=True)\
    .stack().to_frame().rename(columns={0:'para_str'})
df.index.names = ohco[0:3]

In [91]:
df = df.reset_index().set_index(['speech_id','para_id'])

In [92]:
# whenever a paragraph starts with a caps name e.g. CLINTON:
# use that as the speaker until the next caps name
df['speaker'] = df['para_str'].str.extract(r'([A-Z]+)(:)')[0].ffill()

In [93]:
df = df.reset_index().set_index(ohco[0:3])

In [94]:
# remove the caps names
df['para_str'] = df['para_str'].str.replace(r'[A-Z]+: ','')

In [98]:
# third ohco level - sentence

# Alvarado used NLTK sentence tokenizer to split sentences. 
# but let's try it the lazy way to start
df = df['para_str'].str.split("[.!?]+", expand=True)\
    .stack().to_frame().rename(columns={0:'sent_str'})
df.index.names = ohco[0:4]

In [101]:
# drop 0-length strings (usually at end of paragraph)
df = df[df['sent_str'].str.len() > 0]

In [102]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,sent_str
speech_id,speaker,para_id,sent_id,Unnamed: 4_level_1
0,CLINTON,0,0,Wow
0,CLINTON,0,1,Thank you
0,CLINTON,0,2,Thank you all
0,CLINTON,0,3,Thank you
0,CLINTON,0,4,I am really delighted to be here on the first...


In [103]:
# fourth ohco level - tokens
import nltk

In [104]:
token = df['sent_str'].apply(lambda x: pd.Series(nltk.pos_tag(nltk.WhitespaceTokenizer().tokenize(x))))\
    .stack().to_frame()\
    .rename(columns={0:'pos_tuple'})
token['pos'] = token.pos_tuple.apply(lambda x: x[1])
token['token_str'] = token.pos_tuple.apply(lambda x: x[0])
token = token.drop('pos_tuple', 1)

  """Entry point for launching an IPython kernel.


In [105]:
token.index.names = ohco # ok, all done

In [106]:
token.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos,token_str
speech_id,speaker,para_id,sent_id,token_id,Unnamed: 5_level_1,Unnamed: 6_level_1
0,CLINTON,0,0,0,NN,Wow
0,CLINTON,0,1,0,NN,Thank
0,CLINTON,0,1,1,PRP,you
0,CLINTON,0,2,0,NNP,Thank
0,CLINTON,0,2,1,PRP,you


In [107]:
# Lowercase, remove non-word characters
token['term_str'] = token['token_str'].str.lower().str.replace('[\W_]', '')

In [109]:
# Checkpoint
token.to_parquet('data/token.parquet')
library.to_parquet('data/library.parquet')

In [112]:
# Load from checkpoint
if 'token' not in locals():
    token = pd.read_parquet('data/token.parquet')
if 'library' not in locals():
    library = pd.read_parquet('data/library.parquet')

In [113]:
# create vocab table
# 

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos,token_str,term_str
speech_id,speaker,para_id,sent_id,token_id,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,CLINTON,0,0,0,NN,Wow,wow
0,CLINTON,0,1,0,NN,Thank,thank
0,CLINTON,0,1,1,PRP,you,you
0,CLINTON,0,2,0,NNP,Thank,thank
0,CLINTON,0,2,1,PRP,you,you
0,CLINTON,0,2,2,DT,all,all
0,CLINTON,0,3,0,NN,Thank,thank
0,CLINTON,0,3,1,PRP,you,you
0,CLINTON,0,4,0,PRP,I,i
0,CLINTON,0,4,1,VBP,am,am
