In [1]:
import pandas as pd

# Data

In [2]:
transcripts = pd.read_csv('data/cln_transcripts.csv', index_col=0)
transcripts.head()

Unnamed: 0,Title,Date,President,Type,Transcript
0,"April 30, 1789: First Inaugural Address",1789-04-30,George Washington,inauguration,Fellow Citizens of the Senate and the House of...
1,"October 3, 1789: Thanksgiving Proclamation",1789-10-03,George Washington,other,Whereas it is the duty of all Nations to ackno...
2,"January 8, 1790: First Annual Message to Congress",1790-01-08,George Washington,state_union,Fellow Citizens of the Senate and House of Rep...
3,"December 8, 1790: Second Annual Message to Con...",1790-12-08,George Washington,state_union,Fellow citizens of the Senate and House of Rep...
4,"December 29, 1790: Talk to the Chiefs and Coun...",1790-12-29,George Washington,other,"I the President of the United States, by my ow..."


In [3]:
# text data
documents = transcripts['Transcript']

In [4]:
doc = documents[950]

# Extract Phrases
Extract most common phrases from addresses

In [5]:
import nltk

In [64]:
def top_phrase(document, n=3):
    # create tokenizer that ignores punctuation
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    # tokenize document
    words = tokenizer.tokenize(document)
    # create method to find trigram co-occurences in document
    trigram_measures = nltk.collocations.TrigramAssocMeasures()
    # create ntlk co-occurence object
    finder = nltk.collocations.TrigramCollocationFinder.from_words(words)
    # filter to trigrams that appear n times
    finder.apply_freq_filter(n)
    # return trigram with the most importance (highest PMI)
    try:
        top_phrase = finder.nbest(trigram_measures.pmi, 1)[0]
    except:
        top_phrase = ''
    # return top phrase as string
    return(' '.join(top_phrase))

In [65]:
top_phrase(doc)

'middle class families'

In [54]:
def phrase_finder(document, n=3, m=1):
    # create tokenizer that ignores punctuation
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    # tokenize document
    words = tokenizer.tokenize(document)
    # create method to find trigram co-occurences in document
    trigram_measures = nltk.collocations.TrigramAssocMeasures()
    # create ntlk co-occurence object
    finder = nltk.collocations.TrigramCollocationFinder.from_words(words)
    # filter to trigrams that appear n times
    finder.apply_freq_filter(n)
    # return trigram with the most importance (highest PMI)
    try:
        top_phrases = finder.nbest(trigram_measures.pmi, m)
    except:
        top_phrases = ''
    # return top phrase as string
    return([' '.join(phrase) for phrase in top_phrases])

In [60]:
phrase_finder(doc)

['middle class families']

## Extraction

In [51]:
transcripts['Transcript'].apply(top_phrase)

0                       in which I
1                    the People of
2                the United States
3         House of Representatives
4                    to treat with
5                  laid before you
6                    of the States
7                  laid before you
8                the United States
9                                 
10               the United States
11        House of Representatives
12             to their respective
13               the United States
14        House of Representatives
15              against the United
16                  be laid before
17              advice and consent
18            general or principal
19                     when we may
20            his Catholic Majesty
21                     it shall be
22        House of Representatives
23     The commissioners appointed
24                 which they have
25        House of Representatives
26     the commissioners appointed
27               the United States
28               the

# Parts of Speech

In [61]:
import spacy

In [62]:
nlp = spacy.load('en')

In [71]:
def phrase_parts(phrase):
    return([token.pos_ for token in nlp(phrase)])

In [73]:
phrase_parts(top_phrase(doc))

['ADJ', 'NOUN', 'NOUN']

In [70]:
spacy.displacy.render(nlp(phrase), style='dep')

In [74]:
text = 'middle class families'
text_doc = nlp(text)
for token in text_doc:
    print (token.text, token.tag_, token.head.text, token.dep_) # fine-grained tagging

middle JJ class amod
class NN families compound
families NNS families ROOT
