### Classifying Inaugural Speeches

The NLTK toolkit contains the inaugural speeches for all presidents from 1789 till 2009.

In [1]:
from nltk.corpus import inaugural
inaugural.fileids()

[u'1789-Washington.txt',
 u'1793-Washington.txt',
 u'1797-Adams.txt',
 u'1801-Jefferson.txt',
 u'1805-Jefferson.txt',
 u'1809-Madison.txt',
 u'1813-Madison.txt',
 u'1817-Monroe.txt',
 u'1821-Monroe.txt',
 u'1825-Adams.txt',
 u'1829-Jackson.txt',
 u'1833-Jackson.txt',
 u'1837-VanBuren.txt',
 u'1841-Harrison.txt',
 u'1845-Polk.txt',
 u'1849-Taylor.txt',
 u'1853-Pierce.txt',
 u'1857-Buchanan.txt',
 u'1861-Lincoln.txt',
 u'1865-Lincoln.txt',
 u'1869-Grant.txt',
 u'1873-Grant.txt',
 u'1877-Hayes.txt',
 u'1881-Garfield.txt',
 u'1885-Cleveland.txt',
 u'1889-Harrison.txt',
 u'1893-Cleveland.txt',
 u'1897-McKinley.txt',
 u'1901-McKinley.txt',
 u'1905-Roosevelt.txt',
 u'1909-Taft.txt',
 u'1913-Wilson.txt',
 u'1917-Wilson.txt',
 u'1921-Harding.txt',
 u'1925-Coolidge.txt',
 u'1929-Hoover.txt',
 u'1933-Roosevelt.txt',
 u'1937-Roosevelt.txt',
 u'1941-Roosevelt.txt',
 u'1945-Roosevelt.txt',
 u'1949-Truman.txt',
 u'1953-Eisenhower.txt',
 u'1957-Eisenhower.txt',
 u'1961-Kennedy.txt',
 u'1965-Johnson.tx

If we want to see the words and/or sentences of these speeches we use the following commands:

In [2]:
speech = '2009-Obama.txt'

# Here is the list of sentences. Each sentence is a list of tokens
inaugural.sents(speech)

[[u'My', u'fellow', u'citizens', u':'], [u'I', u'stand', u'here', u'today', u'humbled', u'by', u'the', u'task', u'before', u'us', u',', u'grateful', u'for', u'the', u'trust', u'you', u'have', u'bestowed', u',', u'mindful', u'of', u'the', u'sacrifices', u'borne', u'by', u'our', u'ancestors', u'.'], ...]

In [3]:
# Here is the first sentence
inaugural.sents(speech)[0]

[u'My', u'fellow', u'citizens', u':']

In [4]:
# Here is the second sentence
inaugural.sents(speech)[1]

[u'I',
 u'stand',
 u'here',
 u'today',
 u'humbled',
 u'by',
 u'the',
 u'task',
 u'before',
 u'us',
 u',',
 u'grateful',
 u'for',
 u'the',
 u'trust',
 u'you',
 u'have',
 u'bestowed',
 u',',
 u'mindful',
 u'of',
 u'the',
 u'sacrifices',
 u'borne',
 u'by',
 u'our',
 u'ancestors',
 u'.']

In [6]:
# And here is the list of tokens
list(inaugural.words(speech))

[u'My',
 u'fellow',
 u'citizens',
 u':',
 u'I',
 u'stand',
 u'here',
 u'today',
 u'humbled',
 u'by',
 u'the',
 u'task',
 u'before',
 u'us',
 u',',
 u'grateful',
 u'for',
 u'the',
 u'trust',
 u'you',
 u'have',
 u'bestowed',
 u',',
 u'mindful',
 u'of',
 u'the',
 u'sacrifices',
 u'borne',
 u'by',
 u'our',
 u'ancestors',
 u'.',
 u'I',
 u'thank',
 u'President',
 u'Bush',
 u'for',
 u'his',
 u'service',
 u'to',
 u'our',
 u'nation',
 u',',
 u'as',
 u'well',
 u'as',
 u'the',
 u'generosity',
 u'and',
 u'cooperation',
 u'he',
 u'has',
 u'shown',
 u'throughout',
 u'this',
 u'transition',
 u'.',
 u'Forty',
 u'-',
 u'four',
 u'Americans',
 u'have',
 u'now',
 u'taken',
 u'the',
 u'presidential',
 u'oath',
 u'.',
 u'The',
 u'words',
 u'have',
 u'been',
 u'spoken',
 u'during',
 u'rising',
 u'tides',
 u'of',
 u'prosperity',
 u'and',
 u'the',
 u'still',
 u'waters',
 u'of',
 u'peace',
 u'.',
 u'Yet',
 u',',
 u'every',
 u'so',
 u'often',
 u'the',
 u'oath',
 u'is',
 u'taken',
 u'amidst',
 u'gathering',
 u'c

In [10]:
import nltk

# And here is the raw text
raw_text = inaugural.raw(speech)

# And as a reminder, here are the NTLK commands for 
# splitting the text into sentences, or tokenizing it
# (See part A for more details)
sentences = nltk.sent_tokenize(raw_text)
tokens = nltk.word_tokenize(raw_text)
nltk_text = nltk.Text(tokens)

In [14]:
# Here is the list of (non-tokenized) sentences
sentences

[u'My fellow citizens:\n\nI stand here today humbled by the task before us, grateful for the trust you have bestowed, mindful of the sacrifices borne by our ancestors.',
 u'I thank President Bush for his service to our nation, as well as the generosity and cooperation he has shown throughout this transition.',
 u'Forty-four Americans have now taken the presidential oath.',
 u'The words have been spoken during rising tides of prosperity and the still waters of peace.',
 u'Yet, every so often the oath is taken amidst gathering clouds and raging storms.',
 u'At these moments, America has carried on not simply because of the skill or vision of those in high office, but because We the People have remained faithful to the ideals of our forbearers, and true to our founding documents.',
 u'So it has been.',
 u'So it must be with this generation of Americans.',
 u'That we are in the midst of crisis is now well understood.',
 u'Our nation is at war, against a far-reaching network of violence and

In [13]:
# And here is an example of doing POS tagging on the second sentence
sent_tokens = nltk.word_tokenize(sentences[1])
nltk.pos_tag(sent_tokens)

[(u'I', 'PRP'),
 (u'thank', 'VBP'),
 (u'President', 'NNP'),
 (u'Bush', 'NNP'),
 (u'for', 'IN'),
 (u'his', 'PRP$'),
 (u'service', 'NN'),
 (u'to', 'TO'),
 (u'our', 'PRP$'),
 (u'nation', 'NN'),
 (u',', ','),
 (u'as', 'RB'),
 (u'well', 'RB'),
 (u'as', 'IN'),
 (u'the', 'DT'),
 (u'generosity', 'NN'),
 (u'and', 'CC'),
 (u'cooperation', 'NN'),
 (u'he', 'PRP'),
 (u'has', 'VBZ'),
 (u'shown', 'VBN'),
 (u'throughout', 'IN'),
 (u'this', 'DT'),
 (u'transition', 'NN'),
 (u'.', '.')]

### Exercise

You are asked to identify the words that are most indicative of an Presidential inaugural speech for a given year. 

For this task, you will have to do the following:
* Select the target speeches
* Treat each sentence in the target speech as a document; if the sentence is part of the target speeches, mark it as positive, otherwise mark it as negative
* Create a dataset that contains the words that appear in each "positive" and in each "negative" sentence; filter the words so that we only see words that appear in a sufficiently large number of sentences.
* Train a classifier
* See the most informative words

In [15]:
# Here we define our "target" class. We will define our target class, as all the speeches
# in the 18th and 19th century
target_speeches = [u'1789-Washington.txt',  u'1793-Washington.txt',
 u'1797-Adams.txt',  u'1801-Jefferson.txt', u'1805-Jefferson.txt', u'1809-Madison.txt',
 u'1813-Madison.txt', u'1817-Monroe.txt', u'1821-Monroe.txt', u'1825-Adams.txt',
 u'1829-Jackson.txt', u'1833-Jackson.txt', u'1837-VanBuren.txt', u'1841-Harrison.txt',
 u'1845-Polk.txt', u'1849-Taylor.txt', u'1853-Pierce.txt', u'1857-Buchanan.txt',
 u'1861-Lincoln.txt', u'1865-Lincoln.txt', u'1869-Grant.txt', u'1873-Grant.txt',
 u'1877-Hayes.txt', u'1881-Garfield.txt', u'1885-Cleveland.txt', u'1889-Harrison.txt',
 u'1893-Cleveland.txt', u'1897-McKinley.txt', u'1901-McKinley.txt']

non_target_speeches = [s for s in inaugural.fileids() if s not in target_speeches]
non_target_speeches

[u'1905-Roosevelt.txt',
 u'1909-Taft.txt',
 u'1913-Wilson.txt',
 u'1917-Wilson.txt',
 u'1921-Harding.txt',
 u'1925-Coolidge.txt',
 u'1929-Hoover.txt',
 u'1933-Roosevelt.txt',
 u'1937-Roosevelt.txt',
 u'1941-Roosevelt.txt',
 u'1945-Roosevelt.txt',
 u'1949-Truman.txt',
 u'1953-Eisenhower.txt',
 u'1957-Eisenhower.txt',
 u'1961-Kennedy.txt',
 u'1965-Johnson.txt',
 u'1969-Nixon.txt',
 u'1973-Nixon.txt',
 u'1977-Carter.txt',
 u'1981-Reagan.txt',
 u'1985-Reagan.txt',
 u'1989-Bush.txt',
 u'1993-Clinton.txt',
 u'1997-Clinton.txt',
 u'2001-Bush.txt',
 u'2005-Bush.txt',
 u'2009-Obama.txt']

In [19]:
# We go over all speeches, and extract the sentences (each sentence is a list, containing the words/tokens)
# If the speech is a target speech, add the sentence

# The data will contain a tuple ("pos", sentence) and ("neg", sentence)
data = []

for speech in target_speeches:
    # If we want to operate with the raw text
    raw_text = inaugural.raw(speech)
    sentences = nltk.sent_tokenize(raw_text)
    # Or, alternatively, to add the alterady tokenized sentences
    # sentences = list(inaugural.sents(speech))
    for sent in sentences:
        data.append( ("pos", sent))
    
for speech in non_target_speeches:
    # If we want to operate with the raw text
    raw_text = inaugural.raw(speech)
    sentences = nltk.sent_tokenize(raw_text)
    # Or, alternatively, to add the alterady tokenized sentences
    # sentences = list(inaugural.sents(speech))
    for sent in sentences:
        data.append( ("neg", sent))        

In [20]:
len(data)

4839

In [21]:
# This is the number of positive sentences
len([tag for (tag, s) in data if tag=='pos'])

2159

In [23]:
# This is the number of negative sentences
len([tag for (tag, s) in data if tag=='neg'])

2680