Analyze state of the union addresses. 
Data source: https://en.wikisource.org/wiki/Portal:State_of_the_Union_Speeches_by_United_States_Presidents
scrape the text of all speeches and then maybe try to find patterns of speech of each president?

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import urllib.request
import re


# Data source we are going to scrape for results
data_url = 'https://en.wikisource.org/wiki/Portal:State_of_the_Union_Speeches_by_United_States_Presidents'

link_list = []

resp = urllib.request.urlopen(data_url)
soup = BeautifulSoup(resp, from_encoding=resp.info().get_param('charset'))

# Get all links to state of the union addresses from 
for link in soup.find_all('a', href=True):
    if "union_address" in link['href'].lower() or "union_speech" in link['href'].lower() \
        and "portal" not in link['href'].lower() and "#" not in link['href'].lower():
        link_list.append(link['href'])

# extract the text of a speech from a URL
# text is extracted in a list of paragraphs (strings) for each speech
def get_speech(url):
    return([ p.text.strip() for p in BeautifulSoup(urllib.request.urlopen(url)).find_all("p") if \
             'This work is in the public domain in the United States because it is a work of the United States federal government' \
             not in p.text.strip()])

speeches = [get_speech('https://en.wikisource.org' + link) for link in link_list]

In [2]:
# Extract presidents names from link text
presidents = [ link.replace('%','/').split('/')[2].replace('_',' ') for link in link_list ]

# Extract state of the union text entries so we can extract the date
sou_entries = []
for item in soup.find_all('li'):
    if 'union' in item.text.strip().lower() and '(' in  item.text.strip().lower():
        sou_entries.append(item.text.strip())

speeches_pd = pd.DataFrame({
                'president' : presidents,
                'speech' : speeches,
                'year' : [int(re.findall('\d+',item)[1]) for item in sou_entries ]} )
        

In [63]:
speeches_pd.sample(n=10)

Unnamed: 0,president,speech,year
230,Donald Trump,"[Mr. Speaker, Mr. Vice President, Members of C...",2018
184,Richard Nixon,[Twenty-five years ago I sat here as a freshma...,1972
51,Martin Van Buren,[Fellow-Citizens of the Senate and House of Re...,1840
131,Woodrow Wilson,"[GENTLEMEN OF THE CONGRESS:, When I addressed ...",1920
45,Andrew Jackson,[Fellow Citizens of the Senate and of the Hous...,1834
62,Millard Fillmore,[Fellow-Citizens of the Senate and of the Hous...,1851
42,Andrew Jackson,[Fellow Citizens of the Senate and of the Hous...,1831
147,Franklin Delano Roosevelt,"[Mr. President, Mr. Speaker, Members of the Co...",1937
53,John Tyler,[To the Senate and House of Representatives of...,1842
7,George Washington,[Fellow Citizens of the Senate and of the Hous...,1796


## Pre-process and TfIdf Vectorize Speeches

In [59]:
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
#nltk.download('punkt')
#nltk.download('wordnet')
#nltk.download('averaged_perceptron_tagger')
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from nltk.corpus import wordnet
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

lmtzr = WordNetLemmatizer()

# Convert nltk POS to POS that can be used by lemmatizer
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return ''

# Lemmatize a word with its appropriate POS tag
def lemmatize_with_pos(word,testing=False):
    pos = get_wordnet_pos(nltk.pos_tag([word])[0][1])
    if testing==True:
        print('POS: ' + pos)
    if pos != '':
        return(lmtzr.lemmatize(word,pos))
    else:
        return(lmtzr.lemmatize(word))
    
MyStopWords = get_stop_words('en')
MyStopWords.extend(['make','will','can','applause'])

In [5]:
lemmatize_with_pos('dsfs',testing=True)

POS: n


'dsfs'

Clean, Tokenize, Lemmatize, and Vectorize all speeches 

In [61]:
# Each speech is stored as a list of paragraph strings. 
# Here we join the paragraphs into a single speech string
speech_list = [" ".join(speech) for speech in speeches_pd['speech'].tolist() ]

In [62]:
# WARNING - this step may take a few minutes
tokens = [[lemmatize_with_pos(word.lower()) for word in word_tokenize(speech) if \
          word.isalpha() and word.lower() not in MyStopWords ] for \
          speech in speech_list ]

In [65]:
# Tfidf Vectorize
tvec = TfidfVectorizer(ngram_range=(1,1))
tfidf = pd.DataFrame(tvec.fit_transform([" ".join(tok) for tok in tokens ]).toarray(),
                    columns  = tvec.get_feature_names())

In [66]:
print(tfidf.shape)

(231, 16934)


In [69]:
## LDA Topic Modeling
speech_lda = LatentDirichletAllocation(n_components=8,
                                       max_iter=60,
                                learning_method='online')
speech_lda.fit(tfidf)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=60, mean_change_tol=0.001,
             n_components=8, n_jobs=None, n_topics=None, perp_tol=0.1,
             random_state=None, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [70]:
# Print the top words for topics in a topic model
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " | ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()    

print_top_words(speech_lda, tfidf.columns.values, 9)

Topic #0: state | government | may | united | upon | country | law | interest | last
Topic #1: state | country | desideratum | upon | without | occasion | congress | united | important
Topic #2: state | supplementation | multiply | dailiness | greek | sly | shall | june | domain
Topic #3: state | upon | government | may | great | law | congress | public | united
Topic #4: state | government | upon | congress | united | may | year | present | one
Topic #5: state | government | year | congress | people | nation | united | great | country
Topic #6: government | law | united | sight | invasion | withstand | bookkeeping | vacillation | state
Topic #7: state | may | government | public | great | year | interest | war | make

