Analyze state of the union addresses. 
Data source: https://en.wikisource.org/wiki/Portal:State_of_the_Union_Speeches_by_United_States_Presidents
scrape the text of all speeches and then maybe try to find patterns of speech of each president?

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import urllib.request
import re

# Data source we are going to scrape for results
data_url = 'https://en.wikisource.org/wiki/Portal:State_of_the_Union_Speeches_by_United_States_Presidents'

link_list = []

resp = urllib.request.urlopen(data_url)
soup = BeautifulSoup(resp, from_encoding=resp.info().get_param('charset'))

# Get all links to state of the union addresses from 
for link in soup.find_all('a', href=True):
    if "union_address" in link['href'].lower() or "union_speech" in link['href'].lower() \
        and "portal" not in link['href'].lower() and "#" not in link['href'].lower():
        link_list.append(link['href'])

# extract the text of a speech from a URL
# text is extracted in a list of paragraphs (strings) for each speech
def get_speech(url):
    return([ p.text.strip() for p in BeautifulSoup(urllib.request.urlopen(url)).find_all("p") if \
             'This work is in the public domain in the United States because it is a work of the United States federal government' \
             not in p.text.strip()])

speeches = [get_speech('https://en.wikisource.org' + link) for link in link_list]

In [2]:
# Extract presidents names from link text
presidents = [ link.replace('%','/').split('/')[2].replace('_',' ') for link in link_list ]

# Extract state of the union text entries so we can extract the date
sou_entries = []
for item in soup.find_all('li'):
    if 'union' in item.text.strip().lower() and '(' in  item.text.strip().lower():
        sou_entries.append(item.text.strip())

speeches_pd = pd.DataFrame({
                'president' : presidents,
                'speech' : speeches,
                'year' : [int(re.findall('\d+',item)[1]) for item in sou_entries ]} )
        

In [3]:
speeches_pd.head(10)

Unnamed: 0,president,speech,year
0,George Washington,[I embrace with great satisfaction the opportu...,1790
1,George Washington,[Fellow-Citizens of the Senate and the House o...,1790
2,George Washington,[Fellow-Citizens of the Senate and the House o...,1791
3,George Washington,[Fellow-Citizens of the Senate and of the Hous...,1792
4,George Washington,[Fellow Citizens of the Senate and of the Hous...,1793
5,George Washington,[Fellow Citizens of the Senate and of the Hous...,1794
6,George Washington,[Fellow Citizens of the Senate and of the Hous...,1795
7,George Washington,[Fellow Citizens of the Senate and of the Hous...,1796
8,John Adams,[I was for some time apprehensive that it woul...,1797
9,John Adams,[Gentlemen of the Senate and Gentlemen of the ...,1798


## Pre-process and TfIdf Vectorize Speeches

In [4]:
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
#nltk.download('punkt')
#nltk.download('wordnet')
#nltk.download('averaged_perceptron_tagger')
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import wordnet
from nltk.stem.wordnet import WordNetLemmatizer
lmtzr = WordNetLemmatizer()

# Convert nltk POS to POS that can be used by lemmatizer
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return ''

# Lemmatize a word with its appropriate POS tag
def lemmatize_with_pos(word,testing=False):
    pos = get_wordnet_pos(nltk.pos_tag([word])[0][1])
    if testing==True:
        print('POS: ' + pos)
    if pos != '':
        return(lmtzr.lemmatize(word,pos))
    else:
        return(lmtzr.lemmatize(word))
    
MyStopWords = get_stop_words('en')

In [None]:
lemmatize_with_pos('dsfs',testing=True)

POS: n


'dsfs'

In [None]:
# Clean, Tokenize, Lemmatize, and Vectorize all speeches 
speech_list = ["".join(speech) for speech in speeches_pd['speech'].tolist() ]

tokens = [[lemmatize_with_pos(word.lower()) for word in word_tokenize(speech) if \
          word.isalpha() and word.lower() not in MyStopWords ] for \
          speech in speech_list ]


In [None]:
speech_list[:1000]

In [None]:
tvec = TfidfVectorizer(ngram_range=(1,2))

In [None]:
tfidf = pd.DataFrame(tvec.fit_transform([" ".join(tok) for tok in tokens ]).toarray(),
                    columns  = tvec.get_feature_names())

In [55]:
print(tfidf.shape)

(231, 23773)


In [None]:
## LDA Topic Modeling
speech_lda = LatentDirichletAllocation(n_components=5,
                                    #   max_iter=50,
                                learning_method='online')
speech_lda.fit(tfidf)

In [None]:
# Print the top words for topics in a topic model
def print_top_words(model, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " | ".join([model.columns.values[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()    

print_top_words(speech_lda, 10)