# An analysis of the State of the Union speeches - Part 3
# Word analysis

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from collections import Counter
import shelve

plt.style.use('seaborn-dark')
plt.rcParams['figure.figsize'] = (10, 6)

Load data we need from previous runs

In [28]:
addresses_df = pd.read_hdf('results/addresses_df_2.h5', 'addresses_df')

From notebook 2, we already have a single set of unique words across all speeches. Let's fetch that. 

In [29]:
# YOUR CODE HERE
with shelve.open('results/vars2') as db:    
    all_uq_words = db['all_uq_stemmed_words']
    
vocab_list = list(all_uq_words)
# unique = all_speeches_df.columns.tolist()
# unique_set = set(unique)
# n_words = len(unique_set)
# n_words  # number of unique words across all speeches

Now we create a word matrix, whose columns are word vectors for each speech. A word vector contains the word counts for each word across the entire document set. 

In [45]:
def word_vector(doc, vocab):
    """Return a word vector for the input document in the context of a given vocabulary.
    
    Parameters
    ----------
    
    doc: iterable of words
       
    vocab : iterable of words
    integer, size of the entire vocabulary across documents.
    
    Return
    ------
    array
        An integer array, of length equal to `len(vocab)`, containing the count for each
        word in `doc` at its corresponding position in `vocab`.
        
    Example
    -------
    
    >>> doc = "b c b c e".split()
    ... vocab = "a b c d e f".split()
    ... word_vector(doc, vocab)
    ... 
    array([0, 2, 2, 0, 1, 0])
    """
    words_dict = dict.fromkeys(vocab, 0)
    doclist = list(doc)
    for word in doclist: 
        words_dict[word] += 1
    frequency = list(words_dict.values())
    return frequency

Let's write a simple unit test for this:

In [47]:
def test_word_vector():
    doc = "b c b c e".split()
    vocab = "a b c d e f".split()
    wv = word_vector(doc, vocab)
    np.testing.assert_equal(wv, np.array([0, 2, 2, 0, 1, 0]) )

test_word_vector()

Now let's make the word matrix for our entire set of documents. First, we will go through the dataset and read each speech. In the ingestion of a speech, we'll tokenize and stem each word, removing punctuation.

In [48]:
#WARNING: This cell takes a few minutes to run. 

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from string import punctuation 
import nltk.tokenize

def stem_word_tokenize(doc):
    """custom word tokenizer which removes stop words and punctuation. 
    Returns the stemmed version of each word. 
    
    Parameters
    ----------
    doc : string
        A document to be tokenized
        
    Returns
    -------
    Stemmed tokens.
    """
    
    doc = doc.lower() #turn all words to lowercase for standardization 
    
    #From lecture 11
    #https://berkeley-stat159-f17.github.io/stat159-f17/lectures/11-strings/11-nltk.html
    augmented_punctuation = list(punctuation) + ['--']
    empty_content = stopwords.words('english') + augmented_punctuation
    
    #tokenize words
    all_tokens = nltk.tokenize.word_tokenize(doc)
    
    #return each token if it is not punctuation or a stop word 
    clean_tokens = [token for token in all_tokens if token not in empty_content]
    #stem the words 
    stemmer = PorterStemmer()
    return [stemmer.stem(token) for token in clean_tokens]


In [51]:
speech_matrix = pd.DataFrame(columns=['President', 'Date'] + vocab_list)

'''We pass through all of the speeches, creating a word vector for each one.'''
with open('data/stateoftheunion1790-2017.txt', 'r') as speeches_txt:
    txt_file_chunks = speeches_txt.read().split('***') 
    
    for index, row in addresses_df.iterrows():
        raw_speech = txt_file_chunks[index + 1].lower()
        stemmed_tokens = stem_word_tokenize(raw_speech)
        word_vec = word_vector(stemmed_tokens, all_uq_words)
        augmented_vec = [row['President'], row['Date']] + word_vec
        for i in range(len(speech_matrix.columns)): 
            speech_matrix.set_value(index, speech_matrix.columns[i], augmented_vec[i])
        if index%20 == 0: 
            print('Done with {} speeches'.format(index))

Done with 0 speeches
Done with 20 speeches
Done with 40 speeches
Done with 60 speeches
Done with 80 speeches
Done with 100 speeches
Done with 120 speeches
Done with 140 speeches
Done with 160 speeches
Done with 180 speeches
Done with 200 speeches
Done with 220 speeches


In [54]:
#Create a matrix of just the word vector frequencies to do sparsity analysis. 
wmat = speech_matrix.transpose()
wmat = wmat.iloc[2:, :]
wmat.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,217,218,219,220,221,222,223,224,225,226
ox,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
sole,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
puriti,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
shower,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
cleland,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


How sparse is this matrix?

In [55]:
# YOUR CODE HERE
def sparsity_calculator(text):
    rows = len(text)
    columns = len(text.columns)
    zeros = list((text == 0).sum(axis=1))
    total_zeros = sum(zeros)
    sparsity = (total_zeros)/(rows*columns)
    return sparsity
sparsity = sparsity_calculator(wmat)
print(f"wmat is comprised of {100*sparsity:.2f}% zeros.")

wmat is comprised of 93.50% zeros.


## Intermediate results storage

We'll need a few results for the next step, so let's store them in a new set of HDF5/shelve stores for this notebook:

In [59]:
import warnings 
#suppress the PerformanceWarning raised by Pickle when saving such massive files 
warnings.filterwarnings("ignore")

wmat.to_hdf('results/df3.h5', 'wmat')
addresses_df.to_hdf('results/df3.addresses_df_3.h5', 'addresses_df')
speech_matrix.to_hdf('results/df3.speech_matrix_3.h5', 'speech_matrix')

with shelve.open('results/vars3') as db:
    db['unique_words'] = all_uq_words
    db['vocab_list'] = vocab_list