# An analysis of the State of the Union speeches - Part 3
# Word analysis

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from collections import Counter
import shelve

plt.style.use('seaborn-dark')
plt.rcParams['figure.figsize'] = (10, 6)

Load data we need from previous runs

In [2]:
addresses = pd.read_hdf('results/df2.h5', 'addresses')
addresses.head()

Unnamed: 0,president,title,date,n_sent,n_words_all,n_words,n_uwords,n_swords,n_chars
0,George Washington,State of the Union Address,1790-01-08,24,1178,538,398,356,6753
1,George Washington,State of the Union Address,1790-12-08,40,1515,683,516,461,8455
2,George Washington,State of the Union Address,1791-10-25,60,2487,1136,740,622,14203
3,George Washington,State of the Union Address,1792-11-06,61,2298,1042,693,578,12764
4,George Washington,State of the Union Address,1793-12-03,56,2132,972,720,652,11696


In [3]:
with shelve.open('results/vars2') as db:
    speech_words = db['speech_words']
    speeches_cleaned = db['speeches_cleaned']

Let's make a single set of all unique words across all speeches

In [4]:
from collections import Iterable
unique_words=[]
for i in speeches_cleaned:
    unique_words.extend(i)
unique_words=Counter(unique_words).keys()
unique_words=list(unique_words)
n_words=len(unique_words)
n_words
# number of unique words across all speeches

18797

Now we create a word matrix, whose columns are word vectors for each speech. A word vector contains the word counts for each word across the entire document set. 

In [5]:
def word_vector(doc, vocab):
    """Return a word vector for the input document in the context of a given vocabulary.
    
    Parameters
    ----------
    
    doc: iterable of words
       
    vocab : iterable of words
    integer, size of the entire vocabulary across documents.
    
    Return
    ------
    array
        An integer array, of length equal to `len(vocab)`, containing the count for each
        word in `doc` at its corresponding position in `vocab`.
        
    Example
    -------
    
    >>> doc = "b c b c e".split()
    ... vocab = "a b c d e f".split()
    ... word_vector(doc, vocab)
    ... 
    array([0, 2, 2, 0, 1, 0])
    """
    counts=[]
    count=0
    for i in np.arange(len(vocab)):
        for letter in doc:
            if vocab[i]==letter:
                count+=1
        counts.append(count)
        count=0
    return counts

Let's write a simple unit test for this:

In [None]:
def test_word_vector():
    doc = "b c b c e".split()
    vocab = "a b c d e f".split()
    wv = word_vector(doc, vocab)
    np.testing.assert_equal(wv, np.array([0, 2, 2, 0, 1, 0]) )

test_word_vector()

Now let's make the word matrix for our entire set of documents

In [None]:
# YOUR CODE HERE
results=[]
for look_up in speeches_cleaned:
    results.append(word_vector(look_up,unique_words))   
wmat=pd.DataFrame(np.array(results).T,index=unique_words)      
wmat[500:510]

How sparse is this matrix?

In [None]:
# YOUR CODE HERE
#sparsity
#print(f"wmat is comprised of {100*sparsity:.2f}% zeros.")

## Intermediate results storage

We'll need a few results for the next step, so let's store them in a new set of HDF5/shelve stores for this notebook:

In [None]:
wmat.to_hdf('results/df3.h5', 'wmat')
with shelve.open('results/vars3') as db:
    db['unique_words'] = unique_words