# An analysis of the State of the Union speeches - Part 3
# Word analysis

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from collections import Counter
import shelve

plt.style.use('seaborn-dark')
plt.rcParams['figure.figsize'] = (10, 6)

Load data we need from previous runs

In [20]:
all_speeches_df = pd.read_csv('data/speech_word_counts.csv')

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,217,218,219,220,221,222,223,224,225,226
President,george washington,george washington,george washington,george washington,george washington,george washington,george washington,george washington,john adams,john adams,...,george w. bush,barack obama,barack obama,barack obama,barack obama,barack obama,barack obama,barack obama,barack obama,donald j. trump
Speech,state of the union address,state of the union address,state of the union address,state of the union address,state of the union address,state of the union address,state of the union address,state of the union address,state of the union address,state of the union address,...,state of the union address,address before a joint session of congress,state of the union address,state of the union address,state of the union address,state of the union address,state of the union address,state of the union address,state of the union address,state of the union address
Date,1790-01-08 00:00:00,1790-12-08 00:00:00,1791-10-25 00:00:00,1792-11-06 00:00:00,1793-12-03 00:00:00,1794-11-19 00:00:00,1795-12-08 00:00:00,1796-12-07 00:00:00,1797-11-22 00:00:00,1798-12-08 00:00:00,...,2008-01-29 00:00:00,2009-02-24 00:00:00,2010-01-27 00:00:00,2011-01-25 00:00:00,2012-01-24 00:00:00,2013-02-12 00:00:00,2014-01-28 00:00:00,2015-01-20 00:00:00,2016-01-12 00:00:00,2017-02-27 00:00:00
Date (String),"january 8, 1790","december 8, 1790","october 25, 1791","november 6, 1792","december 3, 1793","november 19, 1794","december 8, 1795","december 7, 1796","november 22, 1797","december 8, 1798",...,"january 29, 2008","february 24, 2009","january 27, 2010","january 25, 2011","january 24, 2012","february 12, 2013","january 28, 2014","january 20, 2015","january 12, 2016","february 27, 2017"
guadalupe,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
858,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
acquittal,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
review,0,0,0,0,0,0,3,0,0,0,...,0,0,0,1,0,0,0,0,0,0
extent,0,0,1,1,0,1,0,3,0,2,...,0,0,0,0,0,0,0,0,0,0
angular,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
with shelve.open('results/vars2') as db:
    speech_words = db['speech_words']
    speeches_cleaned = db['speeches_cleaned']

Let's make a single set of all unique words across all speeches

In [11]:
# YOUR CODE HERE
unique = all_speeches_df.columns.tolist()
unique_set = set(unique)
n_words = len(unique_set)
n_words  # number of unique words across all speeches

24849

Now we create a word matrix, whose columns are word vectors for each speech. A word vector contains the word counts for each word across the entire document set. 

In [17]:
def word_vector(doc, vocab):
    """Return a word vector for the input document in the context of a given vocabulary.
    
    Parameters
    ----------
    
    doc: iterable of words
       
    vocab : iterable of words
    integer, size of the entire vocabulary across documents.
    
    Return
    ------
    array
        An integer array, of length equal to `len(vocab)`, containing the count for each
        word in `doc` at its corresponding position in `vocab`.
        
    Example
    -------
    
    >>> doc = "b c b c e".split()
    ... vocab = "a b c d e f".split()
    ... word_vector(doc, vocab)
    ... 
    array([0, 2, 2, 0, 1, 0])
    """
    words_dict = dict()
    #vocabset = set(vocab)
    doclist = list(doc)
    for word in vocab:
        words_dict[word] = doclist.count(word)
    frequency = list(words_dict.values())
    return frequency
    
#     def word_freq(text):
#     """Return a dictionary of word frequencies for the given text."""
#     # YOUR CODE HERE
#     # Regarding dictionary: key is word, value is count
#     words_dict = dict()
#     wordlist = text.split()
#     wordset = set(wordlist)
#     for w in wordset:
#         words_dict[w] = wordlist.count(w)
#     return(words_dict)

Let's write a simple unit test for this:

In [18]:
def test_word_vector():
    doc = "b c b c e".split()
    vocab = "a b c d e f".split()
    wv = word_vector(doc, vocab)
    np.testing.assert_equal(wv, np.array([0, 2, 2, 0, 1, 0]) )

test_word_vector()

Now let's make the word matrix for our entire set of documents

In [24]:
wmat = all_speeches_df.transpose()
wmat = wmat[4:]
wmat

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,217,218,219,220,221,222,223,224,225,226
guadalupe,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
858,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
acquittal,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
review,0,0,0,0,0,0,3,0,0,0,...,0,0,0,1,0,0,0,0,0,0
extent,0,0,1,1,0,1,0,3,0,2,...,0,0,0,0,0,0,0,0,0,0
angular,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
new,1,0,4,2,2,2,1,4,0,1,...,25,21,20,36,27,25,29,41,19,18
cessation,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
unchangeable,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
yarns,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


How sparse is this matrix?

In [61]:
# YOUR CODE HERE
columns = len(all_speeches_df) #columns of word matrix
rows = len(wmat) #rows of word matrix
zeros = list((wmat == 0).sum(axis=1))
total_zeros = sum(zeros)
sparsity = (total_zeros)/(rows*columns)
print(f"wmat is comprised of {100*sparsity:.2f}% zeros.")

wmat is comprised of 93.13% zeros.


## Intermediate results storage

We'll need a few results for the next step, so let's store them in a new set of HDF5/shelve stores for this notebook:

In [60]:
wmat.to_hdf('results/df3.h5', 'wmat')
with shelve.open('results/vars3') as db:
    db['unique_words'] = unique_words

OSError: ``results`` does not exist