In [None]:
# TODO
# finish word responses

In [8]:
from pyspark.sql import SparkSession
from pyspark.mllib.clustering import KMeans
from IPython.display import clear_output
from collections import Counter
from operator import add
import json
import numpy as np
import string
import matplotlib.pyplot as plt

spark  = SparkSession.builder.appName("SOU").getOrCreate()
speeches = spark.read.json("/project/cmsc25025/sou/speeches.json")
speeches = speeches.sort("year",ascending=True) # sort by year for easier processing

# counts all words across all speeches
word_counts = speeches.rdd.map(lambda x: Counter(x['text'].lower().encode('utf-8').translate(string.maketrans("",""), string.punctuation).split())).reduce(lambda a, b: a+b)
speech = speeches.rdd.map(lambda x: [[str(x['president']), int(x['year'])]]).reduce(lambda a, b: a+b)

In [9]:
# remove the most/least common words from the counter
# throw out 20 most common words and words that appear less than 50 times
common = word_counts.most_common() # orders words in the counter by most common
for w,c in common[:20] + common[3170:]:
    del word_counts[w] # be careful with this, will continue to delete elements if don't rerun
# extract the remaining words to build a vocabulary
vocab = [wc[0] for wc in word_counts.items()]

ndocuments = speeches.rdd.count() # total number of documents

# count number of times each word appears in each document
vocab_counts = speeches.rdd.map(lambda x: ([[x['text'].lower().encode('utf-8').translate(string.maketrans("",""), string.punctuation).count(i) for i in vocab]])).reduce(lambda a, b: a+b)

# count number of documents each word appears in
doc_counts = speeches.rdd.map(lambda x: ([int(i in x['text'].lower().encode('utf-8').translate(string.maketrans("",""), string.punctuation)) for i in vocab])).reduce(lambda a, b: map(add, a, b))

# compute TF-IDF weights for each SOU address
def computeTF_IDF(counts):
    return np.multiply(counts,log_factor)

log_factor = np.log(np.divide(np.repeat(ndocuments,len(vocab)),np.array(doc_counts)))
TF_IDF = np.apply_along_axis(computeTF_IDF, 1, np.array(vocab_counts))


In [19]:
# define and fill a lower triangular similarity matrix
sim_matrix = np.zeros((ndocuments,ndocuments))
norms = np.linalg.norm(TF_IDF,ord=None,axis=1) # the norms for each document's TF-IDF vector
for i in range(ndocuments):
    for j in range(i+1,ndocuments):
        # compute the similarity
        sim_matrix[j,i] = np.dot(TF_IDF[i,:],TF_IDF[j,:])/(norms[i]*norms[j])

# find 50 most similar pairs by different presidents
tmp_sim_matrix = sim_matrix
npairs = 0 # number of pairs found
diff_pairs = []
while npairs < 50:
    # find the indices of current maximum similarity
    i,j = np.unravel_index(tmp_sim_matrix.argmax(), tmp_sim_matrix.shape)
    if speech[i][0] != speech[j][0]: # else - ignore if same president
        # add this pair (presidents and years)
        diff_pairs += [(speech[i][0],speech[i][1],speech[j][0],speech[j][1])]
        npairs += 1    
    # set to 0 so this pair is no longer considered
    tmp_sim_matrix[i,j] = 0
print('The 50 most similar pairs of SOUs by different presidents are')
for pair in diff_pairs:
    print pair
print "\n"
    
# find 50 most similar pairs by same president
tmp_sim_matrix = sim_matrix
npairs = 0 # number of pairs found
same_pairs = []
while npairs < 50:
    # find the indices of current maximum similarity
    i,j = np.unravel_index(tmp_sim_matrix.argmax(), tmp_sim_matrix.shape)
    if speech[i][0] == speech[j][0]: # else - ignore if different presidents
        # add this pair (president and years)
        same_pairs += [(speech[i][0],speech[i][1],speech[j][1])]
        npairs += 1    
    # set to 0 so this pair is no longer considered
    tmp_sim_matrix[i,j] = 0
print('The 50 most similar pairs of SOUs by the same president are')
for pair in same_pairs:
    print pair
print "\n"
    
# find 25 most similar pairs of presidents
names = list(set([row[0] for row in speech])) # unique president names
ii = [np.where(np.array([row[0] for row in speech]) == name)[0] for name in names] # index list for each president

# compute average similarity for all pairs of presidents
pres_pairs = []
for i in range(len(names)):
    for j in range(i+1,len(names)):
        total_sim = 0 # total similarity
        for a in ii[i]:
            for b in ii[j]:
                if a > b:
                    total_sim += sim_matrix[a][b]
                else:
                    total_sim += sim_matrix[b][a]
        avg_sim = total_sim/float(len(ii[i])*len(ii[j])) # average similarity
        pres_pairs += [(avg_sim,names[i],names[j])]
# sort by average similarity
pres_pairs = sorted(pres_pairs, key=lambda x: x[0],reverse=True)[:25]
print('The 25 most similar pairs of presidents are')
for pair in pres_pairs:
    print (pair[1],pair[2])
print "\n"

The 50 most similar pairs of SOUs by different presidents are
('Barack Obama', 2010, 'William J. Clinton', 1995)
('Barack Obama', 2009, 'William J. Clinton', 1993)
('William J. Clinton', 2000, 'George Bush', 1989)
('Grover Cleveland', 1885, 'Rutherford B. Hayes', 1877)
('James K. Polk', 1846, 'John Tyler', 1844)
('Barack Obama', 2011, 'William J. Clinton', 1995)
('Barack Obama', 2012, 'William J. Clinton', 1993)
('Barack Obama', 2011, 'William J. Clinton', 1993)
('Barack Obama', 2011, 'William J. Clinton', 2000)
('Barack Obama', 2010, 'William J. Clinton', 1993)
('George Bush', 1989, 'Ronald Reagan', 1988)
('William J. Clinton', 1997, 'George Bush', 1989)
('Barack Obama', 2012, 'William J. Clinton', 1995)
('Benjamin Harrison', 1889, 'Grover Cleveland', 1885)
('Barack Obama', 2010, 'William J. Clinton', 1994)
('Ronald Reagan', 1981, 'Gerald R. Ford', 1975)
('Barack Obama', 2009, 'William J. Clinton', 2000)
('Barack Obama', 2012, 'William J. Clinton', 1994)
('Barack Obama', 2011, 'Willia

The speeches are somewhat similar in their word content but still vary in overall theme/topics. A better similarity measure might be constructed by considering phrases rather than single words. Latent semantic analysis may also be applicable, finding a set of concepts for each documents and comparing these.

In [32]:
# cluster the TDF-IDF representations using k-means
# experimenting with the number of clusters
TF_IDF_rdd = spark.sparkContext.parallelize(TF_IDF)
years = [row[1] for row in speech]
names = [row[0] for row in speech]
k = [6,8,10,12,14] # number of clusters
for num in k:
    # compute the clusters
    clusters = KMeans.train(TF_IDF_rdd, num, maxIterations=50, initializationMode="random")
    # TODO - display the clusters
    predictions = clusters.predict(TF_IDF_rdd).collect()
    year_cluster = [(names[i],years[i],predictions[i]) for i in range(len(predictions))]
    sorted_clusters = sorted(year_cluster, key=lambda x: x[1])
    print "%d clusters" % num
    for i in range(1,num+1):
        print "cluster %d" % i
        print [(val[0],val[1]) for val in sorted_clusters if val[2] == i]

# TODO - Comment on the clustering results, and whether or not the results are interpretable

6 clusters
cluster 1
[('Harry S Truman', 1946), ('Lyndon B. Johnson', 1966), ('Lyndon B. Johnson', 1967), ('Lyndon B. Johnson', 1968), ('Gerald R. Ford', 1976), ('Ronald Reagan', 1981), ('Ronald Reagan', 1982), ('Ronald Reagan', 1983), ('Ronald Reagan', 1984), ('Ronald Reagan', 1985), ('Ronald Reagan', 1988), ('George Bush', 1989), ('George Bush', 1990), ('George Bush', 1991), ('George Bush', 1992), ('William J. Clinton', 1993), ('William J. Clinton', 1994), ('William J. Clinton', 1995), ('William J. Clinton', 1996), ('William J. Clinton', 1997), ('William J. Clinton', 1998), ('William J. Clinton', 1999), ('William J. Clinton', 2000), ('George W. Bush', 2001), ('George W. Bush', 2002), ('George W. Bush', 2003), ('George W. Bush', 2004), ('George W. Bush', 2005), ('George W. Bush', 2006), ('George W. Bush', 2007), ('George W. Bush', 2008), ('Barack Obama', 2009), ('Barack Obama', 2010), ('Barack Obama', 2011), ('Barack Obama', 2012), ('Barack Obama', 2013)]
cluster 2
[('James K. Polk', 

Generally speaking, a president's speeches tend to be in the same cluster, and clusters generally contain speeches from similar time periods. However, the number of speeches per cluster is often unbalanced. Overall, these results are difficult to interpret.