In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
corpus = []
filenames= []
basepath = '/content/drive/Shared drives/Unstructured Group 5/Data/'
with os.scandir(basepath) as entries:
    for entry in entries:
        if entry.is_file():
            print(entry.name)
            filenames.append(entry.name)
            file_location = basepath + entry.name
            corpus.append(open(file_location,'r').read())


Adams_1797.txt
Adams_1828.txt
Buchanan_1858.txt
Adams_1798.txt
Adams_1799.txt
Buchanan_1859.txt
Adams_1827.txt
Arthur_1881.txt
Arthur_1884.txt
Arthur_1883.txt
Adams_1826.txt
Adams_1800.txt
Adams_1825.txt
Arthur_1882.txt
Buchanan_1860.txt
Buchanan_1857.txt
Hayes_1877.txt
Madison_1810.txt
Eisenhower_1961.txt
Buren_1839.txt
Madison_1814.txt
Johnson_1968.txt
Johnson_1865.txt
Lincoln_1863.txt
Buren_1838.txt
Bush_1990.txt
Johnson_1868.txt
Carter_1979.txt
Bush_2004.txt
Madison_1812.txt
Grant_1872.txt
Coolidge_1927.txt


In [None]:
#sorting by year
import numpy as np

years = [eval(fname[-8:-4]) for fname in filenames]
year_idx = np.argsort(years) 

SOTUcorpus = [corpus[i] for i in year_idx]
SOTUnames = [filenames[i] for i in year_idx]

In [None]:
!pip install nltk scipy numpy matplotlib scikit-learn

In [None]:
print(len(SOTUcorpus))


##Topic analysis




Using Topic Modeling, we will determine the top 7 topics for the State of the Union addresses with the top 10 words being listed for each. 

In [None]:
from sklearn.feature_extraction.text import CountVectorizer 


# set max features and whether we want stopwords or note
cvect_corpus = CountVectorizer(stop_words='english', max_features=1000) #only want 1000 most common tokens
X_corpus = cvect_corpus.fit_transform(SOTUcorpus) 
vocab_corpus = cvect_corpus.get_feature_names() 

from sklearn.decomposition import LatentDirichletAllocation 
#Set a seed so that the topic numbers are the same everytime the code is run
import random
random.seed(10)

NUM_TOPICS = 7 #can change to get more topics
lda = LatentDirichletAllocation(n_components=NUM_TOPICS) 

lda.fit(X_corpus)

import numpy as np

TOP_N = 10  # change this to see the top N words per topic

topic_norm = lda.components_ / lda.components_.sum(axis=1)[:, np.newaxis]

for idx, topic in enumerate(topic_norm):
    print("Topic id: {}".format(idx))
    #print(topic)
    top_tokens = np.argsort(topic)[::-1] #finding top words in topic
    for i in range(TOP_N):
      print('{}: {}'.format(vocab_corpus[top_tokens[i]], topic[top_tokens[i]]))
    print()

Next, we will take this and apply it to the first three State of the Unions which all came from George Washington. This is in order to see which topic he wrote under

In [None]:
#First 3 from George Washington, need to figure out how to sort by date rather than alphabetical

docs_sample = lda.transform(X_corpus[0:3])

for i in range(3):
    print('Document: {}'.format(SOTUcorpus[i][0:300]))
    row = docs_sample[i]
    print(row)
    top_topics = np.argsort(row)[::-1]
    #print(top_topics[0:3])
    print('top topic: {}'.format(top_topics[0])) #prints first entry in top topics, if you wanted top 2 it would be [0:2]
    print("\n")



Now, the last three. One Obama and 2 Trump.

In [None]:
# Last 3 SOTU addresses in dataset, Obama's 8th and Trump's 1st and 2nd

docs_sample = lda.transform(X_corpus[-3:])

for i in range(-3,0):
    print('Document: {}'.format(SOTUcorpus[i][0:300]))
    row = docs_sample[i]
    print(row)
    top_topics = np.argsort(row)[::-1]
    #print(top_topics[0:3])
    print('top topic: {}'.format(top_topics[0])) #prints first entry in top topics, if you wanted top 2 it would be [0:2]
    print("\n")

Seeing that the topics were consistent over generational shift, we want to determine how the topics have changed over the generations. In order to do this, we will create an array to store the topic and year, and then create a dictionary to look them up. After this is done, we will create a dataframe in order to visualize all of the decades with their top topics.

In [None]:
# Looking at how the topics change with every year
docs_sample = lda.transform(X_corpus)

# Create an array to store the topic / year
Topics = np.zeros(len(docs_sample))
Years = np.zeros(len(docs_sample))

for i in range(len(docs_sample)):
    row = docs_sample[i]
    top_topics = np.argsort(row)[::-1]
    Topics[i] = top_topics[0]
    Years[i] = SOTUnames[i][-8:-4]

# Create a dictionary to easily lookup topics and years
TopicsByYear = dict(zip(Years, Topics))

In [None]:
# Topics of certain years
year = 1945
print('The topic of {} is: {}'.format(year, TopicsByYear[year]))

In [None]:
# What is the most popular topic by Decade?
import pandas as pd
# Create a pandas dataframe to utilize the groupby and agg functions
DecadeData = pd.DataFrame(data = {'Year': Years, 'Topic': Topics})
# Calculate the decade of each topic
DecadeData['Decade'] = DecadeData['Year'] // 10 * 10
# Find the mode topic of each decade
DecadeData.groupby('Decade').agg({'Topic': pd.Series.mode})

## Party Affiliation Data Frame

Now we will look to see if the topic changes over parties. 

In [None]:
df = pd.DataFrame(columns=['file_name', 'year', 'president', 'party', 'text'])
import nltk 
from nltk.tokenize import word_tokenize 
nltk.download('punkt')

for i in range(len(SOTUnames)):
    components = SOTUnames[i].split('_')
    name = components[0]
    year = components[1].split('.')[0]
    df.loc[i, 'file_name'] = SOTUnames[i]
    df.loc[i,'year'] = year
    df.loc[i,'president'] = name  
    df.loc[i, 'text'] = SOTUcorpus[i]
    # df.loc[i, 'tokens'] = nltk.word_tokenize(SOTUcorpus[i])
    # df.loc[i, 'lex_div'] = len(set(nltk.word_tokenize(SOTUcorpus[i]))) / len(nltk.word_tokenize(SOTUcorpus[i]))
    # df.loc[i, 'len'] = len(nltk.word_tokenize(SOTUcorpus[i]))
    # df.loc[i, 'set'] = len(set(nltk.word_tokenize(SOTUcorpus[i])))
    
df.year = df.year.astype(int) 

# Fix entries where presidents have the same last name
indices = df.query("president =='Roosevelt' & year <= 1909").index
df.loc[indices,'president'] = 'Theodore Roosevelt'

indices = df.query("president == 'Roosevelt'").index
df.loc[indices,'president'] = 'Franklin D. Roosevelt'

indices = df.query("president =='Bush' & year <= 1992").index
df.loc[indices,'president'] = 'George H. W. Bush'

indices = df.query("president == 'Bush'").index
df.loc[indices,'president'] = 'George W. Bush'

indices = df.query("president =='Johnson' & year <= 1869").index
df.loc[indices,'president'] = 'Andrew Johnson'

indices = df.query("president == 'Johnson'").index
df.loc[indices,'president'] = 'Lyndon B. Johnson'

indices = df.query("president =='Adams' & year <= 1801").index
df.loc[indices,'president'] = 'John Adams'

indices = df.query("president == 'Adams'").index
df.loc[indices,'president'] = 'John Quincy Adams'


indices = df.query("president =='Harrison' & year <= 1841").index
df.loc[indices,'president'] = 'William Henry Harrison'

indices = df.query("president == 'Harrison'").index
df.loc[indices,'president'] = 'Benjamin Harrison'

def pres_to_party(name):
    republican = ['Lincoln', 'Grant', 'Hayes', 'Garfield', 'Arthur', 
                  'Benjamin Harrison', 'McKinley', 'Theodore Roosevelt', 
                  'Taft', 'Harding', 'Coolidge', 'Hoover', 'Eisenhower', 
                  'Nixon', 'Ford', 'Reagan', 'George H. W. Bush', 
                  'George W. Bush', 'Trump']
    if name in republican:
        return 'Republican'
    
    democratic = ['Jackson', 'Buren', 'Polk', 'Pierce', 
                  'Buchanan', 'Cleveland', 'Wilson', 'Franklin D. Roosevelt', 
                  'Truman', 'Kennedy', 'Lyndon B. Johnson', 'Carter', 'Clinton', 'Obama']
    if name in democratic:
        return 'Democratic'
    
    whig = ['William Henry Harrison', 'Taylor', 'Fillmore']
    if name in whig:
        return 'Whig'
    
    national_union = ['Andrew Johnson']
    if name in national_union:
        return 'National Union'
    
    
    unaffiliated = ['Washington', 'Tyler']
    if name in unaffiliated:
        return 'Unaffiliated'
    
    federalist = ['John Adams']
    if name in federalist:
        return 'Federalist'
    
    democratic_republican = ['Jefferson', 'Madison', 'Monroe', 'John Quincy Adams']
    if name in democratic_republican:
        return 'Democratic-Republican'
    
df.party = df.president.apply(pres_to_party)

# df.set_index('year', inplace=True)
# df.sort_index(inplace=True)
df.sort_values(by=['year'], inplace=True)

df 

In [None]:
df['tokens'] = df.apply(lambda row: nltk.word_tokenize(row.text), axis = 1)
df['set_len'] = df.apply(lambda row: len(set(row.tokens)), axis = 1)
df['len'] = df.apply(lambda row: len(row.tokens), axis = 1)
df['lex_div'] = df.apply(lambda row: row.set_len/row.len, axis = 1)

df.head()

##Lexical Diversity

This section will show how different the words used are in one speech. This will show how diverse and unique each speech is.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


color_dict = {'Unaffiliated': 'gray', 'Federalist': 'yellow', 'Democratic-Republican':'purple',
       'Democratic':'blue', 'Whig':'orange', 'Republican':'red', 'National Union':'green'}

g = sns.scatterplot(x=df['year'], y=df['lex_div'], hue=df['party'],
              data=df, palette=color_dict, 
                   legend='full')
g.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xlabel('Year')
plt.ylabel('Lexical Diversity')
plt.title('Lexical Diversity Over Time')

## Most Important Terms Based on TF-IDF Scores

After seeing the results from the Lexical Diversity, we want to know the top 5 words of each speech in order to get a sense of patterns throughout Presidents and over time. 

In [None]:
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer 
import numpy as np 
import pandas as pd 

def tfidf_vectorizer(corpus):
  cvect = CountVectorizer() 
  count_matrix = cvect.fit_transform(corpus) 
  tokens = cvect.get_feature_names() 

  count_matrix = pd.DataFrame(count_matrix.todense()) 

  df_vect = count_matrix.astype(bool).sum(axis=0) 
  df_vect = np.log(len(corpus) / df_vect) 
  print(tokens, np.array(count_matrix * df_vect))
  return tokens, np.array(count_matrix * df_vect) 

tokens, tfidf_matrix = tfidf_vectorizer(SOTUcorpus)
print(tfidf_matrix.shape)

idx_to_tokens = {}
tokens_to_idx = {}

for i in range(len(tokens)):
  token = tokens[i] 
  tokens_to_idx[token] = i 
  idx_to_tokens[i] = token 

In [None]:
for i in range(0,len(tfidf_matrix)):
  print("\n", df.file_name[i])
  bookarray = tfidf_matrix[i][:]
  idx = np.argsort(bookarray)
  idx = idx[::-1]
  for i in idx[0:5]:
    print("{}: {}".format(tokens[i], bookarray[i]))