In [1]:
from glob import glob
from pathlib import Path
import os
import pandas as pd
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer
import nltk
from nltk.corpus import stopwords
from collections import Counter
# Download stopwords list
nltk.download('punkt')
stop_words = set(stopwords.words('english')) 
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

[nltk_data] Downloading package punkt to /home/alexzabbey/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# zd = {("Roosevelt", 1909): "Theodore Roosevelt", ("Bush", 1992): "George H. W. Bush", ("Johnson", 1869): "Andrew Johnson", ("Adams", 1801): "John Adams", ("Harrison", 1841): "William Henry Harrison"}
# for p in glob("data/sotu/raw/*"):
#     for k, v in zd.items():
#         if k[0] in p and int(k[1]) >= int(Path(p).stem.split("_")[1]):
#             os.rename(p, p.replace(k[0], v))


In [None]:
presidents = (
    pd.DataFrame(
        [Path(p).stem.split("_") for p in glob("data/sotu/raw/*")],
        columns=["president", "year"],
    )
    .sort_values(by="year")
    .reset_index(drop=True)
)
presidents["year"] = presidents["year"].astype(int)

In [2]:
path = 'data/sotu/raw'
dirs = os.listdir(path)

df = pd.DataFrame(columns=['year', 'president', 'text', 'party'])


for i in range(len(dirs)):
    components = dirs[i].split('_')
    name = components[0]
    year = components[1].split('.')[0]
    df.loc[i,'year'] = year
    df.loc[i,'president'] = name   
    
    filename = os.path.join(path, dirs[i])
    text_file = open(filename, "r")
    
    lines = text_file.read()
    df.loc[i, 'text'] = lines.replace('\n', ' ')
    
df.year = df.year.astype(int) 
df.president = df.president.astype(str)
df.text = df.text.astype(str)

In [3]:
# need to distinuish between Theodore Roosevelt and Franklin D. Roosevelt
#thank you to kaggle user mjmurphy28 for this code
indices = df.query("president =='Roosevelt' & year <= 1909").index
df.loc[indices,'president'] = 'Theodore Roosevelt'

indices = df.query("president == 'Roosevelt'").index
df.loc[indices,'president'] = 'Franklin D. Roosevelt'

indices = df.query("president =='Bush' & year <= 1992").index
df.loc[indices,'president'] = 'George H. W. Bush'

indices = df.query("president == 'Bush'").index
df.loc[indices,'president'] = 'George W. Bush'

indices = df.query("president =='Johnson' & year <= 1869").index
df.loc[indices,'president'] = 'Andrew Johnson'

indices = df.query("president == 'Johnson'").index
df.loc[indices,'president'] = 'Lyndon B. Johnson'

indices = df.query("president =='Adams' & year <= 1801").index
df.loc[indices,'president'] = 'John Adams'

indices = df.query("president == 'Adams'").index
df.loc[indices,'president'] = 'John Quincy Adams'


indices = df.query("president =='Harrison' & year <= 1841").index
df.loc[indices,'president'] = 'William Henry Harrison'

indices = df.query("president == 'Harrison'").index
df.loc[indices,'president'] = 'Benjamin Harrison'


In [4]:
#add party name to each year
#thank you to kaggle user mjmurphy28 for this code
def pres_to_party(name):
    republican = ['Lincoln', 'Grant', 'Hayes', 'Garfield', 'Arthur', 
                  'Benjamin Harrison', 'McKinley', 'Theodore Roosevelt', 
                  'Taft', 'Harding', 'Coolidge', 'Hoover', 'Eisenhower', 
                  'Nixon', 'Ford', 'Reagan', 'George H. W. Bush', 
                  'George W. Bush', 'Trump']
    if name in republican:
        return 'Republican'
    
    democratic = ['Jackson', 'Buren', 'Polk', 'Pierce', 
                  'Buchanan', 'Cleveland', 'Wilson', 'Franklin D. Roosevelt', 
                  'Truman', 'Kennedy', 'Lyndon B. Johnson', 'Carter', 'Clinton', 'Obama', "Biden"]
    if name in democratic:
        return 'Democratic'
    
    whig = ['William Henry Harrison', 'Taylor', 'Fillmore']
    if name in whig:
        return 'Whig'
    
    national_union = ['Andrew Johnson']
    if name in national_union:
        return 'National Union'
    
    unaffiliated = ['Washington', 'Tyler']
    if name in unaffiliated:
        return 'Unaffiliated'
    
    federalist = ['John Adams']
    if name in federalist:
        return 'Federalist'
    
    democratic_republican = ['Jefferson', 'Madison', 'Monroe', 'John Quincy Adams']
    if name in democratic_republican:
        return 'Democratic-Republican'
    
df.party = df.president.apply(pres_to_party)

In [5]:
ignore_tokens = [',', '.', ';', ':', '"', '``', "''", '`', '--', '-','...']
wnl = WordNetLemmatizer()
full_stopwords = set(stopwords.words('english') + ["would", "also", "upon", "must", "u", "every", "shall", "many", "without", "much", "within", "could", "should", "since"])
# [wnl.lemmatize(t) for t in  if t not in ignore_tokens and t not in set(stopwords.words('english'))]
def lemmatize(s):
    wl = word_tokenize(s)
    word_list = [
        word.lower()
        for word in wl
        if word.isalpha() and word.lower() not in full_stopwords
    ]
    lemmatized_words = [wnl.lemmatize(word) for word in word_list]
    return Counter(lemmatized_words)

df["counter"] = df["text"].apply(lemmatize)
df = df.set_index("year")


In [6]:
freq = pd.concat([pd.DataFrame.from_dict(dict(row["counter"]), orient="index").reset_index().rename(columns={"index": "element", 0: "frequency_in_document"}).assign(**{"document":year}) for year, row in df.iterrows()])
freq = freq[~freq["element"].isin(full_stopwords)].reset_index(drop=True)


In [7]:

freq.sort_values("document").to_csv("data/sotu/np_freq/0.csv", index=False)

In [8]:
freq.groupby("element", as_index=False)["frequency_in_document"].sum().sort_values("frequency_in_document", ascending=False).iloc[50:100]

Unnamed: 0,element,frequency_in_document
6894,federal,1419
19746,well,1392
9016,increase,1379
2398,business,1364
10357,legislation,1283
11089,mean,1282
15000,report,1277
11169,men,1269
15195,result,1244
13786,program,1229
