In [1]:
import requests
stop_url = "https://gist.githubusercontent.com/rg089/35e00abf8941d72d419224cfd5b5925d/raw/12d899b70156fd0041fa9778d657330b024b959c/stopwords.txt"
stopwords_list = requests.get(stop_url).content
stopwords = set(stopwords_list.decode().splitlines())
stopwords = list(stopwords)


def remove_stopwords(low): # low = list of words
    list_ = re.sub(r"[^a-zA-Z0-9]", " ", low.lower()).split()
    return [itm for itm in list_ if itm not in stopwords]

In [2]:
from collections import Counter
import glob
import re

In [3]:
import numpy as np
def calculate_tf_idf(speech_dir):
    """Calculate TF-IDF for speeches in directory"""
    
    # Read all txt files
    speeches = []
    for filepath in glob.glob(f"{speech_dir}/*.txt"):
        with open(filepath, 'r', encoding='utf-8') as f:
            speeches.append(f.read())

    # Process each speech
    processed_speeches = []
    for speech in speeches:
        filtered_bow = remove_stopwords(speech)
        processed_speeches.append(filtered_bow)
    
    # Calculate document frequencies
    doc_freq = Counter()
    for speech in processed_speeches:
        unique_words = set(speech) # unique for each speech
        for word in unique_words:
            doc_freq[word] += 1
    
    # Calculate IDFs
    N = len(speeches) #total number of documents
    idfs = {}
    for word, df in doc_freq.items(): # df : document frequency
        idfs[word] = np.log((N + 1)/(df + 1)) + 1
    
    tf_idf_scores = []
    # Calculate TF for all speeches combined
    for i, speech in enumerate(processed_speeches):
        # Calculate TF for this document
        doc_length = len(speech)
        word_counts = Counter(speech)
        
        # Calculate TF-IDF for each word in this document
        for word, count in word_counts.items():
            tf = count/doc_length
            tf_idf = tf * idfs[word]
            tf_idf_scores.append((f"{word}_doc{i}", tf_idf))
    
    # Get top 15 words
    top_words = sorted(tf_idf_scores, 
                      key=lambda x: x[1], 
                      reverse=True)[:15]
    
    return top_words

# Example usage
directory = "/Users/yi.hs/cs119/cs-119-bigdata/week3/prez_speeches/kennedy"  # directory with extracted txt files
top_words = calculate_tf_idf(directory)
print("Top 15 most important words:")
for word, score in top_words:
    print(f"{word}: {score:.4f}")

Top 15 most important words:
corps_doc30: 0.1177
catholic_doc41: 0.1051
tax_doc26: 0.0977
religious_doc41: 0.0927
mississippi_doc7: 0.0905
berlin_doc3: 0.0814
peace_doc30: 0.0786
university_doc35: 0.0777
united_doc8: 0.0760
uh_doc44: 0.0633
vanderbilt_doc23: 0.0619
frost_doc0: 0.0618
church_doc41: 0.0618
city_doc3: 0.0605
nuclear_doc4: 0.0586


In [5]:
import os
os.chdir('/Users/yi.hs/cs119/cs-119-bigdata/week3/prez_speeches')

def list_president_files():
    # Assuming we're in the directory containing president folders
    for president_dir in glob.glob("*/"):
        print(f"\nFiles for {president_dir}:")
        for txt_file in glob.glob(f"{president_dir}/*.txt"):
            print(txt_file)

# Example usage
list_president_files()


Files for coolidge/:
coolidge/coolidge_speeches_009.txt
coolidge/coolidge_speeches_008.txt
coolidge/coolidge_speeches_000.txt
coolidge/coolidge_speeches_001.txt
coolidge/coolidge_speeches_003.txt
coolidge/coolidge_speeches_002.txt
coolidge/coolidge_speeches_006.txt
coolidge/coolidge_speeches_007.txt
coolidge/coolidge_speeches_011.txt
coolidge/coolidge_speeches_005.txt
coolidge/coolidge_speeches_004.txt
coolidge/coolidge_speeches_010.txt

Files for tyler/:
tyler/tyler_speeches_017.txt
tyler/tyler_speeches_003.txt
tyler/tyler_speeches_002.txt
tyler/tyler_speeches_016.txt
tyler/tyler_speeches_000.txt
tyler/tyler_speeches_014.txt
tyler/tyler_speeches_015.txt
tyler/tyler_speeches_001.txt
tyler/tyler_speeches_005.txt
tyler/tyler_speeches_011.txt
tyler/tyler_speeches_010.txt
tyler/tyler_speeches_004.txt
tyler/tyler_speeches_012.txt
tyler/tyler_speeches_006.txt
tyler/tyler_speeches_007.txt
tyler/tyler_speeches_013.txt
tyler/tyler_speeches_009.txt
tyler/tyler_speeches_008.txt

Files for wilson

In [6]:
import numpy as np
from collections import Counter
import glob
import re
import requests
import os

def calculate_president_tfidf(target_president):
    # Dictionary to store all speeches for each president
    president_speeches = {}
    
    # Read all speeches for each president
    for president_dir in os.listdir('.'):
        if os.path.isdir(president_dir):
            all_speeches = []
            for filepath in glob.glob(f"{president_dir}/*.txt"):
                with open(filepath, 'r', encoding='utf-8') as f:
                    speech = f.read()
                    filtered_speech = remove_stopwords(speech)
                    all_speeches.extend(filtered_speech)
            president_speeches[president_dir] = all_speeches
    
    # Check if target president exists
    if target_president not in president_speeches:
        raise ValueError(f"President {target_president} not found in directory")
    
    # Calculate document frequencies
    doc_freq = Counter()
    for president, speeches in president_speeches.items():
        unique_words = set(speeches)
        for word in unique_words:
            doc_freq[word] += 1
    
    # Calculate IDFs
    N = len(president_speeches)
    idfs = {}
    for word, df in doc_freq.items():
        idfs[word] = np.log((N + 1)/(df + 1)) + 1
    
    # Calculate TF-IDF for target president
    speeches = president_speeches[target_president]
    total_words = len(speeches)
    word_counts = Counter(speeches)
    
    # Calculate TF-IDF scores
    tfidf_scores = {}
    for word, count in word_counts.items():
        tf = count/total_words
        tfidf_scores[word] = tf * idfs[word]
    
    # Get top 15 words
    top_words = sorted(tfidf_scores.items(), 
                      key=lambda x: x[1], 
                      reverse=True)[:15]
    
    return top_words

In [7]:
top_words_kennedy = calculate_president_tfidf("kennedy")

In [8]:
print(f"\nTop 15 words for Kennedy:")
for word, score in top_words_kennedy:
    print(f"{word}: {score:.4f}")


Top 15 words for Kennedy:
uh: 0.0115
kennedy: 0.0093
united: 0.0080
states: 0.0069
people: 0.0062
president: 0.0062
nuclear: 0.0061
nixon: 0.0060
country: 0.0059
senator: 0.0054
today: 0.0053
economic: 0.0051
freedom: 0.0050
soviet: 0.0049
years: 0.0049


In [9]:
top_words_hoover = calculate_president_tfidf("hoover")

print(f"\nTop 15 words for hoover:")
for word, score in top_words_hoover:
    print(f"{word}: {score:.4f}")


Top 15 words for hoover:
government: 0.0120
000: 0.0103
people: 0.0080
economic: 0.0076
federal: 0.0066
american: 0.0064
states: 0.0059
business: 0.0055
country: 0.0052
public: 0.0052
tariff: 0.0050
congress: 0.0049
democratic: 0.0048
great: 0.0047
national: 0.0044
