In [39]:
import os
import re
import string
import requests
from collections import defaultdict

# Load AFINN-165 lexicon
AFINN_URL = "https://raw.githubusercontent.com/fnielsen/afinn/master/afinn/data/AFINN-en-165.txt"
afinn_data = requests.get(AFINN_URL).text
valence_dict = dict(line.split('\t') for line in afinn_data.splitlines())
valence_dict = {word: int(score) for word, score in valence_dict.items()}

# Load stopwords
STOPWORDS_URL = "https://gist.githubusercontent.com/rg089/35e00abf8941d72d419224cfd5b5925d/raw/12d899b70156fd0041fa9778d657330b024b959c/stopwords.txt"
stopwords_list = requests.get(STOPWORDS_URL).content
stopwords = set(stopwords_list.decode().splitlines())

def remove_stopwords(words):
    list_ = re.sub(r'[^a-zA-Z0-9]', " ", words.lower()).split()
    return [itm for itm in list_ if itm not in stopwords]

def clean_text(text):
    text = text.lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub(r'[\d]+', ' ', text)
    return ' '.join(remove_stopwords(text))

def calc_valence(text):
    words = text.split()
    if not words:
        return 0
    return sum(valence_dict.get(word, 0) for word in words) / len(words)

def valence(text):
    return calc_valence(clean_text(text))

In [44]:
def mapper(directory):
    mapped_data = []
    for president_folder in os.listdir(directory):
        president_path = os.path.join(directory, president_folder)
        if os.path.isdir(president_path):  
            for filename in os.listdir(president_path):
                if filename.endswith(".txt"):  
                    file_path = os.path.join(president_path, filename)
                    with open(file_path, "r", encoding="utf-8") as file:
                        for line in file:
                            line = line.strip()
                            if line:
                                mapped_data.append((president_folder, valence(line)))
    return mapped_data

In [45]:
def reducer(mapped_data):
    president_scores = defaultdict(list)
    for president, score in mapped_data:
        president_scores[president].append(score)
    
    president_avg = {pres: sum(scores)/len(scores) for pres, scores in president_scores.items() if scores}
    return president_avg

In [47]:
speech_directory = os.getcwd()+"/week3/prez_speeches"  
mapped_data = mapper(speech_directory)
sentiments = reducer(mapped_data)

df = pd.DataFrame(sentiments.items(), columns=["President", "Average Sentiment"])

In [48]:
df

Unnamed: 0,President,Average Sentiment
0,coolidge,0.112565
1,tyler,0.081399
2,wilson,0.081781
3,ford,0.128761
4,pierce,0.069799
5,lincoln,0.049569
6,washington,0.137754
7,reagan,0.090472
8,hoover,0.040317
9,jefferson,0.038722
