In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Define the URL of the archive
url = "https://www.presidency.ucsb.edu/documents/app-categories/inaugural-addresses"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

# Extract links to speeches
base_url = "https://www.presidency.ucsb.edu"
speech_links = [base_url + a["href"] for a in soup.find_all("a", href=True) if "documents/inaugural-address" in a["href"]]

# Function to scrape speech text and year
def fetch_speech_text(url):
    resp = requests.get(url)
    soup = BeautifulSoup(resp.text, "html.parser")
    paragraphs = soup.find_all("p")
    text = " ".join(p.text for p in paragraphs)
    
    # Extract the year from the title
    title = soup.find("title").text
    year_match = re.search(r"(\d{4})", title)
    year = int(year_match.group(1)) if year_match else None
    
    return year, text

# Scrape all speeches
data = []
for link in speech_links[:10]:  # Limit for testing
    year, text = fetch_speech_text(link)
    if year and text:
        data.append({"year": year, "speech_text": text})

# Create DataFrame
df = pd.DataFrame(data)

# Define text cleaning function
def clean_text(text):
    text = re.sub(r'\n+', ' ', text)  # Remove line breaks
    text = re.sub(r'\W+', ' ', text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    return text

# Apply text cleaning
df['clean_text'] = df['speech_text'].apply(clean_text)

# Define keywords to track
keywords = ["freedom", "economy", "war", "rights", "government"]

# Count occurrences per speech
for word in keywords:
    df[word] = df['clean_text'].apply(lambda x: x.count(word))

# Aggregate by year
df_grouped = df.groupby('year')[keywords].sum()

# Plot trends
plt.figure(figsize=(12, 6))
for word in keywords:
    plt.plot(df_grouped.index, df_grouped[word], label=word)
plt.xlabel("Year")
plt.ylabel("Word Frequency")
plt.title("Political Themes in U.S. Inaugural Addresses (1789-Present)")
plt.legend()
plt.savefig("political_speech_trends.png")
plt.show()

# Convert text data into word frequency matrix
vectorizer = CountVectorizer(max_features=1000, stop_words='english')
X = vectorizer.fit_transform(df['clean_text'])

# Apply LDA topic modeling
lda = LatentDirichletAllocation(n_components=5, random_state=42)
topics = lda.fit_transform(X)

# Print top words per topic
words = vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(lda.components_):
    print(f"Topic {topic_idx}: ", [words[i] for i in topic.argsort()[:-10 - 1:-1]])

# Assign dominant topic
df['dominant_topic'] = topics.argmax(axis=1)

# Aggregate topic distributions by year
topic_trends = df.groupby('year')['dominant_topic'].value_counts().unstack()

# Plot the evolution of themes
plt.figure(figsize=(12, 6))
topic_trends.plot(kind="area", stacked=True)
plt.xlabel("Year")
plt.ylabel("Proportion of Speeches")
plt.title("Thematic Evolution in Inaugural Addresses")
plt.savefig("speech_topic_trends.png")
plt.show()


KeyError: 'speech_text'

In [7]:
print(df.head())  # See if 'speech_text' exists
print(df.columns)  # Check available columns


Empty DataFrame
Columns: []
Index: []
RangeIndex(start=0, stop=0, step=1)


In [9]:
def fetch_speech_text(url):
    resp = requests.get(url)
    soup = BeautifulSoup(resp.text, "html.parser")
    paragraphs = soup.find_all("p")
    text = " ".join(p.text.strip() for p in paragraphs if p.text.strip())  # Ensure non-empty text
    
    title = soup.find("title").text
    year_match = re.search(r"(\d{4})", title)
    year = int(year_match.group(1)) if year_match else None

    return year, text if text else None  # Return None if empty


In [11]:
speech_links = [a["href"] for a in soup.find_all("a", href=True) if "documents/inaugural-address" in a["href"]]
speech_links = list(set(speech_links))  # Remove duplicates
speech_links = [base_url + link if not link.startswith("http") else link for link in speech_links]


In [13]:
def fetch_speech_text(url):
    resp = requests.get(url)
    soup = BeautifulSoup(resp.text, "html.parser")
    
    # Extract speech paragraphs
    paragraphs = soup.select("div.field-docs-content p")  # Adjust based on the webpage structure
    text = " ".join(p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True))
    
    # Extract the year from the title
    title = soup.find("title").get_text()
    year_match = re.search(r"(\d{4})", title)
    year = int(year_match.group(1)) if year_match else None

    return year, text if text else None  # Return None if empty


In [15]:
data = []
for link in speech_links[:10]:  
    year, text = fetch_speech_text(link)
    if year and text:
        data.append({"year": year, "speech_text": text})

df = pd.DataFrame(data)
print(df.head())  # Should show actual data


Empty DataFrame
Columns: []
Index: []


In [19]:
import requests
from bs4 import BeautifulSoup

# Define the URL
url = "https://www.presidency.ucsb.edu/documents/app-categories/inaugural-addresses"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

# Extract links
speech_links = [a["href"] for a in soup.find_all("a", href=True) if "documents/inaugural-address" in a["href"]]
speech_links = list(set(speech_links))  # Remove duplicates
speech_links = ["https://www.presidency.ucsb.edu" + link if not link.startswith("http") else link for link in speech_links]

print("Number of speech links found:", len(speech_links))
print("First 5 links:", speech_links[:5])


Number of speech links found: 0
First 5 links: []


In [21]:
import requests
from bs4 import BeautifulSoup

# Define the URL
url = "https://www.presidency.ucsb.edu/documents/app-categories/inaugural-addresses"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

# View raw HTML
print(soup.prettify()[:2000])  # Print first 2000 characters to inspect the page structure


<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML+RDFa 1.0//EN"
  "http://www.w3.org/MarkUp/DTD/xhtml-rdfa-1.dtd">
<html dir="ltr" lang="en" prefix="content: http://purl.org/rss/1.0/modules/content/ dc: http://purl.org/dc/terms/ foaf: http://xmlns.com/foaf/0.1/ og: http://ogp.me/ns# rdfs: http://www.w3.org/2000/01/rdf-schema# sioc: http://rdfs.org/sioc/ns# sioct: http://rdfs.org/sioc/types# skos: http://www.w3.org/2004/02/skos/core# xsd: http://www.w3.org/2001/XMLSchema#">
 <head profile="http://www.w3.org/1999/xhtml/vocab">
  <meta charset="utf-8"/>
  <script type="text/javascript">
   (window.NREUM||(NREUM={})).init={ajax:{deny_list:["bam.nr-data.net"]}};(window.NREUM||(NREUM={})).loader_config={licenseKey:"dee899de70",applicationID:"80106271"};;/*! For license information please see nr-loader-rum-1.280.0.min.js.LICENSE.txt */
(()=>{var e,t,r={122:(e,t,r)=>{"use strict";r.d(t,{a:()=>i});var n=r(944);function i(e,t){try{if(!e||"object"!=typeof e)return(0,n.R)(3);if(!t||"object"!=typeof t)retur

In [23]:
import requests

url = "https://www.presidency.ucsb.edu/documents/app-categories/inaugural-addresses"
response = requests.get(url)

print("Status Code:", response.status_code)
print("Response Headers:", response.headers)
print("First 500 characters of response:", response.text[:500])


Status Code: 404
Response Headers: {'Connection': 'keep-alive', 'Cache-Control': 'no-cache, must-revalidate', 'Content-Encoding': 'gzip', 'Content-Language': 'en', 'Content-Type': 'text/html; charset=utf-8', 'Expires': 'Sun, 19 Nov 1978 05:00:00 GMT', 'Link': '</oops>; rel="canonical",</node/324522>; rel="shortlink"', 'Server': 'nginx', 'Strict-Transport-Security': 'max-age=300', 'X-Content-Type-Options': 'nosniff', 'X-Frame-Options': 'SAMEORIGIN', 'X-Generator': 'Drupal 7 (http://drupal.org)', 'X-Pantheon-Styx-Hostname': 'styx-fe1-b-6f847bcb88-s5tjv', 'X-Styx-Req-Id': '411bab02-e18b-11ef-a8e8-5ee6e89791af', 'Date': 'Sun, 02 Feb 2025 17:29:28 GMT', 'X-Served-By': 'cache-chi-kigq8000094-CHI, cache-mci680044-MCI', 'X-Cache': 'MISS, MISS', 'X-Cache-Hits': '0, 0', 'X-Timer': 'S1738517368.054851,VS0,VE155', 'Vary': 'Accept-Encoding, Cookie, Cookie', 'Age': '0', 'Accept-Ranges': 'bytes', 'Via': '1.1 varnish, 1.1 varnish', 'transfer-encoding': 'chunked'}
First 500 characters of response: <!DO

In [25]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# Updated URL
base_url = "https://www.presidency.ucsb.edu"
url = "https://www.presidency.ucsb.edu/documents/app-categories/spoken-addresses-and-remarks/presidential/inaugural-addresses"

# Request the page
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

# Extract links to speeches
speech_links = []
for a in soup.select("td.views-field-title a"):  # Based on observed structure
    href = a.get("href")
    if href and "documents" in href:
        speech_links.append(base_url + href)

print(f"Found {len(speech_links)} inaugural addresses.")

# Function to extract speech text and year
def fetch_speech_text(url):
    resp = requests.get(url)
    soup = BeautifulSoup(resp.text, "html.parser")

    # Extract speech text (adjust based on actual structure)
    paragraphs = soup.select("div.field-docs-content p")
    text = " ".join(p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True))

    # Extract the year from the title
    title = soup.find("title").get_text()
    year_match = re.search(r"(\d{4})", title)
    year = int(year_match.group(1)) if year_match else None

    return year, text

# Scrape all speeches
data = []
for link in speech_links[:10]:  # Limiting to 10 for testing
    year, text = fetch_speech_text(link)
    if year and text:
        data.append({"year": year, "speech_text": text})

# Create DataFrame
df = pd.DataFrame(data)
print(df.head())  # Verify content


Found 0 inaugural addresses.
Empty DataFrame
Columns: []
Index: []
