### DATA 620 Web Analytics
### Sentiment Analysis of Presidential Inauguration Speeches
Lin Li and Tony Mai

### Part 1 - Collect inuagural speeches of presidents

In [51]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

We adapted the code for scraping the speech data from https://medium.com/@med.taha.elahmar/an-nlp-case-study-with-us-presidents-inaugural-speeches-part-1-2-data-collection-and-f1e2c5df5e50

We added comments to the code to demonstrate our understanding of the script

In [48]:
# Get list of urls of individual speech from website

def get_urls(url):
    '''Returns list of transcript urls'''
    
    page = requests.get(url).text 
    soup=BeautifulSoup(page, 'lxml')
    url_table = soup.find("table", class_='table').find_all("a")
    return [u["href"] for u in url_table]

urls = get_urls("https://www.presidency.ucsb.edu/documents/presidential-documents-archive-guidebook/inaugural-addresses")

In [49]:
# Scrape speech transcripts from each url

transcripts = pd.DataFrame()
def get_transcripts(urls, transcripts):
    for u in urls:
        page = requests.get(u).text
        soup = BeautifulSoup(page, 'lxml')
        president = soup.find("h3", class_="diet-title").text
        year = soup.find("span", class_="date-display-single").text.split(',')[1].strip()
        content = soup.find("div", class_="field-docs-content").text
        record = {
            'president' : president,
            'year' : year,
            'content' : content
        }
        transcripts = transcripts.append(record, ignore_index=True)

    return transcripts
data = get_transcripts(urls,transcripts)


In [50]:
data.head()

Unnamed: 0,content,president,year
0,\nFellow-Citizens of the Senate and of the Hou...,George Washington,1789
1,\nFellow Citizens:\nI AM again called upon by ...,George Washington,1793
2,"\nWHEN it was first perceived, in early times,...",John Adams,1797
3,\nFriends and Fellow-Citizens:\nCALLED upon to...,Thomas Jefferson,1801
4,"\nPROCEEDING, fellow-citizens, to that qualifi...",Thomas Jefferson,1805


### Part 2 - Text processing


In [114]:
import re
import string
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords

In [115]:
# Use regex to remove noise in the content 
def clean_content(content):
    content = content.replace('\n', '')
    content = content.replace(':', '')
    content = content.replace('-', ' ')
    content = content.lower()
    return content


In [116]:
# Create new dataframe with clean content
clean_data = data
clean_data['content'] = data.content.head().apply(clean_content)
clean_data.head()

Unnamed: 0,content,president,year
0,fellow citizens of the senate and of the house...,George Washington,1789
1,fellow citizensi am again called upon by the v...,George Washington,1793
2,"when it was first perceived, in early times, t...",John Adams,1797
3,friends and fellow citizenscalled upon to unde...,Thomas Jefferson,1801
4,"proceeding, fellow citizens, to that qualifica...",Thomas Jefferson,1805


In [118]:
# Use NLP to further process the content

def process_content (content):
    # remove punctuation
    nopunc = [char for char in content if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    # remove stopwords
    clean_content = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    return clean_content

clean_data['content'].head().apply(process_content)

0    [fellow, citizens, senate, house, representati...
1    [fellow, citizensi, called, upon, voice, count...
2    [first, perceived, early, times, middle, cours...
3    [friends, fellow, citizenscalled, upon, undert...
4    [proceeding, fellow, citizens, qualification, ...
Name: content, dtype: object