In [1]:
# Numpy, Pandas imports
import pandas as pd
import numpy as np

# Web scraping, pickle, selenium imports
import requests
from bs4 import BeautifulSoup
import pickle
from selenium import webdriver

# Progress Bar, time, system inputs
from tqdm import tqdm
import time
import sys

# Set recursion limit for pickeling
sys.setrecursionlimit(1000000)

ModuleNotFoundError: No module named 'bs4'

# Helper Functions

In [2]:
def fetch_transcript(url):
    """ Fetches speech trancript from url """
    
    # Soup object for url
    page = requests.get(url).text
    soup = BeautifulSoup(page, 'lxml')
    
    # Checks if 'View Transcript' button exists
    view_transcript = True if soup.find_all("div", {"class": "transcript-btn-inner"}) else False

    
    if not view_transcript:
        try:
            return soup.find_all("div", {"class": "view-transcript"})[0].text
        except:
            return soup.find_all("div", {"class": "transcript-inner"})[0].text
    else:
        # Driver clicks 'View Transcript' allowing transcript to be fetched
        driver = webdriver.Chrome()
        driver.get(url)
        continue_link = driver.find_element_by_link_text('View Transcript').click()        
        transcript = soup.find_all("div", {"class": "transcript-inner"})[0].text
        return transcript

In [3]:
def parse_url(url):
    """ Fetches date, president's name, speech title, summary of the speech, and transcript from speech url"""

    # Soup object for url
    page = requests.get(url).text
    soup = BeautifulSoup(page, 'lxml')
    
    # Extract date, president's name, and speech title
    date = soup.find_all("p", {"class": "episode-date"})[0].text
    president_name = soup.find_all("label", {"class": "presidential-speeches--label"})[0].text.strip()
    speech_title = soup.find_all("h2", {"class": "presidential-speeches--title"})[0].text.strip()
    
    # Extract summary and handle special cases with different html formatting
    summary = soup.find_all("div", {"class": "about-sidebar--intro"})
    if summary:
        try:
            summary = summary[0].text
        except:
            soup.find_all("div", {"class": "about-sidebar--intro"})[0].text
    else:
        summary = ''
    
    transcript = fetch_transcript(url)
    return (date, president_name, speech_title, summary, transcript)

# Webscrape Presidential Speeches

In [4]:
# Read HTML
with open('html/presidential_speeches.html') as file:
    html = file.read()

# Soup object for html
soup = BeautifulSoup(html, 'lxml')

# List of page links
pages = ['https://millercenter.org' + item.a['href'] for item in soup.find_all("div", {"class": "views-row"})]

# Initialize lists
dates = []
president_names = []
speech_titles = []
summaries = []
transcripts = []
urls = []

In [5]:
# Parses urls for the specified data and updates their lists respectively
# Note: There are 992 pages ~ 100%|██████████| 992/992 [56:52<00:00,  3.44s/it]  ~ 1 hour
for url in tqdm(pages):
    while True:
        try:
            date, president_name, speech_title, summary, transcript = parse_url(url)
            dates += [date]
            president_names += [president_name]
            speech_titles += [speech_title]
            summaries += [summary]
            transcripts += [transcript]
            urls += [url]
        except:
            # If there is connection failure, wait 5 seconds then retry
            time.sleep(2)
            continue
        break

100%|██████████| 992/992 [56:52<00:00,  3.44s/it]  


In [6]:
# Pickle raw data 
pickle.dump(dates, open("pickles/webscrape/webscrape_dates.p", "wb" ))
pickle.dump(president_names, open("pickles/webscrape/webscrape_president_names.p", "wb" ))
pickle.dump(speech_titles, open("pickles/webscrape/webscrape_speech_titles.p", "wb" ))
pickle.dump(summaries, open("pickles/webscrape/webscrape_summaries.p", "wb" ))
pickle.dump(transcripts, open("pickles/webscrape/webscrape_transcripts.p", "wb" ))
pickle.dump(urls, open("pickles/webscrape/webscrape_urls.p", "wb" ))

data = {
    'Date': dates,
    'President': president_names,
    'Speech Title': speech_titles,
    'Summary': summaries,
    'Transcript': transcripts,
    'URL': urls
}

# Pickle dataframe
pickle.dump(pd.DataFrame(data), open("pickles/webscrape/raw_speech_data.p", "wb" ))