In [1]:
import pandas as pd
import json
from selenium import webdriver
import time

In [2]:
# chromedriver path
chromedriver = '/bin/chromedriver'
# initialize driver
driver = webdriver.Chrome(chromedriver)

### Website

In [3]:
test_url = 'https://millercenter.org/the-presidency/presidential-speeches'
driver.get(test_url)

### Select all Presidents

In [4]:
# click all president checkboxes
x_path = '//input[@class="form-checkbox"]'
driver.find_element_by_xpath(x_path).click

<bound method WebElement.click of <selenium.webdriver.remote.webelement.WebElement (session="3110dd8c28f889b532d4c00c3b7b4875", element="e5ec28cd-d2d1-4e87-a4b4-3a5a662300b8")>>

### Scroll Down to Load Speeches

In [6]:
SCROLL_PAUSE_TIME = 3

# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")

while True:
    # Scroll down to bottom
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

    # Wait to load page
    time.sleep(SCROLL_PAUSE_TIME)

    # Calculate new scroll height and compare with last scroll height
    new_height = driver.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        break
    last_height = new_height

### Retrieve List of Speech Links

In [10]:
# retrieve all speech link elements
x_path = '//div[@class="views-field views-field-title"]/span/a'
speeches = driver.find_elements_by_xpath(x_path)

In [14]:
# create list of speech hyperlinks
speech_list = []
for speech in speeches:
    speech_list.append(speech.get_attribute('href'))

In [63]:
len(speech_list)

989

In [101]:
# save link list
with open('data/speech_links.json', 'w') as outfile:
    json.dump(speech_list, outfile)

In [3]:
# load link list
with open('data/speech_links.json', 'r') as infile:
    speech_list = json.load(infile)

### Retrieve Speech Data

In [4]:
# create empty dataframe to store scraped data
speech_df = pd.DataFrame(columns=['Title', 'Date', 'President', 'Transcript'])

In [6]:
# list to log error pages
speech_err = []

In [12]:
# iterate through each speech
for speech in speech_list[477:]:

    try:
    
        # load speech page
        driver.get(speech)
        # maximize window
        driver.maximize_window()

        # title
        title = driver.find_element_by_xpath('//h2[@class="presidential-speeches--title"]').text
        # president
        pres = driver.find_element_by_xpath('//p[@class="president-name"]').text
        # date
        date = driver.find_element_by_xpath('//p[@class="episode-date"]').text
        # transcript
        try:
            # click button if exists
            driver.find_element_by_class_name('expandable-text-trigger').click()
            # wait to load
            time.sleep(1)
            # retrieve transcript text
            speech_text = driver.find_element_by_xpath('//div[@class="transcript-inner"]').text
        except:
            # retrieve transcript text
            speech_text = driver.find_element_by_xpath('//div[@class="view-transcript"]').text

        # add data to dataframe
        speech_df = speech_df.append({
            'Title': title, 
            'Date': date, 
            'President': pres, 
            'Transcript': speech_text
        }, ignore_index=True)
    
    except:
        
        # log page for follow-up
        speech_err.append(speech)

In [16]:
speech_df.head()

Unnamed: 0,Title,Date,President,Transcript
0,"February 5, 2019: State of the Union Address","February 05, 2019",Donald Trump,"Transcript\nMadam Speaker, Mr. Vice President,..."
1,"January 19, 2019: Remarks about the US Souther...","January 19, 2019",Donald Trump,Transcript\nTHE PRESIDENT: Just a short time a...
2,"September 25, 2018: Address at the 73rd Sessio...","September 25, 2018",Donald Trump,"Transcript\nTHE PRESIDENT: Madam President, Mr..."
3,"July 24, 2018: Speech at the Veterans of Forei...","July 24, 2018",Donald Trump,"Transcript\nTHE PRESIDENT: Thank you, Lee. Tha..."
4,"March 19, 2018: Remarks on Combating the Opioi...","March 19, 2018",Donald Trump,Transcript\nTHE PRESIDENT: Thank you to our Fi...


### Errors

In [18]:
# list of pages with errors
speech_err

['https://millercenter.org/the-presidency/presidential-speeches/october-28-1932-campaign-speech-indianapolis-indiana']

In [22]:
# index of links with errors
for err in speech_err:
    print(speech_list.index(err))

477


In [32]:
# load error page
driver.get(speech_err[0])
driver.maximize_window()

In [33]:
# grab data
# title
title = driver.find_element_by_xpath('//h2[@class="presidential-speeches--title"]').text
# president
pres = driver.find_element_by_xpath('//p[@class="president-name"]').text
# date
date = driver.find_element_by_xpath('//p[@class="episode-date"]').text
# transcript
speech_text = driver.find_element_by_xpath('//div[@class="about-sidebar--intro"]/p').text

In [65]:
speech_df.loc[476:479, :]

Unnamed: 0,Title,Date,President,Transcript
476,"November 5, 1932: Campaign speech in St Paul M...","November 05, 1932",Herbert Hoover,Transcript\nIn these closing hours of the camp...
477,"October 21, 1932: Campaign Speech in Madison S...","October 21, 1932",Herbert Hoover,Transcript\nThis campaign is more than a conte...
478,"August 11, 1932: Speech Accepting the Republic...","August 11, 1932",Herbert Hoover,Transcript\nMr. Chairman and my fellow citizen...
479,"May 31, 1932: Statement on the National Economy","May 31, 1932",Herbert Hoover,Transcript\n\nAN EMERGENCY has developed in th...


In [56]:
# Function to insert row in the dataframe 
def Insert_row_(row_number, df, row_value): 
    # Slice the upper half of the dataframe 
    df1 = df[0:row_number] 
   
    # Store the result of lower half of the dataframe 
    df2 = df[row_number:] 
   
    # Inser the row in the upper half dataframe 
    df1.loc[row_number]=row_value 
   
    # Concat the two dataframes 
    df_result = pd.concat([df1, df2]) 
   
    # Reassign the index labels 
    df_result.index = [*range(df_result.shape[0])] 
   
    # Return the updated dataframe 
    return df_result 

In [66]:
# insert into dataframe
speech_df = Insert_row_(477, speech_df, [title, date, pres, speech_text])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


In [67]:
speech_df.loc[476:479, :]

Unnamed: 0,Title,Date,President,Transcript
476,"November 5, 1932: Campaign speech in St Paul M...","November 05, 1932",Herbert Hoover,Transcript\nIn these closing hours of the camp...
477,"October 28, 1932: Campaign speech in Indianapo...","October 28, 1932",Herbert Hoover,"My fellow citizens, my friends in Indianapolis..."
478,"October 21, 1932: Campaign Speech in Madison S...","October 21, 1932",Herbert Hoover,Transcript\nThis campaign is more than a conte...
479,"August 11, 1932: Speech Accepting the Republic...","August 11, 1932",Herbert Hoover,Transcript\nMr. Chairman and my fellow citizen...


In [68]:
# close driver object
driver.close()

### Save Dataframe

In [70]:
# save speech dataframe to CSV file
speech_df.to_csv('data/transcripts.csv')

In [64]:
# load speech dataframe from CSV file
speech_df = pd.read_csv('data/transcripts.csv', index_col=0)