In [None]:
"""
Collect US commencement speech transcripts

Source:
1. https://www.graduationwisdom.com/
2. https://www.floydhub.com/whatrocks/datasets/commencement

Output to pickled df:
speech_df (412 speeches with speaker, year, transcript)

"""

In [1]:
from bs4 import BeautifulSoup
import requests
import time, os

import pandas as pd
import numpy as np

import sys
sys.path.append('/Users/katiehuang/Documents/metis/projects/onl_ds5_project_4/py')

### 1. Web scrape from graduationwisdom.com

In [2]:
# Scrape the archive page of Graduate Wisdom
# For transcript url using BeautifulSoup 
url = "https://www.graduationwisdom.com/archive/archive000.htm"
page = requests.get(url).text
soup = BeautifulSoup(page, "lxml")
table = soup.find('table',id="box-table-a")
rows = [row for row in table.find_all('tr')]

speakers = [row.find_all('td')[0].find('b').text for row in rows[1:110]]+\
           [row.find_all('td')[0].find('strong').text for row in rows[110:111]]+\
           [row.find_all('td')[0].find('b').text for row in rows[111:]]
years = [row.find_all('td')[1].text for row in rows[1:]]
urls = [row.find_all('a')[0].get('href') for row in rows[1:]]

In [3]:
len(speakers),len(years),len(urls)

(243, 243, 243)

In [4]:
# Create a Speech DataFrame
speech_df_1 = pd.DataFrame([speakers,years]).transpose()
speech_df_1.columns = ['speaker','year']
speech_df_1['year'] = speech_df_1['year'].replace('',0).astype('int')
speech_df_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 243 entries, 0 to 242
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   speaker  243 non-null    object
 1   year     243 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 3.9+ KB


In [5]:
# Scrape transcript data from each url
def url_to_transcript(url):
    '''Returns transcript data specifically from graduationwisdom.com.'''
#     print(url)
    page = requests.get(url).text
    soup = BeautifulSoup(page, "lxml")
    text = ''
    try:
        text = [p.text for p in soup.find('div',id="main-text").find_all('p')[2:]]
        text = ' '.join(text)
    except AttributeError: # page not found
        pass
    return text

In [6]:
transcripts = [url_to_transcript(u) for u in urls]

In [7]:
len(transcripts)

243

In [8]:
# Add transcript to speech_df
speech_df_1['transcript'] = transcripts

### 2. Download from floydhub.com/whatrocks/datasets/commencement

In [9]:
# Test open downloaded transcripts from FloydHub Charlie Harrington
f = open("../data/1838-ralph-waldo-emerson-harvard-university.txt", "r")
test = f.read()
print(test)

In this refulgent summer, it has been a luxury to draw the breath of life. The grass grows, the buds burst, the meadow is spotted with fire and gold in the tint of flowers. The air is full of birds, and sweet with the breath of the pine, the balm-of-Gilead, and the new hay. Night brings no gloom to the heart with its welcome shade. Through the transparent darkness the stars pour their almost spiritual rays. Man under them seems a young child, and his huge globe a toy. The cool night bathes the world as with a river, and prepares his eyes again for the crimson dawn. The mystery of nature was never displayed more happily. The corn and the wine have been freely dealt to all creatures, and the never-broken silence with which the old bounty goes forward, has not yielded yet one word of explanation. One is constrained to respect the perfection of this world, in which our senses converse. How wide; how rich; what invitation from every property it gives to every faculty of man! In its fruitful

In [27]:
import glob
import os

# Read all the data file names
os.chdir(r'../data')
myFiles = glob.glob('*.txt')

In [28]:
myFiles

['2010-susan-rice-stanford-university.txt',
 '2012-melissa-harris-perry-wellesley-college.txt',
 '2011-steve-blank-philadelphia-university.txt',
 '2013-joss-whedon-wesleyan-university.txt',
 '2012-wesley-chan-university-of-california,-san-diego.txt',
 '2011-terry-teachout-hamilton-holt-school.txt',
 '1918-oprah-winfrey-usc.txt',
 '2011-robert-krulwich-university-of-california,-berkeley.txt',
 '2013-stephen-colbert-university-of-virginia.txt',
 '2000-carly-fiorina-massachusetts-institute-of-technology.txt',
 '1997-madeleine-albright-harvard-university.txt',
 '1982-mother-teresa-niagara-university.txt',
 '1975-gerald-ford-chicago-state-university.txt',
 '2014-jim-carrey-maharishi-university-of-management.txt',
 '2010-jeffrey-sachs-connecticut-college.txt',
 '1990-barbara-bush-wellesley-college.txt',
 '2014-mindy-kaling-harvard-law-school.txt',
 '1955-dwight-eisenhower-penn-state.txt',
 '2012-andy-samberg-harvard-university.txt',
 '2004-richard-russo-colby-college.txt',
 '1998-charles-w-c

In [29]:
len(myFiles)

303

In [30]:
year_2 = [file.split('-')[0] for file in myFiles]

speakr_2 = [" ".join(file.split('-')[1:3]) \
            if len(file.split('-')[2]) != 1 else " ".join(file.split('-')[1:4]) \
            for file in myFiles]

In [31]:
speech_df_2 = pd.DataFrame([speakr_2,year_2]).transpose()
speech_df_2.columns = ['speaker','year']
speech_df_2['speaker'] = speech_df_2['speaker'].apply(lambda x: x.upper())
speech_df_2['year'] = speech_df_2['year'].replace('',0).astype('int')
speech_df_2

Unnamed: 0,speaker,year
0,SUSAN RICE,2010
1,MELISSA HARRIS,2012
2,STEVE BLANK,2011
3,JOSS WHEDON,2013
4,WESLEY CHAN,2012
...,...,...
298,CARRIE CHAPMAN,1936
299,DANIEL S GOLDIN,2001
300,FRED ROGERS,2002
301,JIMMY TINGLE,2010


In [32]:
transcripts_2 = []
for file in myFiles:
    f = open("../data/" + file, "r")
    transcript = f.read()
    transcripts_2.append(transcript)

In [33]:
len(transcripts_2)

303

In [34]:
speech_df_2['transcript'] = transcripts_2

In [35]:
# Update incorrect values
speech_df_2.loc[165,'speaker'] = 'RALPH WALDO EMERSON'
speech_df_2.loc[6,'year'] = 2018

### Combine the two data sets
Examine the data and update incorrect values

In [36]:
speech_df = pd.concat([speech_df_1,speech_df_2],axis=0)

In [37]:
speech_df[speech_df.year==2018]

Unnamed: 0,speaker,year,transcript
0,SIDDHARTHA MUKHERJEE,2018,I wish someone had told me at my own commenc...
2,JON B. FISHER,2018,\r\n Commencement Speech Transcript ...
3,MINDY KALING,2018,"Good morning to the Class of 2018, the facul..."
4,DAVID SEDARIS,2018,Full transcript available Commencement Addre...
5,ABBY WAMBACK,2018,"Failure is not something to be ashamed of, ..."
6,JESMYN WARD,2018,Persist. Be patient. Be well. Good morning....
7,NIGHT SHYAMALAN,2018,\n\n A person who concentrates on what they ...
6,OPRAH WINFREY,2018,Thank you Wallis Annenberg and a special thank...
67,NIGHT SHYAMALAN,2018,\nwhat's up dragons alright let's start\n\nwit...
108,DAVID SEDARIS,2018,"Thank you so much for having me, and for prese..."


In [38]:
# Count duplicates
speech_df[speech_df.duplicated(subset=['speaker','year'])==True].shape

(75, 3)

In [39]:
# Drop duplicates (keep last)
speech_df = speech_df.drop_duplicates(subset=['speaker','year'],keep='last')

In [40]:
speech_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 471 entries, 0 to 302
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   speaker     471 non-null    object
 1   year        471 non-null    int64 
 2   transcript  471 non-null    object
dtypes: int64(1), object(2)
memory usage: 14.7+ KB


In [41]:
speech_df = speech_df.sort_values('year',ascending=False).\
            reset_index(drop=True)

In [42]:
speech_df

Unnamed: 0,speaker,year,transcript
0,SIDDHARTHA MUKHERJEE,2018,I wish someone had told me at my own commenc...
1,JESMYN WARD,2018,Persist. Be patient. Be well. Good morning....
2,DAVID SEDARIS,2018,"Thank you so much for having me, and for prese..."
3,OPRAH WINFREY,2018,Thank you Wallis Annenberg and a special thank...
4,NIGHT SHYAMALAN,2018,\nwhat's up dragons alright let's start\n\nwit...
...,...,...,...
466,WILLIAM ALLEN,1936,About all that a commencement orator can do fo...
467,CARRIE CHAPMAN,1936,I bring a message to Sweet Briar College and e...
468,FRANKLIN D ROOSEVELT,1932,"For me, as for you, this is a day of honorable..."
469,RALPH WALDO,1838,"In this refulgent summer, it has been a luxury..."


In [43]:
# Pickle the DataFrame
speech_df.to_pickle('../dump/speech_df')