In [None]:
"""
Collect US commencement speech transcripts

Source:
1. https://www.graduationwisdom.com/
2. https://www.floydhub.com/whatrocks/datasets/commencement

Output to pickled df:
speech_df (412 speeches with speaker, year, transcript)

"""

In [1]:
from bs4 import BeautifulSoup
import requests
import time, os

import pandas as pd
import numpy as np

import sys
sys.path.append('/Users/katiehuang/Documents/metis/projects/onl_ds5_project_4/py')

### 1. Web scrape from graduationwisdom.com

In [133]:
# Scrape the archive page of Graduate Wisdom
# For transcript url using BeautifulSoup 
url = "https://www.graduationwisdom.com/archive/archive000.htm"
page = requests.get(url).text
soup = BeautifulSoup(page, "lxml")
table = soup.find('table',id="box-table-a")
rows = [row for row in table.find_all('tr')]

speakers = [row.find_all('td')[0].find('b').text.upper().rstrip(' ') for row in rows[1:110]]+\
           [row.find_all('td')[0].find('strong').text.upper().rstrip(' ') for row in rows[110:111]]+\
           [row.find_all('td')[0].find('b').text.upper().rstrip(' ') for row in rows[111:]]
years = [row.find_all('td')[1].text for row in rows[1:]]
urls = [row.find_all('a')[0].get('href') for row in rows[1:]]

schools = [row.find_all('td')[2].text for row in rows[1:]]

In [134]:
len(speakers),len(years),len(urls),len(schools)

(243, 243, 243, 243)

In [135]:
# Create a Speech DataFrame
speech_df_1 = pd.DataFrame([speakers,years,schools]).transpose()
speech_df_1.columns = ['speaker','year','school']
speech_df_1['year'] = speech_df_1['year'].replace('',0).astype('int')
speech_df_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 243 entries, 0 to 242
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   speaker  243 non-null    object
 1   year     243 non-null    int64 
 2   school   243 non-null    object
dtypes: int64(1), object(2)
memory usage: 5.8+ KB


In [5]:
# Scrape transcript data from each url
def url_to_transcript(url):
    '''Returns transcript data specifically from graduationwisdom.com.'''
#     print(url)
    page = requests.get(url).text
    soup = BeautifulSoup(page, "lxml")
    text = ''
    try:
        text = [p.text for p in soup.find('div',id="main-text").find_all('p')[2:]]
        text = ' '.join(text)
    except AttributeError: # page not found
        pass
    return text

In [6]:
# transcripts = [url_to_transcript(u) for u in urls]

In [7]:
len(transcripts)

243

In [137]:
# Add transcript to speech_df_1
speech_df_1['transcript'] = transcripts

In [138]:
# Add length of transcript to speech_df_1
speech_df_1['length'] = [len(transcript) for transcript in speech_df_1['transcript']]

In [139]:
speech_df_1

Unnamed: 0,speaker,year,school,transcript,length
0,SIDDHARTHA MUKHERJEE,2018,Univ. of Southern California,I wish someone had told me at my own commenc...,14487
1,ANDY ROONEY,1996,Colgate University,"It's strange for me, being here at this gradua...",17447
2,JON B. FISHER,2018,University of San Francisco,\r\n Commencement Speech Transcript ...,8544
3,MINDY KALING,2018,Darmouth,"Good morning to the Class of 2018, the facul...",15391
4,DAVID SEDARIS,2018,Oberlin College,Full transcript available Commencement Addre...,2510
...,...,...,...,...,...
238,SUZAN-LORI PARKS,2001,Mount Holyoke,Suggestions and Advice are funny things. In 1...,3709
239,EARL BAKKEN,2004,University of Hawaii,\nI would like to recognize three friends of m...,2194
240,DAVID WOODLE,2001,Penn State University,\nThe first word is PREPARED. In my 20-plus ye...,10376
241,JERRY ZUCKER,2003,University of Wisconsin,"\n Before I start my remarks, I'd like ever...",9916


#### Examine the transcripts
If the length of transcript is too short, will add manually along with the second data sets.

In [140]:
speech_df_1[speech_df_1.length < 1000]

Unnamed: 0,speaker,year,school,transcript,length
7,NIGHT SHYAMALAN,2018,Drexel University,\n\n A person who concentrates on what they ...,138
9,PICO IYER,2015,Univ. Southern California,\n\n It matters not how far you go. It matte...,74
13,KELLY MCMASTERS,2017,Univ. of Louisville,,0
23,THEO EPSTEIN,2017,Yale,,0
66,DENZEL WASHINGTON,2015,Dillard University,GRADUATION SPEECH QUOTES\n\n In this te...,914
75,MAIRA KALMAN,2013,Rhode Island School of Design,Go forth with knowing and having no idea - a...,576
78,ALEXIS OHANIAN,2014,Carthage College,Failure is an option. Commencement Speech Ex...,868
82,TERRY GROSS,2014,Bryn Mawr College,"I believe we’re shaped by our failures, by o...",364
87,DANIEL PINK,2014,Weinberg College,"Sometimes, the only way to discover who you ...",405
93,MARY CARILLO,2014,Elon University,There is a distinct thrill to committing to ...,965


### 2. Download from floydhub.com/whatrocks/datasets/commencement

In [9]:
# Test open downloaded transcripts from FloydHub Charlie Harrington
f = open("../data/1838-ralph-waldo-emerson-harvard-university.txt", "r")
test = f.read()
print(test)

In this refulgent summer, it has been a luxury to draw the breath of life. The grass grows, the buds burst, the meadow is spotted with fire and gold in the tint of flowers. The air is full of birds, and sweet with the breath of the pine, the balm-of-Gilead, and the new hay. Night brings no gloom to the heart with its welcome shade. Through the transparent darkness the stars pour their almost spiritual rays. Man under them seems a young child, and his huge globe a toy. The cool night bathes the world as with a river, and prepares his eyes again for the crimson dawn. The mystery of nature was never displayed more happily. The corn and the wine have been freely dealt to all creatures, and the never-broken silence with which the old bounty goes forward, has not yielded yet one word of explanation. One is constrained to respect the perfection of this world, in which our senses converse. How wide; how rich; what invitation from every property it gives to every faculty of man! In its fruitful

In [44]:
import glob
import os

# Read all the data file names
os.chdir(r'../data')
myFiles = glob.glob('*.txt')

In [45]:
# myFiles

In [46]:
len(myFiles)

322

In [91]:
year_2 = [file.split('-')[0] for file in myFiles]

speaker_2 = [" ".join(file.split('-')[1:3]) \
            if len(file.split('-')[2]) != 1 else " ".join(file.split('-')[1:4]) \
            for file in myFiles]

school_2 = [" ".join(file.split('-')[3:]).rstrip('.txt') \
            if len(file.split('-')[2]) != 1
            else " ".join(file.split('-')[4:]).rstrip('.txt') \
            for file in myFiles]

In [92]:
speech_df_2 = pd.DataFrame([speaker_2,year_2,school_2]).transpose()
speech_df_2.columns = ['speaker','year','school']
speech_df_2['speaker'] = speech_df_2['speaker'].apply(lambda x: x.upper())
speech_df_2['year'] = speech_df_2['year'].replace('',0).astype('int')
speech_df_2['school'] = speech_df_2['school'].apply(lambda x: x.upper())
speech_df_2

Unnamed: 0,speaker,year,school
0,SUSAN RICE,2010,STANFORD UNIVERSITY
1,MELISSA HARRIS,2012,PERRY WELLESLEY COLLEGE
2,STEVE BLANK,2011,PHILADELPHIA UNIVERSITY
3,JOSS WHEDON,2013,WESLEYAN UNIVERSITY
4,WESLEY CHAN,2012,"UNIVERSITY OF CALIFORNIA, SAN DIEGO"
...,...,...,...
317,CARRIE CHAPMAN,1936,CATT SWEET BRIAR COLLEGE
318,DANIEL S GOLDIN,2001,MASSACHUSETTS INSTITUTE OF TECHNOLOGY
319,FRED ROGERS,2002,DARTMOUTH COLLEGE
320,JIMMY TINGLE,2010,HARVARD UNIVERSITY


In [93]:
transcripts_2 = []
for file in myFiles:
    f = open("../data/" + file, "r")
    transcript = f.read()
    transcripts_2.append(transcript)

In [94]:
len(transcripts_2)

322

In [95]:
speech_df_2['transcript'] = transcripts_2

In [96]:
# Update incorrect values
speech_df_2.loc[165,'speaker'] = 'RALPH WALDO EMERSON'
speech_df_2.loc[6,'year'] = 2018

In [97]:
speech_df_2['length'] = [len(transcript) for transcript in speech_df_2['transcript']]
speech_df_2

Unnamed: 0,speaker,year,school,transcript,length
0,SUSAN RICE,2010,STANFORD UNIVERSITY,"Good morning, Stanford! Thank you very much, P...",11906
1,MELISSA HARRIS,2012,PERRY WELLESLEY COLLEGE,I often say that the very best day of my life ...,16642
2,STEVE BLANK,2011,PHILADELPHIA UNIVERSITY,I am honored to be with you as we gather to ce...,12481
3,JOSS WHEDON,2013,WESLEYAN UNIVERSITY,"sCommencement address—it’s going well, it’s go...",8204
4,WESLEY CHAN,2012,"UNIVERSITY OF CALIFORNIA, SAN DIEGO","Graduates, faculty, family and friends, I'm ve...",8987
...,...,...,...,...,...
317,CARRIE CHAPMAN,1936,CATT SWEET BRIAR COLLEGE,I bring a message to Sweet Briar College and e...,22942
318,DANIEL S GOLDIN,2001,MASSACHUSETTS INSTITUTE OF TECHNOLOGY,"President Vest, members of the Board of Truste...",13026
319,FRED ROGERS,2002,DARTMOUTH COLLEGE,Wow. What a privilege to be with you all. Sinc...,7919
320,JIMMY TINGLE,2010,HARVARD UNIVERSITY,Thank you.\n\nI am truly honored to be here th...,5576


### Combine the two data sets
Examine the data and update incorrect values

In [141]:
speech_df = pd.concat([speech_df_1,speech_df_2],axis=0)

In [142]:
speech_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 565 entries, 0 to 321
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   speaker     565 non-null    object
 1   year        565 non-null    int64 
 2   school      565 non-null    object
 3   transcript  565 non-null    object
 4   length      565 non-null    int64 
dtypes: int64(2), object(3)
memory usage: 26.5+ KB


In [143]:
speech_df[speech_df.year==2018]

Unnamed: 0,speaker,year,school,transcript,length
0,SIDDHARTHA MUKHERJEE,2018,Univ. of Southern California,I wish someone had told me at my own commenc...,14487
2,JON B. FISHER,2018,University of San Francisco,\r\n Commencement Speech Transcript ...,8544
3,MINDY KALING,2018,Darmouth,"Good morning to the Class of 2018, the facul...",15391
4,DAVID SEDARIS,2018,Oberlin College,Full transcript available Commencement Addre...,2510
5,ABBY WAMBACK,2018,Barnard College,"Failure is not something to be ashamed of, ...",15866
6,JESMYN WARD,2018,Tulane University,Persist. Be patient. Be well. Good morning....,14063
7,NIGHT SHYAMALAN,2018,Drexel University,\n\n A person who concentrates on what they ...,138
6,TERRY TEACHOUT,2018,HAMILTON HOLT SCHOOL,"I’m supposed to keep it short, and I approve o...",6288
73,NIGHT SHYAMALAN,2018,DREXEL UNIVERSITY,\nwhat's up dragons alright let's start\n\nwit...,26406
118,DAVID SEDARIS,2018,OBERLIN COLLEGE,"Thank you so much for having me, and for prese...",10587


In [144]:
speech_df[speech_df.speaker=='PAUL GLASER']

Unnamed: 0,speaker,year,school,transcript,length
220,PAUL GLASER,2004,Stanford School of Medicine,(- Robert Frost) \n Read the full commenc...,151
274,PAUL GLASER,2004,STANFORD UNIVERSITY,"When I direct actors, I often try to put them ...",8651
290,PAUL GLASER,2004,STANFORD SCHOOL OF MEDICINE,"When I direct actors, I often try to put them ...",8682


In [145]:
# Count duplicates
speech_df[speech_df.duplicated(subset=['speaker','year'])==True]

Unnamed: 0,speaker,year,school,transcript,length
2,STEVE BLANK,2011,PHILADELPHIA UNIVERSITY,I am honored to be with you as we gather to ce...,12481
3,JOSS WHEDON,2013,WESLEYAN UNIVERSITY,"sCommencement address—it’s going well, it’s go...",8204
5,ALEXIS OHANIAN,2014,CARRTHAGE COLLEGE,Hello.\n\nI am very fond of the\nwaffles at Wa...,19842
9,STEPHEN COLBERT,2013,UNIVERSITY OF VIRGINIA,"Good morning, good morning. I am Stephen Colbe...",12229
14,JIM CARREY,2014,MAHARISHI UNIVERSITY OF MANAGEMEN,"Thank you Bevan, thank you all!\n\nI brought o...",14523
...,...,...,...,...,...
309,MURIEL SIEBERT,1998,CASE WESTERN RESERVE UNIVERSITY,"I am touched. I look at you 2,045 graduates wh...",16022
310,JEFF BEZOS,2010,PRINCETON UNIVERSITY,"As a kid, I spent my summers with my grandpare...",7260
313,RAY BRADBURY,2000,CALTECH,Thank you. Thank you. Thank you. This is fanta...,23693
314,DAVID WOODLE,2001,PENN STATE,Thank you very much for your kind introduction...,10734


In [146]:
# Drop duplicates (keep last)
speech_df = speech_df.drop_duplicates(subset=['speaker','year'],keep='last')

In [147]:
speech_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 443 entries, 0 to 321
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   speaker     443 non-null    object
 1   year        443 non-null    int64 
 2   school      443 non-null    object
 3   transcript  443 non-null    object
 4   length      443 non-null    int64 
dtypes: int64(2), object(3)
memory usage: 20.8+ KB


In [148]:
speech_df = speech_df[speech_df.length >100].sort_values('year',ascending=False).\
            reset_index(drop=True)

In [149]:
speech_df

Unnamed: 0,speaker,year,school,transcript,length
0,SIDDHARTHA MUKHERJEE,2018,Univ. of Southern California,I wish someone had told me at my own commenc...,14487
1,JESMYN WARD,2018,Tulane University,Persist. Be patient. Be well. Good morning....,14063
2,DAVID SEDARIS,2018,OBERLIN COLLEGE,"Thank you so much for having me, and for prese...",10587
3,NIGHT SHYAMALAN,2018,DREXEL UNIVERSITY,\nwhat's up dragons alright let's start\n\nwit...,26406
4,TERRY TEACHOUT,2018,HAMILTON HOLT SCHOOL,"I’m supposed to keep it short, and I approve o...",6288
...,...,...,...,...,...
436,WILLIAM ALLEN,1936,WHITE NORTHWESTERN UNIVERSITY,About all that a commencement orator can do fo...,14953
437,FRANKLIN D ROOSEVELT,1932,OGLETHORPE UNIVERSITY,"For me, as for you, this is a day of honorable...",16017
438,OPRAH WINFREY,1918,USC,Thank you Wallis Annenberg and a special thank...,15301
439,RALPH WALDO,1838,EMERSON HARVARD UNIVERSITY,"In this refulgent summer, it has been a luxury...",40403


In [150]:
# Pickle the DataFrame
speech_df.to_pickle('../dump/speech_df')