In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import datetime as dt
import re

# The Fed's Site allows users to access all of the speeches for a given year on their site.
# This chunk of code builds out the correct link for each of these years as the actual structure
# of the link changes in 2011
def generate_all_pages(start, end):
    pages = []
    for year in range(start, end + 1):
        if year < 2011:
            pages.append('https://www.federalreserve.gov/newsevents/speech/' + str(year) +'speech.htm')
        else:
            pages.append('https://www.federalreserve.gov/newsevents/speech/' + str(year) + '-speeches.htm')
    return pages

# This piece of code get's the speech links for the years after 2006.
def get_speech_links(url):
    page = requests.get(url)
    content=page.content
    soup = BeautifulSoup(content)

#     # links from old website
#     if url[-10:] == 'speech.htm' and int(url[-14:][:4]) < 2006:
#         soup = soup.find('ul', id='speechIndex')
#         links = []
#         for link in soup.find_all('a', href=True):
#             if len(link['href']) > 10 and link['href'][:11] == '/boarddocs/':
#                 links.append('https://www.federalreserve.gov' + link['href'])

#     # links from new website
#     else:
# #     print(soup)
    soup = soup.find('div', class_='row eventlist')
    links = []

    for link in soup.find_all('a', href=True):
        if len(link['href']) > 12 and link['href'][:12] == '/newsevents/':
            links.append('https://www.federalreserve.gov' + link['href'])
    return links

# This is a helper function that takes a url as input and returns a soup
def get_soup(url):
    page = requests.get(url)
    content=page.content
    soup = BeautifulSoup(content)
    return soup

# This function takes in soup and url as input. We have to get the text differently depending on
# which year the website was built.
def get_text(soup, url):
    # handle old website
    if url[30:41] == '/boarddocs/':
        soup = soup.body 
        soup = soup.find_all('p')
        text = " "
    
        for s in soup:
            text += s.get_text(" ")
        return text
    # handles new website
    else:
        soup = soup.find(class_='col-xs-12 col-sm-8 col-md-8')
        soup = soup.find_all('p')
        text = " "
        for s in soup:
            text += " " + s.get_text(" ")
        return text
    
# This function gets the author and the date of a given link. We only use this function for links past 2006.
def get_author_and_date(soup, url):
    soup = soup.find(class_='heading col-xs-12 col-sm-8 col-md-8')
    soup = soup.find_all('p')
    return soup[1].get_text(" "), soup[0].get_text(" ")

# This function returns the links to all of the speeches for before 2006. Since this website is very well
# structured, and many of the html links that we scrape here are not, we use this oppurtunity to scrape
# Authors and Dates as well.
def get_early_links_dates_authors(url):
    page = requests.get(url)
    content=page.content
    soup = BeautifulSoup(content)
    
    # links from old website
    if url[-10:] == 'speech.htm' and int(url[-14:][:4]) < 2006:
        soup = soup.find('ul', id='speechIndex')
        links = []
        authors = []
        dates = []
        for link in soup.find_all('a', href=True):
            if len(link['href']) > 10 and link['href'][:11] == '/boarddocs/':
                links.append('https://www.federalreserve.gov' + link['href'])
        for auth in soup.find_all(class_='speaker'):
            temp = auth.get_text().strip()
            authors.append(temp)
        for day in soup.find_all('li'):
            date_match = re.search(r'\b([A-Za-z]+ \d{1,2}, \d{4})\b', day.get_text())
            if date_match:
                dates.append(date_match.group(1))
            else:
                print('ERROR')
                
        return links, dates, authors


# This is the main scraping function which takes a start and end date as input, and it 
# returns a DF that holds['Text', 'Author', 'Date'] as its columns for those years.
def scrape(start, end):
    #Begin by generating pages for each year
    pages = generate_all_pages(start, end)
    
    #These are the main arrays we want to populate with information
    old_links = []
    new_links = []
    text = []
    authors =[]
    dates = []
    
    #For each page
    for page in pages:
        #If this page is part of the old website
        if page[-10:] == 'speech.htm' and int(page[-14:][:4]) < 2006:
            #Scrape links, dates, and authors
            l, d, a = get_early_links_dates_authors(page)
            #Add the found information to old_links, dates, and authors
            old_links += l
            dates += d
            authors += a
        # Otherwise this is the new website so we only scrape links
        else:
            new_links += get_speech_links(page)
    
    #For each link in the old_links we only scrape text. This is because we have already
    #found the date and author information for these articles
    for link in old_links:
        soup = get_soup(link)
        #Get Text
        text.append(get_text(soup, link))
    
    #For each link in the new_links, we need to find the text, author, and date, and add these
    #to their respective arrays
    for link in new_links:
        soup = get_soup(link)
        #Get Text
        text.append(get_text(soup, link))
        #Get Author and date
        auth, date = get_author_and_date(soup, link)
        authors.append(auth)
        dates.append(date)
    
    #Convert all times to pandas timestamp
    times = [pd.Timestamp(da) for da in dates]
    #Create and populate DF with respective information
    df = pd.DataFrame(columns = ["Date", "Author", "Text" ])
    df['Date'] = [tim for tim in times]
    df['Author'] = [a for a in authors]
    df['Text'] = [te for te in text]
    return df

#Calls scrape from 2000 to 2023
df = scrape(2000, 2023)
#Display DF to see how our results look
display(df)
#Save the DF as a CSV. This is currently commented out as we want to add a few columns before
#saving the data. This will make later merges between datasets easier
# df_2000_to_2203.to_csv('df_2000_to_2203.csv')

Unnamed: 0,Date,Author,Text
0,2000-12-08,Chairman Alan Greenspan,\n \n\r\nBuildings such as this new Birmingha...
1,2000-12-06,"Vice Chairman Roger W. Ferguson, Jr.",\n \n\n\r\n\tThank you for inviting me to the...
2,2000-12-06,Governor Edward M. Gramlich,"\n \n\n Subprime Lending, Predatory Lending \..."
3,2000-12-05,Chairman Alan Greenspan,"\n \n\r\nTechnological innovation, and in par..."
4,2000-11-21,Governor Edward M. Gramlich,\n \n\n Financial Literacy \n \r\nPartnership...
...,...,...,...
1513,2023-01-20,Governor Christopher J. Waller,"Thank you, Ben, and thank you to the Council..."
1514,2023-01-19,Vice Chair Lael Brainard,"Inflation has declined in recent months, whi..."
1515,2023-01-10,Governor Michelle W. Bowman,Governor Bowman presented identical remarks ...
1516,2023-01-10,Chair Jerome H. Powell,"I will address three main points. First, the..."


In [2]:
#Save DF in case of error when editing :)
df2 = df

In [3]:
# Give each author an authorId
temp = 0
author_dict = {}

#First we get each unique author and assign them a numerical key authorId
for author in df['Author']:
    if author in author_dict:
        author = author_dict[author]
    else:
        author_dict[author] = temp
        temp+=1
#Populate the DF with respective authors
df['authorId'] = [author_dict[author] for author in df['Author']]

#NOTE: This keeps all parts of the author's name within the text including their title
# We chose not to remove this as their may be importance in the value of a given persons speeches before
# vs after appointment to a certain position. (i.e. Vice Chairman --> Chairman)

In [4]:
#Make year column
df['year'] = [da.year for da in df['Date']]

In [9]:
#Make Quarter column
quarter = {1:1, 2:1, 3:1, 4:2, 5:2, 6:2, 7:3, 8:3, 9:3, 10:4, 11:4, 12:4}
df['quarter'] = [quarter[da.month] for da in df['Date']]

In [10]:
df.head(25)

Unnamed: 0,Date,Author,Text,authorId,year,quarter
0,2000-12-08,Chairman Alan Greenspan,\n \n\r\nBuildings such as this new Birmingha...,0,2000,4
1,2000-12-06,"Vice Chairman Roger W. Ferguson, Jr.",\n \n\n\r\n\tThank you for inviting me to the...,1,2000,4
2,2000-12-06,Governor Edward M. Gramlich,"\n \n\n Subprime Lending, Predatory Lending \...",2,2000,4
3,2000-12-05,Chairman Alan Greenspan,"\n \n\r\nTechnological innovation, and in par...",0,2000,4
4,2000-11-21,Governor Edward M. Gramlich,\n \n\n Financial Literacy \n \r\nPartnership...,2,2000,4
5,2000-11-20,Chairman Alan Greenspan,\n \n\n\n\r\nI am pleased to join you this ev...,0,2000,4
6,2000-11-14,Chairman Alan Greenspan,\n \n\n\n\r\n\tI am honored to be speaking be...,0,2000,4
7,2000-10-31,"Vice Chairman Roger W. Ferguson, Jr.",\n \n\n\n\r\nIt is a pleasure to join America...,1,2000,4
8,2000-10-24,Governor Laurence H. Meyer,"\n \n\n\n\n\r\nIt is widely believed, at leas...",3,2000,4
9,2000-10-20,"Vice Chairman Roger W. Ferguson, Jr.",\n \n\n\n\r\nI am pleased to speak with you t...,1,2000,4


In [13]:
## Add type to indicate that this is a speech
df['Type'] = 'Speech'

In [14]:
 df.to_csv('df_2000_to_2203.csv')