In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime as dt
import re

#This url is for the more recent years of the fed (2018-2023) which all reside on the same page
recent_url = 'https://www.federalreserve.gov/monetarypolicy/fomccalendars.htm'

In [2]:
#Generate page links from 2000 to 2017
def generate_2000_2017_pages():
    pages = []
    year = 2000
    for i in range(18):
        pages.append('https://www.federalreserve.gov/monetarypolicy/fomchistorical'+str(year+i)+'.htm')
    return pages

In [3]:
# This function gets the links for all statements and minutes when given a page as input.
# The input page points to a link that will give access to all available FOMC statements
# and minutes from a given year.
def get_2000_to_2007_links(url):
    #Basic request and  and soup creation
    page = requests.get(url)
    content=page.content
    soup = BeautifulSoup(content)
    
    #Create the arrays we want to populate
    minutes_links = []
    statements_links = []
    
    # Soup to find_all minutes and statements links
    statements_links = soup.findAll('a', string = 'Statement')
    minutes_links = soup.findAll('a', string = 'Minutes')
    
    #Get all of the of the links and build them appropriately
    statements_links = ['https://www.federalreserve.gov/'+statement.attrs['href'] for statement in statements_links]
    minutes_links = ['https://www.federalreserve.gov/'+minute.attrs['href'] for minute in minutes_links]
    
    return minutes_links, statements_links

In [4]:
def get_2007_links(url):
    #Basic request and  and soup creation
    page = requests.get(url)
    content=page.content
    soup = BeautifulSoup(content)
    
    #Create the arrays we want to populate
    minutes_links = []
    statements_links = []
    
    # Soup to find_all minutes and statements links
    statements_links = soup.findAll('a', string = 'Statement')    
    statements_links = ['https://www.federalreserve.gov/'+statement.attrs['href'] for statement in statements_links]
    
    #Get all of the of the links and build them appropriately
    minutes = soup.find_all('a', href=re.compile('^/fomc/minutes/\d{8}.htm'))
    minutes_links = ['https://www.federalreserve.gov/'+minute.attrs['href'] for minute in minutes]
    return minutes_links, statements_links
    
    

In [5]:
def get_2008_to_2017_links(url):
    #Basic request and  and soup creation
    page = requests.get(url)
    content=page.content
    soup = BeautifulSoup(content)
    
    #Create the arrays we want to populate
    minutes_links = []
    statements_links = []
    
    # Soup to find_all minutes and statements links
    statements_links = soup.findAll('a', string = 'Statement')    
    statements_links = ['https://www.federalreserve.gov/'+statement.attrs['href'] for statement in statements_links]
    
    #Get all of the of the links and build them appropriately
    minutes = soup.find_all('a', href=re.compile('^/monetarypolicy/fomcminutes\d{8}.htm'))
    minutes_links = ['https://www.federalreserve.gov/'+minute.attrs['href'] for minute in minutes]
    
    
    return minutes_links, statements_links

In [6]:
def get_2018_to_2023_links(url):
    #Basic request and  and soup creation
    page = requests.get(url)
    content=page.content
    soup = BeautifulSoup(content)
    
    #Create the arrays we want to populate
    minutes_links = []
    statements_links = []
    
    # Soup to find_all minutes and statements links
    statements = soup.find_all('a', href=re.compile('^/newsevents/pressreleases/monetary\d{8}a.htm'))
    statements_links = ['https://www.federalreserve.gov/'+statement.attrs['href'] for statement in statements]
    
    #Get all of the of the links and build them appropriately
    minutes = soup.find_all('a', href=re.compile('^/monetarypolicy/fomcminutes\d{8}.htm'))
    minutes_links = ['https://www.federalreserve.gov/'+minute.attrs['href'] for minute in minutes]
    
    return minutes_links, statements_links

In [7]:
def get_all_dates_and_text(url):
    #Basic request and  and soup creation
    page = requests.get(url)
    content=page.content
    soup = BeautifulSoup(content)
    
    #Find all of the p tags
    soup = soup.find_all('p')
    
    # Find the date using regex on the url
    date = re.findall('[0-9]{8}', url)[0]
    
    # Build out the date appropriately
    if date[4] == '0':
        date = "{}/{}/{}".format(date[:4], date[5:6], date[6:])
    else:
        date = "{}/{}/{}".format(date[:4], date[4:6], date[6:])
    
    #Get all of the text from the url
    text = ''
    for paragraph in soup:
        text += paragraph.get_text()
    
    return text, date

In [8]:
# This is the main scraping function which does not take any inputs. It will scrape all FOMC statements 
# and minutes from 2000 to 2023.
def get_all_FOMC():
    #Generate all pages form 2000 to 2017. We do not have to generate pages
    #for any dates past 2017 as they all occur on the same page which is denoted
    # as the variable recent_url
    pages = generate_2000_2017_pages()
    
#     Create all of the empty data structures(arrays) that we would like to populate
#     We keep minutes and statements separated until the end when we would like to 
#     Populate them into a dataframe
    minutes_links = []
    statements_links = []
    minutes_dates = []
    statements_dates = []
    minutes_text = []
    statements_text = []
    
    # For the first 18 links (2000-2017)
    # We have nested if statements in this for loop as the Fed's website changed
    # between the years and we need to scrape the data differently depending on the year
    for i in range(18):
        #If it is 2007, then use the special function that handles this case
        if i == 7:
            m, s = get_2007_links(pages[i])
            
        #If it is 2008-2017 then use the the functino that scrapes for this
        elif i > 7:
            m,s = get_2008_to_2017_links(pages[i])
        
        #If it is 2000-2007 then use the function that scrapes for this
        else:
            m, s = get_2000_to_2007_links(pages[i])
            
        #Add the found links to their respective variables
        minutes_links.extend(m)
        statements_links.extend(s)
    
    # For each link in the minutes_links, retrive the date/text and add that to it's respective array
    for link in minutes_links:
        t, d = get_all_dates_and_text(link)
        minutes_text.append(t)
        minutes_dates.append(d)
    
    # For each link in the statements_links, retrive the date/text and add that to it's respective array
    for link in statements_links:
        t, d = get_all_dates_and_text(link)
        statements_text.append(t)
        statements_dates.append(d)
    
    
#     current_text = minutes_text.copy()
#     current_text.extend(statements_text)
    
    #Get the minute and statement links from 2018-2023
    minutes_links_2018_2023, statements_links_2018_2023 = get_2018_to_2023_links(recent_url)
    
    # For each link in the minutes_links, retrive the date/text and add that to it's respective array
    for link in minutes_links_2018_2023:
        t, d = get_all_dates_and_text(link)
        minutes_text.append(t)
        minutes_dates.append(d)
        
    # For each link in the statements_links, retrive the date/text and add that to it's respective array
    for link in statements_links_2018_2023:
        t, d = get_all_dates_and_text(link)
        statements_text.append(t)
        statements_dates.append(d)
    
    #These numbers were manually input and will work up to the fed speech on Oct/Nov 31/1 in 2023
    #We do this in order to classify the documents as either a minutes speech or a statements speech
    minutes_arr = ['Minutes']*200
    statements_arr = ['Statement']*205
    types = minutes_arr
    types.extend(statements_arr)
    
    #Create total text variable. It will hold all minutes, and then all statements
    total_text = minutes_text.copy()
    temp_text = statements_text.copy()
    total_text.extend(temp_text)
    
    #Create all dates variable. It will hold all minutes_dates, and then all statements_dates
    all_dates = minutes_dates.copy()
    temp_dates = statements_dates.copy()
    all_dates.extend(temp_dates)
    
    times = [pd.Timestamp(da) for da in all_dates]
    df = pd.DataFrame(columns = ["Date", "Type", "Text" ])
    df['Date'] = [tim for tim in times]
    df['Type'] = [a for a in types]
    df['Text'] = [te for te in total_text]
    return df

In [9]:
df = get_all_FOMC()

In [10]:
display(df.head(10))

Unnamed: 0,Date,Type,Text
0,2000-02-02,Minutes,\n\r\nA meeting of the Federal Open Market Com...
1,2000-03-21,Minutes,\n\r\nA meeting of the Federal Open Market Com...
2,2000-05-16,Minutes,\n\r\nA meeting of the Federal Open Market Com...
3,2000-06-28,Minutes,\n\r\nA meeting of the Federal Open Market Com...
4,2000-08-22,Minutes,\n\r\nA meeting of the Federal Open Market Com...
5,2000-10-03,Minutes,\n\r\nA meeting of the Federal Open Market Com...
6,2000-11-15,Minutes,\n\r\nA meeting of the Federal Open Market Com...
7,2000-12-19,Minutes,\n\r\nA meeting of the Federal Open Market Com...
8,2000-12-19,Minutes,\n\r\nA meeting of the Federal Open Market Com...
9,2001-01-31,Minutes,\n\r\n\tA meeting of the Federal Open Market C...


In [11]:
df['year'] = [da.year for da in df['Date']]
quarter = {1:1, 2:1, 3:1, 4:2, 5:2, 6:2, 7:3, 8:3, 9:3, 10:4, 11:4, 12:4}
df['quarter'] = [quarter[da.month] for da in df['Date']]

In [12]:
df.to_csv('FOMC_Statements_and_Minutes.csv')