# Scraping a wider range of dates

In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [2]:
from pyquery import PyQuery as pq
from bs4 import BeautifulSoup
from selenium import webdriver
import re
import requests

import time

In [3]:
fedurl_base = "http://search.newyorkfed.org"

## Using Selenium WebDriver

We first used Selenium to obtain all the links of the FOMC minutes on the FOMC website.

Selenium offers an advantage over basic requests--it doesn't run into php/javacript tag selector issues because it simulates an actual human browsing the website. (The search engine from which the links are obtained uses a php/javascript backend.)

In [5]:
# Initialize browser
# Use Ctrl+Enter (don't use Shift+Enter)
browser = webdriver.Firefox()

Selenium allows us to go directly to the search page with text already in the input query.

In [6]:
browser.get("http://search.newyorkfed.org/fomc-docs/search?advanced_search=true&fomc_document_type=minutes&text=.htm&search_precision=All+Words&from_month=3&from_year=1936&to_month=12&to_year=2015&sort=Most+Recent+First&Search=Search")

We iterate through each page of the search results and collect all the links by storing them in a list.

In [7]:
links = []

while True:
    src = browser.page_source
    soup = BeautifulSoup(src, "html.parser")
    
    for tag in soup.find_all('strong'):
        linkbox = tag.find('a')
        if linkbox:
            links.append(linkbox['href'])
    try:
        nextresults = browser.find_element_by_link_text('Next Page')
        nextresults.click()
        time.sleep(1)
    except Exception, e:
        print "End of results"
        print "====================================="
        print e
        break

# remove duplicates
links = list(set(links))
print len(links)

End of results
Message: Unable to locate element: {"method":"link text","selector":"Next Page"}
Stacktrace:
    at FirefoxDriver.prototype.findElementInternal_ (file:///c:/users/george/appdata/local/temp/tmpvkqyi5/extensions/fxdriver@googlecode.com/components/driver-component.js:10659)
    at FirefoxDriver.prototype.findElement (file:///c:/users/george/appdata/local/temp/tmpvkqyi5/extensions/fxdriver@googlecode.com/components/driver-component.js:10668)
    at DelayedCommand.prototype.executeInternal_/h (file:///c:/users/george/appdata/local/temp/tmpvkqyi5/extensions/fxdriver@googlecode.com/components/command-processor.js:12534)
    at DelayedCommand.prototype.executeInternal_ (file:///c:/users/george/appdata/local/temp/tmpvkqyi5/extensions/fxdriver@googlecode.com/components/command-processor.js:12539)
    at DelayedCommand.prototype.execute/< (file:///c:/users/george/appdata/local/temp/tmpvkqyi5/extensions/fxdriver@googlecode.com/components/command-processor.js:12481)
187


We save the links that we found.

In [10]:
import pickle
pickle.dump(links, open("mins_links.p", "wb"))

In [11]:
import pickle
links = pickle.load(open("mins_links.p", "rb"))

In [12]:
len(links)

187

## Using requests

This approach for obtaining the links cannot be used as easily since Javacript used on the FOMC site makes tag selection difficult. 
Attempting to find strong results in no urls being found. 


However, given the links that we found above, we can still use requests to obtain the page contents.

In [8]:
from requests.exceptions import ConnectionError

fomc_mins_all = {}

In [9]:
# this code block can be run multiple times--we check for duplicates
searched_urls = fomc_mins_all.keys()

for url in links:
    if url not in searched_urls and url[-3:] == "htm":
        try:
            page = requests.get(url)
            fomc_mins_all[url] = page.text
        except ConnectionError as e:
            print "Error ==> ", e, "for", date

        time.sleep(1)

print "Finished getting page sources"

Finished getting page sources


We store our page sources a dictionary indexed by the FOMC minutes url. We'll clean up by standardizing the format of the keys which encode the date information.

In [61]:
import json

mins_html = open("fomc_mins_all.json", "wb")
json.dump(fomc_mins_all, mins_html)
mins_html.close()

In [62]:
import json

with open("fomc_mins_all.json", "rb") as infile:
    fomc_mins_all = json.load(infile)

In [63]:
len(fomc_mins_all)

182

## Cleaning keys

We now clean the keys of the dictionary so that they correspond to the actual dates in standardized form.

In [57]:
dates = []
for old_k in fomc_mins_all.keys():
    if old_k[-5].isdigit():
        new_k = old_k[-12:-4]
    else:
        new_k = old_k[-15:-7]
    
    dates.append(new_k)
    fomc_mins_all[new_k] = fomc_mins_all[old_k]
    del fomc_mins_all[old_k]

dates = list(set(dates))

In [64]:
len(fomc_mins_all.keys())

182

In [65]:
import json

mins_html = open("fomc_mins_all.json", "wb")
json.dump(fomc_mins_all, mins_html)
mins_html.close()

In [66]:
import json

with open("fomc_mins_all.json", "rb") as infile:
    fomc_mins_all = json.load(infile)

## Obtaining historical rate changes from Wikipedia

Wikipedia has a list of FOMC actions along with the associated rate change and other summary information here: https://en.wikipedia.org/wiki/History_of_Federal_Open_Market_Committee_actions. We scrape the table towards the bottom of the page.

In [4]:
resultpage = requests.get("https://en.wikipedia.org/wiki/History_of_Federal_Open_Market_Committee_actions")

In [7]:
ressoup = BeautifulSoup(resultpage.text, "html.parser")

Wikipedia has color codes to indicate the change (up or down) from the previous rate.

In [None]:
IGREEN = '#66F500'
IBLUE = '#CCEEFF'
IRED = '#FFB6B6'
IYELLOW = '#FFE153'

We use some helping functions to extract the movements and the date.

In [46]:
def col2mov(colcode):
    if colcode == IGREEN:
        movement = 0
    elif colcode == IBLUE:
        movement = 1
    elif colcode == IRED:
        movement = True
    elif colcode == IYELLOW:
        movement = -1
    else:
        #print "No color match"
        movement = None
    return movement

def month2num(date):
    return{
        'Jan' : 1,
        'January': 1,
        'Feb' : 2,
        'Mar' : 3,
        'Apr' : 4,
        'May' : 5,
        'Jun' : 6,
        'Jul' : 7,
        'Aug' : 8,
        'Sep' : 9,
        'September': 9,
        'Oct' : 10,
        'Nov' : 11,
        'November': 11,
        'Dec' : 12
    }[date]

In [96]:
restable = ressoup.find('table', {'class': 'wikitable'}).find_all('tr')

actions = {}

for row in restable:
    entries = row.find_all('td')
    
    # skip if nothing found
    if not entries:
        continue
    
    # extract date information
    date = entries[0].contents[0]
    datespan = entries[0].find('span')
    if datespan:
        date = datespan.contents[0]
    try:
        datecol = entries[0]['style'][-7:]
    except KeyError, e:
        datecol = None

    datels = date.split()
    date = datels[2] + \
        str(month2num(datels[0])).zfill(2) + \
        datels[1].strip(',').zfill(2)

    # extract federal funds rate info
    ffr = entries[1].contents[0][:-1]
    ffrcol = entries[1]['style'][-7:]

    # extract discount rate info
    discr = float(entries[2].contents[0][:-1])
    discrcol = entries[2]['style'][-7:]

    # match color codes for rate movement info
    special = col2mov(datecol)
    ffrmov = col2mov(ffrcol)
    discrmov = col2mov(discrcol)

    # store information in dictionary with dates as keys
    actions[date] = [special, ffr, ffrmov, discr, discrmov]

We save our data in a json file.

In [100]:
import json

fomc_actions = open("actions_05-15.json", "wb")
json.dump(actions, fomc_actions)
fomc_actions.close()

In [101]:
import json

with open("actions_05-15.json", "rb") as infile:
    action = json.load(infile)