## Import Packages

In [1]:
import re
import json
import pandas as pd
import datetime
import numpy as np
import os
from bs4 import BeautifulSoup
from urllib.request import urlopen
from google.cloud.storage import Client


## Step 1: Scrap the events list from Fed WebSite

In [2]:
# Global Variables

FED_BASE_URL = 'https://www.federalreserve.gov'
FED_EVENTS_SOURCE = 'https://www.federalreserve.gov/json/ne-press.json'
RENAME_COLUMNS = {'d':'date','l':'link','pt':'categ','pt2':'othercateg','stub':'stub','t':'title','updateDate':'updateddate'}


# other variables. fed by apache airflow environment/trigger variables.
target_event = datetime.datetime(2020,1,3,14) # fomc minutes release for 12/2019 meeting

In [3]:
# Read the Events Source

with urlopen(FED_EVENTS_SOURCE) as url:
     events_data = json.loads(url.read())

In [4]:
# rename columns

df_events = pd.DataFrame(events_data).rename(columns=RENAME_COLUMNS)

In [5]:
# drop na dates. cause issues with applymap funcitons 

df_events.dropna(axis=0,subset=['date'],inplace=True)

In [6]:
def format_date(x):
    try:
        return datetime.datetime.strptime(x,'%m/%d/%Y %I:%M:%S %p')
        #return date_fmt
    except ValueError as e:
        # the time element may be absent. try another format
        try:
            return datetime.datetime.strptime(x,'%m/%d/%Y')            
        except:
            return np.nan

In [7]:
df_events['date'] = df_events[['date']].applymap(lambda x:format_date(x))

### Step 2: Extract details of relavent event. Get url and Download Document

In [138]:
# Extract the relevant event link

event_link = df_events[df_events['date']==pd.Timestamp(target_event)]['link']
#contructed event url
event_url = FED_BASE_URL + event_link.values[0]

In [143]:
# open the url. scrap through beautiful soup to get url
event_page = urlopen(event_url)
bs_event_contents = BeautifulSoup(event_page,'html.parser')

In [151]:
minutes_link = bs_event_contents.find('a', href=re.compile('^/monetarypolicy/fomcminutes\d{8}.htm'))

In [183]:
minutes_url = FED_BASE_URL + minutes_link.attrs['href']

In [186]:
#extract meeting date from link
re_date_url = re.compile(r'(\d{8}).htm[l]?')
mt_dt = re.search(re_date_url,minutes_url)
if mt_dt:
    datestr = mt_dt.group(1)
    meeting_date = datetime.datetime.strptime(datestr,'%Y%m%d')    
else:
    meeting_date = datetime.datetime.now()  

### Step 3: Download the Data to Google Drive

In [14]:


# GCS Api setup
#set environment variables
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = r'C:\Users\ashay\Documents\Projects\GCP - FedStatements\AuthenticationJSON\fedminutesanalysis-4141ba8145c7.json'
GCP_BUCKET = 'khandeas-fedminutesanalysis'

#initialize google cloud storage client
client = Client()
bucket = client.get_bucket(GCP_BUCKET)

In [196]:
def download_minutes(minutes_url, meeting_dt):

    minuteslink = urlopen(minutes_url)            
    fname = "{}{}{}.html".format(meeting_dt.year,meeting_dt.month,meeting_dt.day)
    gcs_filename = 'raw/minutes/' + str(meeting_dt.year) + '/' +  fname
    blob = bucket.blob(gcs_filename)
    blob.upload_from_string(minuteslink.read())

In [None]:
blob.upload_from_string()

In [197]:
download_minutes(minutes_url, meeting_date)