In [1]:
# Prerequisites
# !pip install selenium
# !pip install webdriver_manager

In [2]:
# downloading Selenium libraries 
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException

# downloading BeautifulSoup
from bs4 import BeautifulSoup

# downloading Numpy & Pandas
import pandas as pd
import numpy as np

# Downloading sleep function 
from time import sleep

In [3]:
# setting Selenium options 
options = Options()
options.add_argument("start-maximized")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
# setting website that should be scraped 
driver.get("https://fred.stlouisfed.org/releases/calendar")


In [4]:
def get_table():
    
    """
    get_table function retrieves tables from the website, transforms it into a dataframe and 
    modifies it in order to obtain the most important information in a transparent way
    """
        
    html=driver.page_source
    html = html.replace("N/A","Unspecified")
    
    soup=BeautifulSoup(html,'html.parser')
    
    # downloading the table from the website
    div=soup.find_all('table', attrs={'class':'table table-condensed table-standard-theme'})
    table=pd.read_html(str(div))
    
    # transforming the table to string
    ans = np.array(table)
    ans = (ans[0])
    
    # saving array as a dataframe 
    df = pd.DataFrame(ans, columns = ['Hour','Variable'])
    
    # saving new variable
    globals()['last_day_of_the_week'] = df['Variable'].iloc[0]

    # editing the dataframe
    df['Hour'].fillna(method='pad', inplace=True)
    df['Date'] = df['Variable'].iloc[0]
    df = df.iloc[1: , :]
    dfx = df['Date'].str.split(' ', expand=True)
    df = df.drop('Date', axis=1)
    df = pd.concat([dfx, df.reindex(dfx.index)], axis=1)
    df[2] = df[2].replace(',','', regex=True)
    df.rename(columns = {1:'Month', 2:'Day', 3:'Year'}, inplace = True)
    get_table.df = df
    
    # saving the weekday of the table to be able to delte it in next step
    weekday = df[0].iloc[0]
    df = df.drop(0, axis=1)
    
    # saving results as a global df 
    globals()[f'df_{weekday}'] = df
    
    # writed to confirm the execution of the function 
    print(weekday + ", " + df['Day'].iloc[0] + " " + df['Month'].iloc[0] + " downloaded")

<div id="release-dates-pager">    No release dates are available for the selected options.
</div>

In [5]:
# TESTS

def get_table():
    
    """
    get_table function retrieves tables from the website, transforms it into a dataframe and 
    modifies it in order to obtain the most important information in a transparent way
    """
        
    html=driver.page_source
    html = html.replace("N/A","Unspecified")
    
    soup=BeautifulSoup(html,'html.parser')
    
    # downloading the table from the website
    if soup.find_all('table', attrs={'class':'table table-condensed table-standard-theme'}):
        div=soup.find_all('table', attrs={'class':'table table-condensed table-standard-theme'})
        table=pd.read_html(str(div)) # tu się wywala błąd - tj. nie ma tabeli 
        
        # transforming the table to string
        ans = np.array(table)
        ans = (ans[0])
    
        # saving array as a dataframe 
        df = pd.DataFrame(ans, columns = ['Hour','Variable'])
    
        # saving new variable
        globals()['last_day_of_the_week'] = df['Variable'].iloc[0]

        # editing the dataframe
        df['Hour'].fillna(method='pad', inplace=True)
        df['Date'] = df['Variable'].iloc[0]
        df = df.iloc[1: , :]
        dfx = df['Date'].str.split(' ', expand=True)
        df = df.drop('Date', axis=1)
        df = pd.concat([dfx, df.reindex(dfx.index)], axis=1)
        df[2] = df[2].replace(',','', regex=True)
        df.rename(columns = {1:'Month', 2:'Day', 3:'Year'}, inplace = True)
        get_table.df = df
    
        # saving the weekday of the table to be able to delte it in next step
        weekday = df[0].iloc[0]
        df = df.drop(0, axis=1)
    
        # saving results as a global df 
        globals()[f'df_{weekday}'] = df
    
        # writed to confirm the execution of the function 
        print("    " + weekday + ", " + df['Day'].iloc[0] + " " + df['Month'].iloc[0] + " downloaded")
        
    else:
        print('N/A')

In [6]:
def download_data(name):
    
    """
    a function that combines a dataframe from individual days into one large dataframe for a whole week
    """
    
    # merging all df's to week_df
    week_df = pd.concat([df_Sunday, df_Monday, df_Tuesday, df_Wednesday, df_Thursday, df_Friday, df_Saturday])
    

    week_df.rename(columns = {4:'Updated'}, inplace = True)
    week_df = week_df.reset_index()
    globals()[name] = week_df
        
    #(f'df_{last_day_of_the_week}.csv')
    
    #saving it as csv file
    #week_df.to_csv(r"C:\Users\ddawiec001\Desktop\FRED\dataframe.csv")

In [7]:
def wait():
    
    """
    feature designed for delays allowing the full page loading
    """    
    
    WebDriverWait(driver,100).until(EC.presence_of_element_located((By.XPATH, '//*[@id="release-dates-pager"]/div/table/tbody/tr[2]/td[2]')));
    sleep(3)

In [8]:
def _wait(delay=100):
    WebDriverWait(driver,delay).until(EC.presence_of_element_located((By.XPATH, "//*[@class='fc-next-button fc-button fc-state-default fc-corner-right']")));

In [9]:
def week():
    
    """
    a function that combines all the previous functions together, 
    it retrieves data from the whole week and combines them into one dataframe
    """    
    # setting Selenium buttons
    WebDriverWait(driver,100).until(EC.element_to_be_clickable((By.XPATH, "//*[@class='fc-next-button fc-button fc-state-default fc-corner-right']")));
    next_week = driver.find_element(By.XPATH, "//*[@class='fc-next-button fc-button fc-state-default fc-corner-right']")
    fc_days = driver.find_elements(By.XPATH, "//*[@class='fc-day-grid-event fc-h-event fc-event fc-start fc-end']")
    
    fc_sun, fc_mon, fc_tue, fc_wed, fc_thu, fc_fri, fc_sat = fc_days
    
    # Executing the process
    fc_sat.click();
    wait();
    
    fc_sun.click();
    wait();
    get_table();
    
    fc_mon.click();
    wait();
    get_table();
    
    fc_tue.click();
    wait();
    get_table();  
    
    fc_wed.click();
    wait();
    get_table();
    
    fc_thu.click();
    wait();
    get_table();  
        
    fc_fri.click();
    wait();
    get_table();
    
    fc_sat.click();
    wait();
    get_table();
    
    next_week.click();

In [10]:
week()

    Sunday, 21 July downloaded
    Monday, 22 July downloaded
    Tuesday, 23 July downloaded
    Wednesday, 24 July downloaded
    Thursday, 25 July downloaded
    Friday, 26 July downloaded
    Saturday, 27 July downloaded


In [11]:
download_data('week_1')

In [12]:
driver.refresh()

In [13]:
max_retries = 30

for week_i in [f'week_{i}' for i in range(1, 24)]:
    print(f"Starting downloads for {week_i}")
    
    for attempt in range(max_retries):
        print(f"  Attempt {attempt + 1} for {week_i}")
        
        try:
            week()
            print(f"  Successfully completed download for {week_i} on attempt {attempt + 1}")
            break
        except (NoSuchElementException, TimeoutException):
            print(f"  Attempt {attempt + 1} failed for {week_i}, retrying...")
            driver.find_element(By.XPATH, '//*[@id="rc-rid"]/option[1]').click()
            # sleep(60)
        else:
            break

    print(f"  Downloading data for {week_i}")
    download_data(week_i)
    print(f"  Completed downloads for {week_i}")


Starting downloads for week_1
  Attempt 1 for week_1
    Sunday, 21 July downloaded
    Monday, 22 July downloaded
    Tuesday, 23 July downloaded
    Wednesday, 24 July downloaded
    Thursday, 25 July downloaded
    Friday, 26 July downloaded
    Saturday, 27 July downloaded
  Successfully completed download for week_1 on attempt 1
  Downloading data for week_1
  Completed downloads for week_1
Starting downloads for week_2
  Attempt 1 for week_2
  Attempt 1 failed for week_2, retrying...
  Attempt 2 for week_2
    Sunday, 28 July downloaded
    Monday, 29 July downloaded
    Tuesday, 30 July downloaded
    Wednesday, 31 July downloaded
    Thursday, 01 August downloaded
    Friday, 02 August downloaded
    Saturday, 03 August downloaded
  Successfully completed download for week_2 on attempt 2
  Downloading data for week_2
  Completed downloads for week_2
Starting downloads for week_3
  Attempt 1 for week_3
  Attempt 1 failed for week_3, retrying...
  Attempt 2 for week_3
    Sunday,

  Attempt 1 failed for week_18, retrying...
  Attempt 2 for week_18
    Sunday, 17 November downloaded
    Monday, 18 November downloaded
    Tuesday, 19 November downloaded
    Wednesday, 20 November downloaded
    Thursday, 21 November downloaded
    Friday, 22 November downloaded
    Saturday, 23 November downloaded
  Successfully completed download for week_18 on attempt 2
  Downloading data for week_18
  Completed downloads for week_18
Starting downloads for week_19
  Attempt 1 for week_19
  Attempt 1 failed for week_19, retrying...
  Attempt 2 for week_19
    Sunday, 24 November downloaded
    Monday, 25 November downloaded
    Tuesday, 26 November downloaded
    Wednesday, 27 November downloaded
    Thursday, 28 November downloaded
    Friday, 29 November downloaded
    Saturday, 30 November downloaded
  Successfully completed download for week_19 on attempt 2
  Downloading data for week_19
  Completed downloads for week_19
Starting downloads for week_20
  Attempt 1 for week_20


In [14]:
all_weeks_df = pd.concat([week_1, week_2, week_3, week_4, week_5, week_6, week_7, week_8, week_9, week_10, week_11, week_12, week_13, week_14, week_15, week_16, week_17, week_18, week_19, week_20, week_21, week_22])
    
print(all_weeks_df)

     index     Month Day  Year  Updated         Hour  \
0        1      July  21  2024  Updated      7:00 pm   
1        2      July  21  2024  Updated  Unspecified   
2        3      July  21  2024  Updated  Unspecified   
3        1      July  22  2024  Updated      1:00 am   
4        2      July  22  2024  Updated      2:00 am   
..     ...       ...  ..   ...      ...          ...   
175     39  December  20  2024      NaN  Unspecified   
176     40  December  20  2024      NaN  Unspecified   
177     41  December  20  2024      NaN  Unspecified   
178      1  December  21  2024      NaN      7:00 pm   
179      2  December  21  2024      NaN  Unspecified   

                                              Variable  
0                            Coinbase Cryptocurrencies  
1                               Equifax Credit Quality  
2                                   FOMC Press Release  
3                                 Euro Short Term Rate  
4     Swiss National Bank Monthly Statisti

In [16]:
# saving it as csv file
all_weeks_df.to_csv(r"./FRED/all_weeks_dataframe.csv")