### Importing Relevant Libraries

In [1]:
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from datetime import datetime
from bs4 import BeautifulSoup
import pandas as pd

### Function for filtering countries in the table 

In [2]:
def filter_countries(driver, countries):

    # Wait for the filter button to be clickable
    filter_button = WebDriverWait(driver, 5).until(
    EC.element_to_be_clickable((By.ID, "filterStateAnchor")))

    # Click the filter button
    filter_button.click()
    
    # Locate all country labels
    country_labels = driver.find_elements(By.XPATH, "//label[contains(@for, 'country')]")
    
    # Check if the default country filter is selected and deselect it if necessary
    default_country_checkbox = None
    for label in country_labels:
        country_name = label.text.strip()
        if country_name == "United States":
            default_country_checkbox = label.find_element(By.XPATH, "./preceding-sibling::input[@type='checkbox']")
            break

    if default_country_checkbox and default_country_checkbox.is_selected():
        default_country_checkbox.click()

    # Iterate over each label and check if it matches the user-provided country names
    for label in country_labels:
        country_name = label.text.strip()
        if country_name in countries:
            # If there's a match, find the associated checkbox and click it
            checkbox_id = label.get_attribute('for')
            checkbox = driver.find_element(By.ID, checkbox_id)
            if not checkbox.is_selected():
                driver.execute_script("arguments[0].click();", checkbox)
    

    apply_filter_button = driver.find_element(By.ID, "ecSubmitButton")
    apply_filter_button.click()

### Function for selecting date range

In [3]:
def select_date(driver, date_option, start_date=None, end_date=None, max_attempts=10):
    date_option_map = {
        "Yesterday": "timeFrame_yesterday",
        "Today": "timeFrame_today",
        "Tomorrow": "timeFrame_tomorrow",
        "This Week": "timeFrame_thisWeek",
        "Next Week": "timeFrame_nextWeek",
        "Custom Date": "datePickerToggleBtn"
    }

    attempt = 0
    while attempt < max_attempts:
        try:
            attempt += 1
            print(f"Attempt {attempt} to select dates...")

            # Click on the selected date option
            date_option_id = date_option_map.get(date_option, None)
            if date_option_id is not None:
                date_option_element = WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.ID, date_option_id))
                )
                date_option_element.click()
                
                # If "Custom Date" is selected, fill in start and end dates
                if date_option == "Custom Date" and start_date and end_date:
                    # Wait for the date picker to be visible
                    WebDriverWait(driver, 10).until(
                        EC.visibility_of_element_located((By.ID, "ui-datepicker-div"))
                    )

                    start_date_input = WebDriverWait(driver, 10).until(
                        EC.presence_of_element_located((By.ID, "startDate"))
                    )
                    start_date_input.clear()
                    start_date_input.send_keys(start_date)

                    end_date_input = WebDriverWait(driver, 10).until(
                        EC.presence_of_element_located((By.ID, "endDate"))
                    )
                    end_date_input.clear()
                    end_date_input.send_keys(end_date)

                    apply_button = WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.ID, "applyBtn")))
                    apply_button.click()

                    # Check if the startDate element is found
                    WebDriverWait(driver, 10).until(
                        EC.presence_of_element_located((By.ID, "startDate"))
                    )
                    print("Dates selected successfully.")
                    return
                else:
                    print(f"Date option '{date_option}' found.")
                    return
            else:
                print(f"Date option '{date_option}' not found.")
                return

        except Exception as e:
            print(f"Attempt {attempt} failed with error: {e}")
        
        print("Retrying...")

    print(f"Failed to select dates after {max_attempts} attempts.")
    raise Exception("Failed to select dates after multiple attempts.")


### Function for Extracting table from the webpage

In [4]:
def fetch_earnings_calendar_table(driver):
    try:
        get_source = driver.page_source
        soup = BeautifulSoup(get_source, 'html.parser')
        table_id = 'earningsCalendarData'
        
        table = soup.find('table', {'id': table_id})
        # Check if the table was found
        if table:
            # Extract the table data into a pandas DataFrame
            df = pd.read_html(str(table))[0]
            return df
        else:
            print(f"Table with id '{table_id}' not found.")
            return None
    
    finally:
        # Ensure the WebDriver is closed regardless of success or failure
        pass

### Function for preprocessing the extracted raw table

In [5]:
def preprocess_dataframe_for_aggregation(temp):
    # Create new columns based on the given logic
    temp["Company_Name"] = temp.iloc[:, 1]
    temp["EPS_Forecast"] = temp.iloc[:, 2].astype(str).str.cat(temp.iloc[:, 3].astype(str), sep=' ', na_rep='')
    temp["Revenue_Forecast_1"] = temp.iloc[:, 4].astype(str).str.cat(temp.iloc[:, 5].astype(str), sep=' ', na_rep='')
    temp["Market_Cap"] = temp.iloc[:, 6].astype(str)

    # Filter the DataFrame to keep only relevant columns and rows
    filter_df = temp[['Company_Name', 'EPS_Forecast', 'Revenue_Forecast_1', 'Market_Cap']]

    # Identify rows with dates
    date_rows = filter_df['Company_Name'].str.contains('day, ')

    # Extract dates and convert to datetime format
    filter_df.loc[date_rows, 'Date'] = pd.to_datetime(filter_df.loc[date_rows, 'Company_Name'])

    # Forward fill the dates
    filter_df['Date'] = filter_df['Date'].fillna(method='ffill')

    # Remove the date rows from the DataFrame
    filter_df = filter_df[~date_rows]

    # Reset the index
    filter_df.reset_index(drop=True, inplace=True)

    # Set the multi-index with Date and original index
    filter_df.set_index(['Date', filter_df.index], inplace=True)

    return filter_df


### Initialize the FirefoxOptions

In [6]:
firefox_options = Options()
firefox_options.add_argument("--start-fullscreen")  # Open in full screen mode
firefox_options.add_argument("--ignore-certificate-errors")  # This option is not typically used in Firefox; might need to handle differently
firefox_options.add_argument("--disable-popup-blocking")
firefox_options.add_argument("--disable-notifications")
firefox_options.add_argument("--disable-extensions")
firefox_options.add_argument("--disable-infobars")  # This option is not available in Firefox; it's specific to Chrome

### Initializing the Selenium WebDriver with options

In [7]:
driver = webdriver.Firefox(options=firefox_options)

### Navigating to the website

In [8]:
driver.get("https://www.investing.com/earnings-calendar/")

### User Input

##### Country list for filter function

In [9]:
Country_filter_list = ["Taiwan,China,Argentina,Chile"]

##### For Fixed Interval

In [10]:
# Date_option = "This Week"
# filter_countries(driver,Country_filter_list)
# select_date(driver, Date_option)

##### For Custom Date Interval

In [16]:
Date_option = "Custom Date"
start_date = "01/01/2024"
end_date = "01/31/2024"
filter_countries(driver,Country_filter_list)
select_date(driver, Date_option,start_date,end_date)

Attempt 1 to select dates...
Attempt 1 failed with error: Message: 

Retrying...
Attempt 2 to select dates...
Dates selected successfully.


In [23]:
df = fetch_earnings_calendar_table(driver)

In [24]:
final_df = preprocess_dataframe_for_aggregation(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filter_df.loc[date_rows, 'Date'] = pd.to_datetime(filter_df.loc[date_rows, 'Company_Name'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filter_df['Date'] = filter_df['Date'].fillna(method='ffill')


In [25]:
final_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Company_Name,EPS_Forecast,Revenue_Forecast_1,Market_Cap
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024-01-01,0,Snipp Interactive Inc (SPN),0.00 / --,7.45M / --,31.47M
2024-01-01,1,Clean Air Metals (AIR),-- / --,-- / --,11.2M
2024-01-02,2,Taiwan Cogeneration (8926),-- / --,-- / --,33.74B
2024-01-02,3,Virtus Dividend Interest Premium Strategy Fund...,-- / --,-- / --,1.17B
2024-01-02,4,ETV Limited Duration (EVV),-- / --,-- / --,1.14B
...,...,...,...,...,...
2024-01-19,958,World Acceptance (WRLD),2.84 / 1.73,137.75M / 131.48M,685.59M
2024-01-19,959,Hingham Institution (HIFS),-- / --,-- / --,373.39M
2024-01-19,960,Citizens&Northern (CZNC),0.28 / 0.39,28.31M / 26.37M,272M
2024-01-19,961,Private Bancorp of America (PBAM),1.36 / 1.36,23.43M / 24.15M,196.13M
