In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time

# Base URL for company job listings
base_url = 'https://careers.citigroup.com/students-and-graduates/find-events.html'
# url_start = 'https://careers.citigroup.com/students-and-graduates/'

# TODO: Define the pull_company_name function
def pull_company_name(base_url, all_links, job_links):
    """
    Extracts job links from the company careers page.

    Parameters:
    base_url (str): The base URL of the company careers page.
    all_links (list): List of all links on the current page.
    job_links (list): List of collected job links.

    Returns:
    list: Updated list of job links.
    """

    url_start = base_url[:base_url.find('find-events')]
    for link in all_links:
        # TODO: Check if the link contains a certain pattern and is not already in job_links
        # Append the link to job_links if it meets the criteria
        href = link.get('href') # extract href
        if 'event-details' in href and 'http' not in href and 'mailto' not in href:
            res_url = url_start + href
            if res_url not in job_links:
                job_links.append(res_url)
        
        
        
    # print(job_links)    
    return job_links

# TODO: Define the button_company_name function
def button_company_name(driver, page, outer_loop_break, all_links):
    """
    Handles pagination by clicking the "next page"/"load more" button.

    Parameters:
    driver (WebDriver): The Selenium WebDriver instance.
    page (int): The current page number. (optional)
    outer_loop_break (bool): Flag to indicate when to stop scraping.
    all_links (list): List of all links on the current page.

    Returns:
    tuple: Updated page number and outer_loop_break flag.
    """
    try:
        # TODO: Locate the next page button and navigate to the next page if it exists
        # NOT ENOUGH CURRENT LINKS FOR PAGINATION -> NO LOADMORE/NEXT BUTTON FOUND
        # Leaving this here incase more links are added and pagination necessary for both virtual + onsite events
        onsite_events = WebDriverWait(driver, 3).until(
            EC.element_to_be_clickable((By.CLASS_NAME, 'inperson'))
        )
        # print(onsite_events.text)
        if '--active' not in onsite_events.get_attribute('class'):
            onsite_events.click()
        no_events_message = driver.find_elements(By.CLASS_NAME, 'no-events')
        if no_events_message:
            print('No on-site events found')
            outer_loop_break = True

    except Exception as e:
        print(f"Error occurred while trying to click the 'next page' button: {e}")
        outer_loop_break = True
    return page + 1, outer_loop_break

# WebDriver setup
driver = webdriver.Chrome()
driver.implicitly_wait(10)
driver.get(base_url)

# Initialize variables
job_links = []
prev_page = 0
page = 1

#HANDLE COOKIE:
try:
    cookie_reject_all = WebDriverWait(driver, 3).until(
        EC.element_to_be_clickable((By.XPATH, "//a[contains(@class, 'ocean-button') and contains(@class, 'cookies-button') and contains(@class, 'reject-all')]"))
    )
    cookie_reject_all.click()
except Exception as e:
    pass

# Main scraping loop
while True:
    time.sleep(5)
    outer_loop_break = False
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')
    all_links = soup.find_all("a", href=True)

    # Scrape job links
    job_links = pull_company_name(base_url, all_links, job_links)
    # Handle pagination
    page, outer_loop_break = button_company_name(driver, page, outer_loop_break, all_links)

    # NO FILTERS HERE
    # Following if block to be repeated several times
    # if block to be filled with a distinctive string that appears in base_url
    # if '' in base_url:
    #     # Apply filter here if necessary
        
    if outer_loop_break:
        break

# Output the collected job links
print("Length of job links:", len(job_links))

# Close the WebDriver
driver.quit()

['https://careers.citigroup.com/students-and-graduates/event-details.html?id=24749211', 'https://careers.citigroup.com/students-and-graduates/event-details.html?id=24762835']
No on-site events found
Length of job links: 2
