In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time

# Base URL for company job listings
# base_url = 'https://careers.db.com/professionals/search-roles/#/professional/results/'
base_url = 'https://careers.db.com/professionals/search-roles/#/professional/results/?title=1'

# TODO: Define the pull_company_name function
def pull_company_name(base_url, all_links, job_links):
    """
    Extracts job links from the company careers page.

    Parameters:
    base_url (str): The base URL of the company careers page.
    all_links (list): List of all links on the current page.
    job_links (list): List of collected job links.

    Returns:
    list: Updated list of job links.
    """
    # Filter links before appending
    # went with list of acceptable locations gathered from currently available cities - bad because we are not accounting for possible new locations 
    #   elements' only location indicitor on page within anchor tag containing href,
    #   no other indicator at moment - may need to switch to automated navigating of filters but sort of difficult at the moment with 2k+ results.
    #   site filter cannot handle multiple cities at moment, restricts pattern to: filter -> scrape -> paginate -> scrape -> ... -> filter -> scrape -> paginate ...
    acceptable_locations = [
        'Sydney', 
        'Boston', 'Cary', 'Chicago', 'Jacksonville', 'Los Angeles', 'Miami', 'New York', 'San Francisco', 'Santa Ana',
        'Birmingham', 'London',
        'Beijing', 'Shanghai',
        'Hong Kong',
        'Singapore'
        ]

    url_start = base_url[:base_url.find('#')]
    # print(url_start)

    for link in all_links:
        # TODO: Check if the link contains a certain pattern and is not already in job_links
        # Append the link to job_links if it meets the criteria
        href = link.get('href') # extract href
        if href and 'job' in href:
            job_url = url_start + href
            job_title = link.get_text()
            if any(location in job_title for location in acceptable_locations):
                if job_url not in job_links:
                    job_links.append(job_url)
    
    # print(len(job_links), job_links)    
    return job_links

# TODO: Define the button_company_name function
def button_company_name(driver, page, outer_loop_break, all_links):
    """
    Handles pagination by clicking the "next page"/"load more" button.

    Parameters:
    driver (WebDriver): The Selenium WebDriver instance.
    page (int): The current page number. (optional)
    outer_loop_break (bool): Flag to indicate when to stop scraping.
    all_links (list): List of all links on the current page.

    Returns:
    tuple: Updated page number and outer_loop_break flag.
    """

    try:
        # TODO: Locate the next page button and navigate to the next page if it exists
        load_more = WebDriverWait(driver, 5).until(
            EC.element_to_be_clickable((
                By.XPATH, "//button[@class='button type-primary' and text()=' Load more ']"
            ))
        )
        if load_more:
            load_more.click()
        else:
            outer_loop_break = True
            return page, outer_loop_break

        # print(load_more[0].text)
    except Exception as e:
        print(f"Error occurred while trying to click the 'next page' button: {e}")
        outer_loop_break = True
    return page + 1, outer_loop_break

# WebDriver setup
driver = webdriver.Chrome()
driver.implicitly_wait(1)
driver.get(base_url)

# Initialize variables
job_links = []
prev_page = 0
page = 1

#cookie try: might not need - site still accessible so should be OK as long as modal not in way of button
# try:
#     cookie_deny = WebDriverWait(driver, 5).until(
#         EC.element_to_be_clickable((By.XPATH, "//button[@role='button' and @data-testid='uc-deny-all-button' and text()='Deny and continue']")))
#     print(cookie_deny)
#     cookie_deny.click()
# except Exception as e:
#     pass

# Main scraping loop
while True:
    time.sleep(5)
    outer_loop_break = False
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')
    all_links = soup.find_all("a", href=True)

    # Scrape job links
    job_links = pull_company_name(base_url, all_links, job_links)
    # Handle pagination
    page, outer_loop_break = button_company_name(driver, page, outer_loop_break, all_links)
    # Following if block to be repeated several times
    # if block to be filled with a distinctive string that appears in base_url
    # if '' in base_url:
        # Apply filter here if necessary
        
    if outer_loop_break:
        break

# Output the collected job links
print("Length of job links:", len(job_links))

# Close the WebDriver
driver.quit()

2 ['https://careers.db.com/professionals/search-roles/#/professional/job/55169', 'https://careers.db.com/professionals/search-roles/#/professional/job/54754']


KeyboardInterrupt: 