In [1]:
# | default_exp crawler.crawler

In [2]:
#| exporti
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from bs4 import BeautifulSoup

# import time


In [3]:
#| export
def driversetup(is_headless: bool = True) -> webdriver:
    options = webdriver.ChromeOptions()
    # run Selenium in headless mode

    if is_headless:
        options.add_argument("--headless")

    options.add_argument("--no-sandbox")

    driver = webdriver.Chrome(options=options)

    return driver

In [4]:
#| export

class WaitAndReturn_ElementError(Exception):
    def __init__(self):
        super.__init__(f"must include element_id or element_ls in WaitAndReturn")

def wait_and_return(
    driver: webdriver,
    element_id: str = None,
    element_ls: [str] = None,
    element_type=By.ID,
    max_sleep_time=15,
    is_return_soup: bool = False,
):
    """
    Wait for the element to be present and return it.
    """

    if not element_id and not element_ls:
        raise WaitAndReturn_ElementError()
    
    try:
        element_str = f".{', .'.join(element_ls)}" if element_ls else element_id
        ele = None

        if element_ls:
            ele = WebDriverWait(driver, timeout=max_sleep_time).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, element_str)))

        else:
            ele = WebDriverWait(driver, timeout=max_sleep_time).until(
                EC.presence_of_element_located(
                    (element_type, element_id))
            )
        
        if is_return_soup:
            return BeautifulSoup(res.get_attribute("innerHTML"), "lxml")
        return ele

    except Exception as e:
        print(e)
        print(
            f"Timeout Exception: {url} did not load within {max_sleep_time} seconds.")


#### sample implemention of wait_and_return

In [5]:
# url = 'https://domo-support.domo.com/s/topic/0TO5w000000ZanDGAS/card-and-dashboard-management?language=en_US'

# driver = driversetup()
# driver.get(url)

# wait_and_return(
#     driver = driver, 
#     element_ls=["section-list-item", 'article-list-item'] ,
# )

<selenium.webdriver.remote.webelement.WebElement (session="ac56a1c509f6bc5733820a0b7cf236ee", element="5c2a5d0c-826e-4477-a3b6-ee1784dbadf4")>

In [6]:
#| export
def authenticate_domo_driver(driver, domo_instance, domo_username, domo_password):

    url = f"https://{domo_instance}.domo.com/auth/index"

    driver.get(url)

    button = wait_and_return(driver, element_id="sign-in",
                             element_type=By.CLASS_NAME, max_sleep_time=10)

    form_username = wait_and_return(driver, "username", element_type=By.NAME)

    form_password = wait_and_return(driver, "password", element_type=By.NAME)


    # Sending input
    form_username.clear()
    form_username.send_keys(domo_username)
    form_password.clear()
    form_password.send_keys(domo_password)
    button.click()

    return driver

#### sample implementation of authenticate_domo_driver

In [7]:
# import os
# from dotenv import load_dotenv

# load_dotenv('../../.env')

# driver = driversetup(is_headless=False)

# domo_username = os.environ['DOMO_DOJO_USERNAME']
# domo_password = os.environ['DOMO_DOJO_PASSWORD']
# domo_instance = 'domo-community'

# driver = authenticate_domo_driver(driver=driver, domo_instance=domo_instance,
#                          domo_username=domo_username,
#                          domo_password=domo_password)

# # Test can navigate to a site that requires authentication    
# driver.get(url=f'https://{domo_instance}.domo.com/datacenter/datawarehouse')


In [8]:
# | export
def pagesource(
    url: str,
    driver: webdriver = None,
    element_id: str = None,
    element_ls: [str] = None,
    element_type=By.ID,
    max_sleep_time = 15,
    is_return_soup: bool = False,
):
    is_driver_close = False if driver else True
    driver = driver or driversetup()

    print(f"💤 retrieving {url} 💤")
    driver.get(url)

    try:
        if element_id or element_ls:
            wait_and_return(
                driver = driver,
                element_id = element_id,
                element_ls = element_ls,
                element_type = element_type,
                max_sleep_time = max_sleep_time,
            )

        if is_return_soup:
            return BeautifulSoup(driver.page_source, "lxml")
        
        return driver.page_source
    
    except Exception as e:
        print(e)

    finally:
        if is_driver_close:
            driver.close()


#sample implementation of pagesource

In [12]:
# url = 'https://domo-support.domo.com/s/topic/0TO5w000000ZanDGAS/card-and-dashboard-management?language=en_US'

# driver = driversetup()

# res = pagesource(
#     url = url,
#     driver = driver, 
#     element_ls=["section-list-item", 'article-list-item'] ,
#     is_return_soup = True
# )


💤 retrieving https://domo-support.domo.com/s/topic/0TO5w000000ZanDGAS/card-and-dashboard-management?language=en_US 💤


In [15]:
# url = 'https://domo-support.domo.com/s/article/360043429913'

# driver = driversetup()

# res = pagesource(
#     url=url,
#     driver=driver,
#     element_type=By.CLASS_NAME, element_id="slds-form-element",
#     is_return_soup=True
# )

💤 retrieving https://domo-support.domo.com/s/article/360043429913 💤
