In [1]:
# | default_exp scraper.driver

In [2]:
# | exporti
import re

import subprocess

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

from bs4 import BeautifulSoup

install chrome via the instructions in the provided setup file

In [3]:
# !sudo apt-get install -y curl unzip xvfb libxi6 libgconf-2-4
# !wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
# !sudo apt install ./google-chrome-stable_current_amd64.deb

# !wget https://chromedriver.storage.googleapis.com/86.0.4240.22/chromedriver_linux64.zip
# !unzip chromedriver_linux64.zip
# !sudo mv chromedriver /usr/bin/chromedriver
# !sudo chown root:root /usr/bin/chromedriver
# !sudo chmod +x /usr/bin/chromedriver

# !which google-chrome
# !google-chrome --version
# !chromedriver --version

In [4]:
# !pip install webdriver_manager selenium

In [5]:
test_urls = [
    "https://domo-support.domo.com/s/article/36004740075?language=en_US",
    "https://domo-support.domo.com/s/topic/0TO5w000000ZlOmGAK/20202023?language=en_US",  # list of articles
    "https://domo-support.domo.com/s/topic/0TO5w000000Zan7GAC/archived-feature-release-notes?language=en_US",  # list of topics
]

# DriverGenerator

In [6]:
# | export


class DriverGenerator_NotInPath(Exception):
    def __init__(self):
        super().__init__(
            "google-chrome not found in PATH and therefore cannot be accessed by a driver.  install it or add to PATH"
        )


class DriverGenerator:
    """class for generating drivers, can be extended for other drivers"""

    driver_path: str = None

    def __init__(self, driver_path=None, debug_prn: bool = False):
        self.driver_path = driver_path

        assert self.is_chrome_in_path()

        if debug_prn:
            print(self.get_chromedriver_version())

    def is_chrome_in_path(self):
        result = subprocess.run(["which", "google-chrome"], stdout=subprocess.PIPE)
        is_chrome_in_path = result.stdout != b""

        if not is_chrome_in_path:
            raise DriverGenerator_NotInPath()

        return is_chrome_in_path

    def get_chromedriver_version(self):
        result = subprocess.run(["chromedriver", "--version"], stdout=subprocess.PIPE)
        return result.stdout.decode("utf-8")

    def get_webdriver(self, driver_path="/usr//bin/chromedriver") -> webdriver:
        """create a chrome webdriver"""

        if not self.driver_path:
            self.driver_path = driver_path

        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_argument("--no-sandbox")  # needed for LINUX environments
        chrome_options.add_argument(self.driver_path or driver_path)

        return webdriver.Chrome(chrome_options)

In [7]:
drivergenerator = DriverGenerator(debug_prn=True)

drivergenerator.get_webdriver()

ChromeDriver 120.0.6099.109 (3419140ab665596f21b385ce136419fde0924272-refs/branch-heads/6099@{#1483})



<selenium.webdriver.chrome.webdriver.WebDriver (session="458a9cbe98d68bdd4717bddd781d25dd")>

In [8]:
# | export
def get_pagesource(
    url: str,
    driver: webdriver,
    search_criteria_tuple=None,  # for dynamically rendered pages, pass a WebDriverWait search tuple (search_element, element name)
    max_sleep_time=15,
    return_soup: bool = True,
    debug_prn: bool = False,
):
    """retrieve page_source"""
    try:
        if debug_prn:
            print(f"💤 pagesource: retrieving {url} 💤")

        driver.get(url)

        if search_criteria_tuple:
            WebDriverWait(driver, timeout=max_sleep_time).until(
                EC.presence_of_element_located(search_criteria_tuple)
            )

        if return_soup:
            return BeautifulSoup(driver.page_source, "lxml")

        driver.close()

        return driver.page_source

    except Exception as e:
        print(
            f"ERROR: {e} -  {url} failed to load page within {max_sleep_time} seconds.  is the element represented in the element list?"
        )

In [9]:
drivergenerator = DriverGenerator(debug_prn=True)

driver = drivergenerator.get_webdriver()

soup = get_pagesource(
    driver=driver,
    url="https://domo-support.domo.com/s/article/36004740075?language=en_US",
    search_criteria_tuple=(By.CLASS_NAME, "slds-form-element"),
    max_sleep_time=15,
    return_soup=True,
)

str(soup)[0:100]

ChromeDriver 120.0.6099.109 (3419140ab665596f21b385ce136419fde0924272-refs/branch-heads/6099@{#1483})



'<html dir="ltr" lang="en-US"><head><title>Jupyter Workspaces | User Guide</title><meta content="defa'

In [10]:
# | hide
import nbdev

nbdev.nbdev_export()