In [10]:
# | default_exp scraper.Scraper

In [11]:
# | exporti
import re

import os
from dataclasses import dataclass
from typing import List, Callable, Any
from bs4 import BeautifulSoup


from urllib.parse import urljoin, urlparse

from concurrent.futures import ThreadPoolExecutor


from urllib.parse import urlparse
from gdoc_sync.utils import upsert_folder, convert_str_file_name

import gdoc_sync.scraper.driver as dg

In [12]:
# |exporti
from nbdev.showdoc import patch_to

In [13]:
# | hide
from selenium.webdriver.common.by import By

In [14]:
# from selenium.webdriver.common.by import By

test_urls = [
    "https://domo-support.domo.com/s/article/36004740075?language=en_US",
    "https://domo-support.domo.com/s/topic/0TO5w000000ZlOmGAK/20202023?language=en_US",  # list of articles
    "https://domo-support.domo.com/s/topic/0TO5w000000Zan7GAC/archived-feature-release-notes?language=en_US",  # list of topics
]

drivergenerator = dg.DriverGenerator(debug_prn=True)

driver = drivergenerator.get_webdriver()

test_soup = dg.get_pagesource(
    driver=driver,
    url="https://domo-support.domo.com/s/article/36004740075?language=en_US",
    search_criteria_tuple=(By.CLASS_NAME, "slds-form-element"),
    max_sleep_time=15,
    return_soup=True,
)

ChromeDriver 120.0.6099.109 (3419140ab665596f21b385ce136419fde0924272-refs/branch-heads/6099@{#1483})



# Scrape_Config

In [15]:
# | export
def extract_links(
    soup: BeautifulSoup, base_url: str = None, custom_link_extractor_fn: Callable = None
) -> [str]:
    """returns a list of urls"""

    links_ls = [
        link["href"]
        for link in soup.findAll("a")
        if (not base_url and link.has_attr("href"))
        or (
            base_url
            and link.has_attr("href")
            and link["href"].lower().startswith(base_url.lower())
        )
    ]

    if custom_link_extractor_fn:
        return custom_link_extractor_fn(links_ls, base_url)

    return links_ls

In [16]:
extract_links(test_soup)[0:5]

['36004740075?nocache=https%3A%2F%2Fdomo-support.domo.com%2Fs%2Farticle%2F36004740075%3Flanguage%3Den_US',
 'https://www.domo.com/domo-central',
 'https://www.domo.com/domo-central/community',
 'https://community-forums.domo.com/main',
 'https://community-forums.domo.com/main/categories/welcome']

In [17]:
# | export
def generate_filename_from_url(url):
    parsed_url = urlparse(url)

    return "_".join([str for str in parsed_url[2].split("/") if str])

In [18]:
[generate_filename_from_url(url) for url in test_urls]

['s_article_36004740075',
 's_topic_0TO5w000000ZlOmGAK_20202023',
 's_topic_0TO5w000000Zan7GAC_archived-feature-release-notes']

In [19]:
# |export


@dataclass
class Scrape_Config:
    """class for collating data about how to scrape a page"""

    pattern: re.Pattern  # url pattern
    link_extractor_fn: Callable = extract_links

    generate_filename_fn: Callable = generate_filename_from_url
    content_extractor_fn: Callable = None

    search_element_type: Any = None
    search_element_text: str = None

    max_sleep_time: int = 10

    def __id__(self, other):
        return self.pattern == other.pattern

    def get_search_tuple(self) -> Any:
        if not self.search_element_text and self.search_element_type:
            return None

        return (self.search_element_type, self.search_element_text)

    def is_text_match_pattern(self, text, debug_prn: bool = False):
        pattern = re.compile(self.pattern)

        match_pattern = pattern.match(text)

        if debug_prn:
            print({"text": text, "pattern": self.pattern})

        if not match_pattern:
            return False

        if debug_prn:
            print(match_pattern)

        return True

### ScrapeConfig manages content download

In [None]:
@patch_to(Scrape_Config)
def _download_content(self: Scrape_Config, folder_path, file_name, content):
    upsert_folder(folder_path)

    with open(os.path.join(folder_path, file_name), "w", encoding="utf-8") as f:
        f.write(str(content))

    return True


@patch_to(Scrape_Config)
def scrape_page(self: Scrape_Config, url, debug_prn: bool = True):
    if debug_prn:
        print(f"scraping_page {url}")

    soup = None
    try:
        soup = self.get_pagesource(url=url)

        scrape_config = self.scrape_factory.get_factory_config(url)

        upsert_folder(self.download_folder)

        folder_path = (
            os.path.join(
                self.download_folder, scrape_config.convert_url_to_filename(url)
            )
            + "/"
        )

        self._download_content(
            folder_path=folder_path, file_name="index.html", content=soup
        )

        content_scraper = scrape_config.content_scraper

        # Extract the article content
        if content_scraper:
            content = content_scraper(soup)

            self._download_content(
                folder_path=folder_path, file_name="content.html", content=content
            )
        if self.is_test:
            print("this is a test")

        if not self.is_test:
            links = get_links(soup, self.base_url)
            [self._add_url_to_visit(link) for link in links]
        return f"🎉 successfully scraped {url}"

    except Exception as e:
        return f"💀 failed to download {url} received errror{e}"

In [20]:
# | export


class Scrape_Factory_NoConfigMatch(Exception):
    def __init__(self, text):
        super().__init__(
            f"{text} has no pattern match in factory_configs, add an appropriate config or check pattern matches"
        )


@dataclass
class Scrape_Factory:
    """class handles a list of Scrape_Configs and will return the 'correct one' given a URL"""

    factory_configs: List[Scrape_Config]

    def get_factory_config(self, url, debug_prn: bool = False):
        config = next(
            (
                config
                for config in self.factory_configs
                if config.is_text_match_pattern(url, debug_prn=debug_prn)
            ),
            None,
        )

        if not config:
            raise Scrape_Factory_NoConfigMatch(text=url)

        return config

# DomoKB_ScrapeConfig

In [21]:
# | exporti
def process_link(link, base_url):
    url = link.get("href")

    if not url:
        return None

    # for relative addresses concat base_url
    if url.startswith("/s/"):
        url = urljoin(base_url, url)

    # ignore urls not orginating from base_url
    if not url.startswith(base_url):
        return None

    # remove query params
    url = urljoin(url, urlparse(url).path)

    # only keep the first 6 pieces of the URL
    url = "/".join(url.split("/")[:6])

    if url.endswith("/"):
        url = url[:-1]

    return url


def domokb_link_extractor_fn(soup, base_url):
    links = []

    for link in soup.findAll("a"):
        url = process_link(link, base_url)

        if url and url not in links:
            links.append(url)
    return links

## DomoKB_ScrapeConfig_Article

In [22]:
# | exporti


def domokb_article_content_extractor_fn(soup) -> BeautifulSoup:
    return soup.find(class_=["article-column"])


DomoKB_ScrapeConfig_Article = Scrape_Config(
    pattern=r".*/s/article/.*",
    link_extractor_fn=domokb_link_extractor_fn,
    content_extractor_fn=domokb_article_content_extractor_fn,
    search_element_type=By.CLASS_NAME,
    search_element_text="slds-form-element",
)

## DomoKB_ScrapeConfig_Topic

In [23]:
# | exporti
def domokb_topic_content_extractor_fn(soup) -> BeautifulSoup:
    return soup.find(class_=["knowledge-base"])


DomoKB_ScrapeConfig_Topic = Scrape_Config(
    pattern=r".*/s/topic/.*",
    link_extractor_fn=domokb_link_extractor_fn,
    content_extractor_fn=domokb_topic_content_extractor_fn,
    search_element_type=By.CSS_SELECTOR,
    search_element_text=f".{', .'.join(['section-list-item', 'article-list-item'] )}",
)

## DomoKB_ScrapeFactory

In [24]:
# | export
DomoKB_ScrapeFactory = Scrape_Factory(
    [DomoKB_ScrapeConfig_Article, DomoKB_ScrapeConfig_Topic]
)

In [25]:
[
    {
        "url": url,
        "config": DomoKB_ScrapeFactory.get_factory_config(url, debug_prn=False),
    }
    for url in test_urls
]

[{'url': 'https://domo-support.domo.com/s/article/36004740075?language=en_US',
  'config': Scrape_Config(pattern='.*/s/article/.*', link_extractor_fn=<function domokb_link_extractor_fn at 0x7f97038200e0>, generate_filename_fn=<function generate_filename_from_url at 0x7f970411d580>, content_extractor_fn=<function domokb_article_content_extractor_fn at 0x7f970411ff60>, search_element_type='class name', search_element_text='slds-form-element', max_sleep_time=10)},
 {'url': 'https://domo-support.domo.com/s/topic/0TO5w000000ZlOmGAK/20202023?language=en_US',
  'config': Scrape_Config(pattern='.*/s/topic/.*', link_extractor_fn=<function domokb_link_extractor_fn at 0x7f97038200e0>, generate_filename_fn=<function generate_filename_from_url at 0x7f970411d580>, content_extractor_fn=<function domokb_topic_content_extractor_fn at 0x7f9703820220>, search_element_type='css selector', search_element_text='.section-list-item, .article-list-item', max_sleep_time=10)},
 {'url': 'https://domo-support.domo

# Scrape_Crawler

In [26]:
# | export


class Scrape_Crawler:
    """threadpool manager for crawling through a list of urls"""

    executor: ThreadPoolExecutor
    scrape_factory: Scrape_Factory
    base_url: str

    driver_generator: dg.DriverGenerator = None

    download_folder: str

    is_test: bool
    visited_urls: set = set()
    urls_to_visit: set = set()

    def __init__(
        self,
        driver_path,
        scrape_factory: Scrape_Factory,
        base_url: str,
        is_test: bool = False,
        max_workers=5,
        download_folder: str = "./SCRAPE/",
    ):
        self.is_test = is_test
        self.base_url = base_url
        self.scrape_factory = scrape_factory
        self.executor = ThreadPoolExecutor(max_workers=max_workers)
        self.driver_generator = dg.DriverGenerator(driver_path=driver_path)
        self.download_folder = download_folder

In [27]:
wbs = Scrape_Crawler(
    driver_path="/usr//bin/chromedriver",
    scrape_factory=DomoKB_ScrapeFactory,
    base_url="https://domo-support.domo.com",
)
wbs.__dict__

{'is_test': False,
 'base_url': 'https://domo-support.domo.com',
 'scrape_factory': Scrape_Factory(factory_configs=[Scrape_Config(pattern='.*/s/article/.*', link_extractor_fn=<function domokb_link_extractor_fn at 0x7f97038200e0>, generate_filename_fn=<function generate_filename_from_url at 0x7f970411d580>, content_extractor_fn=<function domokb_article_content_extractor_fn at 0x7f970411ff60>, search_element_type='class name', search_element_text='slds-form-element', max_sleep_time=10), Scrape_Config(pattern='.*/s/topic/.*', link_extractor_fn=<function domokb_link_extractor_fn at 0x7f97038200e0>, generate_filename_fn=<function generate_filename_from_url at 0x7f970411d580>, content_extractor_fn=<function domokb_topic_content_extractor_fn at 0x7f9703820220>, search_element_type='css selector', search_element_text='.section-list-item, .article-list-item', max_sleep_time=10)]),
 'executor': <concurrent.futures.thread.ThreadPoolExecutor at 0x7f970380d950>,
 'driver_generator': <gdoc_sync.scra

In [28]:
# |export


@patch_to(Scrape_Crawler)
def get_pagesource(self: Scrape_Crawler, url: str, debug_prn: bool = False):
    # Find the first pattern that matches the URL and get the corresponding attribute

    scrape_config = self.scrape_factory.get_factory_config(url)

    assert scrape_config

    driver = self.driver_generator.get_webdriver()

    search_criteria_tuple = scrape_config.get_search_tuple()
    max_sleep_time = scrape_config.max_sleep_time

    if debug_prn:
        print(
            {
                "url": url,
                "max_sleep_time": max_sleep_time,
                "search_criteria": search_criteria_tuple,
            }
        )

    pagesource = dg.get_pagesource(
        url=url,
        search_criteria_tuple=search_criteria_tuple,
        driver=driver,
        max_sleep_time=max_sleep_time,
    )

    if not pagesource:
        raise Exception(f"unable to retrieve source {url}")

    return pagesource

In [29]:
test_url = "https://domo-support.domo.com/s/article/36004740075?language=en_US"
test_url = "https://domo-support.domo.com/s/topic/0TO5w000000ZamzGAC/transforming-data-in-domo?language=en_US"

# generate a chrome webdriver for scraping the webpage
wbs = Scrape_Crawler(
    driver_path="/usr//bin/chromedriver",
    scrape_factory=DomoKB_ScrapeFactory,
    base_url="https://domo-support.domo.com",
)

soup = wbs.get_pagesource(url=test_url)

str(soup)[0:100]

'<html dir="ltr" lang="en-US"><head><title>Transforming Data In Domo</title><meta content="default-sr'

## Scrape_Crawler manages URls visited

In [30]:
# | export


@patch_to(Scrape_Crawler)
def _add_url_to_visit(self: Scrape_Crawler, url):
    if url not in self.visited_urls:
        print(f"adding {url} to to_vist list")

        self.urls_to_visit.add(url)




NameError: name 'WebScraper' is not defined

In [None]:
test_url = "https://domo-support.domo.com/s/article/36004740075?language=en_US"
test_url = "https://domo-support.domo.com/s/topic/0TO5w000000ZamzGAC/transforming-data-in-domo?language=en_US"

# generate a chrome webdriver for scraping the webpage
wbs = WebScraper(
    driver_path="/usr//bin/chromedriver",
    scrape_factory=Scrape_Factory(factory_configs),
    base_url="https://domo-support.domo.com",
)

wbs.scrape_page(url=test_url)

In [None]:
# | export


@patch_to(WebScraper)
def quit(self: WebScraper):
    self.executor.shutdown(wait=True)
    return f"Done scraping {len(self.visited_urls)} urls"


@patch_to(WebScraper)
def scrape_next_page(self: WebScraper):
    while self.urls_to_visit:
        url = self.urls_to_visit.pop()
        self.visited_urls.add(url)
        future = self.executor.submit(self.scrape_page, url)
        try:
            result = future.result()
        except Exception as e:
            print(e)

    return self.quit()

In [None]:
test_url = "https://domo-support.domo.com/s/article/36004740075?language=en_US"
test_url = "https://domo-support.domo.com/s/topic/0TO5w000000ZanUGAS/dataflow-management?language=en_US"
test_url = "https://domo-support.domo.com/s/topic/0TO5w000000ZamzGAC/transforming-data-in-domo?language=en_US"

wbs = WebScraper(
    driver_path="/usr//bin/chromedriver",
    scrape_factory=Scrape_Factory(factory_configs),
    base_url="https://domo-support.domo.com",
)

wbs._add_url_to_visit(
    test_url,
)

wbs.scrape_next_page()

In [None]:
# | hide
# import nbdev

# nbdev.nbdev_export()