In [None]:
# pip install mdutils


In [None]:
# pip install markdownify

In [None]:
URL = "https://domo-support.domo.com/s/article/360047400753?language=en_US"
BASE_URL = "https://domo-support.domo.com/"

In [None]:
from selenium import webdriver
from bs4 import BeautifulSoup


def driversetup():
    options = webdriver.ChromeOptions()
    # run Selenium in headless mode
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")

    driver = webdriver.Chrome(options=options)

    return driver


driver = driversetup()

In [None]:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
import selenium.webdriver.support.expected_conditions as EC


def pagesource(url, driver=None):

    is_driver_close = True
    if driver:
        is_driver_close = False

    driver = driver or driversetup()

    driver.get(url)

    try:
        ele = WebDriverWait(driver, timeout=15, poll_frequency=1).until(
            EC.presence_of_element_located((By.CLASS_NAME, "slds-form-element"))
        )
        print(f"Page {url} is loaded within 10 seconds.")
    except Exception as e:
        # print(e)
        print(f"Timeout Exception: Page {url} did not load within 10 seconds.")

    soup = BeautifulSoup(driver.page_source)

    if is_driver_close:
        driver.close()
    return soup


# test_page_source = pagesource(url = URL, driver = driver)
# test_page_source

In [None]:
from dataclasses import dataclass, field
import urllib.parse as url_parse
import markdown as md


@dataclass
class Article:
    soup: BeautifulSoup = field(repr=False)
    base_url: str
    # linked_url_ls : list[str] = field( default_factory= list)

    def __post_init__(self):
        self.get_linked_urls()

    @classmethod
    def get_from_url(cls, url: str, driver, base_url: str):
        soup = pagesource(driver=driver, url=url)

        return cls(soup=soup, base_url=base_url)

    @staticmethod
    def md_soup(soup, **options):
        return md.MarkdownConverter(**options).convert_soup(soup)

    def get_linked_urls(self, is_remove_query_string_parameters:bool = True):
        self.linked_url_ls = []
        for soup_link in self.soup.find_all("a"):
            url = soup_link.get("href")

            if not url:
                continue

            if url.startswith("/"):
                url = url_parse.urljoin(self.base_url, url)
            
            if is_remove_query_string_parameters:
                url = url_parse.urljoin(url, url_parse.urlparse(url).path)

            if url.startswith(self.base_url) and url not in self.linked_url_ls:
                self.linked_url_ls.append(url)

        return self.linked_url_ls


# test_art = Article.get_from_url(url = URL, driver = driver, base_url = base_url)
# test_art

In [None]:
import re
import datetime as dt
import markdownify as md
from dataclasses import dataclass, field
from dateutil import parser

# @dataclass
# class Article:
#   soup: BeautifulSoup = field(repr = False)
#   base_url:str

class ArticleKB_GetSoupError:
    def __init__(self, url):
        super().__init__(f"failed to retrieve soup for {url}")

class ArticleKB_ProcessSoupError(Exception):
    def __init__(self, url, search_term):
        super().__init__(f"search term {search_term} does not exist in {url}")



@dataclass()
class Article_KB:
    url: str
    base_url: str
    driver: webdriver

    is_success:bool = False
    article: Article = field(default=None, repr=False)
    kb_soup: BeautifulSoup = field(default=None, repr=False)
    kb_url_ls: list[str] = field(default=None)

    title: str = None
    md_str: str = field(default=None, repr=False)
    article_id: str = None
    views: int = None
    created: dt.date = None
    last_updated: dt.date = None

    def __post_init__(self):
        soup = pagesource(driver=self.driver, url=self.url)

        if not soup:
            raise ArticleKB_GetSoupError(url = self.url)


        self.article = Article(soup=soup, base_url=self.base_url)
        self.kb_url_ls = self.article.linked_url_ls
        
        try:
            self.process_kb_soup(soup)
            self.is_success = True
        except ArticleKB_ProcessSoupError as e:
            print(e)

    def process_kb_soup(self, soup: BeautifulSoup):
        search_term = "slds-form-element"

        table = soup.find_all(class_=[search_term])

        if not table or table == []:
            raise ArticleKB_ProcessSoupError(url = self.url, search_term= search_term)

        tarticle = []
        for row in table:
            # print("❤️")

            cells = row.find(class_="slds-form-element__label")

            if list(cells.strings):
                content = row.find(class_="slds-form-element__control")
                tarticle.append((list(cells.strings)[0], content))

        kb_soup = dict(tarticle)
        self.kb_soup = kb_soup

        self.title = self.article.md_soup(kb_soup.get("Title"))

        self.md_str = self.article.md_soup(kb_soup.get("Article Body"))
        self.article_id = self.article.md_soup(kb_soup.get("Article Number"))
        self.views = self.article.md_soup(kb_soup.get("Article Total View Count"))
        self.created = parser.parse(
            self.article.md_soup(kb_soup.get("Article Created Date"))
        )

        self.last_updated = parser.parse(
            self.article.md_soup(kb_soup.get("First Published Date"))
        )

        return self.kb_soup

test_broken_url = 'https://domo-support.domo.com/s/knowledge-base/'
test_cls = Article_KB(url=URL, base_url=BASE_URL, driver=driver)
test_cls

Page https://domo-support.domo.com/s/article/360047400753?language=en_US is loaded within 10 seconds.


Article_KB(url='https://domo-support.domo.com/s/article/360047400753?language=en_US', base_url='https://domo-support.domo.com/', driver=<selenium.webdriver.chrome.webdriver.WebDriver (session="9b374035ae2b7346d34bad18c6cb2e07")>, is_success=True, kb_url_ls=['https://domo-support.domo.com/s/knowledge-base/', 'https://domo-support.domo.com/s/', 'https://domo-support.domo.com/s/topic/0TO5w000000ZammGAC', 'https://domo-support.domo.com/s/topic/0TO5w000000ZanzGAC', 'https://domo-support.domo.com/s/article/7872485267991', 'https://domo-support.domo.com/s/article/7440921035671', 'https://domo-support.domo.com/s/article/360047400753', 'https://domo-support.domo.com/s/topic/0TO5w000000ZanzGAC/other-connection-methods', 'https://domo-support.domo.com/s/article/360043429933', 'https://domo-support.domo.com/s/article/360042925494', 'https://domo-support.domo.com/s/article/360043429953', 'https://domo-support.domo.com/s/article/360043429913', 'https://domo-support.domo.com/s/login/'], title='Jupyte

In [None]:
import mdutils
import os
from mdutils.mdutils import MdUtils


def convert_to_snake(text_str):
    """converts 'snake_case_str' to 'snakeCaseStr'"""

    return text_str.replace(" ", "_").lower()

 def add_frontmatter(article, output_path: str = "kb_md", data):

    with open(output_path, 'r+', encoding='utf-8') as md_file:
        file_data = md_file.read()  # Save all the file's content
        md_file.seek(0, 0)  # Place file pointer at the beginning
        md_file.write(data)  # Write data
        md_file.write('\n' + file_data)

def output_md(
    article: Article_KB, output_folder: str = "kb_md", debug_prn: bool = False
):

    output_path = os.path.join(output_folder, convert_to_snake(article.title))

    if debug_prn:
        print(f"outputting {article.title} to {output_path}")

    md_file = MdUtils(file_name=output_path)
    
    md_file.new_line('---\n')
    md_file.new_line(f"title: {article.title}")
    md_file.new_line(f"url: {article.url}")
    md_file.new_line(
        f'linked_kbs:  { [ md_file.new_inline_link(link) for link in article.kb_url_ls]}')
    md_file.new_line(f'article_id: {article.article_id}')
    md_file.new_line(f'views: {article.views}')
    md_file.new_line(f"created_date: {str(article.created)}")
    md_file.new_line(f"last updated: {str(article.last_updated)}")
    md_file.new_line('---')
    md_file.write('\n\n\n')


    md_file.write(article.md_str)

    md_file.create_md_file()

output_md(article = test_cls, debug_prn= True)



outputting Jupyter Workspaces to kb_md\jupyter_workspaces


In [None]:
import logging

logging.basicConfig(
    format='%(asctime)s %(levelname)s:%(message)s',
    level=logging.INFO)


class Crawler:
    base_url: str
    output_folder: str
    urls_visited_ls: list[str]
    urls_to_vist_ls: list[str]

    driver: webdriver

    def __init__(self,
                 urls_to_visit_ls: list[str] = None,
                 base_url=None,
                 output_folder='kb_md',
                 ):

        self.base_url = base_url
        self.output_folder = output_folder
        self.urls_visited_ls = []
        self.urls_to_visit_ls = urls_to_visit_ls
        self.article_ls = []
        self.driver = driversetup()

    def add_url_to_visit(self, url, debug_prn: bool = False):
        if url not in self.urls_visited_ls and url not in self.urls_to_visit_ls:
            if debug_prn:
                print(f'adding url to list - {url}')

            self.urls_to_visit_ls.append(url)

    def crawl(self, url, debug_prn: bool = False):
        if debug_prn:
            print(f"starting crawl - {url}")

        article = Article_KB(url=url, base_url=self.base_url, driver=driver)

        for url in article.kb_url_ls:
            self.add_url_to_visit(url=url, debug_prn=debug_prn)
        
        if article.is_success:
            output_md(article=article, output_folder=self.output_folder,
                    debug_prn=debug_prn)


    def run(self, debug_prn: bool = False):
        while self.urls_to_visit_ls:
            url = self.urls_to_visit_ls.pop(0)

            logging.info(f'Crawling: {url}')

            try:
                self.crawl(url, debug_prn)
            except Exception:
                logging.exception(f'Failed to crawl: {url}')
            finally:
                self.urls_visited_ls.append(url)

        print('done')
        return self


In [None]:
crawler = Crawler(urls_to_visit_ls=[URL], base_url=BASE_URL, output_folder="kb_md")

crawler.run(debug_prn=True)

2023-02-08 06:46:50,238 INFO:Crawling: https://domo-support.domo.com/s/article/360047400753?language=en_US


starting crawl - https://domo-support.domo.com/s/article/360047400753?language=en_US


2023-02-08 06:46:56,080 INFO:Crawling: https://domo-support.domo.com/s/knowledge-base/


Page https://domo-support.domo.com/s/article/360047400753?language=en_US is loaded within 10 seconds.
adding url to list - https://domo-support.domo.com/s/knowledge-base/
adding url to list - https://domo-support.domo.com/s/
adding url to list - https://domo-support.domo.com/s/topic/0TO5w000000ZammGAC
adding url to list - https://domo-support.domo.com/s/topic/0TO5w000000ZanzGAC
adding url to list - https://domo-support.domo.com/s/article/7872485267991
adding url to list - https://domo-support.domo.com/s/article/7440921035671
adding url to list - https://domo-support.domo.com/s/article/360047400753
adding url to list - https://domo-support.domo.com/s/topic/0TO5w000000ZanzGAC/other-connection-methods
adding url to list - https://domo-support.domo.com/s/article/360043429933
adding url to list - https://domo-support.domo.com/s/article/360042925494
adding url to list - https://domo-support.domo.com/s/article/360043429953
adding url to list - https://domo-support.domo.com/s/article/360043429

2023-02-08 06:47:13,713 INFO:Crawling: https://domo-support.domo.com/s/


Timeout Exception: Page https://domo-support.domo.com/s/knowledge-base/ did not load within 10 seconds.
search term slds-form-element does not exist in https://domo-support.domo.com/s/knowledge-base/
adding url to list - https://domo-support.domo.com/s/knowledge-base/
adding url to list - https://domo-support.domo.com/s/topic/0TO5w000000ZamsGAC
adding url to list - https://domo-support.domo.com/s/topic/0TO5w000000ZamzGAC
adding url to list - https://domo-support.domo.com/s/topic/0TO5w000000ZamoGAC
adding url to list - https://domo-support.domo.com/s/topic/0TO5w000000ZamnGAC
adding url to list - https://domo-support.domo.com/s/topic/0TO5w000000ZamlGAC
adding url to list - https://domo-support.domo.com/s/topic/0TO5w000000ZamqGAC
adding url to list - https://domo-support.domo.com/s/topic/0TO5w000000Zan0GAC
adding url to list - https://domo-support.domo.com/s/topic/0TO5w000000Zan2GAC
adding url to list - https://domo-support.domo.com/s/topic/0TO5w000000ZampGAC
adding url to list - https://

2023-02-08 06:47:30,827 INFO:Crawling: https://domo-support.domo.com/s/topic/0TO5w000000ZammGAC


Timeout Exception: Page https://domo-support.domo.com/s/ did not load within 10 seconds.
search term slds-form-element does not exist in https://domo-support.domo.com/s/
adding url to list - https://domo-support.domo.com/start/free
starting crawl - https://domo-support.domo.com/s/topic/0TO5w000000ZammGAC


2023-02-08 06:47:47,269 INFO:Crawling: https://domo-support.domo.com/s/topic/0TO5w000000ZanzGAC


Timeout Exception: Page https://domo-support.domo.com/s/topic/0TO5w000000ZammGAC did not load within 10 seconds.
search term slds-form-element does not exist in https://domo-support.domo.com/s/topic/0TO5w000000ZammGAC
adding url to list - https://domo-support.domo.com/s/topic/0TO5w000000ZanLGAS
adding url to list - https://domo-support.domo.com/s/topic/0TO5w000000ZaoPGAS
starting crawl - https://domo-support.domo.com/s/topic/0TO5w000000ZanzGAC


2023-02-08 06:48:03,625 INFO:Crawling: https://domo-support.domo.com/s/article/7872485267991


Timeout Exception: Page https://domo-support.domo.com/s/topic/0TO5w000000ZanzGAC did not load within 10 seconds.
search term slds-form-element does not exist in https://domo-support.domo.com/s/topic/0TO5w000000ZanzGAC
adding url to list - https://domo-support.domo.com/s/article/360048925554
adding url to list - https://domo-support.domo.com/s/article/360043437733
adding url to list - https://domo-support.domo.com/s/article/360042932874
adding url to list - https://domo-support.domo.com/s/article/360043437513
adding url to list - https://domo-support.domo.com/s/article/360042932914
adding url to list - https://domo-support.domo.com/s/article/360042932934
adding url to list - https://domo-support.domo.com/s/article/360043437633
adding url to list - https://domo-support.domo.com/s/article/360043437653
adding url to list - https://domo-support.domo.com/s/article/360043437673
adding url to list - https://domo-support.domo.com/s/article/360043437753
adding url to list - https://domo-support.

2023-02-08 06:48:19,940 INFO:Crawling: https://domo-support.domo.com/s/article/7440921035671


Timeout Exception: Page https://domo-support.domo.com/s/article/7872485267991 did not load within 10 seconds.
search term slds-form-element does not exist in https://domo-support.domo.com/s/article/7872485267991
starting crawl - https://domo-support.domo.com/s/article/7440921035671


2023-02-08 06:48:24,246 INFO:Crawling: https://domo-support.domo.com/s/article/360047400753


Page https://domo-support.domo.com/s/article/7440921035671 is loaded within 10 seconds.
adding url to list - https://domo-support.domo.com/s/article/7440921035671
outputting Jupyter Troubleshooting Guide to kb_md\jupyter_troubleshooting_guide
starting crawl - https://domo-support.domo.com/s/article/360047400753


2023-02-08 06:48:27,785 INFO:Crawling: https://domo-support.domo.com/s/topic/0TO5w000000ZanzGAC/other-connection-methods


Page https://domo-support.domo.com/s/article/360047400753 is loaded within 10 seconds.
adding url to list - https://domo-support.domo.com/s/article/360047400753
outputting Jupyter Workspaces to kb_md\jupyter_workspaces
starting crawl - https://domo-support.domo.com/s/topic/0TO5w000000ZanzGAC/other-connection-methods


2023-02-08 06:48:44,299 INFO:Crawling: https://domo-support.domo.com/s/article/360043429933


Timeout Exception: Page https://domo-support.domo.com/s/topic/0TO5w000000ZanzGAC/other-connection-methods did not load within 10 seconds.
search term slds-form-element does not exist in https://domo-support.domo.com/s/topic/0TO5w000000ZanzGAC/other-connection-methods
starting crawl - https://domo-support.domo.com/s/article/360043429933


2023-02-08 06:48:47,522 INFO:Crawling: https://domo-support.domo.com/s/article/360042925494


Page https://domo-support.domo.com/s/article/360043429933 is loaded within 10 seconds.
adding url to list - https://domo-support.domo.com/s/topic/0TO5w000000ZanAGAS
adding url to list - https://domo-support.domo.com/s/article/360043429933
adding url to list - https://domo-support.domo.com/s/topic/0TO5w000000ZanAGAS/beast-mode
outputting Beast Mode Functions Reference Guide to kb_md\beast_mode_functions_reference_guide
starting crawl - https://domo-support.domo.com/s/article/360042925494


2023-02-08 06:48:50,688 INFO:Crawling: https://domo-support.domo.com/s/article/360043429953


Page https://domo-support.domo.com/s/article/360042925494 is loaded within 10 seconds.
adding url to list - https://domo-support.domo.com/s/article/360043430153
adding url to list - https://domo-support.domo.com/s/article/360043429633
adding url to list - https://domo-support.domo.com/s/article/360043428153
adding url to list - https://domo-support.domo.com/s/article/360042925494
outputting Sample Beast Mode Calculations: Date Transforms to kb_md\sample_beast_mode_calculations:_date_transforms
starting crawl - https://domo-support.domo.com/s/article/360043429953


2023-02-08 06:48:55,112 INFO:Crawling: https://domo-support.domo.com/s/article/360043429913


Page https://domo-support.domo.com/s/article/360043429953 is loaded within 10 seconds.
adding url to list - https://domo-support.domo.com/s/article/360043429953
outputting Date Format Specifier Characters for Beast Mode to kb_md\date_format_specifier_characters_for_beast_mode
starting crawl - https://domo-support.domo.com/s/article/360043429913


2023-02-08 06:48:58,684 INFO:Crawling: https://domo-support.domo.com/s/login/


Page https://domo-support.domo.com/s/article/360043429913 is loaded within 10 seconds.
adding url to list - https://domo-support.domo.com/s/article/360042924034
adding url to list - https://domo-support.domo.com/s/article/360042924094
adding url to list - https://domo-support.domo.com/s/article/360043429993
adding url to list - https://domo-support.domo.com/s/article/360043428753
adding url to list - https://domo-support.domo.com/s/article/360043429913
outputting Adding a Beast Mode Calculation to Your Chart to kb_md\adding_a_beast_mode_calculation_to_your_chart
starting crawl - https://domo-support.domo.com/s/login/
