In [1]:
# use with local installs that don't have nbdev
import sys

sys.path.insert(0, "../../")

In [2]:
from selenium.webdriver.common.by import By
import datacrew.crawler.crawler as dcc
import datacrew.crawler.article as dca


URL = "https://domo-support.domo.com/s/article/360047400753?language=en_US"
URL = "https://domo-support.domo.com/s/article/360043429913"
BASE_URL = "https://domo-support.domo.com"
IMG_BASE_URL = "https://domo-support.domo.com/servlet/rtaImage"

driver = dcc.driversetup(is_headless=False)

test_article = dca.Article_KB(url=URL, driver=driver, base_url=BASE_URL)

💤 loading https://domo-support.domo.com/s/article/360043429913 💤
Page https://domo-support.domo.com/s/article/360043429913 is loaded within 10 seconds.


# Utility Functions

## string manipulation


In [3]:
import re

def convert_to_snake(text_str):
    """converts 'snake_case_str' to 'snakeCaseStr'"""

    return text_str.replace(" ", "_").lower()


def clean_url_name(path_name):
    valid_chars = r"[^a-zA-Z0-9_]"

    return re.sub(valid_chars, "", path_name)

## process html files


In [4]:
import requests


def download_img(image_url, image_path, debug_prn: bool = False):

    img_data = requests.get(image_url).content

    with open(image_path, "wb") as handler:
        if debug_prn:
            print(f"downloading {image_url} to {image_path}")
        handler.write(img_data)


def get_images(
    article: dca.Article,
    output_path: str,
    test_base_url: str = None,
    debug_prn: bool = False,
):

    # process image soup
    image_ls = [
        {
            "url": f"{BASE_URL if item.get('src').startswith('/') else ''}{item.get('src')}",
            "relative_url": item.get("src"),
            "name": item.get("alt"),
        }
        for item in article.soup.find_all("img")
    ]

    if test_base_url:
        image_ls = [img for img in image_ls if img.get("url").startswith(test_base_url)]

    # download images
    for img in image_ls:

        img_name = img.get("name")

        if not img_name:
            continue

        img_url = img.get("url")
        img_path = f"{output_path}/{img_name}"
        img_rel_path = img.get("relative_url")

        if debug_prn:
            print(
                f"downloading {img_url} to {img_path}.  replacing article with {img_rel_path} with {img_name}"
            )

        download_img(image_url=img_url, image_path=img_path)

        article.md_str = article.md_str.replace(img_rel_path, img_name)

    return image_ls


test_base_url = "https://domo-support.domo.com/servlet/rtaImage"
test_output_path = "../../raw_kb/adding_a_beast_mode_calculation_to_your_chart"

get_images(
    article=test_article,
    test_base_url=test_base_url,
    output_path=test_output_path,
    debug_prn=True,
)

downloading https://domo-support.domo.com/servlet/rtaImage?eid=ka05w000001248e&feoid=00N5w00000Ri7BU&refid=0EM5w000005vOEH to ../../raw_kb/adding_a_beast_mode_calculation_to_your_chart/Beast_Mode_Dialog.jpg.  replacing article with /servlet/rtaImage?eid=ka05w000001248e&feoid=00N5w00000Ri7BU&refid=0EM5w000005vOEH with Beast_Mode_Dialog.jpg
downloading https://domo-support.domo.com/servlet/rtaImage?eid=ka05w000001248e&feoid=00N5w00000Ri7BU&refid=0EM5w000005vOED to ../../raw_kb/adding_a_beast_mode_calculation_to_your_chart/Beast_Mode_Modal.png.  replacing article with /servlet/rtaImage?eid=ka05w000001248e&feoid=00N5w00000Ri7BU&refid=0EM5w000005vOED with Beast_Mode_Modal.png


[{'url': 'https://domo-support.domo.com/servlet/rtaImage?eid=ka05w000001248e&feoid=00N5w00000Ri7BU&refid=0EM5w000005vOEH',
  'relative_url': '/servlet/rtaImage?eid=ka05w000001248e&feoid=00N5w00000Ri7BU&refid=0EM5w000005vOEH',
  'name': 'Beast_Mode_Dialog.jpg'},
 {'url': 'https://domo-support.domo.com/servlet/rtaImage?eid=ka05w000001248e&feoid=00N5w00000Ri7BU&refid=0EM5w000005vOEE',
  'relative_url': '/servlet/rtaImage?eid=ka05w000001248e&feoid=00N5w00000Ri7BU&refid=0EM5w000005vOEE',
  'name': ''},
 {'url': 'https://domo-support.domo.com/servlet/rtaImage?eid=ka05w000001248e&feoid=00N5w00000Ri7BU&refid=0EM5w000005vOED',
  'relative_url': '/servlet/rtaImage?eid=ka05w000001248e&feoid=00N5w00000Ri7BU&refid=0EM5w000005vOED',
  'name': 'Beast_Mode_Modal.png'},
 {'url': 'https://domo-support.domo.com/servlet/rtaImage?eid=ka05w000001248e&feoid=00N5w00000Ri7BU&refid=0EM5w000005vOEE',
  'relative_url': '/servlet/rtaImage?eid=ka05w000001248e&feoid=00N5w00000Ri7BU&refid=0EM5w000005vOEE',
  'name': 

## handle markdown


In [5]:
from tempfile import mkstemp
from os import fdopen, remove
import textwrap
from shutil import move, copymode


def dedent_frontmatter(file_path):
    # Create temp file
    fh, abs_path = mkstemp()
    with fdopen(fh, "w") as new_file:
        with open(file_path) as old_file:
            count_frontmatter = 0
            for line in old_file:
                if count_frontmatter < 2:
                    if "---" in line:
                        count_frontmatter += 1
                    new_file.write(textwrap.dedent(line))
                else:
                    new_file.write(line)
    # Copy the file permissions from the old file to the new file
    copymode(file_path, abs_path)
    # Remove original file
    remove(file_path)
    # Move new file
    move(abs_path, file_path)


def add_frontmatter(front_matter, file_path: str):
    with open(file_path, "r+", encoding="utf-8") as md_file:
        file_data = md_file.read()  # Save all the file's content
        md_file.seek(0, 0)  # Place file pointer at the beginning
        md_file.write(front_matter)
        md_file.write("\n" + file_data)

    # dedent_frontmatter(file_path)

In [6]:
import re

def article_cleansing(article: dca.Article):
    compiled = re.compile(re.escape("youtube-nocookie.com"), re.IGNORECASE)
    article.md_str = compiled.sub("youtube.com", article.md_str)

In [21]:
from mdutils.mdutils import MdUtils


def output_md(article: dca.Article, output_index: str, debug_prn: bool = False):
    
    md_file = MdUtils(file_name=output_index)

    md_file.write(article.md_str)

    md_file.create_md_file()

    frontmatter_obj = f"""---
title : {article.title}
categories: { [link for link in article.url_ls if '/s/topic/' in link]}
date: {str(article.last_updated)}

url : {article.url}
linked_kbs :  { list(set([ md_file.new_inline_link(link) for link in article.url_ls]))}
article_id : {article.article_id}
views : {article.views}
created_date : {str(article.created)}
last updated : {str(article.last_updated)}
---"""
    if debug_prn:
        print(f"front_matter {output_index}")

    add_frontmatter(front_matter=frontmatter_obj, file_path=f"{output_index}.md")

In [14]:
def output_qmd(output_index):
    qmd_path = f"{output_index}.qmd"

    if os.path.exists(qmd_path):
        os.remove(qmd_path)

    os.rename(f"{output_index}.md", qmd_path)


In [9]:
def output_html(output_html, soup):
    with open(output_html, 'w') as f:
        f.write(str(soup))

In [22]:
import os


def output_article(
    article: str,
    output_folder: str = "../../raw_kb",
    debug_prn: bool = False,
):
    article_title = article.title

    output_path = os.path.join(
        output_folder, clean_url_name(convert_to_snake(article_title))
    )

    if not os.path.exists(output_path):
        os.makedirs(output_path)

    if debug_prn:
        print(f"outputing '{article_title}' to {output_path}")

    get_images(
        article=article,
        test_base_url=IMG_BASE_URL,
        output_path=output_path,
        debug_prn=debug_prn,
    )

    article_cleansing(article = article)

    output_index = f"{output_path}\index"

    output_html(output_html = f"{output_path}/doc.html", soup = article.soup)


    output_md(article=article, output_index=output_index, debug_prn=debug_prn)

    output_qmd(output_index=output_index)

    return output_path


test_article.get_images()
output_article(article=test_article, debug_prn=True)

outputing 'Adding a Beast Mode Calculation to Your Chart' to ../../raw_kb\adding_a_beast_mode_calculation_to_your_chart
downloading https://domo-support.domo.com/servlet/rtaImage?eid=ka05w000001248e&feoid=00N5w00000Ri7BU&refid=0EM5w000005vOEH to ../../raw_kb\adding_a_beast_mode_calculation_to_your_chart/Beast_Mode_Dialog.jpg.  replacing article with /servlet/rtaImage?eid=ka05w000001248e&feoid=00N5w00000Ri7BU&refid=0EM5w000005vOEH with Beast_Mode_Dialog.jpg
downloading https://domo-support.domo.com/servlet/rtaImage?eid=ka05w000001248e&feoid=00N5w00000Ri7BU&refid=0EM5w000005vOED to ../../raw_kb\adding_a_beast_mode_calculation_to_your_chart/Beast_Mode_Modal.png.  replacing article with /servlet/rtaImage?eid=ka05w000001248e&feoid=00N5w00000Ri7BU&refid=0EM5w000005vOED with Beast_Mode_Modal.png
front_matter ../../raw_kb\adding_a_beast_mode_calculation_to_your_chart\index


'../../raw_kb\\adding_a_beast_mode_calculation_to_your_chart'

In [11]:
import logging
import selenium.webdriver

logging.basicConfig(format="%(asctime)s %(levelname)s:%(message)s", level=logging.INFO)


class Crawler:
    base_url: str
    output_folder: str
    urls_visited_ls: list[str]
    urls_to_vist_ls: list[str]

    driver: selenium.webdriver

    def __init__(
        self,
        urls_to_visit_ls: list[str] = None,
        base_url=None,
        output_folder="../../raw_kb/",
    ):

        self.base_url = base_url
        self.output_folder = output_folder
        self.urls_visited_ls = []
        self.urls_to_visit_ls = urls_to_visit_ls
        self.article_ls = []
        self.driver = dcc.driversetup(is_headless=False)

    def add_url_to_visit(self, url, debug_prn: bool = False):
        if url not in self.urls_visited_ls and url not in self.urls_to_visit_ls:
            if debug_prn:
                print(f"adding url to list - {url}")

            self.urls_to_visit_ls.append(url)

    def crawl(self, url, debug_prn: bool = False):
        if debug_prn:
            print(f"starting crawl - {url}")

        article = dca.Article_KB(url=url, base_url=self.base_url, driver=driver)

        for url in article.url_ls:
            self.add_url_to_visit(url=url, debug_prn=debug_prn)

        if article.is_success:
            output_md(
                article=article, output_folder=self.output_folder, debug_prn=debug_prn
            )

    def run(self, debug_prn: bool = False):
        while self.urls_to_visit_ls:
            url = self.urls_to_visit_ls.pop(0)

            logging.info(f"Crawling: {url}")

            try:
                self.crawl(url, debug_prn)
            except Exception:
                logging.exception(f"Failed to crawl: {url}")
            finally:
                self.urls_visited_ls.append(url)

        print("done")
        return self

In [12]:
crawler = Crawler(
    urls_to_visit_ls=[URL], base_url=BASE_URL, output_folder="../../raw_kb"
)

crawler.run(debug_prn=True)

2023-04-09 20:18:09,011 INFO:Crawling: https://domo-support.domo.com/s/article/360043429913


starting crawl - https://domo-support.domo.com/s/article/360043429913
💤 loading https://domo-support.domo.com/s/article/360043429913 💤


2023-04-09 20:18:12,131 ERROR:Failed to crawl: https://domo-support.domo.com/s/article/360043429913
Traceback (most recent call last):
  File "C:\Users\jwilson1\AppData\Local\Temp\1\ipykernel_12464\4246888015.py", line 57, in run
    self.crawl(url, debug_prn)
  File "C:\Users\jwilson1\AppData\Local\Temp\1\ipykernel_12464\4246888015.py", line 46, in crawl
    output_md(
TypeError: output_md() got an unexpected keyword argument 'output_folder'
2023-04-09 20:18:12,132 INFO:Crawling: https://domo-support.domo.com/s/knowledge-base


Page https://domo-support.domo.com/s/article/360043429913 is loaded within 10 seconds.
adding url to list - https://domo-support.domo.com/s/knowledge-base
adding url to list - https://domo-support.domo.com/s/topic/0TO5w000000ZamwGAC/release-notes
adding url to list - https://domo-support.domo.com/s/knowledge-base/
adding url to list - https://domo-support.domo.com/s/topic/0TO5w000000ZamoGAC
adding url to list - https://domo-support.domo.com/s/topic/0TO5w000000ZanAGAS
adding url to list - https://domo-support.domo.com/s/article/360042924034
adding url to list - https://domo-support.domo.com/s/article/360043429933
adding url to list - https://domo-support.domo.com/s/article/360043429953
adding url to list - https://domo-support.domo.com/s/article/360042924094
adding url to list - https://domo-support.domo.com/s/article/360043429993
adding url to list - https://domo-support.domo.com/s/article/360043428753
adding url to list - https://domo-support.domo.com/s/article/360043429913
adding url

2023-04-09 20:18:28,737 INFO:Crawling: https://domo-support.domo.com/s/topic/0TO5w000000ZamwGAC/release-notes


Message: 
Stacktrace:
Backtrace:
	GetHandleVerifier [0x002FDCE3+50899]
	(No symbol) [0x0028E111]
	(No symbol) [0x00195588]
	(No symbol) [0x001C08F9]
	(No symbol) [0x001C0AFB]
	(No symbol) [0x001EF902]
	(No symbol) [0x001DB944]
	(No symbol) [0x001EE01C]
	(No symbol) [0x001DB6F6]
	(No symbol) [0x001B7708]
	(No symbol) [0x001B886D]
	GetHandleVerifier [0x00563EAE+2566302]
	GetHandleVerifier [0x005992B1+2784417]
	GetHandleVerifier [0x0059327C+2759788]
	GetHandleVerifier [0x00395740+672048]
	(No symbol) [0x00298872]
	(No symbol) [0x002941C8]
	(No symbol) [0x002942AB]
	(No symbol) [0x002871B7]
	BaseThreadInitThunk [0x767100F9+25]
	RtlGetAppContainerNamedObjectPath [0x77097BBE+286]
	RtlGetAppContainerNamedObjectPath [0x77097B8E+238]
	(No symbol) [0x00000000]

Timeout Exception: Page https://domo-support.domo.com/s/knowledge-base did not load within 10 seconds.
search term slds-form-element does not exist in https://domo-support.domo.com/s/knowledge-base
adding url to list - https://domo-suppor

2023-04-09 20:18:45,099 INFO:Crawling: https://domo-support.domo.com/s/knowledge-base/


Message: 
Stacktrace:
Backtrace:
	GetHandleVerifier [0x002FDCE3+50899]
	(No symbol) [0x0028E111]
	(No symbol) [0x00195588]
	(No symbol) [0x001C08F9]
	(No symbol) [0x001C0AFB]
	(No symbol) [0x001EF902]
	(No symbol) [0x001DB944]
	(No symbol) [0x001EE01C]
	(No symbol) [0x001DB6F6]
	(No symbol) [0x001B7708]
	(No symbol) [0x001B886D]
	GetHandleVerifier [0x00563EAE+2566302]
	GetHandleVerifier [0x005992B1+2784417]
	GetHandleVerifier [0x0059327C+2759788]
	GetHandleVerifier [0x00395740+672048]
	(No symbol) [0x00298872]
	(No symbol) [0x002941C8]
	(No symbol) [0x002942AB]
	(No symbol) [0x002871B7]
	BaseThreadInitThunk [0x767100F9+25]
	RtlGetAppContainerNamedObjectPath [0x77097BBE+286]
	RtlGetAppContainerNamedObjectPath [0x77097B8E+238]
	(No symbol) [0x00000000]

Timeout Exception: Page https://domo-support.domo.com/s/topic/0TO5w000000ZamwGAC/release-notes did not load within 10 seconds.
search term slds-form-element does not exist in https://domo-support.domo.com/s/topic/0TO5w000000ZamwGAC/releas

KeyboardInterrupt: 