In [1]:
# !pip install bs4
# !pip install lxml
# !pip install markdownify


In [2]:
BASE_FOLDER = "../../raw_domo_kb"

In [3]:
import os
from os import listdir
from os.path import isfile, join


def crawl_directory(folder_path):
    res = []
    for r, d, f in os.walk(folder_path):
        for file in f:
            if file.endswith(".html"):
                res.append(os.path.join(r, file))
    return res


test_ls = crawl_directory(BASE_FOLDER)
# test_ls[0:5]
test_article_path = next((url for url in test_ls if "article" in url))
# test_article_path

In [4]:
import os


def test_file_exists(file_path):
    if not os.path.exists(file_path):
        raise Exception(f"{file_path} does not exist")

    return True

In [5]:
def read_file(file_name):
    test_file_exists(file_name)

    with open(file_name, encoding="utf-8") as f:
        return f.read()


# read_file(test_ls[0])[0:100]

In [6]:
from enum import Enum


class PageList_Enum(Enum):
    BlocksList = "blocks-list"
    TopicsList = "topics-list"
    ArticleList = "article-list"
    SectionList = "section-list"


class PageContent_Enum(Enum):
    Article = "selfServiceArticleLayout"


def test_is_type(soup, page_type):
    class_text = page_type.value
    class_name = page_type.name

    if not soup.find(class_=[class_text]):
        return False
    return True

In [18]:
from bs4 import BeautifulSoup


def extract_content_soup(path_name, debug_prn: bool = False):  # returns content_soup
    data = read_file(path_name)

    soup = BeautifulSoup(data, features="lxml")

    content_soup = soup.find(class_=["content"])
    # content_soup = soup

    if not content_soup:
        raise Exception(f"content not available in {path_name}.  Check the download.")

    return content_soup


test_article_path = os.path.join(
    BASE_FOLDER, 'category/0TO5w000000ZamlGAC/index.html')
test_article_soup = extract_content_soup(test_article_path, debug_prn= True)
# test_soups = [extract_content_soup(article_path ) for article_path in test_ls[0:40]]
# print(test_article_soup.prettify())

In [8]:
import re
from markdownify import markdownify as md


def convert_to_snake(text_str):
    """converts 'snake_case_str' to 'snakeCaseStr'"""

    return text_str.replace(" ", "_").lower()


def clean_url_name(path_name):
    valid_chars = r"[^a-zA-Z0-9_]"

    return re.sub(valid_chars, "", path_name)


def process_html_str(html):
    html = md(html, strip=["a", "img"])

    html = "".join([line.rstrip() + "\n" for line in html.splitlines()])

    html = re.sub(r"(\n\n.?)+", r"\n\n", html)

    return html


def extract_article(soup, return_raw=False):
    try:
        form_ls = soup.find_all(class_="slds-form")

        res = {
            clean_url_name(
                convert_to_snake(
                    ele.find(class_="slds-form-element__label").text.strip()
                )
            ): process_html_str(str(ele.find(class_="slds-form-element__control")))
            # .strip()
            for form in form_ls
            for ele in form.find_all(class_="slds-form-element")
        }

        return res

    except Exception as e:
        print(e)
        return None


from pprint import pprint

# pprint(extract_article(test_article_soup))

In [9]:
def extract_title(soup, return_raw=False):
    title_soup = soup.find(class_="page-header") and soup.find(class_="page-header").find_next('h1') or soup.find(class_="article-head")
        

    if soup.find(class_="homePage_BrowseResources") and not title_soup:
        return "Home"

    if return_raw:
        return title_soup

    return title_soup.text.strip()

    return title_soup

extract_title(test_article_soup)

'Executing DataSets'

In [19]:
def extract_description(soup, return_raw: bool = False):
    description_soup = soup.find(class_="page-header-description")

    if return_raw:
        return description_soup
    
    return description_soup and description_soup.text.strip() or None

extract_description(test_article_soup)


'Learn how to manage users and groups, control access to content, configure security settings, and more.'

In [11]:
def extract_page_list(soup, list_type, return_raw=False):
    list_soup = soup.find(class_=list_type)

    if not list_soup:
        return None

    if return_raw:
        return list_soup

    return {
        list_type: [
            {
                "text": clean_url_name(convert_to_snake(item.text.strip())),
                "url": item.a["href"] if item.a else None,
            }
            for index, item in enumerate(list_soup)
            if type(item).__name__ == "Tag"
        ]
    }

In [21]:
import os
import json


def process_page(file_path, debug_prn: bool = False):
    print(f"\n{file_path}")

    soup = extract_content_soup(file_path)

    page_data = {"file_path": file_path, "content": {}}

    page_data.update(
        {
            "title": extract_title(soup),
            "title_clean": clean_url_name(convert_to_snake(extract_title(soup))),
            "description" : extract_description(soup)
        }
    )

    breadcrumbs = extract_page_list(soup, "breadcrumbs")
    if breadcrumbs:
        page_data.update(breadcrumbs)

    page_lists = [
        page_list for page_list in PageList_Enum if test_is_type(soup, page_list)
    ]

    for page_list in page_lists:
        page_data["content"].update(extract_page_list(soup, list_type=page_list.value))

    article = extract_article(soup)

    if article:
        page_data["content"].update(article)

    output_path_json = os.path.join(os.path.split(file_path)[0], "process.json")
    output_path_md = os.path.join(os.path.split(file_path)[0], "index.md")
    print(output_path_md, output_path_json)

    with open(output_path_json, "w", encoding="utf-8") as f:
        f.write(json.dumps(page_data))

    if page_data.get("content") and page_data.get("content").get("article_body"):
        with open(output_path_md, "w", encoding="utf-8") as f:
            f.write(page_data.get("content").get("article_body"))

    return page_data

process_page(test_article_path)


../../raw_domo_kb/category/0TO5w000000ZamlGAC/index.html
../../raw_domo_kb/category/0TO5w000000ZamlGAC/index.md ../../raw_domo_kb/category/0TO5w000000ZamlGAC/process.json


{'file_path': '../../raw_domo_kb/category/0TO5w000000ZamlGAC/index.html',
 'content': {'section-list': [{'text': 'implementing_sso',
    'url': '/s/topic/0TO5w000000ZanoGAC'},
   {'text': 'certifications', 'url': '/s/topic/0TO5w000000ZanGGAS'},
   {'text': 'domo_security_options', 'url': '/s/topic/0TO5w000000ZandGAC'},
   {'text': 'governance_tools', 'url': '/s/topic/0TO5w000000ZannGAC'},
   {'text': 'controlling_access_in_domo',
    'url': '/s/topic/0TO5w000000ZanMGAS'},
   {'text': 'specifying_company_settings',
    'url': '/s/topic/0TO5w000000ZaoGGAS'},
   {'text': 'admin_settings_overview', 'url': '/s/topic/0TO5w000000Zan4GAC'},
   {'text': 'approvals', 'url': '/s/topic/0TO5w000000Zan5GAC'},
   {'text': 'workflows', 'url': '/s/topic/0TO5w000000ZkAMGA0'}]},
 'title': 'Administrating Domo',
 'title_clean': 'administrating_domo',
 'description': 'Learn how to manage users and groups, control access to content, configure security settings, and more.',
 'breadcrumbs': [{'text': 'domo', 

In [13]:
article_ls = crawl_directory(BASE_FOLDER)

res = [process_page(path_name, debug_prn=True) for path_name in article_ls]


../../raw_domo_kb/category/0TO5w000000ZandGAC/index.html
../../raw_domo_kb/category/0TO5w000000ZandGAC/index.md ../../raw_domo_kb/category/0TO5w000000ZandGAC/process.json

../../raw_domo_kb/category/0TO5w000000ZapJGAS/index.html
../../raw_domo_kb/category/0TO5w000000ZapJGAS/index.md ../../raw_domo_kb/category/0TO5w000000ZapJGAS/process.json

../../raw_domo_kb/category/0TO5w000000ZaoCGAS/index.html
../../raw_domo_kb/category/0TO5w000000ZaoCGAS/index.md ../../raw_domo_kb/category/0TO5w000000ZaoCGAS/process.json

../../raw_domo_kb/category/0TO5w000000Zan0GAC/index.html
../../raw_domo_kb/category/0TO5w000000Zan0GAC/index.md ../../raw_domo_kb/category/0TO5w000000Zan0GAC/process.json

../../raw_domo_kb/category/0TO5w000000ZanCGAS/index.html
../../raw_domo_kb/category/0TO5w000000ZanCGAS/index.md ../../raw_domo_kb/category/0TO5w000000ZanCGAS/process.json

../../raw_domo_kb/category/0TO5w000000ZanAGAS/index.html
../../raw_domo_kb/category/0TO5w000000ZanAGAS/index.md ../../raw_domo_kb/category/