In [1]:
# use with local installs that don't have nbdev
import sys

sys.path.insert(0, "../../")


In [2]:
import datacrew.crawler.article as dca


TEST_ARTICLE_PATH = "../../raw_kb/article/ad_ebis_connector/index.html"
TEST_CATEGORY_PATH = "../../raw_kb/category/available_custom_connectors/index.html"
TEST_HOME_PATH = "../../raw_kb/home/browse_resources_by_topic/index.html"

BASE_URL = 'https://domo-support.domo.com'
IMG_BASE_URL = "https://domo-support.domo.com/servlet/rtaImage"

ARTICLE_PATH = '../../raw_kb/article_listing.csv'
CATEGORY_PATH = '../../raw_kb/category_listing.csv'

OUTPUT_FOLDER = "../../raw_kb"


# UTILITY FUNCTIONS


## general string functions


In [3]:
import os
import re
import pandas as pd


def clean_url_name(path_name):
    valid_chars = r"[^a-zA-Z0-9_]"

    return re.sub(valid_chars, "", path_name)


def convert_to_snake(text_str):
    """converts 'snake_case_str' to 'snakeCaseStr'"""

    return text_str.replace(" ", "_").lower()


## general html functions


In [4]:
import requests


def download_img(image_url, image_path, debug_prn: bool = False):

    img_data = requests.get(image_url).content

    with open(image_path, "wb") as handler:
        if debug_prn:
            print(f"downloading {image_url} to {image_path}")
        handler.write(img_data)


## handle markdown


In [5]:
from tempfile import mkstemp
from os import fdopen, remove
import textwrap
from shutil import move, copymode


def dedent_frontmatter(file_path):
    # Create temp file
    fh, abs_path = mkstemp()
    with fdopen(fh, "w") as new_file:
        with open(file_path, encoding = "utf8") as old_file:
            count_frontmatter = 0
            for line in old_file:
                if count_frontmatter < 2:
                    if "---" in line:
                        count_frontmatter += 1
                    new_file.write(textwrap.dedent(line))
                else:
                    new_file.write(line)
    # Copy the file permissions from the old file to the new file
    copymode(file_path, abs_path)
    # Remove original file
    remove(file_path)
    # Move new file
    move(abs_path, file_path)


def add_frontmatter(front_matter, file_path: str):
    with open(file_path, "r+", encoding="utf-8") as md_file:
        file_data = md_file.read()  # Save all the file's content
        md_file.seek(0, 0)  # Place file pointer at the beginning
        md_file.write(front_matter)
        md_file.write("\n" + file_data)

    # dedent_frontmatter(file_path)


In [6]:
def convert_md_to_qmd(output_md_path):
    qmd_path = output_md_path.replace('.md', '.qmd')

    if os.path.exists(qmd_path):
        os.remove(qmd_path)

    os.rename(output_md_path, qmd_path)

    return qmd_path


# PROJECT IMPLEMENTATION FUNCTIONS


## get listing functions


In [7]:
def generate_output_folder(output_folder, title: str):
    output_path = f"{output_folder}/{clean_url_name(convert_to_snake(title))}"

    return output_path


def get_listing_df(df_path,  # listing location
                   output_folder,  # base_folder + entity_folder
                   is_update_file: bool = False):
                   
    df = pd.read_csv(df_path, index_col='id')
    
    # del article_listing_df[article_listing_df.columns[0]]

    del df[df.columns[-1]]

    df = df.groupby('id').tail(1)
    df['output_file_path'] = df.apply(lambda x: generate_output_folder(title=x['name'],
                                                                       output_folder=f"{output_folder}"), axis=1)

    if is_update_file:
        df.to_csv(df_path)

    return df


category_listing_df = get_listing_df(CATEGORY_PATH, f"{OUTPUT_FOLDER}/category")
category_listing_df['type'] = 'category'

article_listing_df = get_listing_df(ARTICLE_PATH, f"{OUTPUT_FOLDER}/article")
article_listing_df['type'] = 'article'

mapping_df = pd.concat([category_listing_df, article_listing_df])
mapping_df['rel_url'] = mapping_df['url'].str.replace(BASE_URL, '')

# category_listing_df[0:5]
mapping_df[0:5]


Unnamed: 0_level_0,name,url,updated,output_file_path,type,rel_url
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0TO5w000000ZamoGAC,Creating Content In Domo,https://domo-support.domo.com/s/topic/0TO5w000...,2023-04-16 19:33:50.939898,../../raw_kb/category/creating_content_in_domo,category,/s/topic/0TO5w000000ZamoGAC
0TO5w000000ZamwGAC,Release Notes,https://domo-support.domo.com/s/topic/0TO5w000...,2023-04-16 19:34:47.307326,../../raw_kb/category/release_notes,category,/s/topic/0TO5w000000ZamwGAC
0TO5w000000ZanAGAS,Beast Mode,https://domo-support.domo.com/s/topic/0TO5w000...,2023-04-16 19:33:26.454061,../../raw_kb/category/beast_mode,category,/s/topic/0TO5w000000ZanAGAS
0TO5w000000ZanDGAS,Card And Dashboard Management,https://domo-support.domo.com/s/topic/0TO5w000...,2023-04-16 19:35:33.265200,../../raw_kb/category/card_and_dashboard_manag...,category,/s/topic/0TO5w000000ZanDGAS
0TO5w000000ZaoEGAS,Sharing Access To Cards And Dashboards,https://domo-support.domo.com/s/topic/0TO5w000...,2023-04-16 19:35:55.374172,../../raw_kb/category/sharing_access_to_cards_...,category,/s/topic/0TO5w000000ZaoEGAS


## project html handling functions


### test implementation


In [8]:
# !pip install chardet

# import the chardet library
import chardet 

# use the detect method to find the encoding
# 'rb' means read in the file as binary

url = '../../raw_kb/article/installing_coursebuilder/index.html'
# url = '../../raw_kb/category/creating_content_in_domo/index.html'
with open(url, 'rb') as file:
    print(chardet.detect(file.read()))


{'encoding': 'utf-8', 'confidence': 0.99, 'language': ''}


In [9]:
import pandas as pd

row_article = dict(article_listing_df[article_listing_df['name'].str.contains(
    'Adding a Beast Mode Calculation to Your Chart')].iloc[0])

test_article = dca.Article_KB(
    url=row_article.get('url'),
    path_html=f"{row_article.get('output_file_path')}/index.html",
    base_url=BASE_URL)

row_category = dict(category_listing_df.iloc[0])


_ = f"{row_category.get('output_file_path')}/index.html"

test_category = dca.Article_Category(
    url=row_category.get('url'),
    path_html=f"{row_category.get('output_file_path')}/index.html",
    base_url=BASE_URL)


test_home = dca.Article_KB_Home(
    url="https://domo-support.domo.com/s/knowledge-base",
    path_html=f"../../raw_kb/home/browse_resources_by_topic/index.html",
    base_url=BASE_URL)

print(test_home.md_str, _)

# assert 1 == 2

❤️ child url - https://domo-support.domo.com/s/topic/0TO5w000000ZanDGAS, 0TO5w000000ZanDGAS
❤️ child url - https://domo-support.domo.com/s/topic/0TO5w000000ZaoEGAS, 0TO5w000000ZaoEGAS
❤️ child url - https://domo-support.domo.com/s/topic/0TO5w000000ZaoNGAS, 0TO5w000000ZaoNGAS
❤️ child url - https://domo-support.domo.com/s/topic/0TO5w000000ZanwGAC, 0TO5w000000ZanwGAC
❤️ child url - https://domo-support.domo.com/s/topic/0TO5w000000ZanZGAS, 0TO5w000000ZanZGAS
❤️ child url - https://domo-support.domo.com/s/topic/0TO5w000000Zao4GAC, 0TO5w000000Zao4GAC
❤️ child url - https://domo-support.domo.com/s/topic/0TO5w000000ZaoHGAS, 0TO5w000000ZaoHGAS
❤️ child url - https://domo-support.domo.com/s/topic/0TO5w000000ZanAGAS, 0TO5w000000ZanAGAS
❤️ child url - https://domo-support.domo.com/s/topic/0TO5w000000ZdTCGA0, 0TO5w000000ZdTCGA0
❤️ child url - https://domo-support.domo.com/s/topic/0TO5w000000ZamsGAC, 0TO5w000000ZamsGAC
❤️ child url - https://domo-support.domo.com/s/topic/0TO5w000000ZammGAC, 0TO5w00

### handle article links


In [10]:
import re


def update_md_clean_youtube(article: dca.Article):
    compiled = re.compile(
        re.escape("//www.youtube-nocookie.com"), re.IGNORECASE)
    article.md_str = compiled.sub("www.youtube.com", article.md_str)

    return article.md_str


In [11]:
from pprint import pprint
import os


def update_article_url_fn(url_path, mapping_df, current_path, debug_prn: bool = False):

    url_ls = url_path.split(' ')

    contain_values = mapping_df[mapping_df['url'].str.contains(url_ls[0])]

    if debug_prn:
        print(contain_values)

    if not len(contain_values.index) > 0:
        return url_path

    row = contain_values.iloc[0]

    url_ls[0] = f"{row.output_file_path}/index.html"

    new_path = ' '.join(url_ls)

    current_path = os.path.split(current_path)[0]

    rel_path = os.path.relpath(new_path, current_path).strip()

    if rel_path[-10:] == 'index.html':
        rel_path = rel_path.replace('index.html', 'index.md')

    if debug_prn:
        print(
            f"update_article_url_nf: {new_path}, {current_path} : {rel_path}")

    return rel_path


In [12]:
import re
# https://stackoverflow.com/questions/63197371/detecting-all-links-in-markdown-files-in-python-and-replace-them-with-outputs-of


def find_md_links(md):
    """Returns dict of links in markdown:
    """
    # https://stackoverflow.com/a/30738268/2755116

    INLINE_LINK_RE = re.compile(r'\[([^\]]+)\]\(([^)]+)\)')
    FOOTNOTE_LINK_TEXT_RE = re.compile(r'\[([^\]]+)\]\[(\d+)\]')
    FOOTNOTE_LINK_URL_RE = re.compile(r'\[(\d+)\]:\s+(\S+)')

    links = list(INLINE_LINK_RE.findall(md))
    footnote_links = dict(FOOTNOTE_LINK_TEXT_RE.findall(md))
    footnote_urls = dict(FOOTNOTE_LINK_URL_RE.findall(md))

    footnotes_linking = []

    for key in footnote_links.keys():
        footnotes_linking.append(
            (footnote_links[key], footnote_urls[footnote_links[key]]))

    return {'regular': links, 'footnotes': footnotes_linking}


def replace_md_links(article, f, mapping_df, debug_prn: bool = False):
    """Replace links url to f(url)"""

    links = find_md_links(article.md_str)

    if links.get('regular'):
        for r in links['regular']:
            
            if debug_prn:
                print(r)
            
            article.md_str = article.md_str.replace(
                r[1], f(url_path=r[1],
                        mapping_df=mapping_df, 
                        current_path=article.path_html,
                        debug_prn=debug_prn))

    if links.get('footnotes'):
        for r in links['footnotes']:
            article.md_str = article.md_str.replace(
                r[1], f(url_path=r[1],
                        mapping_df=mapping_df,
                        current_path=article.path_html,
                        debug_prn=debug_prn))

    return article.md_str





# test_article = dca.Article_KB(
#     url=row_category.get('url'),
#     path_html="../../raw_kb/article/installing_coursebuilder/index.html",
#     base_url=BASE_URL)

# test_category = dca.Article_Category(
#     url=row_category.get('url'),
#     path_html="../../raw_kb\category\current_release_notes\index.html",
#     base_url=BASE_URL)

test_home = dca.Article_KB_Home(
    url="https://domo-support.domo.com/s/knowledge-base",
    path_html=f"../../raw_kb/home/browse_resources_by_topic/index.html",
    base_url=BASE_URL)

print(replace_md_links(article=test_home,
                       f=update_article_url_fn,
                       mapping_df=mapping_df,
                       debug_prn=True))



❤️ child url - https://domo-support.domo.com/s/topic/0TO5w000000ZamsGAC, 0TO5w000000ZamsGAC
❤️ child url - https://domo-support.domo.com/s/topic/0TO5w000000ZammGAC, 0TO5w000000ZammGAC
❤️ child url - https://domo-support.domo.com/s/topic/0TO5w000000ZamzGAC, 0TO5w000000ZamzGAC
❤️ child url - https://domo-support.domo.com/s/topic/0TO5w000000ZamoGAC, 0TO5w000000ZamoGAC
❤️ child url - https://domo-support.domo.com/s/topic/0TO5w000000ZamnGAC, 0TO5w000000ZamnGAC
❤️ child url - https://domo-support.domo.com/s/topic/0TO5w000000ZamlGAC, 0TO5w000000ZamlGAC
❤️ child url - https://domo-support.domo.com/s/topic/0TO5w000000ZamqGAC, 0TO5w000000ZamqGAC
❤️ child url - https://domo-support.domo.com/s/topic/0TO5w000000Zan0GAC, 0TO5w000000Zan0GAC
❤️ child url - https://domo-support.domo.com/s/topic/0TO5w000000Zan2GAC, 0TO5w000000Zan2GAC
❤️ child url - https://domo-support.domo.com/s/topic/0TO5w000000ZampGAC, 0TO5w000000ZampGAC
❤️ child url - https://domo-support.domo.com/s/topic/0TO5w000000ZamwGAC, 0TO5w00

In [13]:

import os


def remove_links_in_mapping(article, mapping_df, is_convert_qmd: bool = True):

    for index, row in mapping_df.iterrows():
        article.md_str = article.md_str.replace(row['rel_url'],
                                                os.path.relpath(f"{row['output_file_path']}\{'index.qmd' if is_convert_qmd else 'index.md'}",
                                                                article.path_html.replace('index.html', '')))
    return article.md_str



test_home = dca.Article_KB_Home(
    url="https://domo-support.domo.com/s/knowledge-base",
    path_html=f"../../raw_kb/home/browse_resources_by_topic/index.html",
    base_url=BASE_URL)

test_home
# remove_links_in_mapping(article = test_home, mapping_df = mapping_df)


❤️ child url - https://domo-support.domo.com/s/topic/0TO5w000000ZamsGAC, 0TO5w000000ZamsGAC
❤️ child url - https://domo-support.domo.com/s/topic/0TO5w000000ZammGAC, 0TO5w000000ZammGAC
❤️ child url - https://domo-support.domo.com/s/topic/0TO5w000000ZamzGAC, 0TO5w000000ZamzGAC
❤️ child url - https://domo-support.domo.com/s/topic/0TO5w000000ZamoGAC, 0TO5w000000ZamoGAC
❤️ child url - https://domo-support.domo.com/s/topic/0TO5w000000ZamnGAC, 0TO5w000000ZamnGAC
❤️ child url - https://domo-support.domo.com/s/topic/0TO5w000000ZamlGAC, 0TO5w000000ZamlGAC
❤️ child url - https://domo-support.domo.com/s/topic/0TO5w000000ZamqGAC, 0TO5w000000ZamqGAC
❤️ child url - https://domo-support.domo.com/s/topic/0TO5w000000Zan0GAC, 0TO5w000000Zan0GAC
❤️ child url - https://domo-support.domo.com/s/topic/0TO5w000000Zan2GAC, 0TO5w000000Zan2GAC
❤️ child url - https://domo-support.domo.com/s/topic/0TO5w000000ZampGAC, 0TO5w000000ZampGAC
❤️ child url - https://domo-support.domo.com/s/topic/0TO5w000000ZamwGAC, 0TO5w00

Article_KB_Home(url='https://domo-support.domo.com/s/knowledge-base', base_url='https://domo-support.domo.com', entity_base_url='https://domo-support.domo.com/s/knowledge-base', url_entity_prefix='/s/knowledge-base', url_id='', is_success=True, url_ls=['https://domo-support.domo.com/s/knowledge-base', 'https://domo-support.domo.com/s/topic/0TO5w000000ZamwGAC', 'https://domo-support.domo.com/s/topic/0TO5w000000ZamsGAC', 'https://domo-support.domo.com/s/topic/0TO5w000000ZammGAC', 'https://domo-support.domo.com/s/topic/0TO5w000000ZamzGAC', 'https://domo-support.domo.com/s/topic/0TO5w000000ZamoGAC', 'https://domo-support.domo.com/s/topic/0TO5w000000ZamnGAC', 'https://domo-support.domo.com/s/topic/0TO5w000000ZamlGAC', 'https://domo-support.domo.com/s/topic/0TO5w000000ZamqGAC', 'https://domo-support.domo.com/s/topic/0TO5w000000Zan0GAC', 'https://domo-support.domo.com/s/topic/0TO5w000000Zan2GAC', 'https://domo-support.domo.com/s/topic/0TO5w000000ZampGAC', 'https://domo-support.domo.com/s/topi

### handle article images


In [14]:
import os
import pandas as pd


def update_md_download_images(
    article: dca.Article,
    output_folder: str,
    debug_prn: bool = False,
):
    image_ls = article.image_ls or article.get_images()

    if not image_ls:
        return

    # download images
    for index, img in enumerate(image_ls):
        img_name = img.get("name")

        if not img_name:
            img_name = f"{index}.png"

        img_url = img.get("url")
        img_path = f"{output_folder}/{img_name}"
        img_rel_path = img.get("relative_url")

        if debug_prn:
            print(
                f"downloading {img_url} to {img_path}. replacing article with {img_rel_path} with {img_name}"
            )

        download_img(image_url=img_url, image_path=img_path)

        img.update({'image_path': img_path})

        article.md_str = article.md_str.replace(img_rel_path, img_name)
        image_ls[index] = img


    return image_ls


test_home.get_images()
pd.DataFrame(update_md_download_images(
    article=test_home,
    output_folder="../../raw_kb/home/browse_resources_by_topic",
    debug_prn=False,
))

print(test_home.md_str)


# assert 1 == 2

Browse resources by topic:
* [![](0.png)#### Getting To Know Domo](/s/topic/0TO5w000000ZamsGAC)
* [![](1.png)#### Connecting Data To Domo](/s/topic/0TO5w000000ZammGAC)
* [![](2.png)#### Transforming Data In Domo](/s/topic/0TO5w000000ZamzGAC)
* [![](3.png)#### Creating Content In Domo](/s/topic/0TO5w000000ZamoGAC)
* [![](4.png)#### Consuming Data In Domo](/s/topic/0TO5w000000ZamnGAC)
* [![](5.png)#### Administrating Domo](/s/topic/0TO5w000000ZamlGAC)
* [![](6.png)#### Distributing Domo Content](/s/topic/0TO5w000000ZamqGAC)
* [![](7.png)#### Understanding Data](/s/topic/0TO5w000000Zan0GAC)
* [![](8.png)#### Working With DataSets In Domo](/s/topic/0TO5w000000Zan2GAC)
* [![](9.png)#### Creating Custom Visualizations](/s/topic/0TO5w000000ZampGAC)
* [![](10.png)#### Release Notes](/s/topic/0TO5w000000ZamwGAC)
* [![](11.png)#### Using Domo APIs](/s/topic/0TO5w000000Zan1GAC)
* [![](12.png)#### Training](/s/topic/0TO5w000000ZamyGAC)



### handle urls


## Handle Article Output


In [15]:
def clean_article_md(article,
                     output_folder,
                     mapping_df,
                     debug_prn: bool = False):

    update_md_download_images(
        article=article,
        output_folder=output_folder,
        debug_prn=debug_prn)

    update_md_clean_youtube(article)

    replace_md_links(article, update_article_url_fn, mapping_df=mapping_df)

    article.md_str = article.md_str.replace('index.html', 'index.md')

    return article.md_str

# print(clean_article_md(
#     article=test_article,
#     mapping_df=mapping_df,
#     debug_prn=False)
# )


In [16]:
from mdutils.mdutils import MdUtils

def output_article_md(article: dca.Article,
                 output_folder: str,
                 mapping_df : pd.DataFrame,
                 generate_frontmatter_fn: callable = None,
                 debug_prn: bool = False,
                 is_convert_qmd: bool = True
                 ):
    
    output_md_path = f"{output_folder}/index.md"

    md_file = MdUtils(
        file_name=output_md_path.replace('.md', ''))

    clean_article_md(article, output_folder= output_folder, mapping_df = mapping_df )

    if is_convert_qmd:
        article.md_str = article.md_str.replace('index.md', 'index.qmd')

    md_file.write(article.md_str)

    md_file.create_md_file()

    if generate_frontmatter_fn:
        frontmatter_obj = generate_frontmatter_fn(article)

        add_frontmatter(front_matter=frontmatter_obj,
                        file_path=output_md_path)

    if is_convert_qmd:
        convert_md_to_qmd(output_md_path)
    
    return output_md_path


In [17]:
import os


def generate_kb_frontmatter(article):
    return f"""---
title : {article.title}
categories: { [link for link in article.url_ls if '/s/topic/' in link]}
date: {str(article.last_updated)}
url : {article.url}
linked_kbs :  { list(set( article.url_ls))}
article_id : {article.article_id}
views : {article.views}
created_date : {str(article.created)}
last updated : {str(article.last_updated)}
---"""


# test output_article_md

In [18]:
row = category_listing_df.iloc[0]

# print(row)
url = row.get('url')
output_folder = row.get('output_file_path') 

# try:
print(url)
print(output_folder)

article = dca.Article_Category(
    url= url,
    path_html= f"{output_folder}/index.html",
    base_url=BASE_URL)


output_article_md(article,
                output_folder = output_folder,
                # generate_frontmatter_fn= generate_kb_frontmatter,
                mapping_df = mapping_df,
                debug_prn = True,
                is_convert_qmd = False
                
                )

# except Exception as e:
#     print(f"error: {url} to  {output_folder} - {e}")
    


https://domo-support.domo.com/s/topic/0TO5w000000ZamoGAC
../../raw_kb/category/creating_content_in_domo
❤️ child url - https://domo-support.domo.com/s/topic/0TO5w000000ZanDGAS, 0TO5w000000ZanDGAS
❤️ child url - https://domo-support.domo.com/s/topic/0TO5w000000ZaoEGAS, 0TO5w000000ZaoEGAS
❤️ child url - https://domo-support.domo.com/s/topic/0TO5w000000ZaoNGAS, 0TO5w000000ZaoNGAS
❤️ child url - https://domo-support.domo.com/s/topic/0TO5w000000ZanwGAC, 0TO5w000000ZanwGAC
❤️ child url - https://domo-support.domo.com/s/topic/0TO5w000000ZanZGAS, 0TO5w000000ZanZGAS
❤️ child url - https://domo-support.domo.com/s/topic/0TO5w000000Zao4GAC, 0TO5w000000Zao4GAC
❤️ child url - https://domo-support.domo.com/s/topic/0TO5w000000ZaoHGAS, 0TO5w000000ZaoHGAS
❤️ child url - https://domo-support.domo.com/s/topic/0TO5w000000ZanAGAS, 0TO5w000000ZanAGAS
❤️ child url - https://domo-support.domo.com/s/topic/0TO5w000000ZdTCGA0, 0TO5w000000ZdTCGA0


'../../raw_kb/category/creating_content_in_domo/index.md'

In [19]:
# for index, row in mapping_df.iterrows():

import asyncio


def main(url, output_folder, article_type, is_convert_qmd : bool = True):
    try:
        article = None

        if article_type == 'article':
            article = dca.Article_KB(
                url=url,
                path_html=f"{output_folder}/index.html",
                base_url=BASE_URL)

        elif article_type == 'category':

            article = dca.Article_Category(
                url=url,
                path_html=f"{output_folder}/index.html",
                base_url=BASE_URL)

        elif article_type == 'home':
            article = dca.Article_KB_Home(
                url=url,
                path_html=f"{output_folder}/index.html",
                base_url=BASE_URL)

        else:
            raise Exception(f'invalid type {article_type}')

        output_article_md(article,
                          output_folder=output_folder,
                          generate_frontmatter_fn=generate_kb_frontmatter if article_type == 'article' else None,
                          mapping_df=mapping_df,
                          is_convert_qmd = is_convert_qmd,
                          debug_prn=True)

    except Exception as e:
        print(f"error: {url} to  {output_folder} - {e}")


# await asyncio.gather([main(
#     url=row.get('url'),
#     output_folder=row.get('output_file_path'),
#     article_type=row.get('type')
# ) for index, row in article_listing_df.iterrows()])


main(url ="https://domo-support.domo.com/s/knowledge-base",
     output_folder = "../../raw_kb/home/browse_resources_by_topic/",
     article_type = "home",
     is_convert_qmd = False
     )


❤️ child url - https://domo-support.domo.com/s/topic/0TO5w000000ZamsGAC, 0TO5w000000ZamsGAC
❤️ child url - https://domo-support.domo.com/s/topic/0TO5w000000ZammGAC, 0TO5w000000ZammGAC
❤️ child url - https://domo-support.domo.com/s/topic/0TO5w000000ZamzGAC, 0TO5w000000ZamzGAC
❤️ child url - https://domo-support.domo.com/s/topic/0TO5w000000ZamoGAC, 0TO5w000000ZamoGAC
❤️ child url - https://domo-support.domo.com/s/topic/0TO5w000000ZamnGAC, 0TO5w000000ZamnGAC
❤️ child url - https://domo-support.domo.com/s/topic/0TO5w000000ZamlGAC, 0TO5w000000ZamlGAC
❤️ child url - https://domo-support.domo.com/s/topic/0TO5w000000ZamqGAC, 0TO5w000000ZamqGAC
❤️ child url - https://domo-support.domo.com/s/topic/0TO5w000000Zan0GAC, 0TO5w000000Zan0GAC
❤️ child url - https://domo-support.domo.com/s/topic/0TO5w000000Zan2GAC, 0TO5w000000Zan2GAC
❤️ child url - https://domo-support.domo.com/s/topic/0TO5w000000ZampGAC, 0TO5w000000ZampGAC
❤️ child url - https://domo-support.domo.com/s/topic/0TO5w000000ZamwGAC, 0TO5w00

# Cleansing functions


In [20]:
def decompose_output_folder(output_folder, project_base):

    strip_base = output_folder[output_folder.find(project_base):]

    base_folder = "/".join(strip_base.split("/")[:-2])

    doc_folder = f"/{os.path.split(strip_base)[1]}"

    entity_folder = strip_base.replace(
        base_folder, '').replace(doc_folder, '').rstrip("/")

    # path_from_base = os.path.split(row['output_file_path'])[1]

    return {
        "project_base": project_base,
        "output_folder": output_folder,
        "base_folder": base_folder,
        "output_folder": output_folder,
        "entity_folder": entity_folder,
        "doc_folder": doc_folder
    }


In [21]:
# import os
# import shutil
# folder = '../../raw_kb/category/'

# for root, dirs, files in os.walk(folder):
#     for f in files:
#         if f.endswith('index.html'):
#             continue
#         print(f)
#         os.unlink(os.path.join(root, f))
    # for d in dirs:
    #     shutil.rmtree(os.path.join(root, d))