In [1]:
# use with local installs that don't have nbdev
import sys

sys.path.insert(0, "../../")

In [2]:
import datacrew.crawler.article as dca


TEST_ARTICLE_PATH = "../../raw_kb/article/ad_ebis_connector/index.html"
TEST_CATEGORY_PATH = "../../raw_kb/category/administrating_domo/index.html"
BASE_URL = 'https://domo-support.domo.com'
IMG_BASE_URL = "https://domo-support.domo.com/servlet/rtaImage"

LISTING_PATH = '../../raw_kb/article_listing.csv'
CATEGORY_PATH = '../../raw_kb/category_listing.csv'

OUTPUT_FOLDER = "../../raw_kb"

In [3]:
import os
import re
import pandas as pd


def clean_url_name(path_name):
    valid_chars = r"[^a-zA-Z0-9_]"

    return re.sub(valid_chars, "", path_name)


def convert_to_snake(text_str):
    """converts 'snake_case_str' to 'snakeCaseStr'"""

    return text_str.replace(" ", "_").lower()


def create_output_folder(output_folder, title: str):
    output_path = f"{output_folder}/{clean_url_name(convert_to_snake(title))}"

    return output_path


def repair_df(df_path, output_folder):
    df = pd.read_csv(df_path, index_col='id')
    # del article_listing_df[article_listing_df.columns[0]]
    del df[df.columns[-1]]

    df = df.groupby('id').tail(1)
    df['output_file_path'] = df.apply(lambda x: create_output_folder(title=x['name'],
                                                                     output_folder=f"{output_folder}"), axis=1)

    df.to_csv(df_path)

    return df

category_listing_df = repair_df(CATEGORY_PATH, f"{OUTPUT_FOLDER}/category")


In [4]:

import pandas as pd

article_listing_df = pd.read_csv(LISTING_PATH)

row = dict(article_listing_df.loc[0])
row

test_article = dca.Article_KB(
    url = row.get('url'),
    path_html = f"{row.get('output_file_path')}/index.html",
    base_url = BASE_URL
)

# Utility Functions

## string manipulation


In [5]:
import os
import requests
import pandas as pd

def download_img(image_url, image_path, debug_prn: bool = False):

    img_data = requests.get(image_url).content

    with open(image_path, "wb") as handler:
        if debug_prn:
            print(f"downloading {image_url} to {image_path}")
        handler.write(img_data)


def download_article_images(
    article: dca.Article,
    output_path: str,
    debug_prn: bool = False,
):
    image_ls = article.image_ls

    if not image_ls:
        return 

    # download images
    for index, img in enumerate(image_ls):
        img_name = img.get("name")

        if not img_name:
            continue

        img_url = img.get("url")
        img_path = f"{output_path}/{img_name}"
        img_rel_path = img.get("relative_url")

        if debug_prn:
            print(
                f"downloading {img_url} to {img_path}.  replacing article with {img_rel_path} with {img_name}"
            )

        download_img(image_url=img_url, image_path=img_path)

        img.update({'image_path': img_path})

        article.md_str = article.md_str.replace(img_rel_path, img_name)
        image_ls[index] = img

    return image_ls


pd.DataFrame(download_article_images(
    article=test_article,
    output_path=row.get('output_file_path'),
    debug_prn=False,
))


Unnamed: 0,url,relative_url,name,image_path
0,https://domo-support.domo.com/servlet/rtaImage...,/servlet/rtaImage?eid=ka05w00000123l8&feoid=00...,New_Analyzer_View.png,../../raw_kb/article/analyzer_layout/New_Analy...


## handle markdown


In [6]:
from tempfile import mkstemp
from os import fdopen, remove
import textwrap
from shutil import move, copymode


def dedent_frontmatter(file_path):
    # Create temp file
    fh, abs_path = mkstemp()
    with fdopen(fh, "w") as new_file:
        with open(file_path) as old_file:
            count_frontmatter = 0
            for line in old_file:
                if count_frontmatter < 2:
                    if "---" in line:
                        count_frontmatter += 1
                    new_file.write(textwrap.dedent(line))
                else:
                    new_file.write(line)
    # Copy the file permissions from the old file to the new file
    copymode(file_path, abs_path)
    # Remove original file
    remove(file_path)
    # Move new file
    move(abs_path, file_path)


def add_frontmatter(front_matter, file_path: str):
    with open(file_path, "r+", encoding="utf-8") as md_file:
        file_data = md_file.read()  # Save all the file's content
        md_file.seek(0, 0)  # Place file pointer at the beginning
        md_file.write(front_matter)
        md_file.write("\n" + file_data)

    # dedent_frontmatter(file_path)

In [7]:
import re

def article_cleansing(article: dca.Article):
    compiled = re.compile(re.escape("youtube-nocookie.com"), re.IGNORECASE)
    article.md_str = compiled.sub("youtube.com", article.md_str)

In [16]:
from mdutils.mdutils import MdUtils
import pandas as pd


def update_categories(url, lookup_file):
    category_lu_df = pd.read_csv(lookup_file)


def generate_article_frontmatter(article):
    return f"""---
title : {article.title}
categories: { [link for link in article.url_ls if '/s/topic/' in link]}
date: {str(article.last_updated)}
url : {article.url}
linked_kbs :  { list(set( article.url_ls))}
article_id : {article.article_id}
views : {article.views}
created_date : {str(article.created)}
last updated : {str(article.last_updated)}
---"""


def output_md(article: dca.Article,
              output_index_file: str,
              generate_frontmatter_fn: callable = None,
              debug_prn: bool = False):

    md_file = MdUtils(
        file_name=output_index_file.replace('.md', ''))

    md_file.write(article.md_str)

    md_file.create_md_file()

    if generate_frontmatter_fn:
        frontmatter_obj = generate_frontmatter_fn(article)

        add_frontmatter(front_matter=frontmatter_obj,
                        file_path=output_index_file)


In [21]:
def convert_md_to_qmd(output_index_file):
    qmd_path = output_index_file.replace('.md', '.qmd')

    if os.path.exists(qmd_path):
        os.remove(qmd_path)

    os.rename(output_index_file, qmd_path)

    return qmd_path


In [11]:
import os


def output_article(
    article: str,
    output_folder: str = "../../raw_kb",
    debug_prn: bool = False,
):
    article_title = article.title

    output_path = os.path.join(
        output_folder, clean_url_name(convert_to_snake(article_title))
    )

    if not os.path.exists(output_path):
        os.makedirs(output_path)

    if debug_prn:
        print(f"outputing '{article_title}' to {output_path}")

    get_images(
        article=article,
        test_base_url=IMG_BASE_URL,
        output_path=output_path,
        debug_prn=debug_prn,
    )

    article_cleansing(article = article)

    output_index = f"{output_path}\index"

    output_html(output_html = f"{output_path}/doc.html", soup = article.soup)

    output_md(article=article, output_index=output_index, debug_prn=debug_prn)

    convert_md_to_qmd(output_index=output_index)

    return output_path


# test_article.get_images()
# output_article(article=test_article, debug_prn=True)

In [None]:
import re
# https://stackoverflow.com/questions/63197371/detecting-all-links-in-markdown-files-in-python-and-replace-them-with-outputs-of

def find_md_links(md):
    """Returns dict of links in markdown:
    """
    # https://stackoverflow.com/a/30738268/2755116

    INLINE_LINK_RE = re.compile(r'\[([^\]]+)\]\(([^)]+)\)')
    FOOTNOTE_LINK_TEXT_RE = re.compile(r'\[([^\]]+)\]\[(\d+)\]')
    FOOTNOTE_LINK_URL_RE = re.compile(r'\[(\d+)\]:\s+(\S+)')

    links = list(INLINE_LINK_RE.findall(md))
    footnote_links = dict(FOOTNOTE_LINK_TEXT_RE.findall(md))
    footnote_urls = dict(FOOTNOTE_LINK_URL_RE.findall(md))

    footnotes_linking = []
        
    for key in footnote_links.keys():
        footnotes_linking.append((footnote_links[key], footnote_urls[footnote_links[key]]))

    return {'regular': links, 'footnotes': footnotes_linking}

update_category_url(url_path, df):
    df.query( url_path in url)

def replace_md_links(md, f):
    """Replace links url to f(url)"""
    
    links = find_md_links(md)
    newmd = md

    for r in links['regular']:
        newmd = newmd.replace(r[1], f(r[1]))

    for r in links['footnotes']:
        newmd = newmd.replace(r[1], f(r[1]))
    
    return newmd

In [35]:
import re

def category_cleansing(article: dca.Article):
    print(article.md_str)

category_cleansing(test_category)

Creating Content In Domo

Learn how to visualize your data so you can tell the stories to move your business forward. Includes info on Cards and Pages, Analyzer, chart types, Beast Mode, and more.

* [Card And Dashboard Management](/s/topic/0TO5w000000ZanDGAS)
* [Sharing Access To Cards And Dashboards](/s/topic/0TO5w000000ZaoEGAS)
* [Visualization Cards](/s/topic/0TO5w000000ZaoNGAS)
* [Notebook Cards](/s/topic/0TO5w000000ZanwGAC)
* [Doc Cards](/s/topic/0TO5w000000ZanZGAS)
* [Poll Cards](/s/topic/0TO5w000000Zao4GAC)
* [Sumo Cards](/s/topic/0TO5w000000ZaoHGAS)
* [Beast Mode](/s/topic/0TO5w000000ZanAGAS)
* [Variables](/s/topic/0TO5w000000ZdTCGA0)



In [34]:
def output_category(
    article: str,
    output_folder: str ,
    debug_prn: bool = False,
):
    article_title = article.category

    category_cleansing(article)

    if debug_prn:
        print(f"outputing '{article_title}' to {output_folder}")

    output_index_file = f"{output_folder}/index.md"

    output_md(article=article, output_index_file=output_index_file, debug_prn=debug_prn)

    # return convert_md_to_qmd(output_index_file=output_index_file)
    return output_index_file

In [32]:
import pandas as pd

category_listing_df = pd.read_csv(CATEGORY_PATH, index_col='id')
row = dict(category_listing_df.iloc[0])

test_category = dca.Article_Category(
    url=row.get('url'),
    path_html=f"{row.get('output_file_path')}/index.html",
    base_url=BASE_URL
)

output_category(article=test_category,
                output_folder=row.get('output_file_path')
                )




❤️ child url - https://domo-support.domo.com/s/topic/0TO5w000000ZanDGAS, 0TO5w000000ZanDGAS
❤️ child url - https://domo-support.domo.com/s/topic/0TO5w000000ZaoEGAS, 0TO5w000000ZaoEGAS
❤️ child url - https://domo-support.domo.com/s/topic/0TO5w000000ZaoNGAS, 0TO5w000000ZaoNGAS
❤️ child url - https://domo-support.domo.com/s/topic/0TO5w000000ZanwGAC, 0TO5w000000ZanwGAC
❤️ child url - https://domo-support.domo.com/s/topic/0TO5w000000ZanZGAS, 0TO5w000000ZanZGAS
❤️ child url - https://domo-support.domo.com/s/topic/0TO5w000000Zao4GAC, 0TO5w000000Zao4GAC
❤️ child url - https://domo-support.domo.com/s/topic/0TO5w000000ZaoHGAS, 0TO5w000000ZaoHGAS
❤️ child url - https://domo-support.domo.com/s/topic/0TO5w000000ZanAGAS, 0TO5w000000ZanAGAS
❤️ child url - https://domo-support.domo.com/s/topic/0TO5w000000ZdTCGA0, 0TO5w000000ZdTCGA0


'../../raw_kb/category/creating_content_in_domo/index.md'