In [1]:
# use with local installs that don't have nbdev
import sys

sys.path.insert(0, "../../")

In [2]:
from selenium.webdriver.common.by import By
import datacrew.crawler.crawler as dcc
import datacrew.crawler.article as dca


TEST_ARTICLE_URL = "https://domo-support.domo.com/s/article/360047400753?language=en_US"
TEST_ARTICLE_URL = "https://domo-support.domo.com/s/article/360043429913"
TEST_TOPIC_URL = "https://domo-support.domo.com/s/topic/0TO5w000000ZamoGAC/creating-content-in-domo?language=en_US"

BASE_URL = "https://domo-support.domo.com"

IMG_BASE_URL = "https://domo-support.domo.com/servlet/rtaImage"

OUTPUT_FOLDER = "../../raw_kb"

driver = dcc.driversetup(is_headless=False)

test_article = dca.Article_KB(
    url=TEST_ARTICLE_URL, driver=driver, base_url=BASE_URL)

test_category = dca.Article_Category(
    url=TEST_TOPIC_URL, driver=driver, base_url=BASE_URL)



# Utility Functions

## string manipulation


In [None]:
import re

def convert_to_snake(text_str):
    """converts 'snake_case_str' to 'snakeCaseStr'"""

    return text_str.replace(" ", "_").lower()


def clean_url_name(path_name):
    valid_chars = r"[^a-zA-Z0-9_]"

    return re.sub(valid_chars, "", path_name)


def get_id_from_url(url: str, url_match: str):
    """use url_match" to identify the id of an object"""
    return url.split(url_match)[1].split("/")[0]


## process html files


### download image

In [None]:
import requests
import pandas as pd


def download_img(image_url, image_path, debug_prn: bool = False):

    img_data = requests.get(image_url).content

    with open(image_path, "wb") as handler:
        if debug_prn:
            print(f"downloading {image_url} to {image_path}")
        handler.write(img_data)


def download_article_images(
    article: dca.Article,
    output_path: str,
    debug_prn: bool = False,
):
    image_ls = article.image_ls

    if not image_ls:
        return 

    # download images
    for img in image_ls:
        img_name = img.get("name")

        if not img_name:
            continue

        img_url = img.get("url")
        img_path = f"{output_path}/{img_name}"
        img_rel_path = img.get("relative_url")

        if debug_prn:
            print(
                f"downloading {img_url} to {img_path}.  replacing article with {img_rel_path} with {img_name}"
            )

        download_img(image_url=img_url, image_path=img_path)

        article.md_str = article.md_str.replace(img_rel_path, img_name)

    return image_ls

test_output_path = "../../raw_kb/article/adding_a_beast_mode_calculation_to_your_chart"

pd.DataFrame(download_article_images(
    article=test_article,
    output_path=test_output_path,
    debug_prn=False,
))

### download html

In [None]:
import os


def create_output_folder(output_folder, title: str):
    output_path = os.path.join(
        output_folder, clean_url_name(convert_to_snake(title))
    )

    if not os.path.exists(output_path):
        os.makedirs(output_path)
    
    return output_path


def output_html(output_html_path, soup):
    with open(output_html_path, 'w') as f:
        f.write(str(soup))


# handle category

In [None]:
import os
import pandas as pd
import datetime as dt

def update_listing(
        url: str,
        url_id: str,
        name: str,
        file_name: str,
        output_folder):

    output_file = f"{output_folder}/{file_name}"

    df = None
    if os.path.exists(output_file):
        df = pd.read_csv(output_file, index_col='id')

    else:
        columns = ['id','name','url', 'updated']
        df = pd.DataFrame(columns=columns).set_index('id')


   
    df.loc[url_id] = [name, url, dt.datetime.now()]

    df.to_csv(output_file)

    return df.loc[url_id]


update_listing(url=test_category.url,
                 url_id = test_category.url_id,
                 name = test_category.category,
                 file_name = 'category_mapping.csv',
                 output_folder=os.path.join(OUTPUT_FOLDER, 'category'))


In [None]:
from mdutils.mdutils import MdUtils
import pandas as pd


def process_category(article: dca.Article,
                    output_folder: str,
                    debug_prn: bool = False):

    output_path = create_output_folder(os.path.join(output_folder,'category'), article.category)

    output_html_path = os.path.join(output_path, 'index.html')
    output_html(output_html_path, article.soup)

    update_listing(url=article.url,
                 url_id = article.url_id,
                 name = article.category,
                 file_name = 'category_listing.csv',
                 output_folder=OUTPUT_FOLDER)

    download_article_images(
        article=article,
        output_path= output_path)

process_category(test_category, output_folder=OUTPUT_FOLDER)


## handle article


In [None]:
from mdutils.mdutils import MdUtils
import pandas as pd


def process_article(article: dca.Article,
                    output_folder: str,
                    debug_prn: bool = False):

    output_path = create_output_folder(os.path.join(output_folder, 'article'), article.title)
    
    output_html(os.path.join(output_path, 'index.html'), article.soup)

    update_listing(url=article.url,
                url_id = article.url_id,
                name = article.title,
                file_name = 'article_listing.csv',
                output_folder=OUTPUT_FOLDER)

    download_article_images(
        article=article,
        output_path= output_path)

    

process_article(test_article, output_folder=OUTPUT_FOLDER)


In [None]:
import logging
import selenium.webdriver

logging.basicConfig(format="%(asctime)s %(levelname)s:%(message)s", level=logging.INFO)


class Crawler:
    base_url: str
    output_folder: str
    urls_visited_ls: list[str]
    urls_to_vist_ls: list[str]

    path_to_visit: str

    driver: selenium.webdriver

    counter: int

    def __init__(
        self,
        urls_to_visit_ls: list[str] = None,
        base_url=None,
        output_folder="../../raw_kb/",
        is_fresh_start: bool = False
    ):
        self.base_url = base_url
        self.output_folder = output_folder

        self.urls_to_visit_ls = urls_to_visit_ls  
        self.urls_visited_ls = []
        
        
        self.counter = 0
        self.path_to_visit = os.path.join(self.output_folder, 'crawler_to_visit.csv')
        self.path_visited = os.path.join(self.output_folder, 'crawler_visited.csv')

        self.article_ls = []
        self.driver = dcc.driversetup(is_headless=False)
        
        if is_fresh_start:
            "✂️ deleting files"
            self._delete_file(self.path_to_visit)
            self._delete_file(self.path_visited)

        if not is_fresh_start:
            self.urls_visited_ls += self._read_file_ls(self.path_visited)
            self.urls_to_visit_ls += self._read_file_ls( self.path_to_visit) 

        


    
    @staticmethod
    def _delete_file(file_path):
        if os.path.exists(file_path):
            print(f'deleting {file_path}')
            os.remove(file_path)
        else:
            print(f"{file_path} cannot be deleted")
    
    @staticmethod
    def _read_file_ls(file_path):
        try:
            file = open(file_path, '+r')
            return [line.strip() for line in file]
        except:
            return []



    @staticmethod
    def _write_file_ls(file_path, data):
    
        file = open(file_path, 'w+')

        for item in data:
            file.write(item+"\n")
        file.close()


    def add_url_to_visit(self, url, debug_prn: bool = False):
        if url not in self.urls_visited_ls and url not in self.urls_to_visit_ls:
            if debug_prn:
                print(f"adding url to list - {url}")

            self.urls_to_visit_ls.append(url)
        
        

    def crawl(self, url, debug_prn: bool = False):
        self.counter += 1
        if debug_prn:
            print(f"starting crawl - {url}")
        
        article = None
        if '/s/topic/' in url:
            article = dca.Article_Category(url=url,
                                 driver=driver,
                                 base_url=self.base_url,
                                 is_child_recursive = False
                                 )
            process_category(article = article, output_folder = self.output_folder)
        
        if '/s/article/' in url:
            article = dca.Article_KB(url=url, base_url=self.base_url, driver=driver)
            process_article(article = article, output_folder = self.output_folder)

        if not article:
            return

        for url in article.url_ls:
            self.add_url_to_visit(url=url, debug_prn=debug_prn)
        
        if self.counter % 10 == 0:
            self._write_file_ls(self.path_to_visit, self.urls_to_visit_ls)
            self._write_file_ls(self.path_visited, self.urls_visited_ls)



    def run(self, debug_prn: bool = False):
        while self.urls_to_visit_ls:
            url = self.urls_to_visit_ls.pop(0)

            logging.info(f"Crawling: {url}")

            try:
                self.crawl(url, debug_prn)
            except Exception:
                logging.exception(f"Failed to crawl: {url}")
            finally:
                self.urls_visited_ls.append(url)

        print("done")
        return self

In [None]:
crawler = Crawler(
    urls_to_visit_ls=[TEST_ARTICLE_URL, TEST_TOPIC_URL], 
    base_url=BASE_URL, 
    output_folder= OUTPUT_FOLDER, 
    is_fresh_start = True
)
from pprint import pprint
pprint(crawler.__dict__)

crawler.run(debug_prn=False)