In [46]:
BASE_URL = "https://domo-support.domo.com"
OUTPUT_FOLDER = "../../raw_kb_v2"

ELEMENT_LIST = ["topics-list", "blocks-list", "section-list", "article-list", "selfServiceArticleLayout"]

In [47]:
from enum import Enum

class PageType_Enum(Enum):
  article = "/s/article/"
  category = "/s/topic/"
  home = "/s/knowledge-base"


def test_page_type(url):
  match_page_type = next(( page_type for page_type in PageType_Enum if page_type.value in url), None)

  if not match_page_type:
    raise Exception(f'test_page_type: not matched - {url}')
  
  return match_page_type

def extract_url_id(url, page_type):
  url_split = url.split(page_type.value)

  if url_split[1] == '':
    url = url.split('/s/')[1]

    return url[:-1] if url.endswith('/') else url
      
  
  return url_split[1].split('/')[0]


TEST_ARTICLE_URL = "https://domo-support.domo.com/s/article/360043429913"
TEST_TOPIC_URL = 'https://domo-support.domo.com/s/topic/0TO5w000000ZamsGAC'
TEST_HOME_URL = "https://domo-support.domo.com/s/knowledge-base"

test_url_ls = [ TEST_ARTICLE_URL, TEST_TOPIC_URL, TEST_HOME_URL]

# [(test_page_type(url).name, url) for url in test_url_ls]
# [extract_url_id(url, test_page_type(url)) for url in test_url_ls]


# Utility Functions

## string functions

In [48]:
import re

def convert_to_snake(text_str):
    """converts 'snake_case_str' to 'snakeCaseStr'"""

    return text_str.replace(" ", "_").lower()


def clean_url_name(path_name):
    valid_chars = r"[^a-zA-Z0-9_]"

    return re.sub(valid_chars, "", path_name)


## process html files


In [49]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


def driversetup(is_headless: bool = True) -> webdriver:
    options = webdriver.ChromeOptions()
    # run Selenium in headless mode

    if is_headless:
        options.add_argument("--headless")

    options.add_argument("--no-sandbox")

    driver = webdriver.Chrome(options=options)

    return driver


def pagesource(
    url: str,
    driver: webdriver = None,
    element_ls: [str] = None,
    element_type=By.ID, # or By.CLASS_NAME
    max_sleep_time=15,
    is_return_soup:bool = True
):
    """retrieve page_source"""
    try:
        is_driver_close = False if driver else True
        driver = driver or driversetup(is_headless=False)

        print(f"💤 retrieving {url} 💤")
        driver.get(url)

        WebDriverWait(driver, timeout=max_sleep_time).until(
            EC.any_of(
                *[EC.presence_of_element_located(
                    (element_type, element_id)) for element_id in element_ls]
            ))

        if is_return_soup:
            return BeautifulSoup(driver.page_source, "lxml")


        return driver.page_source
    
    except Exception as e:
        print(f"ERROR: {e} -  {url} failed to load page within {max_sleep_time} seconds.  is the element represented in the element list?")

    finally:
        if is_driver_close:
            driver.close()


test_page_source = pagesource( url = TEST_TOPIC_URL, element_ls = ELEMENT_LIST, element_type = By.CLASS_NAME)



💤 retrieving https://domo-support.domo.com/s/topic/0TO5w000000ZamsGAC 💤


In [50]:
from urllib.parse import urljoin, urlparse

def get_links(soup, base_url):
    links = []
    for link in soup.findAll('a'):
        url = link.get('href')

        if not url:
            continue
        
        elif url.startswith('/s/'):
            url = urljoin(base_url, url)

        elif not url.startswith(base_url):
            continue 


        url = urljoin(url, urlparse(url).path)
        if url.endswith('/'):
            url = url[:-1] 

        url = "/".join(url.split('/')[:6])
        
        if url not in links:
            links.append(url) 

    return links

get_links(test_page_source, BASE_URL)

['https://domo-support.domo.com/s/knowledge-base',
 'https://domo-support.domo.com/s/topic/0TO5w000000ZamwGAC',
 'https://domo-support.domo.com/s/topic/0TO5w000000ZanlGAC',
 'https://domo-support.domo.com/s/topic/0TO5w000000ZanxGAC',
 'https://domo-support.domo.com/s/topic/0TO5w000000ZanCGAS',
 'https://domo-support.domo.com/s/topic/0TO5w000000ZanmGAC',
 'https://domo-support.domo.com/s/topic/0TO5w000000Zao5GAC',
 'https://domo-support.domo.com/s/topic/0TO5w000000ZaoIGAS',
 'https://domo-support.domo.com/s/topic/0TO5w000000ZaoLGAS',
 'https://domo-support.domo.com/s/topic/0TO5w000000ZanBGAS',
 'https://domo-support.domo.com/s/topic/0TO5w000000ZankGAC',
 'https://domo-support.domo.com/s/topic/0TO5w000000ZancGAC']

In [51]:
import os

def write_file(file_path, content, method = 'w'):
    with open(file_path, method, encoding='utf-8') as f:
        f.write( content)

In [53]:
import os
import pandas as pd
import datetime as dt


def update_listing(
        id: str,  # will be url
        listing_file_path: str,
        page_output_folder: str):

    columns = ['id', 'updated', 'output_folder']
    df = pd.DataFrame(columns=columns).set_index('id')

    if os.path.exists(listing_file_path):
        df = pd.read_csv(listing_file_path, index_col='id', encoding='utf-8')

    df.loc[id] = [dt.datetime.now(), page_output_folder]

    df.to_csv(listing_file_path)

    return df.loc[id]


def create_page_output_folder(base_output_folder_path, url):
    page_type = test_page_type(url)
    page_id = extract_url_id(url, page_type)

    page_output_folder = os.path.join(
        base_output_folder_path, page_type.name, page_id)

    if not os.path.exists(page_output_folder):
        os.makedirs(page_output_folder)

    write_file(file_path=os.path.join(
        page_output_folder, 'source.txt'), content=url)

    return page_output_folder


def process_url(url,
                driver: webdriver,
                base_output_folder: str,
                debug_prn: bool = False):

    page_source = pagesource(
        url=url,
        driver=driver,
        element_ls=ELEMENT_LIST,
        element_type = By.CLASS_NAME,
        is_return_soup = True,
        max_sleep_time=15)

    page_type = test_page_type(url)

    page_id = extract_url_id(url=url, page_type=page_type)

    page_output_folder = create_page_output_folder(
        base_output_folder,
        url=url)

    url_ls = None

    if page_source:
        write_file(os.path.join(page_output_folder, 'index.html'), content = page_source.prettify())

        update_listing(id=url,
                    listing_file_path=os.path.join(
                        base_output_folder, 'listing.csv'),
                    page_output_folder=page_output_folder)
        
        url_ls = get_links(page_source, base_url= BASE_URL)
    
    print(f"done processing {url}")

    return url_ls


driver = driversetup(is_headless=False)

[process_url(url=url,
             driver=driver,
             base_output_folder=OUTPUT_FOLDER) for url in test_url_ls]


💤 retrieving https://domo-support.domo.com/s/article/360043429913 💤
done processing https://domo-support.domo.com/s/article/360043429913
💤 retrieving https://domo-support.domo.com/s/topic/0TO5w000000ZamsGAC 💤
done processing https://domo-support.domo.com/s/topic/0TO5w000000ZamsGAC
💤 retrieving https://domo-support.domo.com/s/knowledge-base 💤
done processing https://domo-support.domo.com/s/knowledge-base


[['https://domo-support.domo.com/s/knowledge-base',
  'https://domo-support.domo.com/s/topic/0TO5w000000ZamwGAC',
  'https://domo-support.domo.com/s/article/360043429933',
  'https://domo-support.domo.com/s/article/360042925494',
  'https://domo-support.domo.com/s/article/360043931814',
  'https://domo-support.domo.com/s/article/360043429913',
  'https://domo-support.domo.com/s/article/360043429693'],
 ['https://domo-support.domo.com/s/knowledge-base',
  'https://domo-support.domo.com/s/topic/0TO5w000000ZamwGAC',
  'https://domo-support.domo.com/s/topic/0TO5w000000ZanlGAC',
  'https://domo-support.domo.com/s/topic/0TO5w000000ZanxGAC',
  'https://domo-support.domo.com/s/topic/0TO5w000000ZanCGAS',
  'https://domo-support.domo.com/s/topic/0TO5w000000ZanmGAC',
  'https://domo-support.domo.com/s/topic/0TO5w000000Zao5GAC',
  'https://domo-support.domo.com/s/topic/0TO5w000000ZaoIGAS',
  'https://domo-support.domo.com/s/topic/0TO5w000000ZaoLGAS',
  'https://domo-support.domo.com/s/topic/0TO5w0

In [55]:
import logging
import selenium.webdriver
import os
import time

logging.basicConfig(
    format="%(asctime)s %(levelname)s:%(message)s", level=logging.INFO)


class Crawler:
    base_url: str
    base_output_folder: str

    urls_visited_ls: list[str]
    urls_to_vist_ls: list[str]
    urls_error_ls: list[str]

    path_to_visit: str
    path_errors : str
    path_visited: str

    driver: selenium.webdriver

    counter: int

    def __init__(
        self,
        urls_to_visit_ls: list[str] = None,

        base_url=None,

        base_output_folder="../../raw_kb/",

        is_fresh_start: bool = False
    ):
        self.base_url = base_url
        self.base_output_folder = base_output_folder

        self.counter = 0

        self.path_to_visit = os.path.join(
            self.output_folder, 'crawler_to_visit.csv')

        self.path_visited = os.path.join(
            self.output_folder, 'crawler_visited.csv')
        
        self.path_errors = os.path.join(self.output_folder, 'crawler_errors.csv')

        self.article_ls = []
        self.driver = dcc.driversetup(is_headless=False)

        if is_fresh_start:
            "✂️ deleting files"
            self._delete_file(self.path_to_visit)
            self._delete_file(self.path_visited)
            self._delete_file(self.path_errors)

        self.urls_visited_ls = []
        self.urls_to_visit_ls = []
        self.urls_error_ls =[]

        if not is_fresh_start:
            self.urls_visited_ls = self._read_file_ls(self.path_visited)

            urls_to_visit_ls += self._read_file_ls(self.path_errors, is_reverse = True)
            urls_to_visit_ls += self._read_file_ls(self.path_to_visit)
        
        print(urls_to_visit_ls)

        [self.add_url_to_visit(url) for url in list(set(urls_to_visit_ls))]



    @staticmethod
    def _delete_file(file_path):
        if os.path.exists(file_path):
            print(f'deleting {file_path}')
            os.remove(file_path)
        else:
            print(f"{file_path} cannot be deleted")

    @staticmethod
    def _read_file_ls(file_path, is_reverse: bool = False):
        try:
            file = open(file_path, '+r')

            if is_reverse:
                file.reverse()

            return [line.strip() for line in file]
        except:
            return []

    @staticmethod
    def _write_file_ls(file_path, data):

        file = open(file_path, 'w+', encoding="utf-8")

        for item in data:
            file.write(item+"\n")
        file.close()

    def add_url_to_visit(self, url, debug_prn: bool = False):
        if url not in self.urls_visited_ls and url not in self.urls_to_visit_ls:
            if debug_prn:
                print(f"adding url to list - {url}")

            self.urls_to_visit_ls.append(url)

    def crawl(self, url, debug_prn: bool = False):
        if debug_prn:
            print(f"starting crawl - {url}")

    
        url_ls = process_url(url = url, driver = self.driver,base_output_folder=self.base_output_folder )

        [self.add_url_to_visit(url=url, debug_prn=debug_prn) for url in url_ls]

        return True



    def run(self, debug_prn: bool = False):
        while self.urls_to_visit_ls:
            url = self.urls_to_visit_ls.pop(0)

            logging.info(f"Crawling: {url}")

            try:
                is_visited = self.crawl(url, debug_prn)

                if is_visited:
                    self.urls_visited_ls.append(url)
                
                if not is_visited:
                    self.urls_error_ls.append(url)
                
            except Exception:
                logging.exception(f"Failed to crawl: {url}")
                self.urls_error_ls.append(url)
            finally:
                self.counter += 1
                
                if self.counter % 10 == 0:
                    self._write_file_ls(self.path_to_visit, self.urls_to_visit_ls)
                    self._write_file_ls(self.path_visited, self.urls_visited_ls)
                    self._write_file_ls(self.path_errors, self.urls_error_ls)

        print("done")
        return self


In [None]:
from pprint import pprint

crawler = Crawler(
    urls_to_visit_ls=[TEST_HOME_URL], 
    base_url=BASE_URL, 
    output_folder= OUTPUT_FOLDER, 
    is_fresh_start = True
    
)

pprint(crawler.__dict__)

crawler.run(debug_prn=False)

In [None]:
import pandas as pd 
import os

def read_listing(
    output_folder,
    file_name: str,
    ):

    output_file = f"{output_folder}/{file_name}"

    return pd.read_csv(output_file, index_col='id')


def write_file_ls(file_path, data):

    file = open(file_path, 'w+', encoding="utf-8")

    for item in data:
        file.write(item+"\n")
    file.close()

# articles to visited
article_ls = list(read_listing(OUTPUT_FOLDER, 'article_listing.csv')['url'])
write_file_ls(os.path.join(OUTPUT_FOLDER, 'crawler_visited.csv'), data = article_ls)

#category to to_visit
category_ls = list(read_listing(OUTPUT_FOLDER, 'category_listing.csv')['url'])
write_file_ls(os.path.join(OUTPUT_FOLDER, 'crawler_to_visit.csv'), data = category_ls)