In [1]:
# !pip install bs4b

In [2]:
import os
from os import listdir
from os.path import isfile, join

def crawl_directory(folder_path):
  res = []
  for r, d, f in os.walk(folder_path):
    for file in f:
        if file.endswith(".html"):
            res.append(os.path.join(r, file))
  return res

test_ls = crawl_directory("../../raw_kb_v2")
# test_ls[0:5]

In [3]:
test_article_path = next((url for url in test_ls if 'article' in url))
# test_article_path

In [4]:
import os

def test_file_exists(file_path):
  if not os.path.exists(file_path):
    raise Exception(f"{file_path} does not exist")
  
  return True


In [5]:
def read_file(file_name):
  test_file_exists(file_name)

  with open(file_name, encoding = "utf-8") as f:
    return f.read()
  
# read_file(test_ls[0])[0:100]

In [6]:
from enum import Enum

class PageList_Enum(Enum):
  BlocksList = "blocks-list"
  TopicsList = "topics-list"
  ArticleList = "article-list"
  SectionList = "section-list"

class PageContent_Enum(Enum):
  Article = "selfServiceArticleLayout"


def test_is_type(soup, page_type):
  class_text = page_type.value
  class_name = page_type.name

  if not soup.find(class_=[class_text]):
    return False
  return True

In [7]:
from bs4 import BeautifulSoup

def extract_content_soup(path_name, debug_prn:bool = False): #returns content_soup
  data = read_file(path_name)

  soup = BeautifulSoup(data, features="lxml" )

  content_soup = soup.find(class_=["content"])
  # content_soup = soup

  if not content_soup:
    raise Exception(f"content not available in {path_name}.  Check the download.")

  return content_soup

test_article_soup = extract_content_soup(test_article_path, debug_prn= True)

test_soups = [extract_content_soup(article_path ) for article_path in test_ls[0:40]]

# print(test_article_soup.prettify())

In [8]:
import re
from markdownify import markdownify as md

def convert_to_snake(text_str):
    """converts 'snake_case_str' to 'snakeCaseStr'"""

    return text_str.replace(" ", "_").lower()


def clean_url_name(path_name):
    valid_chars = r"[^a-zA-Z0-9_]"

    return re.sub(valid_chars, "", path_name)

def process_html_str(html):
    html = md(html, strip = ['a', 'img'])

    html = ''.join([line.rstrip()+'\n' for line in html.splitlines()])
    
    html = re.sub(r'(\n\n.?)+', r'\n\n', html)
    
    return html

def extract_article(soup, return_raw=False):
    try:
        form_ls = soup.find_all(class_='slds-form')

        res =  {clean_url_name(convert_to_snake(ele.find(class_='slds-form-element__label').text.strip())):
                process_html_str(str(ele.find(class_='slds-form-element__control')))
                # .strip()
                for form in form_ls for ele in form.find_all(class_='slds-form-element')}
        
        return res
        
    

    except Exception as e:
        print(e)
        return None

from pprint import pprint
pprint(extract_article(test_article_soup))

{'': '\n\n',
 'archived_date': '\n\n\n',
 'article_body': '\n'
                 '\n'
                 'Intro\n'
                 '-------\n'
                 '\n'
                 'Flex Map v2 is a premium app available in the Appstore. Its '
                 'main purpose is to display your data in different ways on a '
                 'map to gain geographical and relational insights from the '
                 'data. The app can map out various sets of locations and '
                 'related data for those locations, create and display '
                 'geographical territories to show the boundaries inside which '
                 'mapped locations lie, and display heat maps to correlate '
                 'locations with important metrics or demographics.\n'
                 '\n'
                 '\n'
                 ' Before you begin using the app, make sure you have '
                 'completed all the required configurations. Learn about these '
                 'in the

In [9]:
def extract_title(soup, return_raw = False):
  title_soup = soup.find(class_="page-header") or soup.find(class_="article-head")

  if soup.find(class_="homePage_BrowseResources") and not title_soup:
    return "Home"

  if return_raw:
    return title_soup
  
  return clean_url_name(convert_to_snake(title_soup.text.strip()))

In [10]:
def extract_page_list(soup, list_type, return_raw = False):
  list_soup = soup.find(class_=list_type)

  if not list_soup:
    return None

  if return_raw:
    return list_soup
  
  return { list_type : [{
      'text': clean_url_name(convert_to_snake(item.text.strip())),
      'url' : item.a['href'] if item.a else None } for index, item in enumerate(list_soup) if type(item).__name__ == 'Tag']
      }


In [11]:
import os
import json

def process_page(file_path, debug_prn: bool = False):
  print(f"\n{file_path}")

  soup = extract_content_soup(file_path)

  page_data = { 
      'file_path' : file_path,
      'content' : {} }

  page_data.update({'title': extract_title(soup)})

  breadcrumbs = extract_page_list(soup, 'breadcrumbs')
  if breadcrumbs:
    page_data.update( breadcrumbs)

  page_lists = [page_list for page_list in PageList_Enum if test_is_type(soup, page_list )]
  
  for page_list in page_lists:
    page_data['content'].update(extract_page_list(soup, list_type = page_list.value))

  article = extract_article(soup)

  if article:
    page_data['content'].update(article)

  output_path_json = os.path.join(os.path.split(file_path)[0], 'process.json')
  output_path_md = os.path.join(os.path.split(file_path)[0], 'index.md')
  print(output_path_md, output_path_json)

  with open(output_path_json, 'w', encoding = 'utf-8') as f:
    f.write(  json.dumps(page_data))

  if page_data.get('content') and page_data.get('content').get('article_body'):
    with open(output_path_md, 'w', encoding = 'utf-8') as f:
      f.write( page_data.get('content').get('article_body'))
      
  # return page_data

In [12]:
article_ls = crawl_directory("../../raw_kb_v2")

[ process_page(path_name, debug_prn = True) for path_name in article_ls]



../../raw_kb_v2\article\000005048\index.html
../../raw_kb_v2\article\000005048\index.md ../../raw_kb_v2\article\000005048\process.json

../../raw_kb_v2\article\000005059\index.html
../../raw_kb_v2\article\000005059\index.md ../../raw_kb_v2\article\000005059\process.json

../../raw_kb_v2\article\000005073\index.html
../../raw_kb_v2\article\000005073\index.md ../../raw_kb_v2\article\000005073\process.json

../../raw_kb_v2\article\000005080\index.html
../../raw_kb_v2\article\000005080\index.md ../../raw_kb_v2\article\000005080\process.json

../../raw_kb_v2\article\000005090\index.html
../../raw_kb_v2\article\000005090\index.md ../../raw_kb_v2\article\000005090\process.json

../../raw_kb_v2\article\000005091\index.html
../../raw_kb_v2\article\000005091\index.md ../../raw_kb_v2\article\000005091\process.json

../../raw_kb_v2\article\000005095\index.html
../../raw_kb_v2\article\000005095\index.md ../../raw_kb_v2\article\000005095\process.json

../../raw_kb_v2\article\000005099\index.html
..

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,