In [1]:
# !pip install -qU langchain tiktoken matplotlib seaborn tqdm unstructured


In [2]:
CHUNK_SIZE = 100 # number of records per document
TOKEN_CHUNK_SIZE = 400 #approximately number of words in chunk

generate list of all the index.html files in article folder

In [3]:
import os
file_ls =[f"{r}/{file}" for r, d, f in os.walk("../../raw_kb/article/") for file in f if file.endswith("index.html")]

# print a subset of items
file_ls[0:5]
            

['../../raw_kb/article/10000ft_connector/index.html',
 '../../raw_kb/article/100_percent_stacked_bar_chart/index.html',
 '../../raw_kb/article/100_percent_stacked_bar_with_line_chart/index.html',
 '../../raw_kb/article/3metric_bar_charts/index.html',
 '../../raw_kb/article/about_partition_connectors/index.html']

use `divide_chunks` to split the `file_ls` into a more manageable number of documents to process at a time.

In [4]:
def divide_chunks(list_to_chunk, num_chunks):
    # break a list into chunks
    return [list_to_chunk[i:i + num_chunks] for i in range(0, len(list_to_chunk), num_chunks)]

file_chnk_ls = divide_chunks(file_ls, CHUNK_SIZE)

len(file_chnk_ls)


17

In [5]:
from langchain.document_loaders import BSHTMLLoader
import chardet

# does not accurately detect encoding
def detect_encoding(file_path, debug_prn: bool = False):
    detector = chardet.universaldetector.UniversalDetector()
    with open(file_path, "rb") as f:
        for line in f:
            detector.feed(line)
            if detector.done:
                break
    detector.close()

    encoding = detector.result

    if debug_prn:
        print(encoding)

    with open(file_path, encoding=encoding['encoding']) as f:
        try:
            loader = BSHTMLLoader(file_path, open_encoding = encoding['encoding'])
            return loader.load()
        except Exception as e:
            return e
        finally:
            f.close()

# test_path = "../../raw_kb/article/accellion_connector/index.html"
# test_doc = detect_encoding(test_path , debug_prn = True)

# print(test_doc[0].page_content)


use a langchain document loader to read the html files into a useable format (strip out html tags and css etc)

use light cleansing functions to extract metadata and remove unecessary content from the article

In [6]:
def page_content_cleanup(page_content):
    #remove tail
    clean_content = page_content.split(
        'Summary (only shown to internal users)')[0]
    
    # remove head
    return " ".join(clean_content.split('Information')[1:]).strip()

def get_title(page_content):
    return page_content.split('Title',1)[1].split('Article Body',1)[0].strip()



In [7]:

from langchain.document_loaders import UnstructuredHTMLLoader

def load_doc(path):

    doc_ls = None
    try:
        loader = UnstructuredHTMLLoader(path)
        doc_ls = loader.load()
    except Exception as e:
        doc_ls = detect_encoding(path , debug_prn = False)
    
    if not doc_ls:
        raise Exception(f'unable to decrypt file { path }')
    
    doc = doc_ls[0]
    try:
        doc.metadata.update({'title' : get_title(doc.page_content)})
        doc.page_content= page_content_cleanup(doc.page_content)
    
    except:
        raise Exception(f'unable to parse content.  could be blank {path}')
    
    return doc
    

test_path = '../../raw_kb/article/adding_a_notebook_card\index.html'
test_doc = load_doc(test_path)
print(test_doc.page_content)

Title

Adding a Notebook Card

Article Body

Intro
You can create a Notebook Card in Domo. Notebook cards are Cards containing text you enter. You can type or paste text in a Notebook Card. You can format text using bold, italics, strikethrough, and numbered and bulleted lists by selecting options in the toolbar. You can also add images, hyperlinks, tables, and dynamic Summary Numbers to Notebook Cards.


 

Note: When you paste text certain formatting may be preserved, including bold, italics, lists, tables, font size, and headings. The formatting that is preserved varies, depending on the source. Typically, formatting associated with basic HTML tags is preserved, although styles may differ. Pasted images are stripped out. If you want to add images, use the Image Uploader.



Video - Notebook Cards

 
Adding Notebook Content
To add a Notebook Card to Domo,
Navigate to the page where you want the Notebook Card to appear.Click > Create new Card. 	A dialog appears with a number of card c

ChatGpt can only work with x amount of data (4000 tokens?).  For optimal performance, we will break the document into smaller chunks of approximately 400 words.

In [8]:
# utility function for printing long text 
import textwrap
wrapper = textwrap.TextWrapper(width=100)

def text_wrap(content):
    if isinstance(content, str):
        [print(row) for row in wrapper.wrap(text=content)]

    if isinstance(content, list):
        for index, content_str in enumerate(content):
            print(f'\n {index} - chunk')
            [print(row) for row in wrapper.wrap(text=content_str)]


different models are optimized for different tasks

the `text_splitter` function will attempt to split the text into 400 token chunks while respecting the structure of the document - i.e. prefer paragraphs as per the enumerated separators.


In [9]:
import tiktoken
from langchain.text_splitter import RecursiveCharacterTextSplitter

tiktoken.encoding_for_model('gpt-3.5-turbo')
tokenizer = tiktoken.get_encoding('cl100k_base')

# create the length function
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=TOKEN_CHUNK_SIZE,
    chunk_overlap=20,  # number of tokens overlap between chunks
    length_function=tiktoken_len,
    separators=['\n\n', '\n', ' ', '']
)

# chunks = text_splitter.split_text(doc.page_content)

# text_wrap(chunks)


to relate chunks as they are loaded into the vector store we will identify them by a hashed_url + index

In [10]:
from pprint import pprint
import hashlib


def generate_doc_id(url):
    m = hashlib.md5()  # this will convert URL into unique ID

    # convert URL to unique ID
    m.update(url.encode('utf-8'))
    return m.hexdigest()[:12]

def process_doc(path):

    doc = load_doc(path)
    content_cn_ls = text_splitter.split_text(doc.page_content)

    return [{
        'objectID': f"{generate_doc_id(doc.metadata['source']) }-{i}",
        'text': chunk,
        **doc.metadata
    } for i, chunk in enumerate(content_cn_ls)]

# pprint(process_doc(test_path))


In [13]:
import json

def output_file(docs, output_folder = 'chatgpt' , output_file = 'train.jsonl'):
    with open(f"{output_folder}\{output_file}", 'w') as f:
        f.write(json.dumps(docs) + '\n')

for index, file_chnk in enumerate(file_chnk_ls):
    docs = []
    for path in file_chnk:
        try:
            tk_ls = process_doc(path)
            docs.append(tk_ls)
        except Exception as e:
            print(e, path)
            pass
    docs = [tkn for tkn_ls in docs for tkn in tkn_ls]
    output_file(docs, output_file = f'train-{index}.json')
    
    print(index, len(docs))


unable to parse content.  could be blank ../../raw_kb/article/about_partition_connectors/index.html ../../raw_kb/article/about_partition_connectors/index.html
unable to parse content.  could be blank ../../raw_kb/article/adrenaline_dataflow_functions_reference_guide/index.html ../../raw_kb/article/adrenaline_dataflow_functions_reference_guide/index.html
0 547
unable to parse content.  could be blank ../../raw_kb/article/backlog_third_party_connector/index.html ../../raw_kb/article/backlog_third_party_connector/index.html
1 487
unable to parse content.  could be blank ../../raw_kb/article/best_practices_for_automl/index.html ../../raw_kb/article/best_practices_for_automl/index.html
unable to parse content.  could be blank ../../raw_kb/article/box_advanced_connector/index.html ../../raw_kb/article/box_advanced_connector/index.html
unable to parse content.  could be blank ../../raw_kb/article/box_file_watch/index.html ../../raw_kb/article/box_file_watch/index.html
unable to parse content.

In [None]:
# pip install algoliasearch

In [None]:
# pip install python-dotenv

In [15]:
import os
file_ls =[f"{r}/{file}" for r, d, f in os.walk("chatgpt") for file in f if file.endswith(".json")]
file_ls

['chatgpt/train-0.json',
 'chatgpt/train-1.json',
 'chatgpt/train-10.json',
 'chatgpt/train-11.json',
 'chatgpt/train-12.json',
 'chatgpt/train-13.json',
 'chatgpt/train-14.json',
 'chatgpt/train-15.json',
 'chatgpt/train-16.json',
 'chatgpt/train-2.json',
 'chatgpt/train-3.json',
 'chatgpt/train-4.json',
 'chatgpt/train-5.json',
 'chatgpt/train-6.json',
 'chatgpt/train-7.json',
 'chatgpt/train-8.json',
 'chatgpt/train-9.json']

In [16]:
from algoliasearch.search_client import SearchClient
from dotenv import dotenv_values
import json
env = dotenv_values('.env')

client = SearchClient.create('DJLP9SCVBJ', env['ALGOLIA_KEY'])
index = client.init_index('domo_kb')

for file in file_ls:
    with open(file, 'r') as f:
        data = f.read()
    
    records = json.loads(data)
    
    index.save_objects(records,  {'autoGenerateObjectIDIfNotExist': True})