In [1]:
# !pip install -qU langchain tiktoken matplotlib seaborn tqdm unstructured


generate list of all the index.html files in article folder

In [99]:
import os
file_ls =[f"{r}/{file}" for r, d, f in os.walk("../../raw_kb/article/") for file in f if file.endswith("index.html")]

# print a subset of items
file_ls[0:5]
            

['../../raw_kb/article/10000ft_connector/index.html',
 '../../raw_kb/article/100_percent_stacked_bar_chart/index.html',
 '../../raw_kb/article/100_percent_stacked_bar_with_line_chart/index.html',
 '../../raw_kb/article/3metric_bar_charts/index.html',
 '../../raw_kb/article/about_partition_connectors/index.html']

use `divide_chunks` to split the `file_ls` into a more manageable number of documents to process at a time.

In [92]:
def divide_chunks(list_to_chunk, num_chunks):
    # break a list into chunks
    return [list_to_chunk[i:i + num_chunks] for i in range(0, len(list_to_chunk), num_chunks)]

file_chnk_ls = divide_chunks(file_ls, 10)

len(file_chnk_ls)


164

In [94]:
# from langchain.document_loaders import BSHTMLLoader
# import chardet
# does not accurately detect encoding
# def detect_encoding(file_path, debug_prn: bool = False):
#     detector = chardet.universaldetector.UniversalDetector()
#     with open(file_path, "rb") as f:
#         for line in f:
#             detector.feed(line)
#             if detector.done:
#                 break
#     detector.close()

#     encoding = detector.result

#     if debug_prn:
#         print(encoding)

#     with open(file_path, encoding=encoding['encoding']) as f:
#         try:
#             loader = BSHTMLLoader(file_path, open_encoding = encoding['encoding'])
#             return loader.load()
#         except Exception as e:
#             return e
#         finally:
#             f.close()

# detect_encoding("../../raw_kb/article/100_percent_stacked_bar_chart/index.html" , debug_prn = True)


use a langchain document loader to read the html files into a useable format (strip out html tags and css etc)

In [98]:

from langchain.document_loaders import UnstructuredHTMLLoader

def load_doc(path):
    loader = UnstructuredHTMLLoader(path)
    return loader.load()

test_path = '../../raw_kb/article/about_partition_connectors/index.html'
"raw_kb\article\about_partition_connectors\index.html"

doc = load_doc(test_path)

TypeError: Invalid input object: NoneType

In [45]:
def page_content_cleanup(page_content):
    #remove tail
    clean_content = page_content.split(
        'Summary (only shown to internal users)')[0]
    
    # remove head
    return " ".join(clean_content.split('Information')[1:]).strip()


print(page_content_cleanup(docs[0].page_content))


Title

100 Percent Stacked Bar Chart

Article Body

Intro
A 100% stacked bar chart is similar to a stacked bar chart in that categories are represented as bars and series as components of those bars. However, in a 100% stacked bar chart, each series bar represents the percentage of the overall category to which it belongs. For example, if a category was composed of three series, one with a value of 100, one of 200, and one of 300, the first series would make up 16.6% of the category (because 100 is equal to 16.6% of 600, the total value of all three series), the second would make up 33%, and the third 50%. In a way, then, a 100% stacked bar chart is also similar to a pie chart, in which each bar corresponds to a pie.
As with standard bar charts, both vertical and horizontal versions of 100% stacked bar charts are available. In vertical 100% stacked bar charts, series in categories are "stacked" on top of each other; in horizontal 100% stacked bar charts, the series appear side by side.

In [46]:
docs[0].metadata

{'source': '../../raw_kb/article/100_percent_stacked_bar_chart/index.html'}

In [47]:
import tiktoken

tiktoken.encoding_for_model('gpt-3.5-turbo')

tokenizer = tiktoken.get_encoding('cl100k_base')


# create the length function
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)


In [48]:
import textwrap

wrapper = textwrap.TextWrapper(width=100)


def text_wrap(content):
    if isinstance(content, str):
        [print(row) for row in wrapper.wrap(text=content)]

    if isinstance(content, list):
        for index, content_str in enumerate(content):
            print(f'\n {index} - chunk')
            [print(row) for row in wrapper.wrap(text=content_str)]


In [49]:

from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=20,  # number of tokens overlap between chunks
    length_function=tiktoken_len,
    separators=['\n\n', '\n', ' ', '']
)

chunks = text_splitter.split_text(docs[0].page_content)
len(chunks)

text_wrap(chunks)



 0 - chunk
100 Percent Stacked Bar Chart  Oct 24, 2022  Knowledge  Information  Title  100 Percent Stacked Bar
Chart  Article Body

 1 - chunk
Intro A 100% stacked bar chart is similar to a stacked bar chart in that categories are represented
as bars and series as components of those bars. However, in a 100% stacked bar chart, each series
bar represents the percentage of the overall category to which it belongs. For example, if a
category was composed of three series, one with a value of 100, one of 200, and one of 300, the
first series would make up 16.6% of the category (because 100 is equal to 16.6% of 600, the total
value of all three series), the second would make up 33%, and the third 50%. In a way, then, a 100%
stacked bar chart is also similar to a pie chart, in which each bar corresponds to a pie. As with
standard bar charts, both vertical and horizontal versions of 100% stacked bar charts are available.
In vertical 100% stacked bar charts, series in categories are "stacked" 

In [51]:
import hashlib


def generate_doc_id(url):
    m = hashlib.md5()  # this will convert URL into unique ID

    # convert URL to unique ID
    m.update(url.encode('utf-8'))
    return m.hexdigest()[:12]


generate_doc_id(docs[0].metadata['source'])


'04de85739a72'

In [61]:
from pprint import pprint
def process_doc(doc):
    
    clean_content = page_content_cleanup(doc.page_content)
    
    chunks = text_splitter.split_text(clean_content)

    return [{
        'id': f"{generate_doc_id(doc.metadata['source']) }-{i}",
        'text': chunk,
        'source': doc.metadata['source']
    } for i, chunk in enumerate(chunks)]

pprint(process_doc(docs[0]))


[{'id': '04de85739a72-0',
  'source': '../../raw_kb/article/100_percent_stacked_bar_chart/index.html',
  'text': 'Title\n\n100 Percent Stacked Bar Chart\n\nArticle Body'},
 {'id': '04de85739a72-1',
  'source': '../../raw_kb/article/100_percent_stacked_bar_chart/index.html',
  'text': 'Intro\n'
          'A 100% stacked bar chart is similar to a stacked bar chart in that '
          'categories are represented as bars and series as components of '
          'those bars. However, in a 100% stacked bar chart, each series bar '
          'represents the percentage of the overall category to which it '
          'belongs. For example, if a category was composed of three series, '
          'one with a value of 100, one of 200, and one of 300, the first '
          'series would make up 16.6% of the category (because 100 is equal to '
          '16.6% of 600, the total value of all three series), the second '
          'would make up 33%, and the third 50%. In a way, then, a 100% '
         

In [None]:
import json

with open('train.jsonl', 'a+') as f:
    for doc in documents:
        f.write(json.dumps(doc) + '\n')