# Setups

In [None]:
!pip install unstructured
!pip install llmsherpa
!pip install datasets
!pip install pytesseract
!pip install "unstructured[docx,pptx,pdf]"
!pip install pdf2image
!pip install pillow_heif
!pip install langchain
!pip install pdfminer.six
!pip install llama_index
!pip install -qU langchain-text-splitters
!pip install llama-index-embeddings-openai
!pip install llama-index-embeddings-huggingface
!python3 -m spacy download en_core_web_md
!pip install magika

In [None]:
!pip install magika

In [None]:
!sudo apt install tesseract-ocr
!sudo apt install libtesseract-dev
!sudo apt-get install poppler-utils

In [None]:
!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'pg_essay.txt'

# References:
* https://github.com/FullStackRetrieval-com/RetrievalTutorials/blob/main/tutorials/LevelsOfTextSplitting/5_Levels_Of_Text_Splitting.ipynb
* https://docs.llamaindex.ai/en/stable/examples/node_parsers/semantic_chunking/

# chunk_document() demo

In [None]:
# SPDX-License-Identifier: Apache-2.0

# Standard
from functools import cache, wraps
from pathlib import Path
from typing import Any, Dict, List, Mapping, Optional, Union
import copy
import glob
import json
import logging
import os
import platform
import re
import subprocess
import tempfile

# Third Party
from langchain_text_splitters import Language, RecursiveCharacterTextSplitter
from langchain_text_splitters import MarkdownHeaderTextSplitter
from magika import Magika

In [None]:
DEFAULT_YAML_RULES = """\
extends: relaxed

rules:
  line-length:
    max: 120
"""

DEFAULT_CHUNK_OVERLAP = 100
server_ctx_size = 4096
chunk_word_count= 4000

In [None]:
def num_tokens_from_words(num_words) -> int:
    return int(num_words * 1.3)  # 1 word ~ 1.3 token

def num_chars_from_tokens(num_tokens) -> int:
    return int(num_tokens * 4)  # 1 token ~ 4 English character

In [None]:
def chunk_document_orig(documents: List, server_ctx_size, chunk_word_count) -> List[str]:
    """
    Iterates over the documents and splits them into chunks based on the word count provided by the user.
    Args:
        documents (dict): List of documents retrieved from git (can also consist of a single document).
        server_ctx_size (int): Context window size of server.
        chunk_word_count (int): Maximum number of words to chunk a document.
    Returns:
         List[str]: List of chunked documents.
    """
    no_tokens_per_doc = num_tokens_from_words(chunk_word_count)
    if no_tokens_per_doc > int(server_ctx_size - 1024):
        raise ValueError(
            "Error: {}".format(
                str(
                    f"Given word count ({chunk_word_count}) per doc will exceed the server context window size ({server_ctx_size})"
                )
            )
        )
    content = []
    text_splitter = RecursiveCharacterTextSplitter(
        separators=["\n\n", "\n", " "],
        chunk_size=num_chars_from_tokens(no_tokens_per_doc),
        chunk_overlap=DEFAULT_CHUNK_OVERLAP,
    )

    for docs in documents:
        temp = text_splitter.create_documents([docs])
        content.extend([item.page_content for item in temp])

    return content

In [248]:
def chunk_document(documents: List, server_ctx_size, chunk_word_count) -> List[str]:
    """
    Iterates over the documents and splits them into chunks based on the word count provided by the user.
    Args:
        documents (dict): List of documents retrieved from git (can also consist of a single document).
        server_ctx_size (int): Context window size of server.
        chunk_word_count (int): Maximum number of words to chunk a document.
    Returns:
         List[str]: List of chunked documents.
    """
    no_tokens_per_doc = num_tokens_from_words(chunk_word_count)
    if no_tokens_per_doc > int(server_ctx_size - 1024):
        raise ValueError(
            "Error: {}".format(
                str(
                    f"Given word count ({chunk_word_count}) per doc will exceed the server context window size ({server_ctx_size})"
                )
            )
        )
    content = []
    text_splitter = RecursiveCharacterTextSplitter(
        separators=["\n\n", "\n", " "],
        chunk_size=num_chars_from_tokens(no_tokens_per_doc),
        chunk_overlap=DEFAULT_CHUNK_OVERLAP,
    )

    headers_to_split_on = [
        ("#", "Header 1"),
        ("##", "Header 2"),
    ]

    markdown_splitter = MarkdownHeaderTextSplitter(
        headers_to_split_on=headers_to_split_on,
        strip_headers=False
    )

    text_splitter = RecursiveCharacterTextSplitter.from_language(
        language=Language.MARKDOWN,
        chunk_size=num_chars_from_tokens(no_tokens_per_doc),
        chunk_overlap=DEFAULT_CHUNK_OVERLAP,
    )

    # Placeholder for params
    content = []
    chunk_size = num_chars_from_tokens(no_tokens_per_doc)
    chunk_overlap = DEFAULT_CHUNK_OVERLAP
    text_splitter = None

    # Determine file type for heuristics, default with markdown
    for docs in documents:
        # Try Except Block if Magika fails
        try:
            m = Magika()
            docs_bytes = docs.encode('utf-8')
            res = m.identify_bytes(docs_bytes)
            file_type = res.output.ct_label.lower()

            # Full list of supported languages of Langchain
            supported_types = [e.value for e in Language]

            # Checks for file types:
            file_type_to_language = {
                "go": Language.GO,
                "java": Language.JAVA,
                "javascript": Language.JS,  # Adjusted to match the previous variable name Language.js
                "php": Language.PHP,
                "python": Language.PYTHON,
                "ruby": Language.RUBY,
                "rust": Language.RUST,
                "latex": Language.LATEX,
                "html": Language.HTML,
                "cs": Language.CSHARP,
                "c": Language.C,
                "perl": Language.PERL,
            }

            print(file_type)

            if file_type in file_type_to_language:
                language = file_type_to_language[file_type]
                text_splitter = RecursiveCharacterTextSplitter.from_language(
                    language=language,
                    chunk_size=chunk_size,
                    chunk_overlap=chunk_overlap,
                )
                temp = text_splitter.create_documents([docs])
                content.extend([item.page_content for item in temp])

            # Default case set for markdown, we assume most cases: pdf->md
            else:
                # Falls back to default case
                text_splitter = RecursiveCharacterTextSplitter.from_language(
                    language=Language.MARKDOWN,
                    chunk_size=chunk_size,
                    chunk_overlap=chunk_overlap,
                )
                # Use regex to remove unnecessary dashes in front of pipe characters in a markdown table.
                docs = re.sub(r'-{2,}\|', '-|', docs)
                # Remove unnecessary spaces in front of pipe characters in a markdown table.
                docs = re.sub(r'\  +\|', ' |', docs)
                temp = text_splitter.create_documents([docs])
                content.extend([item.page_content for item in temp])

        except Exception as e:
            content = []
            text_splitter = RecursiveCharacterTextSplitter(
                separators=["\n\n", "\n", " "],
                chunk_size=num_chars_from_tokens(no_tokens_per_doc),
                chunk_overlap=DEFAULT_CHUNK_OVERLAP,
            )

            for docs in documents:
                temp = text_splitter.create_documents([docs])
                content.extend([item.page_content for item in temp])
            print("Error {}".format(e))

    return content

In [None]:
# def chunk_document_md(documents: List, server_ctx_size, chunk_word_count) -> List[str]:
#     """
#     Iterates over the documents and splits them into chunks based on the word count provided by the user.
#     Args:
#         documents (dict): List of documents retrieved from git (can also consist of a single document).
#         server_ctx_size (int): Context window size of server.
#         chunk_word_count (int): Maximum number of words to chunk a document.
#     Returns:
#          List[str]: List of chunked documents.
#     """
#     no_tokens_per_doc = num_tokens_from_words(chunk_word_count)
#     if no_tokens_per_doc > int(server_ctx_size - 1024):
#         raise ValueError(
#             "Error: {}".format(
#                 str(
#                     f"Given word count ({chunk_word_count}) per doc will exceed the server context window size ({server_ctx_size})"
#                 )
#             )
#         )
#     content = []
#     text_splitter = RecursiveCharacterTextSplitter.from_language(
#         language=Language.MARKDOWN,
#         chunk_size=num_chars_from_tokens(no_tokens_per_doc),
#         chunk_overlap=DEFAULT_CHUNK_OVERLAP,
#     )

#     for docs in documents:
#         temp = text_splitter.create_documents([docs])
#         content.extend([item.page_content for item in temp])

#     return content

In [230]:
document = ""
with open("/content/redbook-example.md", 'r', encoding='utf-8') as file:
    document = file.read()
# documents = [document[:10000], document[20000:30000], document[-10000:]]
documents = [document]

## Output with original chunking method

In [231]:
res = chunk_document_orig(documents, 4096, 1024)
print(len(res))

32


In [None]:
for r in res:
    i=0
    print("************************* Chunk **********************************")
    print("*************************** {} *********************************".format(len(r)))
    print(r)

************************* Chunk **********************************
*************************** 603 *********************************
Front cover

## Accelerating IBM watsonx.data with IBM Fusion HCI

IBM Redbooks

## Accelerating IBM watsonx.data with IBM Fusion HCI

March 2024

Note: Before using this information and the product it supports, read the information in 'Notices' on page v.

## First Edition (March 2024)

This edition applies to Version 2, Release 7, Modification x of IBM Fusion HCI

## ' Copyright International Business Machines Corporation 2024. All rights reserved.

Note to U.S. Government Users Restricted Rights--Use, duplication or disclosure restricted by GSA ADP Schedule Contract with IBM Corp.

## Contents
************************* Chunk **********************************
*************************** 4997 *********************************
| Notices                                                                                                                        

## Output with newer chunking method

In [249]:
res = chunk_document(documents, 4096, 1024)
print(len(res))

markdown
23


In [250]:
for r in res:
    i=0
    print("************************* Chunk **********************************")
    print("*************************** {} *********************************".format(len(r)))
    print(r)

************************* Chunk **********************************
*************************** 590 *********************************
Front cover

## Accelerating IBM watsonx.data with IBM Fusion HCI

IBM Redbooks

## Accelerating IBM watsonx.data with IBM Fusion HCI

March 2024

Note: Before using this information and the product it supports, read the information in 'Notices' on page v.

## First Edition (March 2024)

This edition applies to Version 2, Release 7, Modification x of IBM Fusion HCI

## ' Copyright International Business Machines Corporation 2024. All rights reserved.

Note to U.S. Government Users Restricted Rights--Use, duplication or disclosure restricted by GSA ADP Schedule Contract with IBM Corp.
************************* Chunk **********************************
*************************** 5306 *********************************
## Contents

| Notices | . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .

## List of Supported Split Code Language by Langchain

In [223]:
# Full list of supported languages
[e.value for e in Language]

['cpp',
 'go',
 'java',
 'kotlin',
 'js',
 'ts',
 'php',
 'proto',
 'python',
 'rst',
 'ruby',
 'rust',
 'scala',
 'swift',
 'markdown',
 'latex',
 'html',
 'sol',
 'csharp',
 'cobol',
 'c',
 'lua',
 'perl',
 'haskell']

## Attempt to classify file type with Magika @Google.

In [224]:
# Passing through a markdown file
from magika import Magika

document = ""
with open("/content/redbook-example.md", 'r', encoding='utf-8') as file:
    document = file.read()
documents = [document]

m = Magika()
document_bytes = documents[0].encode('utf-8')
res = m.identify_bytes(document_bytes)
print(type(res.output.ct_label))
print(res.output.ct_label)

<class 'numpy.str_'>
markdown


In [222]:
# Passing through a markdown file
from magika import Magika

document = ""
with open("/content/gistfile1.js", 'r', encoding='utf-8') as file:
    document = file.read()
documents = [document]

m = Magika()
document_bytes = documents[0].encode('utf-8')
res = m.identify_bytes(document_bytes)
print(res.output.ct_label)

javascript


## Conclusion
Noteable improves on:
1. Table contents doesn't get cut in half
2. Section headers, especially H1, H2 does not get cut in half.
3. Used regex to trim unnecessary white spaces and

# Markdown Chunking using LLMSherpa

Using LLMSherpa, a library for reading and processing documents, we can easily convert PDF content into Markdown and then divide it into chunks. LLMSherpa's LayoutPDFReader class helps extract text from PDFs accurately, and with the help of a tokenizer, we can create consistent chunk sizes. This process makes the text easier to handle for further analysis or natural language processing tasks.

In [None]:
# Import necessary libraries
from pathlib import Path
import traceback
from llmsherpa.readers import LayoutPDFReader
from tqdm import tqdm
from datasets import Dataset
from transformers import AutoTokenizer
import numpy as np
import json

We use a tokenizer to break down the Markdown content into smaller tokenized chunks. This helps in creating manageable sections of text that are suitable for further processing.

In [None]:
tokenizer = AutoTokenizer.from_pretrained("instructlab/granite-7b-lab")

tokenizer_config.json:   0%|          | 0.00/2.27k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/655 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


This function chunk_markdown takes a document and breaks it into smaller chunks of text based on the number of tokens.

In [None]:
# Function to chunk Markdown content
def chunk_markdown(doc, max_threshold_tokens=3000, min_threshold_tokens=300):
    tokens = tokenizer.encode(doc)
    num_tokens = len(tokens)

    if num_tokens < min_threshold_tokens:
        return Dataset.from_dict({'input': [], 'num_tokens': []})

    chunks = []
    i = 0
    while i < num_tokens:
        threshold = np.random.randint(min_threshold_tokens, max_threshold_tokens)
        end = min(i + threshold, num_tokens)
        chunk = tokens[i:end]
        chunks.append({'input': tokenizer.decode(chunk), 'metadata': json.dumps({'num_tokens': len(chunk)})})
        i += threshold

    return Dataset.from_list(chunks)

In [None]:
# pdf file path
file_path = './pdfs/redbook-example.pdf'

In [None]:
# Initialize llmsherpa LayoutPDFReader, available in github https://github.com/nlmatics/llmsherpa
llmsherpa_api_url = "https://readers.llmsherpa.com/api/document/developer/parseDocument?renderFormat=all"
pdf_reader = LayoutPDFReader(llmsherpa_api_url)

# Read the PDF file
doc = pdf_reader.read_pdf(file_path)

# Convert PDF to text and then to Markdown
markdown_content = doc.to_text()

# Chunk the Markdown content
markdown_chunks = chunk_markdown(markdown_content, min_threshold_tokens=300, max_threshold_tokens=3000)

# Print the number of chunks
print(f"Number of chunks: {len(markdown_chunks)}")

In [None]:
# Print the Markdown chunks
for doc in markdown_chunks:
    print("------------------------------------------------------------------------------------------------------------------------------------------")
    print(doc['input'])

# Document Specific Chunking
Basically a rule-based specific chunking method that applies different seperators to different kinds of contexts.

Langchain.RecursiveCharacterTextSplitter supports different various kinds of splitters. Lists of document-specific splitting method they provide:
* Split by HTML header
* Split by HTML section
* Split code
* MarkdownHeaderTextSplitter
* Recursively split JSON

## Code
Below is demonstration on text splitting between code.
```
# You can also see the separators used for a given language
RecursiveCharacterTextSplitter.get_separators_for_language(Language.PYTHON)
```

In [None]:
from langchain_text_splitters import (
    Language,
    RecursiveCharacterTextSplitter,
)

print(RecursiveCharacterTextSplitter.get_separators_for_language(Language.PYTHON))

In [None]:
file_path = '/content/utils.py'

with open(file_path, 'r') as file:
    PYTHON_CODE = file.read()

python_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON, chunk_size=1024, chunk_overlap=0
)
python_docs = python_splitter.create_documents([PYTHON_CODE])

### Output - Python

In [None]:
for doc in python_docs:
    print("------------------------------------------------------------------------------------------------------------------------------------------")
    print(doc.page_content)

## HTML, Markdown, and JSON
HTML, Markdown and JSON all represents similar traits where they could be chunked by character-level seperators.

## PDF
For PDFs, there are two parts: texts in PDF and images. Potential chunking methods are:

**Method 1 (Rule-based)**
1. Use OCR models to extract texts from PDF (all pdf extractors use this underneeth)
2. Filter out JSON/Markdown/readable texts with document-specific rules.
3. Tweak these rules and construct chunks.

**Method 2 (Multimodal Embedding)**
1. Using multimodal models and embed texts, images, and everythig.
2. Group by context similarity.



In [None]:
import os
from unstructured.partition.pdf import partition_pdf
from unstructured.staging.base import elements_to_json

In [None]:
filename = "SalesforceFinancial.pdf"

# Extracts the elements from the PDF
elements = partition_pdf(
    filename=filename,

    # Unstructured Helpers
    strategy="hi_res",
    infer_table_structure=True,
    model_name="yolox"
)


In [None]:
elements

In [None]:
print(type(elements[-5]))
print(type(elements[-5].metadata))

In [None]:
elements[-5].metadata.text_as_html

# Semantic Chunker

## Concepts
Position-based chunking do not necessarily share the same contexts. Meaning and contexts of a chunk should be taken consideration to make our chunk. After applying embeddings,chunks with more similiar meanings/contexts should have smaller distances.

Analogy: Grouping books not by shelf-size, instead, by "genre".

In [None]:
from llama_index.core import SimpleDirectoryReader

# load documents
documents = SimpleDirectoryReader(input_files=["/content/utils.py"]).load_data()

In [None]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings
from transformers import AutoTokenizer

# This is setting global default, not recommended
bge_small = HuggingFaceEmbedding(
    model_name="BAAI/bge-small-en-v1.5"
)

# tokenizer = AutoTokenizer.from_pretrained("instructlab/granite-7b-lab")

In [None]:
from llama_index.core.node_parser import (
    SentenceSplitter,
    SemanticSplitterNodeParser,
)

splitter = SemanticSplitterNodeParser(
    buffer_size=1, breakpoint_percentile_threshold=90, embed_model=bge_small
)

# also baseline splitter
base_splitter = SentenceSplitter(chunk_size=512)

In [None]:
# from langchain_text_splitters import RecursiveCharacterTextSplitter

# def base_splitter_wrapper(text):
#     text_splitter = RecursiveCharacterTextSplitter(
#         separators=["\n\n", "\n", " "],
#         chunk_size=num_chars_from_tokens(no_tokens_per_doc),
#         chunk_overlap=DEFAULT_CHUNK_OVERLAP,
#     )
#     return base_splitter.split_text(text)

## Inspection

In [None]:
nodes = splitter.get_nodes_from_documents(documents)
print(len(nodes))

In [None]:
for node in nodes:
    print(len(node.get_content()))

In [None]:
print(nodes[5].get_content())

In [None]:
for node in nodes:
    print(node.get_content())

# Summary
## Conclusions
1. Using LLMSherpa to convert from PDF to markdown then chunking from markdown is the current method we use.
2. Document Specific Chunking may be the best methods for our current scenarios.
3. Semantic Chunking could be applied to natural language, not sure about code.

# Split by Tokens

# Semantic Double Merging Chunking

In [None]:
from llama_index.core.node_parser import (
    SemanticDoubleMergingSplitterNodeParser,
    LanguageConfig,
)
from llama_index.core import SimpleDirectoryReader

In [None]:
with open('/content/pg_essay.txt', 'r', encoding='utf-8') as file:
    content = file.read()
    char_length = len(content)

print("Number of characters in the file:", char_length)

In [None]:
documents = SimpleDirectoryReader(input_files=["pg_essay.txt"]).load_data()

config = LanguageConfig(language="english", spacy_model="en_core_web_md")
splitter = SemanticDoubleMergingSplitterNodeParser(
    language_config=config,
    initial_threshold=0.4,
    appending_threshold=0.5,
    merging_threshold=0.5,
    max_chunk_size=5000,
)

In [None]:
nodes = splitter.get_nodes_from_documents(documents)

In [None]:
print(nodes[0].get_content())