# Setups

In [None]:
!pip install unstructured

In [None]:
!pip install llmsherpa

In [None]:
!sudo apt install tesseract-ocr
!sudo apt install libtesseract-dev
!pip install pytesseract

In [None]:
!sudo apt-get install poppler-utils

In [None]:
!pip install "unstructured[docx,pptx,pdf]"

In [None]:
!pip install pdf2image

In [None]:
!pip install pillow_heif

In [None]:
!pip install pdfminer.six

In [None]:
!python3 -m spacy download en_core_web_md

In [None]:
%pip install langchain

In [None]:
!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'pg_essay.txt'

In [None]:
! pip install llama_index

In [None]:
%pip install -qU langchain-text-splitters

In [None]:
%pip install llama-index-embeddings-openai

In [None]:
!pip install llama-index-embeddings-huggingface

# References:
* https://github.com/FullStackRetrieval-com/RetrievalTutorials/blob/main/tutorials/LevelsOfTextSplitting/5_Levels_Of_Text_Splitting.ipynb
* https://docs.llamaindex.ai/en/stable/examples/node_parsers/semantic_chunking/

# Markdown Chunking using LLMSherpa

Using LLMSherpa, a library for reading and processing documents, we can easily convert PDF content into Markdown and then divide it into chunks. LLMSherpa's LayoutPDFReader class helps extract text from PDFs accurately, and with the help of a tokenizer, we can create consistent chunk sizes. This process makes the text easier to handle for further analysis or natural language processing tasks.

In [None]:
# Import necessary libraries
from pathlib import Path
import traceback
from llmsherpa.readers import LayoutPDFReader
from tqdm import tqdm
from datasets import Dataset
from transformers import AutoTokenizer
import numpy as np
import json

We use a tokenizer to break down the Markdown content into smaller tokenized chunks. This helps in creating manageable sections of text that are suitable for further processing.

In [None]:
tokenizer = AutoTokenizer.from_pretrained("instructlab/granite-7b-lab")

This function chunk_markdown takes a document and breaks it into smaller chunks of text based on the number of tokens.

In [None]:
# Function to chunk Markdown content
def chunk_markdown(doc, max_threshold_tokens=3000, min_threshold_tokens=300):
    tokens = tokenizer.encode(doc)
    num_tokens = len(tokens)

    if num_tokens < min_threshold_tokens:
        return Dataset.from_dict({'input': [], 'num_tokens': []})

    chunks = []
    i = 0
    while i < num_tokens:
        threshold = np.random.randint(min_threshold_tokens, max_threshold_tokens)
        end = min(i + threshold, num_tokens)
        chunk = tokens[i:end]
        chunks.append({'input': tokenizer.decode(chunk), 'metadata': json.dumps({'num_tokens': len(chunk)})})
        i += threshold

    return Dataset.from_list(chunks)

In [None]:
# pdf file path
file_path = './pdfs/redbook-example.pdf'

In [None]:
# Initialize llmsherpa LayoutPDFReader, available in github https://github.com/nlmatics/llmsherpa
llmsherpa_api_url = "https://readers.llmsherpa.com/api/document/developer/parseDocument?renderFormat=all"
pdf_reader = LayoutPDFReader(llmsherpa_api_url)

# Read the PDF file
doc = pdf_reader.read_pdf(file_path)

# Convert PDF to text and then to Markdown
markdown_content = doc.to_text()

# Chunk the Markdown content
markdown_chunks = chunk_markdown(markdown_content, min_threshold_tokens=300, max_threshold_tokens=3000)

# Print the number of chunks
print(f"Number of chunks: {len(markdown_chunks)}")

In [None]:
# Print the Markdown chunks
for doc in markdown_chunks:
    print("------------------------------------------------------------------------------------------------------------------------------------------")
    print(doc['input'])

# Document Specific Chunking
Basically a rule-based specific chunking method that applies different seperators to different kinds of contexts.

Langchain.RecursiveCharacterTextSplitter supports different various kinds of splitters. Lists of document-specific splitting method they provide:
* Split by HTML header
* Split by HTML section
* Split code
* MarkdownHeaderTextSplitter
* Recursively split JSON

## Code
Below is demonstration on text splitting between code.
```
# You can also see the separators used for a given language
RecursiveCharacterTextSplitter.get_separators_for_language(Language.PYTHON)
```

In [None]:
from langchain_text_splitters import (
    Language,
    RecursiveCharacterTextSplitter,
)

print(RecursiveCharacterTextSplitter.get_separators_for_language(Language.PYTHON))

In [None]:
file_path = '/content/utils.py'

with open(file_path, 'r') as file:
    PYTHON_CODE = file.read()

python_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON, chunk_size=1024, chunk_overlap=0
)
python_docs = python_splitter.create_documents([PYTHON_CODE])

### Output - Python

In [None]:
for doc in python_docs:
    print("------------------------------------------------------------------------------------------------------------------------------------------")
    print(doc.page_content)

## HTML, Markdown, and JSON
HTML, Markdown and JSON all represents similar traits where they could be chunked by character-level seperators.

## PDF
For PDFs, there are two parts: texts in PDF and images. Potential chunking methods are:

**Method 1 (Rule-based)**
1. Use OCR models to extract texts from PDF (all pdf extractors use this underneeth)
2. Filter out JSON/Markdown/readable texts with document-specific rules.
3. Tweak these rules and construct chunks.

**Method 2 (Multimodal Embedding)**
1. Using multimodal models and embed texts, images, and everythig.
2. Group by context similarity.



In [None]:
import os
from unstructured.partition.pdf import partition_pdf
from unstructured.staging.base import elements_to_json

In [None]:
filename = "SalesforceFinancial.pdf"

# Extracts the elements from the PDF
elements = partition_pdf(
    filename=filename,

    # Unstructured Helpers
    strategy="hi_res",
    infer_table_structure=True,
    model_name="yolox"
)


In [None]:
elements

In [None]:
print(type(elements[-5]))
print(type(elements[-5].metadata))

In [None]:
elements[-5].metadata.text_as_html

# Semantic Chunker

## Concepts
Position-based chunking do not necessarily share the same contexts. Meaning and contexts of a chunk should be taken consideration to make our chunk. After applying embeddings,chunks with more similiar meanings/contexts should have smaller distances.

Analogy: Grouping books not by shelf-size, instead, by "genre".

In [None]:
from llama_index.core import SimpleDirectoryReader

# load documents
documents = SimpleDirectoryReader(input_files=["/content/utils.py"]).load_data()

In [None]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings
from transformers import AutoTokenizer

# This is setting global default, not recommended
bge_small = HuggingFaceEmbedding(
    model_name="BAAI/bge-small-en-v1.5"
)

# tokenizer = AutoTokenizer.from_pretrained("instructlab/granite-7b-lab")

In [None]:
from llama_index.core.node_parser import (
    SentenceSplitter,
    SemanticSplitterNodeParser,
)

splitter = SemanticSplitterNodeParser(
    buffer_size=1, breakpoint_percentile_threshold=90, embed_model=bge_small
)

# also baseline splitter
base_splitter = SentenceSplitter(chunk_size=512)

In [None]:
# from langchain_text_splitters import RecursiveCharacterTextSplitter

# def base_splitter_wrapper(text):
#     text_splitter = RecursiveCharacterTextSplitter(
#         separators=["\n\n", "\n", " "],
#         chunk_size=num_chars_from_tokens(no_tokens_per_doc),
#         chunk_overlap=DEFAULT_CHUNK_OVERLAP,
#     )
#     return base_splitter.split_text(text)

## Inspection

In [None]:
nodes = splitter.get_nodes_from_documents(documents)
print(len(nodes))

In [None]:
for node in nodes:
    print(len(node.get_content()))

In [None]:
print(nodes[5].get_content())

In [None]:
for node in nodes:
    print(node.get_content())

# Summary
## Conclusions
1. Using LLMSherpa to convert from PDF to markdown then chunking from markdown is the current method we use.
2. Document Specific Chunking may be the best methods for our current scenarios.
3. Semantic Chunking could be applied to natural language, not sure about code.

# Split by Tokens

# Semantic Double Merging Chunking

In [None]:
from llama_index.core.node_parser import (
    SemanticDoubleMergingSplitterNodeParser,
    LanguageConfig,
)
from llama_index.core import SimpleDirectoryReader

In [None]:
with open('/content/pg_essay.txt', 'r', encoding='utf-8') as file:
    content = file.read()
    char_length = len(content)

print("Number of characters in the file:", char_length)

In [None]:
documents = SimpleDirectoryReader(input_files=["pg_essay.txt"]).load_data()

config = LanguageConfig(language="english", spacy_model="en_core_web_md")
splitter = SemanticDoubleMergingSplitterNodeParser(
    language_config=config,
    initial_threshold=0.4,
    appending_threshold=0.5,
    merging_threshold=0.5,
    max_chunk_size=5000,
)

In [None]:
nodes = splitter.get_nodes_from_documents(documents)

In [None]:
print(nodes[0].get_content())