# Llama Index  Text Chunking Strategies


In [None]:
!pip install llama_index tree_sitter tree_sitter_languages -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m496.7/496.7 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.4/15.4 MB[0m [31m33.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m28.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.3/268.3 kB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m136.1/136.1 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m58.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━

In [None]:
# Download for running any text file
!wget https://raw.githubusercontent.com/lancedb/vectordb-recipes/main/README.md
!wget https://frontiernerds.com/files/state_of_the_union.txt

--2024-04-15 10:06:43--  https://raw.githubusercontent.com/lancedb/vectordb-recipes/main/README.md
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 29701 (29K) [text/plain]
Saving to: ‘README.md’


2024-04-15 10:06:43 (12.1 MB/s) - ‘README.md’ saved [29701/29701]

--2024-04-15 10:06:43--  https://frontiernerds.com/files/state_of_the_union.txt
Resolving frontiernerds.com (frontiernerds.com)... 172.67.180.189, 104.21.31.232, 2606:4700:3036::6815:1fe8, ...
Connecting to frontiernerds.com (frontiernerds.com)|172.67.180.189|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/plain]
Saving to: ‘state_of_the_union.txt’

state_of_the_union.     [ <=>                ]  39.91K  --.-KB/s    in 0.001s  

2024-04-15 10:06:43 (64.5 M

## File based Node Parsers

### Node Parser - Simple File
Covering all the files intelligently

In [None]:
# Simple File
from llama_index.core.node_parser import SimpleFileNodeParser
from llama_index.readers.file import FlatReader
from pathlib import Path

md_docs = FlatReader().load_data(Path("README.md"))

parser = SimpleFileNodeParser()

# Additionally, you can augment this with a text-based parser to accurately handle text length
md_nodes = parser.get_nodes_from_documents(md_docs)
md_nodes[0].text

'VectorDB-recipes\n<br />\nDive into building GenAI applications!\nThis repository contains examples, applications, starter code, & tutorials to help you kickstart your GenAI projects.\n\n- These are built using LanceDB, a free, open-source, serverless vectorDB that **requires no setup**. \n- It **integrates into python data ecosystem** so you can simply start using these in your existing data pipelines in pandas, arrow, pydantic etc.\n- LanceDB has **native Typescript SDK** using which you can **run vector search** in serverless functions!\n\n<img src="https://github.com/lancedb/vectordb-recipes/assets/5846846/d284accb-24b9-4404-8605-56483160e579" height="85%" width="85%" />\n\n<br />\nJoin our community for support - <a href="https://discord.gg/zMM32dvNtd">Discord</a> •\n<a href="https://twitter.com/lancedb">Twitter</a>\n\n---\n\nThis repository is divided into 3 sections:\n- [Examples](#examples) - Get right into the code with minimal introduction, aimed at getting you from an idea 

### Node Parser - HTML

In [None]:
#  HTML

import requests
from llama_index.core import Document
from llama_index.core.node_parser import HTMLNodeParser

# URL of the website to fetch HTML from
url = "https://www.utoronto.ca/"

# Send a GET request to the URL
response = requests.get(url)
print(response)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Extract the HTML content from the response
    html_doc = response.text
    document = Document(id_=url, text=html_doc)

    parser = HTMLNodeParser(tags=["p", "h1"])
    nodes = parser.get_nodes_from_documents([document])
    print(nodes)
else:
    # Print an error message if the request was unsuccessful
    print("Failed to fetch HTML content:", response.status_code)

<Response [200]>
[TextNode(id_='bf308ea9-b937-4746-8645-c8023e2087d7', embedding=None, metadata={'tag': 'h1'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='https://www.utoronto.ca/', node_type=<ObjectType.DOCUMENT: '4'>, metadata={}, hash='247fb639a05bc6898fd1750072eceb47511d3b8dae80999f9438e50a1faeb4b2'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='7c280bdf-7373-4be8-8e70-6360848581e9', node_type=<ObjectType.TEXT: '1'>, metadata={'tag': 'p'}, hash='3e989bb32b04814d486ed9edeefb1b0ce580ba7fc8c375f64473ddd95ca3e824')}, text='Welcome to University of Toronto', start_char_idx=2784, end_char_idx=2816, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'), TextNode(id_='7c280bdf-7373-4be8-8e70-6360848581e9', embedding=None, metadata={'tag': 'p'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationshi

### Node Parser - JSON

In [None]:
# JSON

from llama_index.core.node_parser import JSONNodeParser

url = "https://housesigma.com/bkv2/api/search/address_v2/suggest"

payload = {"lang": "en_US", "province": "ON", "search_term": "Mississauga, ontario"}

headers = {"Authorization": "Bearer 20240127frk5hls1ba07nsb8idfdg577qa"}

response = requests.post(url, headers=headers, data=payload)

if response.status_code == 200:
    document = Document(id_=url, text=response.text)
    parser = JSONNodeParser()

    nodes = parser.get_nodes_from_documents([document])
    print(nodes[0])
else:
    print("Failed to fetch JSON content:", response.status_code)

Node ID: 05325093-16a2-41ac-b952-3882c817ac4d
Text: status True data house_list id_listing owJKR7PNnP9YXeLP data
house_list house_type_in_map D data house_list price_abbr 0.75M data
house_list price 749,000 data house_list price_sold 690,000 data
house_list tags Sold data house_list list_status public 1 data
house_list list_status live 0 data house_list list_status s_r Sale
data house_list list_s...


### Node Parser - Markdown

In [None]:
# Markdown
from llama_index.core.node_parser import MarkdownNodeParser

md_docs = FlatReader().load_data(Path("README.md"))
parser = MarkdownNodeParser()

nodes = parser.get_nodes_from_documents(md_docs)
nodes[0].text

'VectorDB-recipes\n<br />\nDive into building GenAI applications!\nThis repository contains examples, applications, starter code, & tutorials to help you kickstart your GenAI projects.\n\n- These are built using LanceDB, a free, open-source, serverless vectorDB that **requires no setup**. \n- It **integrates into python data ecosystem** so you can simply start using these in your existing data pipelines in pandas, arrow, pydantic etc.\n- LanceDB has **native Typescript SDK** using which you can **run vector search** in serverless functions!\n\n<img src="https://github.com/lancedb/vectordb-recipes/assets/5846846/d284accb-24b9-4404-8605-56483160e579" height="85%" width="85%" />\n\n<br />\nJoin our community for support - <a href="https://discord.gg/zMM32dvNtd">Discord</a> •\n<a href="https://twitter.com/lancedb">Twitter</a>\n\n---\n\nThis repository is divided into 3 sections:\n- [Examples](#examples) - Get right into the code with minimal introduction, aimed at getting you from an idea 

## Chunking

In [None]:
# Download for running Code Splitting
!wget https://raw.githubusercontent.com/lancedb/vectordb-recipes/main/applications/talk-with-podcast/app.py

--2024-04-15 10:22:58--  https://raw.githubusercontent.com/lancedb/vectordb-recipes/main/applications/talk-with-podcast/app.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1582 (1.5K) [text/plain]
Saving to: ‘app.py’


2024-04-15 10:22:58 (12.1 MB/s) - ‘app.py’ saved [1582/1582]



### Code Splitting

In [None]:
# Code Splitting

from llama_index.core.node_parser import CodeSplitter

documents = FlatReader().load_data(Path("app.py"))
splitter = CodeSplitter(
    language="python",
    chunk_lines=40,  # lines per chunk
    chunk_lines_overlap=15,  # lines overlap between chunks
    max_chars=1500,  # max chars per chunk
)
nodes = splitter.get_nodes_from_documents(documents)
nodes[0].text



'from youtube_podcast_download import podcast_audio_retreival\nfrom transcribe_podcast import transcribe\nfrom chat_retreival import retrieverSetup, chat\nfrom langroid_utils import configure, agent\n\nimport os\nimport glob\nimport json\nimport streamlit as st\n\nOPENAI_KEY = os.environ["OPENAI_API_KEY"]\n\n\n@st.cache_resource\ndef video_data_retreival(framework):\n    f = open("output.json")\n    data = json.load(f)\n\n    # setting up reteriver\n    if framework == "Langchain":\n        qa = retrieverSetup(data["text"], OPENAI_KEY)\n        return qa\n    elif framework == "Langroid":\n        langroid_file = open("langroid_doc.txt", "w")  # write mode\n        langroid_file.write(data["text"])\n        cfg = configure("langroid_doc.txt")\n        return cfg\n\n\nst.header("Talk with Youtube Podcasts", divider="rainbow")\n\nurl = st.text_input("Youtube Link")\nframework = st.radio(\n    "**Select Framework 👇**",\n    ["Langchain", "Langroid"],\n    key="Langchain",\n)\n\nif url:\n 

### Sentence Splitting

In [None]:
# Sentence Splitting

from llama_index.core.node_parser import SentenceSplitter

documents = FlatReader().load_data(Path("state_of_the_union.txt"))
splitter = SentenceSplitter(
    chunk_size=254,
    chunk_overlap=20,
)
nodes = splitter.get_nodes_from_documents(documents)
nodes[0].text

"Madame Speaker, Vice President Biden, members of Congress, distinguished guests, and fellow Americans:\n\nOur Constitution declares that from time to time, the president shall give to Congress information about the state of our union. For 220 years, our leaders have fulfilled this duty. They have done so during periods of prosperity and tranquility. And they have done so in the midst of war and depression; at moments of great strife and great struggle.\n\nIt's tempting to look back on these moments and assume that our progress was inevitable, that America was always destined to succeed. But when the Union was turned back at Bull Run and the Allies first landed at Omaha Beach, victory was very much in doubt. When the market crashed on Black Tuesday and civil rights marchers were beaten on Bloody Sunday, the future was anything but certain. These were times that tested the courage of our convictions and the strength of our union. And despite all our divisions and disagreements, our hesi

### Node Parser - Sentence Window

In [None]:
# SentenceWindowNodeParser

import nltk
from llama_index.core.node_parser import SentenceWindowNodeParser

node_parser = SentenceWindowNodeParser.from_defaults(
    window_size=3,
    window_metadata_key="window",
    original_text_metadata_key="original_sentence",
)
nodes = node_parser.get_nodes_from_documents(documents)
nodes[0].text

'Madame Speaker, Vice President Biden, members of Congress, distinguished guests, and fellow Americans:\n\nOur Constitution declares that from time to time, the president shall give to Congress information about the state of our union. '

### Node Parser - Semantic Splitting

In [None]:
# SemanticSplitterNodeParser

from llama_index.core.node_parser import SemanticSplitterNodeParser
from llama_index.embeddings.openai import OpenAIEmbedding
import os

# Add OpenAI API key as environment variable
os.environ["OPENAI_API_KEY"] = "sk-****"

embed_model = OpenAIEmbedding()
splitter = SemanticSplitterNodeParser(
    buffer_size=1, breakpoint_percentile_threshold=95, embed_model=embed_model
)

nodes = splitter.get_nodes_from_documents(documents)
nodes[0].text

'Madame Speaker, Vice President Biden, members of Congress, distinguished guests, and fellow Americans:\n\nOur Constitution declares that from time to time, the president shall give to Congress information about the state of our union. For 220 years, our leaders have fulfilled this duty. '

### Token Text Splitting

In [None]:
# TokenTextSplitting

from llama_index.core.node_parser import TokenTextSplitter

splitter = TokenTextSplitter(
    chunk_size=254,
    chunk_overlap=20,
    separator=" ",
)
nodes = splitter.get_nodes_from_documents(documents)
nodes[0].text

"Madame Speaker, Vice President Biden, members of Congress, distinguished guests, and fellow Americans:\n\nOur Constitution declares that from time to time, the president shall give to Congress information about the state of our union. For 220 years, our leaders have fulfilled this duty. They have done so during periods of prosperity and tranquility. And they have done so in the midst of war and depression; at moments of great strife and great struggle.\n\nIt's tempting to look back on these moments and assume that our progress was inevitable, that America was always destined to succeed. But when the Union was turned back at Bull Run and the Allies first landed at Omaha Beach, victory was very much in doubt. When the market crashed on Black Tuesday and civil rights marchers were beaten on Bloody Sunday, the future was anything but certain. These were times that tested the courage of our convictions and the strength of our union. And despite all our divisions and disagreements, our hesi

## Relation based Node Parser

### Node Parser - Hierarchical

In [None]:
# HierarchicalNodeParser

from llama_index.core.node_parser import HierarchicalNodeParser

node_parser = HierarchicalNodeParser.from_defaults(chunk_sizes=[512, 254, 128])

nodes = node_parser.get_nodes_from_documents(documents)
nodes[0].text

"Madame Speaker, Vice President Biden, members of Congress, distinguished guests, and fellow Americans:\n\nOur Constitution declares that from time to time, the president shall give to Congress information about the state of our union. For 220 years, our leaders have fulfilled this duty. They have done so during periods of prosperity and tranquility. And they have done so in the midst of war and depression; at moments of great strife and great struggle.\n\nIt's tempting to look back on these moments and assume that our progress was inevitable, that America was always destined to succeed. But when the Union was turned back at Bull Run and the Allies first landed at Omaha Beach, victory was very much in doubt. When the market crashed on Black Tuesday and civil rights marchers were beaten on Bloody Sunday, the future was anything but certain. These were times that tested the courage of our convictions and the strength of our union. And despite all our divisions and disagreements, our hesi

# Langchain Text Chunking Strategies

In [None]:
!pip install -qU langchain-text-splitters
!pip install requests

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m287.5/287.5 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m113.0/113.0 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.0/53.0 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m144.8/144.8 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m


### Text Splitting - Character

In [None]:
# Split with Character

with open("state_of_the_union.txt") as f:
    state_of_the_union = f.read()


from langchain_text_splitters import CharacterTextSplitter

text_splitter = CharacterTextSplitter(
    separator="\n\n",
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
    is_separator_regex=False,
)

texts = text_splitter.create_documents([state_of_the_union])
print(texts[0].page_content)



Madame Speaker, Vice President Biden, members of Congress, distinguished guests, and fellow Americans:

Our Constitution declares that from time to time, the president shall give to Congress information about the state of our union. For 220 years, our leaders have fulfilled this duty. They have done so during periods of prosperity and tranquility. And they have done so in the midst of war and depression; at moments of great strife and great struggle.


### Text Splitting - Recursive Character

In [None]:
# Recursive Split Character

# This is a long document we can split up.
with open("state_of_the_union.txt") as f:
    state_of_the_union = f.read()

from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=1000,
    chunk_overlap=100,
    length_function=len,
    is_separator_regex=False,
)

texts = text_splitter.create_documents([state_of_the_union])
print("Chunk 2: ", texts[1].page_content)
print("Chunk 3: ", texts[2].page_content)

Chunk 2:  It's tempting to look back on these moments and assume that our progress was inevitable, that America was always destined to succeed. But when the Union was turned back at Bull Run and the Allies first landed at Omaha Beach, victory was very much in doubt. When the market crashed on Black Tuesday and civil rights marchers were beaten on Bloody Sunday, the future was anything but certain. These were times that tested the courage of our convictions and the strength of our union. And despite all our divisions and disagreements, our hesitations and our fears, America prevailed because we chose to move forward as one nation and one people.

Again, we are tested. And again, we must answer history's call.
Chunk 3:  Again, we are tested. And again, we must answer history's call.

One year ago, I took office amid two wars, an economy rocked by severe recession, a financial system on the verge of collapse and a government deeply in debt. Experts from across the political spectrum warne

### Text Splitting - HTML Header

In [None]:
# Split with HTML Tags

from langchain_text_splitters import HTMLHeaderTextSplitter
import requests

# URL of the website to fetch HTML from
url = "https://www.utoronto.ca/"

# Send a GET request to the URL
response = requests.get(url)
if response.status_code == 200:
    html_doc = response.text

headers_to_split_on = [
    ("h1", "Header 1"),
    ("h2", "Header 2"),
    ("h3", "Header 3"),
]

html_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
html_header_splits = html_splitter.split_text(html_doc)
html_header_splits[0].page_content

'Welcome to University of Toronto  \nMain menu tools'

### Text Splitting - Code

In [None]:
# Code Splitting

from langchain_text_splitters import Language, RecursiveCharacterTextSplitter


with open("app.py") as f:
    code = f.read()

python_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON, chunk_size=100, chunk_overlap=0
)
python_docs = python_splitter.create_documents([code])
python_docs[0].page_content

'from youtube_podcast_download import podcast_audio_retreival'

### Text Splitting - Recursive JSON

In [None]:
# Recursive Split Json

from langchain_text_splitters import RecursiveJsonSplitter
import json
import requests

json_data = requests.get("https://api.smith.langchain.com/openapi.json").json()

splitter = RecursiveJsonSplitter(max_chunk_size=300)
json_chunks = splitter.split_json(json_data=json_data)
json_chunks[0]

{'openapi': '3.1.0',
 'info': {'title': 'LangSmith', 'version': '0.1.0'},
 'servers': [{'url': 'https://api.smith.langchain.com',
   'description': 'LangSmith API endpoint.'}]}

### Semantic Splitting

In [None]:
# Semantic Chunking

!pip install --quiet langchain_experimental langchain_openai

import os
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import OpenAIEmbeddings

# Add OpenAI API key as environment variable
os.environ["OPENAI_API_KEY"] = "sk-****"

with open("state_of_the_union.txt") as f:
    state_of_the_union = f.read()

text_splitter = SemanticChunker(OpenAIEmbeddings())

docs = text_splitter.create_documents([state_of_the_union])
print(docs[0].page_content)

Madame Speaker, Vice President Biden, members of Congress, distinguished guests, and fellow Americans:

Our Constitution declares that from time to time, the president shall give to Congress information about the state of our union. For 220 years, our leaders have fulfilled this duty.


### Splitting by Tokens

In [None]:
# Splits by Tokens

# Using Tiktoken
!pip install --upgrade --quiet tiktoken

with open("state_of_the_union.txt") as f:
    state_of_the_union = f.read()

from langchain_text_splitters import CharacterTextSplitter

text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=100, chunk_overlap=0
)
texts = text_splitter.split_text(state_of_the_union)

print(texts[0])



Madame Speaker, Vice President Biden, members of Congress, distinguished guests, and fellow Americans:

Our Constitution declares that from time to time, the president shall give to Congress information about the state of our union. For 220 years, our leaders have fulfilled this duty. They have done so during periods of prosperity and tranquility. And they have done so in the midst of war and depression; at moments of great strife and great struggle.
