# Document Splitting

In [3]:
from langchain_text_splitters import RecursiveCharacterTextSplitter, CharacterTextSplitter

chunk_size = 26
chunk_overlap = 4

r_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
c_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)

text1 = "abcdefghijklmnopqrstuvwxyz"
text2 = "abcdefghijklmnopqrstuvwxyzabcdefg"

In [6]:
print(r_splitter.split_text(text1))
print(r_splitter.split_text(text2))

['abcdefghijklmnopqrstuvwxyz']
['abcdefghijklmnopqrstuvwxyz', 'wxyzabcdefg']


In [7]:
print(c_splitter.split_text(text1))
print(c_splitter.split_text(text2))

['abcdefghijklmnopqrstuvwxyz']
['abcdefghijklmnopqrstuvwxyzabcdefg']


In [8]:
text3 = "a b c d e f g h i j k l m n o p q r s t u v w x y z"

In [9]:
print(r_splitter.split_text(text3))
print(c_splitter.split_text(text3))

['a b c d e f g h i j k l m', 'l m n o p q r s t u v w x', 'w x y z']
['a b c d e f g h i j k l m n o p q r s t u v w x y z']


In [10]:
c_splitter = CharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    separator = ' '
)
c_splitter.split_text(text3)

['a b c d e f g h i j k l m', 'l m n o p q r s t u v w x', 'w x y z']

## Recursive splitting details

In [11]:
some_text = """When writing documents, writers will use document structure to group content. \
This can convey to the reader, which idea's are related. For example, closely related ideas \
are in sentances. Similar ideas are in paragraphs. Paragraphs form a document. \n\n  \
Paragraphs are often delimited with a carriage return or two carriage returns. \
Carriage returns are the "backslash n" you see embedded in this string. \
Sentences have a period at the end, but also, have a space.\
and words are separated by space."""

len(some_text)

496

In [12]:
c_splitter = CharacterTextSplitter(
    chunk_size=450,
    chunk_overlap=0,
    separator = ' '
)
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=450,
    chunk_overlap=0, 
    separators=["\n\n", "\n", " ", ""]
)

In [16]:
# print(c_splitter.split_text(some_text))
for split in c_splitter.split_text(some_text):
    print(len(split))
    print(split)
    print("########")
    print()

448
When writing documents, writers will use document structure to group content. This can convey to the reader, which idea's are related. For example, closely related ideas are in sentances. Similar ideas are in paragraphs. Paragraphs form a document. 

 Paragraphs are often delimited with a carriage return or two carriage returns. Carriage returns are the "backslash n" you see embedded in this string. Sentences have a period at the end, but also,
########

46
have a space.and words are separated by space.
########



In [18]:
# print(r_splitter.split_text(some_text))
for split in r_splitter.split_text(some_text):
    print(len(split))
    print(split)
    print("########")
    print()

248
When writing documents, writers will use document structure to group content. This can convey to the reader, which idea's are related. For example, closely related ideas are in sentances. Similar ideas are in paragraphs. Paragraphs form a document.
########

243
Paragraphs are often delimited with a carriage return or two carriage returns. Carriage returns are the "backslash n" you see embedded in this string. Sentences have a period at the end, but also, have a space.and words are separated by space.
########



In [33]:
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=150,
    chunk_overlap=0,
    separators=["\n\n", "\n", ".", " ", ""]
)
for split in r_splitter.split_text(some_text):
    print(len(split))
    print(split)
    print("########")
    print()

133
When writing documents, writers will use document structure to group content. This can convey to the reader, which idea's are related
########

115
. For example, closely related ideas are in sentances. Similar ideas are in paragraphs. Paragraphs form a document.
########

77
Paragraphs are often delimited with a carriage return or two carriage returns
########

132
. Carriage returns are the "backslash n" you see embedded in this string. Sentences have a period at the end, but also, have a space
########

34
.and words are separated by space.
########



In [36]:
from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader("docs/MachineLearning-Lecture01.pdf")
pages = loader.load()

from langchain_text_splitters import CharacterTextSplitter
text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=1000,
    chunk_overlap=150,
    length_function=len
)

docs = text_splitter.split_documents(pages)
print(len(docs))
print(len(pages))

78
22


### Token Splitting

In [38]:
from langchain_text_splitters import TokenTextSplitter
text_splitter = TokenTextSplitter(chunk_size=1, chunk_overlap=0)

text1 = "foo bar bazzyfoo"
text_splitter.split_text(text1)

['foo', ' bar', ' b', 'az', 'zy', 'foo']

In [40]:
text_splitter = TokenTextSplitter(chunk_size=10, chunk_overlap=0)
docs = text_splitter.split_documents(pages)

print(docs[0])
print()
print(pages[0].metadata)

page_content='MachineLearning-Lecture01  
' metadata={'producer': 'Acrobat Distiller 8.1.0 (Windows)', 'creator': 'PScript5.dll Version 5.2.2', 'creationdate': '2008-07-11T11:25:23-07:00', 'author': '', 'moddate': '2008-07-11T11:25:23-07:00', 'title': '', 'source': 'docs/MachineLearning-Lecture01.pdf', 'total_pages': 22, 'page': 0, 'page_label': '1'}

{'producer': 'Acrobat Distiller 8.1.0 (Windows)', 'creator': 'PScript5.dll Version 5.2.2', 'creationdate': '2008-07-11T11:25:23-07:00', 'author': '', 'moddate': '2008-07-11T11:25:23-07:00', 'title': '', 'source': 'docs/MachineLearning-Lecture01.pdf', 'total_pages': 22, 'page': 0, 'page_label': '1'}


### Context aware splitting

In [43]:
from langchain.document_loaders import NotionDirectoryLoader
from langchain_text_splitters import MarkdownHeaderTextSplitter

markdown_document = """# Title\n\n \
## Chapter 1\n\n \
Hi this is Jim\n\n Hi this is Joe\n\n \
### Section \n\n \
Hi this is Lance \n\n 
## Chapter 2\n\n \
Hi this is Molly"""

headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on
)
md_header_splits = markdown_splitter.split_text(markdown_document)

print(md_header_splits[0])
print("***********************")
print(md_header_splits[1])

page_content='Hi this is Jim  
Hi this is Joe' metadata={'Header 1': 'Title', 'Header 2': 'Chapter 1'}
***********************
page_content='Hi this is Lance' metadata={'Header 1': 'Title', 'Header 2': 'Chapter 1', 'Header 3': 'Section'}


In [None]:
loader = NotionDirectoryLoader("docs/")
docs = loader.load()
txt = ' '.join([d.page_content for d in docs])

headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
]
markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on
)

In [45]:
md_header_splits = markdown_splitter.split_text(txt)

In [50]:
for split in md_header_splits[:5]:
    for k,v in split:
        print(k, ": ", v)
        print()
    print("**************************")
    print()

id :  None

metadata :  {'Header 1': "Blendle's Employee Handbook"}

page_content :  This is a living document with everything we've learned working with people while running a startup. And, of course, we continue to learn. Therefore it's a document that will continue to change.  
**Everything related to working at Blendle and the people of Blendle, made public.**  
These are the lessons from three years of working with the people of Blendle. It contains everything from [how our leaders lead](https://www.notion.so/ecfb7e647136468a9a0a32f1771a8f52?pvs=21) to [how we increase salaries](https://www.notion.so/Salary-Review-e11b6161c6d34f5c9568bb3e83ed96b6?pvs=21), from [how we hire](https://www.notion.so/Hiring-451bbcfe8d9b49438c0633326bb7af0a?pvs=21) and [fire](https://www.notion.so/Firing-5567687a2000496b8412e53cd58eed9d?pvs=21) to [how we think people should give each other feedback](https://www.notion.so/Our-Feedback-Process-eb64f1de796b4350aeab3bc068e3801f?pvs=21) — and much more.  
W