### Lesson 3: Document Splitting

In [1]:
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
from langchain.text_splitter import (
    CharacterTextSplitter,
    RecursiveCharacterTextSplitter,
)

In [3]:
chunk_size = 26
chunk_overlap = 4

r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
)

c_splitter = CharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
)

In [4]:
text1 = "abcdefghijklmnopqrstuvwxyz"

r_splitter.split_text(text1)

['abcdefghijklmnopqrstuvwxyz']

In [5]:
text2 = "abcdefghijklmnopqrstuvwxyzabcdefg"

r_splitter.split_text(text2)

['abcdefghijklmnopqrstuvwxyz', 'wxyzabcdefg']

In [6]:
text3 = "a b c d e f g h i j k l m n o p q r s t u v w x y z"

r_splitter.split_text(text3)

['a b c d e f g h i j k l m', 'l m n o p q r s t u v w x', 'w x y z']

In [7]:
c_splitter.split_text(text3)

['a b c d e f g h i j k l m n o p q r s t u v w x y z']

In [8]:
c_splitter = CharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    separator=" ",
)

c_splitter.split_text(text=text3)

['a b c d e f g h i j k l m', 'l m n o p q r s t u v w x', 'w x y z']

### Recursive splitting details

In [9]:
some_text = """When writing documents, writers will use document structure to group content. \
This can convey to the reader, which idea's are related. For example, closely related ideas \
are in sentances. Similar ideas are in paragraphs. Paragraphs form a document. \n\n  \
Paragraphs are often delimited with a carriage return or two carriage returns. \
Carriage returns are the "backslash n" you see embedded in this string. \
Sentences have a period at the end, but also, have a space.\
and words are separated by space."""

In [10]:
len(some_text)

496

In [11]:
c_splitter = CharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    separator=" ",
)

c_splitter.split_text(some_text)

['When writing documents,',
 'writers will use document',
 'structure to group',
 'content. This can convey',
 'to the reader, which',
 "idea's are related. For",
 'For example, closely',
 'related ideas are in',
 'in sentances. Similar',
 'ideas are in paragraphs.',
 'Paragraphs form a',
 'a document. \n\n Paragraphs',
 'are often delimited with a',
 'a carriage return or two',
 'two carriage returns.',
 'Carriage returns are the',
 'the "backslash n" you see',
 'see embedded in this',
 'this string. Sentences',
 'have a period at the end,',
 'end, but also, have a',
 'a space.and words are',
 'are separated by space.']

In [12]:
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=450,
    chunk_overlap=0,
    separators=["\n\n", "\n", " ", ""],
)

r_splitter.split_text(some_text)

["When writing documents, writers will use document structure to group content. This can convey to the reader, which idea's are related. For example, closely related ideas are in sentances. Similar ideas are in paragraphs. Paragraphs form a document.",
 'Paragraphs are often delimited with a carriage return or two carriage returns. Carriage returns are the "backslash n" you see embedded in this string. Sentences have a period at the end, but also, have a space.and words are separated by space.']

In [13]:
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=150,
    chunk_overlap=0,
    separators=["\n\n", "\n", " ", ""],
)

r_splitter.split_text(some_text)

["When writing documents, writers will use document structure to group content. This can convey to the reader, which idea's are related. For example,",
 'closely related ideas are in sentances. Similar ideas are in paragraphs. Paragraphs form a document.',
 'Paragraphs are often delimited with a carriage return or two carriage returns. Carriage returns are the "backslash n" you see embedded in this',
 'string. Sentences have a period at the end, but also, have a space.and words are separated by space.']

In [14]:
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=150, chunk_overlap=0, separators=["\n\n", "\n", r"(?<=\. )", " ", ""]
)

r_splitter.split_text(some_text)

["When writing documents, writers will use document structure to group content. This can convey to the reader, which idea's are related. For example,",
 'closely related ideas are in sentances. Similar ideas are in paragraphs. Paragraphs form a document.',
 'Paragraphs are often delimited with a carriage return or two carriage returns. Carriage returns are the "backslash n" you see embedded in this',
 'string. Sentences have a period at the end, but also, have a space.and words are separated by space.']

In [15]:
from langchain.document_loaders import PyPDFLoader

In [16]:
loader = PyPDFLoader(file_path="./docs/cs229_lectures/MachineLearning-Lecture01.pdf")

pages = loader.load()

In [17]:
text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=1000,
    chunk_overlap=150,
    length_function=len,
)

In [18]:
docs = text_splitter.split_documents(pages)

In [19]:
len(docs)

77

In [20]:
len(pages)

22

In [21]:
from langchain.document_loaders import NotionDirectoryLoader

In [22]:
loader = NotionDirectoryLoader("docs/Notion_DB")

notion_db = loader.load()

docs = text_splitter.split_documents(notion_db)

In [23]:
len(notion_db)

52

In [24]:
len(docs)

353

### Token splitting

In [25]:
from langchain.text_splitter import TokenTextSplitter

In [26]:
text_splitter = TokenTextSplitter(chunk_size=1, chunk_overlap=0)

In [27]:
text1 = "foo bar bazzyfoo"

text_splitter.split_text(text=text1)

['foo', ' bar', ' b', 'az', 'zy', 'foo']

In [28]:
text_splitter = TokenTextSplitter(chunk_size=10, chunk_overlap=0)

In [29]:
docs = text_splitter.split_documents(documents=pages)

In [30]:
len(docs)

1557

In [31]:
docs[990]

Document(page_content=' this quarter is what if your \ndata doesn', metadata={'source': './docs/cs229_lectures/MachineLearning-Lecture01.pdf', 'page': 13})

In [32]:
pages[0].metadata

{'source': './docs/cs229_lectures/MachineLearning-Lecture01.pdf', 'page': 0}

### Context aware splitting

Chunking aims to keep text with common context together.

A text splitting often uses sentences or other delimiters to keep related text together but many documents (such as Markdown) have structure (headers) that can be explicitly used in splitting.

In [33]:
from langchain.document_loaders import NotionDirectoryLoader
from langchain.text_splitter import MarkdownHeaderTextSplitter

In [34]:
markdown_document = """# Title\n\n \
## Chapter 1\n\n \
Hi this is Jim\n\n Hi this is Joe\n\n \
### Section \n\n \
Hi this is Lance \n\n
## Chapter 2\n\n \
Hi this is Molly"""

In [35]:
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

In [36]:
markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on,
)

md_header_splits = markdown_splitter.split_text(
    text=markdown_document,
)

In [37]:
md_header_splits[0]

Document(page_content='Hi this is Jim  \nHi this is Joe', metadata={'Header 1': 'Title', 'Header 2': 'Chapter 1'})

In [38]:
md_header_splits[1]

Document(page_content='Hi this is Lance', metadata={'Header 1': 'Title', 'Header 2': 'Chapter 1', 'Header 3': 'Section'})

In [39]:
md_header_splits[2]

Document(page_content='Hi this is Molly', metadata={'Header 1': 'Title', 'Header 2': 'Chapter 2'})

In [40]:
docs = NotionDirectoryLoader("docs/Notion_DB").load()

txt = " ".join([d.page_content for d in docs])

In [41]:
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
]

markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on,
)

In [42]:
md_header_splits = markdown_splitter.split_text(text=txt)

In [43]:
md_header_splits[0]

Document(page_content="This is a living document with everything we've learned working with people while running a startup. And, of course, we continue to learn. Therefore it's a document that will continue to change.  \n**Everything related to working at Blendle and the people of Blendle, made public.**  \nThese are the lessons from three years of working with the people of Blendle. It contains everything from [how our leaders lead](https://www.notion.so/ecfb7e647136468a9a0a32f1771a8f52?pvs=21) to [how we increase salaries](https://www.notion.so/Salary-Review-e11b6161c6d34f5c9568bb3e83ed96b6?pvs=21), from [how we hire](https://www.notion.so/Hiring-451bbcfe8d9b49438c0633326bb7af0a?pvs=21) and [fire](https://www.notion.so/Firing-5567687a2000496b8412e53cd58eed9d?pvs=21) to [how we think people should give each other feedback](https://www.notion.so/Our-Feedback-Process-eb64f1de796b4350aeab3bc068e3801f?pvs=21) â€” and much more.  \nWe've made this document public because we want to learn f