-
Notifications
You must be signed in to change notification settings - Fork 0
/
document_chunks.py
49 lines (38 loc) · 1.83 KB
/
document_chunks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
### Retrieve only the pieces of document chunks that are relevant to the query because context window of LLMs is limited.
### Different ways to split the documents :
#### Characters, tokens, context aware splitting such Markdown header splitter.
### Parameter needed to be tuned : separated, chunk size, chunk overlap, length function, etc.
from langchain.text_splitter import MarkdownTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import CharacterTextSplitter
def markdown_text_splitter(documents):
markdown_text_splitter = MarkdownTextSplitter(chunk_size=100,
chunk_overlap=0)
document_chunks = markdown_text_splitter.split_documents(documents)
return document_chunks
def character_splitter(documents):
chunk_size = 1024
chunk_overlap = 5
text_splitter = CharacterTextSplitter(chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
separator="\n")
document_chunks = text_splitter.split_documents(documents)
return document_chunks
def recursive_character_spliter(documents):
chunk_size = 512
chunk_overlap = 5
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,
chunk_overlap=chunk_overlap)
document_chunks = text_splitter.split_documents(documents)
return document_chunks
def test_doc_splitter(document_chunks, page_index):
print("Number of document chunks created : ", len(document_chunks))
print("\n")
print("*** Testing document chunk at index : " + str(page_index) + " ***")
print("\n")
page = document_chunks[page_index]
print("chunk content :", page.page_content)
print("\n")
print("chunk metadata : ", page.metadata)
print("\n")
print("*** End resutls ****")