### Text Splitters

#### Splitting by character

In [11]:
filepath = "../../Sessions_Part2\datasets\Harry Potter 1 - Sorcerer_s Stone.txt"
with open(filepath,'r') as f:
    hp_book = f.read()
    
print("Number of character letters in the document: ", len(hp_book))
print("Number of words in the document: ", len(hp_book.split()))
print("Number of lines in the document: ", len(hp_book.split("\n")))

Number of character letters in the document:  439742
Number of words in the document:  78451
Number of lines in the document:  10703


In [18]:
from collections import Counter
line_len_list = []

for line in hp_book.split("\n"):
    curr_line_len = len(line)
    line_len_list.append(curr_line_len)
    
print(len(line_len_list))
    
Counter(line_len_list) # there are 881 chunks with 71 length, similarly there are 57 chunks with 37 length

10703


Counter({0: 3057,
         71: 881,
         72: 864,
         70: 830,
         69: 710,
         68: 562,
         67: 385,
         66: 285,
         65: 170,
         64: 135,
         63: 89,
         32: 63,
         26: 63,
         9: 62,
         62: 61,
         22: 60,
         31: 59,
         37: 57,
         40: 56,
         35: 56,
         43: 56,
         7: 54,
         36: 53,
         6: 52,
         19: 52,
         34: 52,
         8: 52,
         53: 51,
         27: 50,
         42: 50,
         46: 50,
         13: 50,
         52: 50,
         23: 49,
         21: 48,
         24: 48,
         47: 47,
         18: 47,
         61: 47,
         33: 46,
         16: 46,
         45: 45,
         60: 45,
         41: 45,
         29: 45,
         48: 45,
         17: 44,
         30: 44,
         38: 44,
         28: 44,
         44: 43,
         5: 43,
         10: 42,
         20: 42,
         51: 41,
         54: 41,
         25: 41,
         14: 40,
         

#### Character Text Splitter

##### Splitting the text at a specific character only if the chunk exceeds the given chunk size

In [45]:
from langchain.text_splitter import CharacterTextSplitter

def len_func(text):
    return len(text)

text_splitter = CharacterTextSplitter(
    separator="\n\n",
    chunk_size = 1200,
    chunk_overlap = 100,
    length_function = len_func,
    is_separator_regex= False
    
)

para_list = text_splitter.create_documents(texts = [hp_book])
para_list

[Document(page_content="Harry Potter and the Sorcerer's Stone\n\n\nCHAPTER ONE\n\nTHE BOY WHO LIVED\n\nMr. and Mrs. Dursley, of number four, Privet Drive, were proud to say\nthat they were perfectly normal, thank you very much. They were the last\npeople you'd expect to be involved in anything strange or mysterious,\nbecause they just didn't hold with such nonsense.\n\nMr. Dursley was the director of a firm called Grunnings, which made\ndrills. He was a big, beefy man with hardly any neck, although he did\nhave a very large mustache. Mrs. Dursley was thin and blonde and had\nnearly twice the usual amount of neck, which came in very useful as she\nspent so much of her time craning over garden fences, spying on the\nneighbors. The Dursleys had a small son called Dudley and in their\nopinion there was no finer boy anywhere."),
 Document(page_content="The Dursleys had everything they wanted, but they also had a secret, and\ntheir greatest fear was that somebody would discover it. They didn

##### To add metadata for the document objects

In [46]:
first_chunk = para_list[0]

first_chunk.metadata = {"source":filepath}
first_chunk.metadata

{'source': '../../Sessions_Part2\\datasets\\Harry Potter 1 - Sorcerer_s Stone.txt'}

##### What if the text exceeds the chunk length and there is not separator to chunk the text?

In [47]:
# adding an extra line
extra_line = " ".join(["word"] * 500)

para_list = text_splitter.create_documents(texts = [extra_line + hp_book])

first_chunk_text = para_list[0].page_content

len(first_chunk_text)

Created a chunk of size 2536, which is longer than the specified 1200


2536

Can we add multiple separators to make it working better?

That's where Recursive Character Text Splitter comes in.

#### Recursive Character Splitter

It tries to split on them in order until the chunks are small enough. The default list is ["\n\n", "\n", " ", ""].( Order - First paragraphs, then sentences, then words, then characters)

In [48]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n","\n", " "],
    chunk_size = 200,
    chunk_overlap = 100,
    length_function = len_func,
    is_separator_regex=False
)

# Here, the split first happens at "\n\n", if the chunk size exceeds, it will move to the next separator, if it still exceeds, it will move to the next separator which is a " ".

chunk_list = text_splitter.create_documents(texts = [hp_book])

chunk_list

[Document(page_content="Harry Potter and the Sorcerer's Stone\n\n\nCHAPTER ONE\n\nTHE BOY WHO LIVED"),
 Document(page_content='Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say\nthat they were perfectly normal, thank you very much. They were the last'),
 Document(page_content="that they were perfectly normal, thank you very much. They were the last\npeople you'd expect to be involved in anything strange or mysterious,\nbecause they just didn't hold with such nonsense."),
 Document(page_content='Mr. Dursley was the director of a firm called Grunnings, which made\ndrills. He was a big, beefy man with hardly any neck, although he did'),
 Document(page_content='drills. He was a big, beefy man with hardly any neck, although he did\nhave a very large mustache. Mrs. Dursley was thin and blonde and had'),
 Document(page_content='have a very large mustache. Mrs. Dursley was thin and blonde and had\nnearly twice the usual amount of neck, which came in very useful as she'),
 D

In [49]:
chunk_list = text_splitter.create_documents(texts = [extra_line + hp_book]) # Adding the extra line

chunk_list

[Document(page_content='word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word'),
 Document(page_content='word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word'),
 Document(page_content='word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word'),
 Document(page_content='word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word'),
 Document(page_content='word word word word word word word word word word word word word wor

#### Split by tokens

tiktoken is a python library developed by openAI to count the number of tokens in a string without making an API call.

In [50]:
!pip install tiktoken



You should consider upgrading via the 'C:\Users\HP\OneDrive\Desktop\LLMs_Intro\langchain_new_env\Scripts\python.exe -m pip install --upgrade pip' command.


In [51]:
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator = "\n\n",
    chunk_size = 1200,
    chunk_overlap = 100,
    is_separator_regex = False,
    model_name='text-embedding-3-small',
    encoding_name='text-embedding-3-small'
)

doc_list = text_splitter.create_documents([hp_book])
doc_list

[Document(page_content='Harry Potter and the Sorcerer\'s Stone\n\n\nCHAPTER ONE\n\nTHE BOY WHO LIVED\n\nMr. and Mrs. Dursley, of number four, Privet Drive, were proud to say\nthat they were perfectly normal, thank you very much. They were the last\npeople you\'d expect to be involved in anything strange or mysterious,\nbecause they just didn\'t hold with such nonsense.\n\nMr. Dursley was the director of a firm called Grunnings, which made\ndrills. He was a big, beefy man with hardly any neck, although he did\nhave a very large mustache. Mrs. Dursley was thin and blonde and had\nnearly twice the usual amount of neck, which came in very useful as she\nspent so much of her time craning over garden fences, spying on the\nneighbors. The Dursleys had a small son called Dudley and in their\nopinion there was no finer boy anywhere.\n\nThe Dursleys had everything they wanted, but they also had a secret, and\ntheir greatest fear was that somebody would discover it. They didn\'t\nthink they could

The model name here refers to the model used for calculating the tokens.

To split the text and return the text chunks

In [55]:
line_list = text_splitter.split_text(hp_book)

line_list

['Harry Potter and the Sorcerer\'s Stone\n\n\nCHAPTER ONE\n\nTHE BOY WHO LIVED\n\nMr. and Mrs. Dursley, of number four, Privet Drive, were proud to say\nthat they were perfectly normal, thank you very much. They were the last\npeople you\'d expect to be involved in anything strange or mysterious,\nbecause they just didn\'t hold with such nonsense.\n\nMr. Dursley was the director of a firm called Grunnings, which made\ndrills. He was a big, beefy man with hardly any neck, although he did\nhave a very large mustache. Mrs. Dursley was thin and blonde and had\nnearly twice the usual amount of neck, which came in very useful as she\nspent so much of her time craning over garden fences, spying on the\nneighbors. The Dursleys had a small son called Dudley and in their\nopinion there was no finer boy anywhere.\n\nThe Dursleys had everything they wanted, but they also had a secret, and\ntheir greatest fear was that somebody would discover it. They didn\'t\nthink they could bear it if anyone fou

If you want to convert the split text back to list of document objects

In [58]:

from langchain.docstore.document import Document

doc_list = []
for line in line_list:
    curr_doc = Document(page_content = line, metadata = {"source":filepath})
    doc_list.append(curr_doc)
doc_list

[Document(page_content='Harry Potter and the Sorcerer\'s Stone\n\n\nCHAPTER ONE\n\nTHE BOY WHO LIVED\n\nMr. and Mrs. Dursley, of number four, Privet Drive, were proud to say\nthat they were perfectly normal, thank you very much. They were the last\npeople you\'d expect to be involved in anything strange or mysterious,\nbecause they just didn\'t hold with such nonsense.\n\nMr. Dursley was the director of a firm called Grunnings, which made\ndrills. He was a big, beefy man with hardly any neck, although he did\nhave a very large mustache. Mrs. Dursley was thin and blonde and had\nnearly twice the usual amount of neck, which came in very useful as she\nspent so much of her time craning over garden fences, spying on the\nneighbors. The Dursleys had a small son called Dudley and in their\nopinion there was no finer boy anywhere.\n\nThe Dursleys had everything they wanted, but they also had a secret, and\ntheir greatest fear was that somebody would discover it. They didn\'t\nthink they could

#### Code Splitting

Let's learn a generic way of splitting code that's written in any language. For this let's convert the previous peer_review function code into text.

In [59]:
python_code = """def peer_review(article_id):
    chat = ChatOpenAI()
    loader = ArxivLoader(query=article_id, load_max_docs=2)
    data = loader.load()
    first_record = data[0]
    page_content = first_record.page_content
    title = first_record.metadata['Title']
    summary = first_record.metadata['Summary']

    summary_list = []
    for record in data:
        summary_list.append(record.metadata['Summary'])
    full_summary = "\n\n".join(summary_list)

    system_template = "You are a Peer Reviewer"
    human_template = "Read the paper with the title: '{title}'\n\nAnd Content: {content} and critically list down all the issues in the paper"

    systemp_message_prompt = SystemMessagePromptTemplate.from_template(system_template)
    human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)

    chat_prompt = ChatPromptTemplate.from_messages([systemp_message_prompt, human_message_prompt])
    prompt = chat_prompt.format_prompt(title=title, content=page_content)

    response = chat(messages = prompt.to_messages())

    return response.content"""
    

In [60]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, Language

text_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON,
    chunk_size = 50,
    chunk_overlap = 10
)

text_splitter.create_documents(texts = [python_code])

[Document(page_content='def peer_review(article_id):'),
 Document(page_content='chat = ChatOpenAI()'),
 Document(page_content='loader = ArxivLoader(query=article_id,'),
 Document(page_content='load_max_docs=2)'),
 Document(page_content='data = loader.load()'),
 Document(page_content='first_record = data[0]'),
 Document(page_content='page_content = first_record.page_content'),
 Document(page_content="title = first_record.metadata['Title']"),
 Document(page_content="summary = first_record.metadata['Summary']"),
 Document(page_content='summary_list = []\n    for record in data:'),
 Document(page_content="summary_list.append(record.metadata['Summary'])"),
 Document(page_content='full_summary = "'),
 Document(page_content='".join(summary_list)'),
 Document(page_content='system_template = "You are a Peer Reviewer"'),
 Document(page_content='human_template = "Read the paper with the'),
 Document(page_content="with the title: '{title}'"),
 Document(page_content='And Content: {content} and crit