In [1]:
from tqdm.notebook import tqdm

# Stage 0

In [2]:
import pdfplumber

file_path = "book1.pdf"
pages = []

pdf = pdfplumber.open(file_path)
length = len(pdf.pages)
print("Number of pages is : ", length)

with pdfplumber.open(file_path) as pdf:
    for i, page in tqdm(enumerate(pdf.pages, start=1)):
        text = page.extract_text()
        pages.append({"page_number":i, "text":text})

print("Example of a page text would be -> \n", pages[210]["text"][:200])

text = pages[200]["text"][:350]

Number of pages is :  228


0it [00:00, ?it/s]

Example of a page text would be -> 
 ‚Äî CHAPTER SEVENTEEN ‚Äî
The Man with Two Faces
It was Quirrell.
‚ÄòYou!‚Äô gasped Harry.
Quirrell smiled. His face wasn‚Äôt twitching at all.
‚ÄòMe,‚Äô he said calmly. ‚ÄòI wondered whether I‚Äôd be meeting you
here,


Need to detect Chapter Boundaries, Pages : Paragraphs separately ? Will see if frameworks internal tooling not strong enough

* https://docs.langchain.com/oss/python/integrations/splitters

## Programmatic chunking

In [3]:
from langchain_text_splitters import CharacterTextSplitter #Splitting text based on characters 

splitter = CharacterTextSplitter(chunk_size=50, chunk_overlap=5)
chunks = splitter.split_text(text)
print(chunks[0:2])

['Through the Trapdoor 199\ntogether so he couldn‚Äôt speak. Only his eyes were moving, looking\nat them in horror.\n‚ÄòWhat‚Äôve you done to him?‚Äô Harry whispered.\n‚ÄòIt‚Äôs the full Body-Bind,‚Äô said Hermione miserably. ‚ÄòOh, Neville,\nI‚Äôm so sorry.‚Äô\n‚ÄòWe had to, Neville, no time to explain,‚Äô said Harry.\n‚ÄòYou‚Äôll understand later, Neville,‚Äô said Ron, as they stepped']


In [4]:
from langchain_text_splitters import TokenTextSplitter

splitter = TokenTextSplitter(chunk_size=128, chunk_overlap=0)
chunks = splitter.split_text(text)
print(chunks)

#Works on Tiktoken library, developed by openAI. So really helpful to decide the number of tokens for each chunk, could also specify the model name if required. Not sure how well it translates to other open source models.

['Through the Trapdoor 199\ntogether so he couldn‚Äôt speak. Only his eyes were moving, looking\nat them in horror.\n‚ÄòWhat‚Äôve you done to him?‚Äô Harry whispered.\n‚ÄòIt‚Äôs the full Body-Bind,‚Äô said Hermione miserably. ‚ÄòOh, Neville,\nI‚Äôm so sorry.‚Äô\n‚ÄòWe had to, Neville, no time to explain,‚Äô said Harry.\n‚ÄòYou‚Äôll understand later, Neville,‚Äô said Ron, as they stepped']


* ‚ùî Could just do it hardcoded instead of library.
* ‚ùî Consistency for all models or is it for only OpenAI models
* ‚úîÔ∏è Easy to implement
* ‚úîÔ∏è Can preserve partial context by overlapping chunks
* ‚úîÔ∏è Good for explarotary analysis and quick preprocessing
* ‚úîÔ∏è Simple keyword matching
* ‚ùå Ignores meaning - may cut sentences completely

* üóíÔ∏è These are Length Based strategies 

In [5]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(chunk_size=120, chunk_overlap=10)
chunks = splitter.split_text(text)
print(chunks)

#For languages like Japanese which do not have defined word boundaries would need to change the seperators to custom ones

['Through the Trapdoor 199\ntogether so he couldn‚Äôt speak. Only his eyes were moving, looking\nat them in horror.', '‚ÄòWhat‚Äôve you done to him?‚Äô Harry whispered.\n‚ÄòIt‚Äôs the full Body-Bind,‚Äô said Hermione miserably. ‚ÄòOh, Neville,', 'I‚Äôm so sorry.‚Äô\n‚ÄòWe had to, Neville, no time to explain,‚Äô said Harry.', '‚ÄòYou‚Äôll understand later, Neville,‚Äô said Ron, as they stepped']


* üóíÔ∏è Text Structure Based Strategy
* üóíÔ∏è Attempst to keep largert units like paragraphs as intact as possible. If unit exceeds chunk size it moves to next level (sentences), process continues to word level if necessary
* ‚ùå Will most probably produce variable sized chunks that are harder to manage/index.
* ‚ùå Slightly more complex 
* ‚úîÔ∏è Handles nested structures like paragraphs and/or sections 
* ‚úîÔ∏è Better context handling than fixed size splitting
* ‚úîÔ∏è Tries to adpat to different levels of text granularity and create splits that maintain natural language flow

In [6]:
headers_to_split_on = [
    ("#", "Header1"),
    ('##', 'Header2')
]

from langchain_text_splitters import MarkdownHeaderTextSplitter
splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on, strip_headers=False)
chunks = splitter.split_text(text)

import json 
from langchain_text_splitters import RecursiveJsonSplitter 

from langchain_text_splitters import Language
from langchain_text_splitters import RecursiveCharacterTextSplitter
python_splitter = RecursiveCharacterTextSplitter.from_language(language=Language.PYTHON, chunk_size=50, chunk_overlap=0)

from langchain_text_splitters import HTMLHeaderTextSplitter
headers_to_split_on = [
    ("h1", "Header 1"),
    ("h2", "Header 2"),
    ("h3", "Header 3")
]
html_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
chunks = html_splitter.split_text("Text ll be here")

* ‚úèÔ∏è Have not done anything on the previous examples as its related to Markdown, HTML Tags or creating splits based on code functions which are not in the scope of the novel pdf.

## Embedding based chunking

In [7]:
from sentence_transformers import SentenceTransformer
import torch
import nltk 
import numpy as np 
from nltk.tokenize import sent_tokenize

nltk.download("punkt")
nltk.download("punkt_tab")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\prish\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\prish\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

### Semantic Chunking

* Semantic chunking is about splitting text based on meaning and not fixed sizes
* So the split should happen when the topic "changes" or when the max length for each chunk is reached
* So fixed chunks with multiple meanings should be solved 
* ‚ùå getting long paragraphs with multiple meanings and contexts
* poor chunks ‚ùå with irrelevant boundaries

In [8]:
model_id = "KaLM-Embedding/KaLM-embedding-multilingual-mini-instruct-v2.5"
model_arguments = {"torch_dtype":torch.bfloat16,
                   "device_map":"cuda"
                   #"attn_implementation":"flash_attention_2"
                   }

model = SentenceTransformer(model_id,
                            cache_folder = "models",
                            model_kwargs = model_arguments)

`torch_dtype` is deprecated! Use `dtype` instead!


In [9]:
with open("book.txt", "w") as w:
    for i in tqdm(pages):
        w.write(i["text"])

text = open("book.txt").read()
sentences = sent_tokenize(text)
print("Number of sentences are ", len(sentences))

embs = model.encode(sentences, 
                    convert_to_numpy = True, show_progress_bar=True)

print(f"Every embedding is of shape {embs.shape}")

  0%|          | 0/228 [00:00<?, ?it/s]

Number of sentences are  4863


Batches:   0%|          | 0/152 [00:00<?, ?it/s]

Every embedding is of shape (4863, 896)


In [10]:
def cosine(a, b):
    cosine_similarity = np.dot(a,b) / (np.linalg.norm(a) * np.linalg.norm(b))
    return cosine_similarity

similarities = []
for i in tqdm(range(len(embs)-1)):
    cosine_similarity = cosine(embs[i], embs[i+1])
    similarities.append(cosine_similarity)

  0%|          | 0/4862 [00:00<?, ?it/s]

In [11]:
boundaries = [0] #we ll start from 0 as the starting boundary
threshold = 0.70
for index, similarity in enumerate(similarities):
    if similarity < threshold:
        boundaries.append(index+1)

boundaries.append(len(sentences)) #Adding the last boundary limit

In [12]:
chunks = []
for i in range(len(boundaries)-1):
    start = boundaries[i]
    end = boundaries[i+1]
    chunk = " ".join(sentences[start:end])
    chunks.append(chunk)
print("Number of chunks created are -> ", len(chunks))

Number of chunks created are ->  3198


In [13]:
sentences[0:3]

['When a letter arrives for unhappy but\nordinary Harry Potter, a decade-old secret\nis revealed to him.',
 'His parents were\nwizards, killed by a Dark Lord‚Äôs curse\nwhen Harry was just a baby, and which he\nsomehow survived.',
 'Escaping from his\nunbearable Muggle guardians to Hogwarts,\na wizarding school brimming with ghosts\nand enchantments, Harry stumbles into a\nsinister adventure when he finds a three-\nheaded dog guarding a room on the third\nfloor.']

### Agentic Chunking

* Use an LLM to :-
    * Read the document
    * Understand it
    * Understand the structure, topics, boundaries etc. 
    * Determine how to chunk adaptively 
    * Sometime generate hierarchial or multilayer chunks 
    * Chunk differently depending on the end use case

* Call it kind of a __task aware__ chunking

In [14]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch
torch.random.manual_seed(0)

model_name = "Qwen/Qwen3-0.6B"
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir = "models")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    cache_dir = "models",
    dtype="auto",
    device_map="auto"
) 
#Context length for this odel is 32,768


In [20]:
prompt = """You are an expert text segmentation agent. 
Your task is to read the given text and split it into meaningful, semantically coherent chunks.

Rules :- 
1. Use natural boundaries 
2. No fixed size 
3. Consider topic changes 
4. Keep chunks at max size of 1000 tokens
5. Output format should be in a Json format 

[{"chunk" : "1", 
"text" : "..."} ]

The text to be read is given below 

"""

In [19]:
#Taking an assumption that tokens ~ number of chars/4

chars_per_chunk = 20000 # 20000/4 = 5000 Tokens approx
current = []
current_len = 0
chunks = []

for index, sent in enumerate(sentences):
    current.append(sent)
    current_len += len(sent)
    if(current_len >= 7000):
        chunks.append(" ".join(current))
        current_len = 0
        current = []

print("Number of chunks are ", len(chunks))

Number of chunks are  62


In [23]:
results = []
for i in tqdm(range(len(chunks))):
    text = chunks[i]
    final_prompt = prompt + "\n" + text
    messages = [{"role":"user", "content":final_prompt}]

    text = tokenizer.apply_chat_template(
                                        messages,
                                        tokenize=False,
                                        add_generation_prompt=True,
                                        enable_thinking=True # Switches between thinking and non-thinking modes. Default is True
                                        )
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
    generated_ids = model.generate(**model_inputs, 
                                   max_new_tokens=32768)
    output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist() 
    content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")
    try:
        # rindex finding 151668 (</think>)
        index = len(output_ids) - output_ids[::-1].index(151668)
    except ValueError:
        index = 0

    thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n")
    content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")
    results.append(content)
    break


  0%|          | 0/62 [00:00<?, ?it/s]

In [32]:
json.loads(results[0])

[{'chunk': '1',
  'text': "When a letter arrives for unhappy but ordinary Harry Potter, a decade-old secret is revealed to him. His parents were wizards, killed by a Dark Lord‚Äôs curse when Harry was just a baby, and which he somehow survived. Escaping from his unbearable Muggle guardians to Hogwarts, a wizarding school brimming with ghosts and enchantments, Harry stumbles into a sinister adventure when he finds a three-headed dog guarding a room on the third floor. Then he hears of a missing stone with astonishing powers which could be valuable, dangerous, or both. 'Funny, imaginative, magical ... Rowling has woken up a whole generation to reading. In the 2020s, thirty-something book-lovers will know each other by smug references to Diagon Alley and Quidditch' The Times 'This is a terrific book' Sunday Telegraph 'Has all the makings of a classic ... Rowling uses classic narrative devices with flair and originality and delivers a complex and demanding plot in the form of a hugely ente

* Need to evaluate more on the context length and type of llm model
* Basic gist is that it should be able to receive the large text that is snippets or parts from the book and create chunks out of it
* The json format would make it easier further downstream to handle the tasks