In [1]:
import os
from typing import List, Dict, Any
import pandas as pd




In [2]:
from langchain_core.documents import Document
from langchain_text_splitters import (
    RecursiveCharacterTextSplitter,
    CharacterTextSplitter,
    TokenTextSplitter
)

print("setup is completed!")

  from .autonotebook import tqdm as notebook_tqdm


setup is completed!


## Understanding Document structure in LangChain

In [6]:
## create a simple doc
doc = Document(
    page_content="This is a sample document for testing purposes.",
    metadata={
        "source": "sample_source",
        "page": 1 ,
        "author": "Test Author",
        "date_created": "2024-06-01"
        
        }
)

print("Document structure")
print(f"Content: {doc.page_content}")
print(f"Metadata: {doc.metadata}")


Document structure
Content: This is a sample document for testing purposes.
Metadata: {'source': 'sample_source', 'page': 1, 'author': 'Test Author', 'date_created': '2024-06-01'}


In [7]:
type(doc)

langchain_core.documents.base.Document

## Reading Text files (.txt)

In [8]:
## create a simple dir


os.makedirs("data/text_files", exist_ok=True)

In [9]:
sample_texts ={
    "data/text_files/python_intro.txt": """Anyone can learn Python programming.
Python is a versatile language used for web development, data analysis, artificial intelligence, and more.
It has a simple syntax that makes it easy to read and write code.
It has a large community and extensive libraries that support various applications.
It is a great language for beginners and experienced developers alike.""",

    "data/text_files/machine_learning.txt": """ Machine learning is a subset of artificial intelligence that focuses on building systems that can learn from data.
    It involves algorithms that improve their performance as they are exposed to more data over time.
    Common machine learning tasks include classification, regression, clustering, and recommendation.
    Machine learning is widely used in various applications such as image recognition, natural language processing, and predictive analytics.
    Types of machine learning include supervised learning, unsupervised learning, and reinforcement learning.""",


}

for file_path, content in sample_texts.items():
    with open(file_path, "w", encoding = "utf-8" ) as f:
        f.write(content)

print(" Sample text files created.")

 Sample text files created.


## TextLoader - Read single file.

In [10]:

from langchain_community.document_loaders import TextLoader

##Loading single text file

loader = TextLoader("data/text_files/python_intro.txt", encoding="utf-8")

documents = loader.load()
print(f"Loaded {len(documents)} document(s)")
print(f"content preview: {documents[0].page_content[:100]}...")
print(f"metadata: {documents[0].metadata}")


Loaded 1 document(s)
content preview: Anyone can learn Python programming.
Python is a versatile language used for web development, data a...
metadata: {'source': 'data/text_files/python_intro.txt'}


### Directory Loader - Multiple Text files.


In [12]:
from langchain_community.document_loaders import DirectoryLoader

# loading all text files from a directory

dir_loader = DirectoryLoader("data/text_files", 
                             glob="**/*.txt", #pattern to match files
                             loader_cls=TextLoader, #loader class to use 
                             loader_kwargs={"encoding": "utf-8"},
                             show_progress=True #show loading progress
                             )

docs = dir_loader.load()

print(f"Loaded {len (docs)} document(s) from directory 'data/text_files'")
for i, document in enumerate(docs):
   print(f"\n Document {i+1}:")
   print(f" source : {document.metadata['source']}")
   print(f" Length : {len(document.page_content)} characters")


100%|██████████| 2/2 [00:00<00:00, 2890.63it/s]

Loaded 2 document(s) from directory 'data/text_files'

 Document 1:
 source : data/text_files/python_intro.txt
 Length : 364 characters

 Document 2:
 source : data/text_files/machine_learning.txt
 Length : 571 characters





### Directory loader characteristics :

#### Adv:
- loads multiple files at once
- supports glob patterns
- progressive tracking
- Recursive directory scanning

#### DisAdv:
- All files must be of same type
- Limited error handling per file
- can be memory intensive for large directories



## Text Splitting



In [14]:
## Different text splitting strategies

from langchain_text_splitters import (
    CharacterTextSplitter,
    RecursiveCharacterTextSplitter,
    TokenTextSplitter
)

print(documents)

[Document(metadata={'source': 'data/text_files/python_intro.txt'}, page_content='Anyone can learn Python programming.\nPython is a versatile language used for web development, data analysis, artificial intelligence, and more.\nIt has a simple syntax that makes it easy to read and write code.\nIt has a large community and extensive libraries that support various applications.\nIt is a great language for beginners and experienced developers alike.')]


### 1. Charecter text splitting

Character splitting means dividing your text into pieces based on a set number of characters, regardless of what the text says. It’s a simple way to start understanding splitting, but it’s not great for real use.

Pros: Easy & Simple

Cons: Doesn’t consider text structure well




In [21]:
## charecter text splitter

text = documents[0].page_content
text

'Anyone can learn Python programming.\nPython is a versatile language used for web development, data analysis, artificial intelligence, and more.\nIt has a simple syntax that makes it easy to read and write code.\nIt has a large community and extensive libraries that support various applications.\nIt is a great language for beginners and experienced developers alike.'

In [24]:
print("Character Text Splitter")
char_splitter = CharacterTextSplitter(
    separator="\n", # split by new lines
    chunk_size=200, # max chunk size
    chunk_overlap=20, # overlap between chunks
    length_function=len # how to measure chunk size
)

char_chunks = char_splitter.split_text(text)
print(f"Created {len(char_chunks)} chunks ")
print(f"first chunk : {char_chunks[0] [:100]}...")


Character Text Splitter
Created 3 chunks 
first chunk : Anyone can learn Python programming.
Python is a versatile language used for web development, data a...


In [25]:
print(char_chunks[0])
print("--------------------------------")
print(char_chunks[1])
print("--------------------------------")
print(char_chunks[2])


Anyone can learn Python programming.
Python is a versatile language used for web development, data analysis, artificial intelligence, and more.
--------------------------------
It has a simple syntax that makes it easy to read and write code.
It has a large community and extensive libraries that support various applications.
--------------------------------
It is a great language for beginners and experienced developers alike.


## 2.Recursive charecter Text Splitter

Character Splitting has a flaw, it doesn’t consider how our document is organized. We just split it by a set number of characters.

The Recursive Character Text Splitter fixes this.
We set specific marks to split our documents.
“\n\n” — Paragraph breaks,        
“\n” — New lines, “ “ — Spaces “” — Individual characters.

In [26]:
# Recursive Character Text Splitter

print("Recursive Character Text Splitter")
recursive_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", " ", ""], # hierarchy of separators
    chunk_size=200,
    chunk_overlap=20,
    length_function=len
)

recursive_chunks = recursive_splitter.split_text(text)
print(f"Created {len(recursive_chunks)} chunks ")
print(f"first chunk : {recursive_chunks[0] [:100]}...")


Recursive Character Text Splitter
Created 3 chunks 
first chunk : Anyone can learn Python programming.
Python is a versatile language used for web development, data a...


In [27]:
print(recursive_chunks[0])
print("--------------------------------")
print(recursive_chunks[1])
print("--------------------------------")
print(recursive_chunks[2])


Anyone can learn Python programming.
Python is a versatile language used for web development, data analysis, artificial intelligence, and more.
--------------------------------
It has a simple syntax that makes it easy to read and write code.
It has a large community and extensive libraries that support various applications.
--------------------------------
It is a great language for beginners and experienced developers alike.


In [33]:
# create a split without natural breakpoints
simple_text = " In this world of AI, we are witnessing rapid advancements in technology. we see AI being integrated into various industries, from healthcare to finance. The potential for AI to revolutionize the way we live and work is immense. However, it also raises important ethical considerations that need to be addressed. As we continue to develop AI technologies, it is crucial to ensure that they are used responsibly and for the benefit of all humanity." 

splitter = RecursiveCharacterTextSplitter(
    separators=[" "],
    chunk_size=80,
    chunk_overlap=20,
    length_function=len
)

chunks = splitter.split_text(simple_text)

print(f"\nsimple text example - {len(chunks)} chunks : \n")

for i in range(len(chunks) -1):
    print(f"Chunk {i+1} : '{chunks[i]}' ")
    print(f"Chunk {i+2} : '{chunks[i+1]}' ")
    print("--------------------------------")


print()




simple text example - 7 chunks : 

Chunk 1 : 'In this world of AI, we are witnessing rapid advancements in technology. we see' 
Chunk 2 : 'technology. we see AI being integrated into various industries, from healthcare' 
--------------------------------
Chunk 2 : 'technology. we see AI being integrated into various industries, from healthcare' 
Chunk 3 : 'from healthcare to finance. The potential for AI to revolutionize the way we' 
--------------------------------
Chunk 3 : 'from healthcare to finance. The potential for AI to revolutionize the way we' 
Chunk 4 : 'the way we live and work is immense. However, it also raises important ethical' 
--------------------------------
Chunk 4 : 'the way we live and work is immense. However, it also raises important ethical' 
Chunk 5 : 'important ethical considerations that need to be addressed. As we continue to' 
--------------------------------
Chunk 5 : 'important ethical considerations that need to be addressed. As we continue to' 
Chunk 6

## 3. Token Text Splitter

A Token Text Splitter is a specialized text splitting utility designed to divide large documents into smaller pieces, or chunks, where the size of those chunks is measured by the count of tokens, rather than the number of characters.

This method is crucial for Large Language Model (LLM) applications, especially in Retrieval-Augmented Generation (RAG), because LLMs have a strict limit on the number of tokens they can process in their context window (4096 or 8192 tokens).

In [34]:
# Token Text Splitter
print("Token Text Splitter")
token_splitter = TokenTextSplitter(
    chunk_size=50, # max tokens per chunk
    chunk_overlap=10 # overlap between chunks
)
token_chunks = token_splitter.split_text(text)
print(f"Created {len(token_chunks)} chunks ")
print(f"first chunk : {token_chunks[0] [:100]}...")


Token Text Splitter
Created 2 chunks 
first chunk : Anyone can learn Python programming.
Python is a versatile language used for web development, data a...


In [35]:
print(token_chunks[0])
print("--------------------------------")
print(token_chunks[1])
print("--------------------------------")

Anyone can learn Python programming.
Python is a versatile language used for web development, data analysis, artificial intelligence, and more.
It has a simple syntax that makes it easy to read and write code.
It has a large community and extensive
--------------------------------
 code.
It has a large community and extensive libraries that support various applications.
It is a great language for beginners and experienced developers alike.
--------------------------------


## comparison

### Comparison of different text splitting strategies

#### character text splitter:
"simple and effective for basic splitting tasks.
good for structured text with clear delimiters.
May break mid-sentence

Use When : Simple text with clear delimiters

#### Recursive Character Text Splitter:
more sophisticated, respects natural text boundaries.
better for complex or unstructured text.
best general-purpose splitter
Tries multiple separators in order
slightly more complex than basic character splitter

Use When : Default choice for most text splitting needs

#### Token Text Splitter:
Respects model token limits accurately.
accurate for embedding and language model tasks.
slower than character-based splitters due to tokenization step

Use When : Working with token-limited models or embeddings



