In [3]:
import os
import pandas as pd
from typing import List,Dict,Any

In [1]:
from langchain_core.documents import Document
from langchain_text_splitters import (
    RecursiveCharacterTextSplitter,
    CharacterTextSplitter,
    TokenTextSplitter,
)


In [2]:
# creating the Langchain Document Structure 

doc =Document(
    page_content="This is the page main content",
    metadata ={
        "source":"Bics_global",
        "page":1,
        "Created_by":"Hemanth"
    }
)
print(doc.page_content)
print(doc.metadata)

This is the page main content
{'source': 'Bics_global', 'page': 1, 'Created_by': 'Hemanth'}


In [3]:
# Reading the text file 

import os
os.makedirs("data/text_files",exist_ok=True)



In [4]:
sample_texts ={
    "data/text_files/python_intro.txt":'''Python Programming Introduction

Python is a high-level, interpreted programming language known for its simplicity and readability.
Created by Guido van Rossum and first released in 1991, Python is widely used across many domains.

Key Features:
- Easy to learn and use
- Extensive standard library
- Cross-platform compatibility
- Strong community support

Python is commonly used in web development, data science, machine learning, artificial intelligence,
automation, and backend development.''',

"data/text_files/machine_learning.txt":'''Machine Learning Basics

Machine Learning is a subset of artificial intelligence that enables systems to learn from data
without being explicitly programmed.

Core Concepts:
- Supervised learning
- Unsupervised learning
- Reinforcement learning
- Model training and evaluation

Machine learning is used in recommendation systems, fraud detection, image recognition,
natural language processing, and predictive analytics.''',

"data/text_files/rag_overview.txt": """Retrieval-Augmented Generation (RAG)

RAG is an AI architecture that combines information retrieval with text generation.
It retrieves relevant documents from a knowledge base and uses them to generate accurate responses.

RAG Workflow:
- Load and preprocess documents
- Convert text into embeddings
- Store embeddings in a vector database
- Retrieve relevant chunks at query time
- Generate grounded responses using an LLM

RAG reduces hallucinations and improves factual accuracy in AI systems.
""",
}

for filepath,content in sample_texts.items():
    with open(filepath,'w',encoding="utf-8") as f:
        f.write(content)


In [None]:
from langchain_community.document_loaders import TextLoader

loader =TextLoader("data/text_files/machine_learning.txt",encoding="utf-8")
documents =loader.load()

print(documents)

[Document(metadata={'source': 'data/text_files/machine_learning.txt'}, page_content='Machine Learning Basics\n\nMachine Learning is a subset of artificial intelligence that enables systems to learn from data\nwithout being explicitly programmed.\n\nCore Concepts:\n- Supervised learning\n- Unsupervised learning\n- Reinforcement learning\n- Model training and evaluation\n\nMachine learning is used in recommendation systems, fraud detection, image recognition,\nnatural language processing, and predictive analytics.')]


In [9]:
from langchain_community.document_loaders import DirectoryLoader

dir_loader =DirectoryLoader(
    "data/text_files",
    glob="**/*.txt",
    loader_cls=TextLoader,
    loader_kwargs={'encoding':'utf-8'},
    show_progress=True
)
doc =dir_loader.load()
print(doc)


docs = dir_loader.load()

for doc in docs:
    print(f" Source: {doc.metadata.get('source')}")
    print(f" Length: {len(doc.page_content)} characters")


100%|██████████| 3/3 [00:00<00:00, 735.11it/s]


[Document(metadata={'source': 'data\\text_files\\machine_learning.txt'}, page_content='Machine Learning Basics\n\nMachine Learning is a subset of artificial intelligence that enables systems to learn from data\nwithout being explicitly programmed.\n\nCore Concepts:\n- Supervised learning\n- Unsupervised learning\n- Reinforcement learning\n- Model training and evaluation\n\nMachine learning is used in recommendation systems, fraud detection, image recognition,\nnatural language processing, and predictive analytics.'), Document(metadata={'source': 'data\\text_files\\python_intro.txt'}, page_content='Python Programming Introduction\n\nPython is a high-level, interpreted programming language known for its simplicity and readability.\nCreated by Guido van Rossum and first released in 1991, Python is widely used across many domains.\n\nKey Features:\n- Easy to learn and use\n- Extensive standard library\n- Cross-platform compatibility\n- Strong community support\n\nPython is commonly used in

100%|██████████| 3/3 [00:00<00:00, 4285.73it/s]

 Source: data\text_files\machine_learning.txt
 Length: 420 characters
 Source: data\text_files\python_intro.txt
 Length: 495 characters
 Source: data\text_files\rag_overview.txt
 Length: 497 characters





In [10]:
from langchain_text_splitters import(
    CharacterTextSplitter,
    RecursiveJsonSplitter,
    TokenTextSplitter
)

In [11]:
print(documents)

[Document(metadata={'source': 'data/text_files/machine_learning.txt'}, page_content='Machine Learning Basics\n\nMachine Learning is a subset of artificial intelligence that enables systems to learn from data\nwithout being explicitly programmed.\n\nCore Concepts:\n- Supervised learning\n- Unsupervised learning\n- Reinforcement learning\n- Model training and evaluation\n\nMachine learning is used in recommendation systems, fraud detection, image recognition,\nnatural language processing, and predictive analytics.')]


In [18]:
text =documents[0].page_content
char_splitter =RecursiveCharacterTextSplitter(
    separators=[""],
    chunk_size =200,
    chunk_overlap=20,
    length_function =len
)
recursive_chunk=char_splitter.split_text(text)
print(len(recursive_chunk))

print(recursive_chunk[0])
print(recursive_chunk[1])
print(recursive_chunk[2])



3
Machine Learning Basics

Machine Learning is a subset of artificial intelligence that enables systems to learn from data
without being explicitly programmed.

Core Concepts:
- Supervised learning
- Un
rvised learning
- Unsupervised learning
- Reinforcement learning
- Model training and evaluation

Machine learning is used in recommendation systems, fraud detection, image recognition,
natural langua
tion,
natural language processing, and predictive analytics.
