In [2]:
import os
from typing import List, Dict, Any
import pandas as pd

In [3]:
from langchain_core.documents import Document
from langchain_text_splitters import(
    RecursiveCharacterTextSplitter,
    CharacterTextSplitter,
    TokenTextSplitter,
)

  from pydantic.v1.fields import FieldInfo as FieldInfoV1
  from .autonotebook import tqdm as notebook_tqdm


In [4]:
doc = Document(
    page_content="This is the main text content that ill be embedded and searched.",
    metadata={
    "source":"example.txt",
    "page":1,
    "author":"kunal",
    "created_on":"2026-01-01",
    }
)

print("Document Structure")
print(f"Content: {doc.page_content}")
print(f"Metadata: {doc.metadata}")

Document Structure
Content: This is the main text content that ill be embedded and searched.
Metadata: {'source': 'example.txt', 'page': 1, 'author': 'kunal', 'created_on': '2026-01-01'}


Text file

In [5]:
import os
os.makedirs("data/text_files",exist_ok=True)

In [6]:
sample_texts={
    "data/text_files/p_intro.txt":"""Python is a high-level, general-purpose programming language known for its simple, English-like syntax and high readability, making it a popular choice for beginners. It is dynamically typed and interpreted, supporting multiple programming paradigms including object-oriented and functional programming. 
Key Features
Simple and Readable Syntax: Python's design emphasizes code readability, using indentation instead of curly brackets to define code blocks, which makes it easier to learn and use than many other languages.
Interpreted Language: Code is executed line by line, which allows for rapid prototyping and easy debugging.
Dynamically Typed: You do not need to declare variable types explicitly; Python determines them at runtime, making the code more flexible and quicker to write.
Extensive Standard Library and Ecosystem: Python has a vast collection of free, reusable code modules and libraries that simplify complex tasks, from web development to machine learning.
Platform Compatibility: Python runs on major operating systems, including Windows, macOS, and Linux.
Large Community Support: A large and active community contributes to a wealth of online resources, tutorials, and support forums, making it easy to find help when needed. """
}

In [7]:
for filepath,content in sample_texts.items():
    with open(filepath,'w',encoding="utf-8") as f:
        f.write(content)

print("sample text created")


sample text created


In [8]:
from langchain_community.document_loaders import TextLoader
loader= TextLoader("data/text_files/p_intro.txt",encoding="utf-8")

documents=loader.load()
print(type(documents[0]))
print(documents)

<class 'langchain_core.documents.base.Document'>
[Document(metadata={'source': 'data/text_files/p_intro.txt'}, page_content="Python is a high-level, general-purpose programming language known for its simple, English-like syntax and high readability, making it a popular choice for beginners. It is dynamically typed and interpreted, supporting multiple programming paradigms including object-oriented and functional programming. \nKey Features\nSimple and Readable Syntax: Python's design emphasizes code readability, using indentation instead of curly brackets to define code blocks, which makes it easier to learn and use than many other languages.\nInterpreted Language: Code is executed line by line, which allows for rapid prototyping and easy debugging.\nDynamically Typed: You do not need to declare variable types explicitly; Python determines them at runtime, making the code more flexible and quicker to write.\nExtensive Standard Library and Ecosystem: Python has a vast collection of free

### multiple text files

In [9]:
from langchain_community.document_loaders import DirectoryLoader
dir_loader=DirectoryLoader(
    "data/text_files",
    glob="**/*.txt",
    loader_cls=TextLoader,
    loader_kwargs={"encoding":"utf-8"},
    show_progress=True,
)

print(f"loaded {len(documents)} documents")

for i,doc in enumerate(documents):
    print(f"\n Document {i+1}:")
    print(f"  Source: {doc.metadata['source']}")
    print(f'  Length: {len(doc.page_content)} characters')

loaded 1 documents

 Document 1:
  Source: data/text_files/p_intro.txt
  Length: 1251 characters


In [10]:
from langchain_text_splitters import (
    CharacterTextSplitter,
    RecursiveCharacterTextSplitter,
    TokenTextSplitter
)
print(documents)

[Document(metadata={'source': 'data/text_files/p_intro.txt'}, page_content="Python is a high-level, general-purpose programming language known for its simple, English-like syntax and high readability, making it a popular choice for beginners. It is dynamically typed and interpreted, supporting multiple programming paradigms including object-oriented and functional programming. \nKey Features\nSimple and Readable Syntax: Python's design emphasizes code readability, using indentation instead of curly brackets to define code blocks, which makes it easier to learn and use than many other languages.\nInterpreted Language: Code is executed line by line, which allows for rapid prototyping and easy debugging.\nDynamically Typed: You do not need to declare variable types explicitly; Python determines them at runtime, making the code more flexible and quicker to write.\nExtensive Standard Library and Ecosystem: Python has a vast collection of free, reusable code modules and libraries that simpli

In [11]:
##method 1 character text splitter

text=documents[0].page_content
text


"Python is a high-level, general-purpose programming language known for its simple, English-like syntax and high readability, making it a popular choice for beginners. It is dynamically typed and interpreted, supporting multiple programming paradigms including object-oriented and functional programming. \nKey Features\nSimple and Readable Syntax: Python's design emphasizes code readability, using indentation instead of curly brackets to define code blocks, which makes it easier to learn and use than many other languages.\nInterpreted Language: Code is executed line by line, which allows for rapid prototyping and easy debugging.\nDynamically Typed: You do not need to declare variable types explicitly; Python determines them at runtime, making the code more flexible and quicker to write.\nExtensive Standard Library and Ecosystem: Python has a vast collection of free, reusable code modules and libraries that simplify complex tasks, from web development to machine learning.\nPlatform Compa

In [12]:
char_splitter=CharacterTextSplitter(
    separator="\n",
    chunk_size=200,
    chunk_overlap=20,
    length_function=len,

)

char_chunks=char_splitter.split_text(text)
print(f"created {len(char_chunks)} chunks")
print(f"First chunk: {char_chunks[0][:100]} ...")

Created a chunk of size 304, which is longer than the specified 200
Created a chunk of size 205, which is longer than the specified 200


created 8 chunks
First chunk: Python is a high-level, general-purpose programming language known for its simple, English-like synt ...


In [13]:
#Method 2 Recersive character splitter

recursive_splitter= RecursiveCharacterTextSplitter(
    separators=["\n"],
    chunk_size=200,
    chunk_overlap=20,
    length_function=len
)

recursive_chunk=recursive_splitter.split_text(text)
print(f"created {len(recursive_chunk)} chunks")
print(f"First chunk: {recursive_chunk[0][:100]} ...")

created 8 chunks
First chunk: Python is a high-level, general-purpose programming language known for its simple, English-like synt ...


In [14]:
print(recursive_chunk[0])
print(recursive_chunk[1])

Python is a high-level, general-purpose programming language known for its simple, English-like syntax and high readability, making it a popular choice for beginners. It is dynamically typed and interpreted, supporting multiple programming paradigms including object-oriented and functional programming. 
Key Features


In [15]:
#method 3, token based splitting

token_split= TokenTextSplitter(
    chunk_size=50,
    chunk_overlap=10,
)


token_chunk=token_split.split_text(text)
print(f"created {len(token_chunk)} chunks")
print(f"First chunk: {token_chunk[0][:100]} ...")

created 6 chunks
First chunk: Python is a high-level, general-purpose programming language known for its simple, English-like synt ...


## load pdf files

In [16]:
from langchain_community.document_loaders import (
    PyPDFLoader,
    PyMuPDFLoader,
    UnstructuredPDFLoader,
)

In [17]:
print("PyPdfLoader")

try:
    pypdf_loader=PyPDFLoader("docs/pdf/attention.pdf")
    pypdf_docs=pypdf_loader.load()
    print(pypdf_docs)
    print(f"Loaded {len(pypdf_docs)} pages")
    print(f"Page 1 content: {pypdf_docs[0].page_content[:100]}")
    print(f"Metadata: {pypdf_docs[0].metadata}")

except Exception as e:
    print(f"Error : {e}") 

PyPdfLoader
[Document(metadata={'producer': 'Microsoft: Print To PDF', 'creator': 'PyPDF', 'creationdate': '2026-02-02T10:56:47+05:30', 'author': 'VICHAVE Kunal OBS/OBI', 'moddate': '2026-02-02T10:56:47+05:30', 'title': 'Efficient Attention Mechanisms for Large Language Models_ A Survey', 'source': 'docs/pdf/attention.pdf', 'total_pages': 27, 'page': 0, 'page_label': '1'}, page_content=''), Document(metadata={'producer': 'Microsoft: Print To PDF', 'creator': 'PyPDF', 'creationdate': '2026-02-02T10:56:47+05:30', 'author': 'VICHAVE Kunal OBS/OBI', 'moddate': '2026-02-02T10:56:47+05:30', 'title': 'Efficient Attention Mechanisms for Large Language Models_ A Survey', 'source': 'docs/pdf/attention.pdf', 'total_pages': 27, 'page': 1, 'page_label': '2'}, page_content=''), Document(metadata={'producer': 'Microsoft: Print To PDF', 'creator': 'PyPDF', 'creationdate': '2026-02-02T10:56:47+05:30', 'author': 'VICHAVE Kunal OBS/OBI', 'moddate': '2026-02-02T10:56:47+05:30', 'title': 'Efficient Attenti

In [18]:
print("PyMuPDFLoader")

try:
    pymupdf_loader=PyMuPDFLoader("docs/pdf/attention.pdf")
    pymupdf_docs=pymupdf_loader.load()
    print(pymupdf_docs)
    print(f"Loaded {len(pymupdf_docs)} pages")
    print(f"Page 1 content: {pymupdf_docs[4].page_content[:]}")
    print(f"Metadata: {pymupdf_docs[0].metadata}")

except Exception as e:
    print(f"Error : {e}") 

PyMuPDFLoader
[Document(metadata={'producer': 'Microsoft: Print To PDF', 'creator': '', 'creationdate': '2026-02-02T10:56:47+05:30', 'source': 'docs/pdf/attention.pdf', 'file_path': 'docs/pdf/attention.pdf', 'total_pages': 27, 'format': 'PDF 1.7', 'title': 'Efficient Attention Mechanisms for Large Language Models_ A Survey', 'author': 'VICHAVE Kunal OBS/OBI', 'subject': '', 'keywords': '', 'moddate': '2026-02-02T10:56:47+05:30', 'trapped': '', 'modDate': "D:20260202105647+05'30'", 'creationDate': "D:20260202105647+05'30'", 'page': 0}, page_content=''), Document(metadata={'producer': 'Microsoft: Print To PDF', 'creator': '', 'creationdate': '2026-02-02T10:56:47+05:30', 'source': 'docs/pdf/attention.pdf', 'file_path': 'docs/pdf/attention.pdf', 'total_pages': 27, 'format': 'PDF 1.7', 'title': 'Efficient Attention Mechanisms for Large Language Models_ A Survey', 'author': 'VICHAVE Kunal OBS/OBI', 'subject': '', 'keywords': '', 'moddate': '2026-02-02T10:56:47+05:30', 'trapped': '', 'modDate

In [19]:
raw_pdf_file="""Python is a high-level, general-purpose programming language known for its simple, English-like syntax and high readability, making it a popular choice for beginners. It is dynamically typed and interpreted, supporting multiple programming paradigms including object-oriented and functional programming. 
Key Features
Simple and Readable Syntax: Python's design emphasizes code readability, using indentation instead of curly brackets to define code blocks, which makes it easier to learn and use than many other languages.
Interpreted Language: Code is executed line by line, which allows for rapid prototyping and easy debugging.
Dynamically Typed: You do not need to declare variable types explicitly; Python determines them at runtime, making the code more flexible and quicker to write.
Extensive Standard Library and Ecosystem: Python has a vast collection of free, reusable code modules and libraries that simplify complex tasks, from web development to machine learning.
Platform Compatibility: Python runs on major operating systems, including Windows, macOS, and Linux.
Large Community Support: A large and active community contributes to a wealth of online resources, tutorials, and support forums, making it easy to find help when needed.  """


In [20]:
def clean_text(text):
    ligatures = {
            "ﬁ": "fi",
            "ﬂ": "fl",
            "ﬀ": "ff",
            "ﬃ": "ffi",
            "ﬄ": "ffl",
            "ﬅ": "st",
            "ﬆ": "st",
        }
        
        # 3. Replace ligatures if they exist as single Unicode characters
    for ligature, replacement in ligatures.items():
            text = text.replace(ligature, replacement)
    return text





In [21]:
cleaned= clean_text(raw_pdf_file)
print("BEFORE")
print(repr(raw_pdf_file[:100]))
print("\n AFTER")
print(repr(cleaned[:100]))

BEFORE
'Python is a high-level, general-purpose programming language known for its simple, English-like synt'

 AFTER
'Python is a high-level, general-purpose programming language known for its simple, English-like synt'


In [22]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [None]:
from typing import List
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.documents import Document
from langchain_core.document_loaders import UnstructuredPDFLoader


class SmartPDFProcessor:
    def __init__(self, chunk_size=1000, chunk_overlap=100):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap

        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            separators=["\n\n", "\n", " ", ""]
        )

    def process_pdf(self, pdf_path: str) -> List[Document]:
        print("Processing PDF...")
        loader = PyPDFLoader(pdf_path)
        pages = loader.load()

        processed_chunks = []

        for page_num, page in enumerate(pages):
            raw_text = page.page_content or ""
            print(f"Page {page_num+1}: raw chars = {len(raw_text)}")

            cleaned_text = self._clean_text(raw_text)
            print(f"Page {page_num+1}: cleaned chars = {len(cleaned_text)}")

            if not cleaned_text.strip():
                continue

            chunks = self.text_splitter.create_documents(
                texts=[cleaned_text],
                metadatas=[{
                    **page.metadata,
                    "page": page_num + 1,
                    "total_pages": len(pages),
                    "chunk_method": "smart_pdf_processor",
                    "char_count": len(cleaned_text)
                }]
            )

            processed_chunks.extend(chunks)

        return processed_chunks

    def _clean_text(self, text: str) -> str:
        ligatures = {
            "ﬁ": "fi",
            "ﬂ": "fl",
            "ﬀ": "ff",
            "ﬃ": "ffi",
            "ﬄ": "ffl",
            "ﬅ": "st",
            "ﬆ": "st",
        }

        for ligature, replacement in ligatures.items():
            text = text.replace(ligature, replacement)

        return text


In [55]:
preprocessor = SmartPDFProcessor()

try:
    smart_chunks = preprocessor.process_pdf("docs/pdf/a.pdf")
    print(f"Processed into {len(smart_chunks)} smart chunks")

    if smart_chunks:
        print("\nSample chunk content (first 300 chars):")
        print(smart_chunks[0].page_content[:300])

        print("\nSample chunk metadata:")
        for key, value in smart_chunks[0].metadata.items():
            print(f" {key}: {value}")
    else:
        print("No chunks were generated.")

except Exception as e:
    print("Error occurred while processing PDF:")
    raise e


Processing PDF...
Page 1: raw chars = 0
Page 1: cleaned chars = 0
Page 2: raw chars = 0
Page 2: cleaned chars = 0
Page 3: raw chars = 0
Page 3: cleaned chars = 0
Page 4: raw chars = 0
Page 4: cleaned chars = 0
Page 5: raw chars = 0
Page 5: cleaned chars = 0
Page 6: raw chars = 0
Page 6: cleaned chars = 0
Page 7: raw chars = 0
Page 7: cleaned chars = 0
Page 8: raw chars = 0
Page 8: cleaned chars = 0
Page 9: raw chars = 0
Page 9: cleaned chars = 0
Processed into 0 smart chunks
No chunks were generated.


## Data parsing with docs

### word doc processing

In [2]:
from docx import Document as DocxDocument
import os

In [3]:
from langchain_community.document_loaders import Docx2txtLoader, UnstructuredWordDocumentLoader

  from pydantic.v1.fields import FieldInfo as FieldInfoV1
  from .autonotebook import tqdm as notebook_tqdm


In [8]:
print('Using Docx2WLoader')

try:
    docx_loader=Docx2txtLoader("docs/genai.docx")
    docs=docx_loader.load()
    print(f"loaded{len(docs)} documents")
    print(f"Content preview: {docs[0].page_content[:200]} ....")
    print(f"Metadata: {docs[0].metadata}")
except Exception as e:
    print(f"Error: {e}")

Using Docx2WLoader
loaded1 documents
Content preview: Detailed Prerequisites

Required (must have)

Python 3.8+: Comfort writing functions, classes, async/await, pip/virtualenv

Basic data structures: Lists, dicts, JSON handling, pandas basics

HTTP/REST ....
Metadata: {'source': 'docs/genai.docx'}


In [11]:
from unicodedata import category


print("using unstructuredWord Document")

try:
    unstructured_loader=UnstructuredWordDocumentLoader("docs/genai.docx",mode="elements")
    unstructured_doc=unstructured_loader.load()
    print(f"Loaded {len(unstructured_doc)} elements")
    for i,doc in enumerate(unstructured_doc[:3]):
        print(f"\n Element {i+1}")
        print(f"Type: {doc.metadata.get("category","unknown")}")
        print(f"Content: {doc.page_content[:100]} ...")
except Exception as e:
    print(f"Error: {e}")

using unstructuredWord Document
Loaded 168 elements

 Element 1
Type: UncategorizedText
Content: Detailed Prerequisites ...

 Element 2
Type: NarrativeText
Content: Required (must have) ...

 Element 3
Type: ListItem
Content: Python 3.8+: Comfort writing functions, classes, async/await, pip/virtualenv ...


## working with csv and exel data

In [2]:
import pandas as pd
import langchain 
import os

In [3]:
os.makedirs("data/structured_files",exist_ok=True)

In [5]:
data = {
    "id": [1, 2, 3, 4],
    "name": ["Alice", "Bob", "Charlie", "David"],
    "age": [24, 27, 22, 29],
    "department": ["HR", "IT", "Finance", "Marketing"],
    "salary": [45000, 60000, 52000, 58000]
}

# Create DataFrame
df = pd.DataFrame(data)

# Display DataFrame
print(df)

   id     name  age department  salary
0   1    Alice   24         HR   45000
1   2      Bob   27         IT   60000
2   3  Charlie   22    Finance   52000
3   4    David   29  Marketing   58000


In [6]:
df.to_csv("data/structured_files/company.csv")

In [7]:
with pd.ExcelWriter('data/structured_files/company.xlsx') as writer:
    df.to_excel(writer, sheet_name='Products',index=False)

    summary_data = {
        'Category':['Electronics','Accessories'],
        'Total_Items':[3,2],
        'Total_Value': [1389.97,109.98]
    }
    pd.DataFrame(summary_data).to_excel(writer, sheet_name="Summary", index=False)

In [8]:
df.to_excel("data/structured_files/company2.xlsx")

In [9]:
from langchain_community.document_loaders import CSVLoader, UnstructuredCSVLoader


  from pydantic.v1.fields import FieldInfo as FieldInfoV1
  from .autonotebook import tqdm as notebook_tqdm
