### Data Ingestion


In [3]:
### document structure

from langchain_core.documents import Document

In [4]:
doc=Document(
    page_content="This is the content of the document.", 
    metadata={
        "source": "user_manual.pdf",
        "pages":1,
        "author":"John Doe"}
    )

doc

Document(metadata={'source': 'user_manual.pdf', 'pages': 1, 'author': 'John Doe'}, page_content='This is the content of the document.')

In [5]:
## create a simple txt file

import os
os.makedirs("../data/text_files", exist_ok=True)

In [6]:
sample_texts={
    "../data/text_files/doc1.txt": """RAG Intoduction
    Retrieval-Augmented Generation (RAG) is a technique used to enhance large language models by combining text generation with information retrieval. Instead of relying only on the knowledge stored inside the model during training, RAG systems dynamically fetch relevant information from external data sources before generating a response.

In a typical RAG pipeline, documents such as PDFs, text files, or web pages are first loaded and converted into a structured format. These documents are then split into smaller chunks to ensure efficient processing. Each chunk is transformed into a numerical representation called an embedding, which captures the semantic meaning of the text.

When a user asks a question, the query is also converted into an embedding and compared against the stored document embeddings using a vector similarity search. The most relevant chunks are retrieved and passed to a language model as additional context. This allows the model to produce more accurate, up-to-date, and context-aware answers.

RAG is especially useful in applications like chatbots, question-answering systems, and enterprise knowledge assistants, where factual accuracy and domain-specific information are critical. By grounding model responses in retrieved documents, RAG helps reduce hallucinations and improves trustworthiness."""

}

for file_path, content in sample_texts.items():
    with open(file_path, "w", encoding="utf-8") as f:
        f.write(content)

In [9]:
##TextLoader
from langchain_community.document_loaders import TextLoader

loader= TextLoader("../data/text_files/doc1.txt", encoding="utf-8")
documents=loader.load()
documents


[Document(metadata={'source': '../data/text_files/doc1.txt'}, page_content='RAG Intoduction\n    Retrieval-Augmented Generation (RAG) is a technique used to enhance large language models by combining text generation with information retrieval. Instead of relying only on the knowledge stored inside the model during training, RAG systems dynamically fetch relevant information from external data sources before generating a response.\n\nIn a typical RAG pipeline, documents such as PDFs, text files, or web pages are first loaded and converted into a structured format. These documents are then split into smaller chunks to ensure efficient processing. Each chunk is transformed into a numerical representation called an embedding, which captures the semantic meaning of the text.\n\nWhen a user asks a question, the query is also converted into an embedding and compared against the stored document embeddings using a vector similarity search. The most relevant chunks are retrieved and passed to a 

In [11]:
## directory loader
from langchain_community.document_loaders import DirectoryLoader

dir_loader= DirectoryLoader("../data/text_files", 
glob="*.txt",  #Pattern to match files
loader_cls=TextLoader, #Loader class to use
loader_kwargs={"encoding":"utf-8"},#Loader specific arguments
show_progress=False
)

documents=dir_loader.load()
documents

[Document(metadata={'source': '..\\data\\text_files\\doc1.txt'}, page_content='RAG Intoduction\n    Retrieval-Augmented Generation (RAG) is a technique used to enhance large language models by combining text generation with information retrieval. Instead of relying only on the knowledge stored inside the model during training, RAG systems dynamically fetch relevant information from external data sources before generating a response.\n\nIn a typical RAG pipeline, documents such as PDFs, text files, or web pages are first loaded and converted into a structured format. These documents are then split into smaller chunks to ensure efficient processing. Each chunk is transformed into a numerical representation called an embedding, which captures the semantic meaning of the text.\n\nWhen a user asks a question, the query is also converted into an embedding and compared against the stored document embeddings using a vector similarity search. The most relevant chunks are retrieved and passed to

In [14]:
from reportlab.lib.pagesizes import LETTER
from reportlab.pdfgen import canvas
import os

# Ensure folder exists
os.makedirs("../data/pdf_files", exist_ok=True)

# ---------- PDF 1 ----------
pdf1_path = "../data/pdf_files/rag_intro.pdf"
c1 = canvas.Canvas(pdf1_path, pagesize=LETTER)

c1.drawString(72, 750, "Introduction to Retrieval-Augmented Generation (RAG)")
c1.drawString(72, 720, "RAG combines information retrieval with text generation.")
c1.drawString(72, 690, "It improves factual accuracy by grounding LLMs in documents.")

c1.save()

# ---------- PDF 2 ----------
pdf2_path = "../data/pdf_files/rag_pipeline.pdf"
c2 = canvas.Canvas(pdf2_path, pagesize=LETTER)

c2.drawString(72, 750, "RAG Pipeline Overview")
c2.drawString(72, 720, "1. Load documents (PDFs, text files, etc.)")
c2.drawString(72, 690, "2. Chunk documents")
c2.drawString(72, 660, "3. Generate embeddings")
c2.drawString(72, 630, "4. Retrieve relevant chunks for a query")

c2.save()

print("PDF files created successfully.")


PDF files created successfully.


In [15]:
from langchain_community.document_loaders import PyPDFLoader,PyMuPDFLoader

dir_loader= DirectoryLoader(
    "../data/pdf_files", 
    glob="*.pdf",  #Pattern to match files
    loader_cls=PyMuPDFLoader, #Loader class to use
    #loader_cls=PyPDFLoader,
   
    show_progress=False
)

pdf_documents=dir_loader.load()
pdf_documents

[Document(metadata={'producer': 'ReportLab PDF Library - (opensource)', 'creator': 'anonymous', 'creationdate': '2026-01-24T10:59:30+05:00', 'source': '..\\data\\pdf_files\\rag_intro.pdf', 'file_path': '..\\data\\pdf_files\\rag_intro.pdf', 'total_pages': 1, 'format': 'PDF 1.3', 'title': 'untitled', 'author': 'anonymous', 'subject': 'unspecified', 'keywords': '', 'moddate': '2026-01-24T10:59:30+05:00', 'trapped': '', 'modDate': "D:20260124105930+05'00'", 'creationDate': "D:20260124105930+05'00'", 'page': 0}, page_content='Introduction to Retrieval-Augmented Generation (RAG)\nRAG combines information retrieval with text generation.\nIt improves factual accuracy by grounding LLMs in documents.'),
 Document(metadata={'producer': 'ReportLab PDF Library - (opensource)', 'creator': 'anonymous', 'creationdate': '2026-01-24T10:59:30+05:00', 'source': '..\\data\\pdf_files\\rag_pipeline.pdf', 'file_path': '..\\data\\pdf_files\\rag_pipeline.pdf', 'total_pages': 1, 'format': 'PDF 1.3', 'title': 'un

In [18]:
print(type(pdf_documents))
print(type(pdf_documents[0]))

<class 'list'>
<class 'langchain_core.documents.base.Document'>


### embedding and vectorDB