### Data Ingestion

In [12]:
from langchain_community.utilities.pebblo import dir_loader
# document dataStructure

from langchain_core.documents import Document
from sqlalchemy.testing.suite.test_reflection import metadata


In [4]:
# just to understand how the doc works
doc = Document(
    page_content="this is the main text content I am using to create RAG",
    metadata={
        "source":"example.txt",
        "pages":1,
        "author":"Imon Hosen",
        "date_create" :"2024-01-01"
    }
)
doc

Document(metadata={'source': 'example.txt', 'pages': 1, 'author': 'Imon Hosen', 'date_create': '2024-01-01'}, page_content='this is the main text content I am using to create RAG')

In [6]:
# Create a simple txt file
import os
os.makedirs("../data/text_files", exist_ok=True)


In [9]:
# some sample text

sample_texts={
    "../data/text_files/python_intro.txt":"""Python Programming Introduction

Python is a high-level, interpreted programming language known for its simplicity and readability.
Created by Guido van Rossum and first released in 1991, Python has become one of the most popular
programming languages in the world.

Key Features:
- Easy to learn and use
- Extensive standard library
- Cross-platform compatibility
- Strong community support

Python is widely used in web development, data science, artificial intelligence, and automation.""",



    "../data/text_files/machine_learning.txt": """Machine Learning Basics

Machine learning is a subset of artificial intelligence that enables systems to learn and improve
from experience without being explicitly programmed. It focuses on developing computer programs
that can access data and use it to learn for themselves.

Types of Machine Learning:
1. Supervised Learning: Learning with labeled data
2. Unsupervised Learning: Finding patterns in unlabeled data
3. Reinforcement Learning: Learning through rewards and penalties

Applications include image recognition, speech processing, and recommendation systems


    """
}

for filepath,content in sample_texts.items():
    with open(filepath,'w',encoding="utf-8") as f:
        f.write(content)

print("Sample text files created!")


Sample text files created!


In [16]:
## Text loader
from langchain_community.document_loaders import TextLoader

loader = TextLoader("../data/text_files/python_intro.txt", encoding="utf-8")

document = loader.load()

document

[Document(metadata={'source': '../data/text_files/python_intro.txt'}, page_content='Python Programming Introduction\n\nPython is a high-level, interpreted programming language known for its simplicity and readability.\nCreated by Guido van Rossum and first released in 1991, Python has become one of the most popular\nprogramming languages in the world.\n\nKey Features:\n- Easy to learn and use\n- Extensive standard library\n- Cross-platform compatibility\n- Strong community support\n\nPython is widely used in web development, data science, artificial intelligence, and automation.')]

In [20]:
### Directory loader
from langchain_community.document_loaders import DirectoryLoader

## load all the text files from the directory
dir_loader = DirectoryLoader(
    "../data/text_files",
    glob="**/*.txt", # pattern to match files
    loader_cls=TextLoader,
    loader_kwargs=dict(encoding="utf-8"),
    show_progress = False
)
doc_txt = dir_loader.load()
doc_txt


[Document(metadata={'source': '../data/text_files/python_intro.txt'}, page_content='Python Programming Introduction\n\nPython is a high-level, interpreted programming language known for its simplicity and readability.\nCreated by Guido van Rossum and first released in 1991, Python has become one of the most popular\nprogramming languages in the world.\n\nKey Features:\n- Easy to learn and use\n- Extensive standard library\n- Cross-platform compatibility\n- Strong community support\n\nPython is widely used in web development, data science, artificial intelligence, and automation.'),
 Document(metadata={'source': '../data/text_files/machine_learning.txt'}, page_content='Machine Learning Basics\n\nMachine learning is a subset of artificial intelligence that enables systems to learn and improve\nfrom experience without being explicitly programmed. It focuses on developing computer programs\nthat can access data and use it to learn for themselves.\n\nTypes of Machine Learning:\n1. Supervise

In [22]:
## Pdf loader

from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader

## load all the text files from the directory
dir_loader = DirectoryLoader(
    "../data/pdf",
    glob="**/*.pdf", # pattern to match files
    loader_cls = PyMuPDFLoader, # loader class to use
    #loader_kwargs = dict(encoding="utf-8"),
    show_progress = False
)
doc_pdf = dir_loader.load()
doc_pdf



[Document(metadata={'producer': 'macOS Version 15.1.1 (Build 24B91) Quartz PDFContext', 'creator': '', 'creationdate': "D:20250425210232Z00'00'", 'source': '../data/pdf/9_Counterintuitive_Investment_Principles_from_Mohnish_Pabrai.pdf', 'file_path': '../data/pdf/9_Counterintuitive_Investment_Principles_from_Mohnish_Pabrai.pdf', 'total_pages': 13, 'format': 'PDF 1.4', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': "D:20250425210232Z00'00'", 'trapped': '', 'modDate': "D:20250425210232Z00'00'", 'creationDate': "D:20250425210232Z00'00'", 'page': 0}, page_content=''),
 Document(metadata={'producer': 'macOS Version 15.1.1 (Build 24B91) Quartz PDFContext', 'creator': '', 'creationdate': "D:20250425210232Z00'00'", 'source': '../data/pdf/9_Counterintuitive_Investment_Principles_from_Mohnish_Pabrai.pdf', 'file_path': '../data/pdf/9_Counterintuitive_Investment_Principles_from_Mohnish_Pabrai.pdf', 'total_pages': 13, 'format': 'PDF 1.4', 'title': '', 'author': '', 'subject': ''

In [26]:
# excel reader

from langchain_community.document_loaders import DirectoryLoader, UnstructuredExcelLoader

dir_loader = DirectoryLoader(
    "../data/excel",  # Your Excel folder path
    glob="**/*.xlsx",  # Pattern to match Excel files
    loader_cls=UnstructuredExcelLoader,
    show_progress=False
)

doc_excel = dir_loader.load()
doc_excel


[Document(metadata={'source': '../data/excel/leads_2025-10-26_12-56-30.xlsx'}, page_content='name email phone address website facebook instagram description CODOS Coffee Bartelsstraße 26, Hamburg, Germany A minimalistic cafe offering discounts for reusable cups. Balz und Balz Lehmweg 6, Hamburg, Germany A cafe known for great reviews and delicious food, promoting eco-friendly coffee consumption. Playground Coffee Detlev-Bremer-Straße 21, St. Pauli, Hamburg, Germany A cafe in St. Pauli known for roasting coffee with love and passion. Klein und Kaiserlich Am Kaiserkai 26, 20457 Hamburg, Germany A coffee house offering traditional Austrian coffee specialties with a view of Vasco-da-Gama square. Café Unter den Linden Juliusstraße 16, 22769 Hamburg, Germany A quieter cafe in Hamburg known for its design and attention to detail. Gretchens Villa Marktstraße 142, 20357 Hamburg, Germany A cafe offering a French way of life with breakfast and tarts. Milch Ditmar-Koel-Straße 22, 20459 Hamburg, Ge