Data Ingestion

In [1]:
###Document Structure

from langchain_core.documents import Document


In [2]:
doc=Document(
    page_content="This is a test document",
    metadata={
        "source":"test.pdf",
        "page":1,
        "author":"Kashish Jethmalani",
        "date_created" : "2025-01-01"
    }
)

In [3]:
##Creating a simple txt file

import os
os.makedirs("../data/text_files",exist_ok=True)



In [4]:
sample_text = {
    "../data/text_files/python_intro.txt": """
    Python is a high-level, interpreted programming language that is easy to learn and use.
    It is widely used for web development, data analysis, artificial intelligence, and more.
    Python is known for its clear and concise syntax, which makes it ideal for beginners.
    """,
    "../data/text_files/machine_learning_intro.txt": """
    Machine learning is a field of artificial intelligence that focuses on building systems that learn from data.
    It is used for tasks such as image recognition, natural language processing, and predictive modeling.
    Machine learning is a subset of artificial intelligence and is closely related to data science.
    """
}

for filepath, content in sample_text.items():
    with open(filepath, "w", encoding="utf-8") as f:
        f.write(content)

print("Sample text files created successfully.")



Sample text files created successfully.


In [5]:
###Text Loader

from langchain_community.document_loaders import TextLoader
loader=TextLoader("../data/text_files/python_intro.txt")
document=loader.load()
print(document)

  from .autonotebook import tqdm as notebook_tqdm


[Document(metadata={'source': '../data/text_files/python_intro.txt'}, page_content='\n    Python is a high-level, interpreted programming language that is easy to learn and use.\n    It is widely used for web development, data analysis, artificial intelligence, and more.\n    Python is known for its clear and concise syntax, which makes it ideal for beginners.\n    ')]


In [6]:
###Directory Loader

from langchain_community.document_loaders import DirectoryLoader

directory_loader=DirectoryLoader(
    "../data/text_files",
    glob="*.txt",
    loader_cls=TextLoader,
    loader_kwargs={"encoding":"utf-8"},
    show_progress=False
)

documents=directory_loader.load()
documents

[Document(metadata={'source': '..\\data\\text_files\\machine_learning_intro.txt'}, page_content='\n    Machine learning is a field of artificial intelligence that focuses on building systems that learn from data.\n    It is used for tasks such as image recognition, natural language processing, and predictive modeling.\n    Machine learning is a subset of artificial intelligence and is closely related to data science.\n    '),
 Document(metadata={'source': '..\\data\\text_files\\python_intro.txt'}, page_content='\n    Python is a high-level, interpreted programming language that is easy to learn and use.\n    It is widely used for web development, data analysis, artificial intelligence, and more.\n    Python is known for its clear and concise syntax, which makes it ideal for beginners.\n    ')]

In [7]:
###Pdf loader

from langchain_community.document_loaders import PyMuPDFLoader, PyPDFLoader

directory_loader=DirectoryLoader(
    "../data/pdf",
    glob="*.pdf",
    loader_cls=PyMuPDFLoader,
    show_progress=False
)

pdf_documents=directory_loader.load()
pdf_documents



[Document(metadata={'producer': 'MiKTeX pdfTeX-1.40.25', 'creator': 'TeX', 'creationdate': '2025-11-03T06:54:25-08:00', 'source': '..\\data\\pdf\\exam2-Solution-Annotated.pdf', 'file_path': '..\\data\\pdf\\exam2-Solution-Annotated.pdf', 'total_pages': 6, 'format': 'PDF 1.6', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2025-11-04T19:22:22-08:00', 'trapped': '', 'modDate': "D:20251104192222-08'00'", 'creationDate': "D:20251103065425-08'00'", 'page': 0}, page_content='CECS 528, Exam 2 Solutions, Fall 2025, Dr. Ebert\nIMPORTANT: READ THE FOLLOWING DIRECTIONS. Directions,\n\x88 For each problem, write your solution using ONE SHEET OF PAPER ONLY (BOTH\nFRONT AND BACK). Write NAME and PROBLEM NUMBER on each sheet.\n\x88 Write solutions to different problems on SEPARATE SHEETS of paper.\n\x88 It is OK to use the same sheet for parts of different make up problems.\nUnit 2 LO Problems\nLO5. Do the following.\n(a) The dynamic-programming algorithm that solves the Runaway