In [2]:
# Document Loaders - Imports
from pathlib import Path
from typing import List, Union
from langchain_core.documents import Document
from langchain_community.document_loaders import (
    PyPDFLoader,
    Docx2txtLoader,
    TextLoader,
)

In [4]:
def get_loader(file_path: Union[str, Path]):
    """
    Get the appropriate loader based on file extension.
    
    Args:
        file_path: Path to the document file
        
    Returns:
        Document loader instance
        
    Raises:
        ValueError: If file type is not supported
    """
    file_path = Path(file_path)
    extension = file_path.suffix.lower()
    
    loaders = {
        ".pdf": PyPDFLoader,
        ".docx": Docx2txtLoader,
        ".txt": TextLoader,
    }
    
    if extension not in loaders:
        supported = ", ".join(loaders.keys())
        raise ValueError(
            f"Unsupported file type: {extension}. "
            f"Supported types: {supported}"
        )
    
    return loaders[extension](str(file_path))

In [3]:
# load_document() - Load a single document with enriched metadata
def load_document(file_path: Union[str, Path]) -> List[Document]:
    loader = get_loader(file_path)
    documents = loader.load()
    
    file_path = Path(file_path)
    for doc in documents:
        doc.metadata["source"] = file_path.name
        doc.metadata["file_path"] = str(file_path)
    
    return documents

In [None]:
# load_documents() - Batch load multiple documents with error handling
def load_documents(file_paths: List[Union[str, Path]]) -> List[Document]:
    all_documents = []
    for file_path in file_paths:
        try:
            docs = load_document(file_path)
            all_documents.extend(docs)
        except Exception as e:
            print(f"Error loading {file_path}: {e}")
    
    return all_documents

In [None]:
# Example: Test get_loader with different file types
pdf_loader = get_loader("sample.pdf")
print(f"PDF Loader: {type(pdf_loader).__name__}")

docx_loader = get_loader("sample.docx")
print(f"DOCX Loader: {type(docx_loader).__name__}")

txt_loader = get_loader("sample.txt")
print(f"TXT Loader: {type(txt_loader).__name__}")

In [None]:
# Example: Test unsupported file type error
try:
    loader = get_loader("file.xlsx")
except ValueError as e:
    print(f"Error: {e}")

In [None]:
# Example: Load a PDF document (requires actual file)
# docs = load_document("path/to/your/file.pdf")
# for doc in docs:
#     print(f"Page {doc.metadata.get('page', 'N/A')}")
#     print(f"Source: {doc.metadata.get('source')}")
#     print(f"Content preview: {doc.page_content[:200]}...")
#     print("-" * 50)

In [None]:
# Example: Batch load multiple documents
# file_paths = [
#     "documents/report.pdf",
#     "documents/notes.txt",
#     "documents/summary.docx",
# ]
# 
# all_docs = load_documents(file_paths)
# print(f"Loaded {len(all_docs)} documents from {len(file_paths)} files")

In [None]:
# Example: Load all files from a directory
# from pathlib import Path
# 
# docs_dir = Path("documents")
# files = list(docs_dir.glob("*.pdf")) + list(docs_dir.glob("*.txt")) + list(docs_dir.glob("*.docx"))
# 
# all_docs = load_documents(files)
# print(f"Loaded {len(all_docs)} documents")

In [None]:
# Internal: What PyPDFLoader does
# 1. Opens PDF in binary mode
# 2. Uses pypdf library to parse
# 3. Extracts text from each page
# 4. Returns one Document per page

# Internal: What Docx2txtLoader does
# 1. DOCX is a ZIP file containing XML
# 2. Extracts word/document.xml
# 3. Parses XML and extracts text from <w:t> tags
# 4. Returns single Document with all text

# Internal: What TextLoader does
# 1. Opens file with UTF-8 encoding
# 2. Reads entire content
# 3. Returns single Document