In [5]:
from pathlib import Path
from langchain_community.document_loaders import PyPDFLoader
from typing import List
from langchain_core.documents import Document
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [19]:
class DocumentLoader:
    def __init__(self):
        self.loader_mapping = {
            'pdf': PyPDFLoader
        }
    
    def load_document(self, file_path: str) -> List[Document]:

        path = Path(file_path)

        if not path.exists():
            raise FileNotFoundError(f"file not found: {file_path}")
        extension = path.suffix.lower().lstrip('.')

        if  extension not in self.loader_mapping:
            raise ValueError(
                f"Unsupported file type: {extension}. "
                f"Supported types: {list(self.loader_mapping.keys())}"
            )
        
        loader_class = self.loader_mapping[extension]

       
        try:
            loader = loader_class(file_path)
            documents = loader.load()
            return documents
        except Exception as e:
            logger.error(f"Error loading document: {e}")


    def load_documents(self, file_paths: List[str]) -> List[Document]:
        all_documents = []
        for file_path in file_paths:
            try:
                load_document = self.load_document(file_path)
                all_documents.extend(load_document)
            except Exception as e:
                logger.warning(f"skipping {file_path}: {str(e)}")
                continue
        
        logger.info(f"Total documents loaded {len(all_documents)}")
        return all_documents
    
    def load_directory(self,directory_path:str, glob_pattern:str) -> List[Document]:
        print("Loading documents....")
        directory = Path(directory_path)
        files = list(directory.glob(glob_pattern))

        if not directory.exists() or not directory.is_dir():
            raise ValueError(f"Invalid directory {directory} ")
        loaded_documents = self.load_documents(files)
        print("loaded documents")
        print(loaded_documents)
        return self.load_documents(files)

    
        