In [23]:
from langchain.document_loaders import PyMuPDFLoader, DirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import re
from pathlib import Path
from datetime import datetime
import pytz

tz = pytz.timezone('America/New_York')
MAXTHREDS = 5
ISMULTITHREADING = True

class ProcessMultiplePDFFiles:

    def __init__(self, basePath: Path, chunkSize: int = 200, chunkOverlap: int = 100, fileType : str ="txt"):

        self.basePath = basePath
        self.seperators = ["\n\n", "\n", " ", ""]
        self.chunkSize  = chunkSize
        self.chunkOverlap  = chunkOverlap
        self.fileType = fileType
        self.loaderType = None
        self.dirLoader = DirectoryLoader(
                            self.basePath,
                            glob=f"**/*.{self.fileType}",
                            loader_cls=self.loaderType,
                            use_multithreading=ISMULTITHREADING,
                            max_concurrency=MAXTHREDS,
                            show_progress=True
                        )
        self.textSplitter = RecursiveCharacterTextSplitter(
            separators=self.seperators,
            chunk_size = self.chunkSize,
            chunk_overlap = self.chunkOverlap,
            length_function = len
        )   

    @property
    def basePath(self) -> Path:
        return self.__basePath
    
    @basePath.setter
    def basePath(self, basePath: Path):

        if not basePath.exists():
            basePath.mkdir(parents=True, exist_ok=True)

        self.__basePath = basePath

    @property
    def chunkSize(self) -> int:
        return self.__chunkSize
    
    @chunkSize.setter
    def chunkSize(self, chunkSize : int):
        self.__chunkSize = chunkSize

    @property
    def chunkOverlap(self) -> int:
        return self.__chunkOverlap
    
    @chunkOverlap.setter
    def chunkOverlap(self, chunkOverlap : str):
        self.__chunkOverlap = chunkOverlap

    @property
    def fileType(self) -> str:
        return self.__fileType
    
    @fileType.setter
    def fileType(self, fileType : str):
        self.__fileType = fileType

    @property
    def loaderType(self):
        return self.__loaderType
    
    @loaderType.setter
    def loaderType(self, loaderType):

        if self.fileType.lower() == "txt":
            self.__loaderType = TextLoader
        elif self.fileType.lower() == "pdf":
            self.__loaderType = PyMuPDFLoader
        else:
            raise TypeError(f"Loader Type is Not Specified. It is configured for Text and PDF only.")

    @property
    def loadPDFFromDir(self) -> list:
        return self.dirLoader.load()
    
    def cleanText(self, text):

        text = re.sub(r'\s+',' ', text)
        return text.strip()
    
    @property
    def pageToChunkGenerator(self) -> list:
        processed_chunks = []
        pages = self.loadPDFFromDir
        for i, page in enumerate(pages):
            cleanText = self.cleanText(page.page_content)
            chunks = self.textSplitter.create_documents(
                texts=[cleanText],
                metadatas=[
                    {
                        **page.metadata,
                        "Page Number": i+1,
                        "Total Pages": len(pages),
                        "Created Time": datetime.now(tz=tz).strftime('%Y-%m-%d %H:%M:%S')
                    }
                ]
            )
            processed_chunks.extend(chunks)

        return processed_chunks


path = "../data/PDFs"

pdfsobj = ProcessMultiplePDFFiles(Path(path),500,80,'pdf')
chunks = pdfsobj.pageToChunkGenerator
print(f"Processes into {len(chunks)} Chunks")
for i,item in enumerate(chunks):
    print(item.page_content)
    print('---------------------') 
    

100%|██████████| 3/3 [00:00<00:00, 25.35it/s]

Processes into 88 Chunks
While providing a list of the top 200 Power BI interview questions is challenging due to the dynamic nature of technology and specific company needs, I can offer a comprehensive set of questions covering various aspects of Power BI. These questions are categorized to help you prepare effectively for different interview focuses and experience levels. I. Fundamentals of Power BI 1.​ What is Power BI? Briefly describe its purpose and capabilities as a business analytics tool. 2.​ What are the main
---------------------
purpose and capabilities as a business analytics tool. 2.​ What are the main components of Power BI? List and describe Power BI Desktop, Power BI Service, and Power BI Mobile. 3.​ What are the building blocks of Power BI? Explain Datasets, Visualizations, Reports, Dashboards, and Tiles. 4.​ What is the typical workflow in Power BI? Outline the steps involved in connecting to data, transforming it, creating visuals, and sharing insights. 5.​ How does


