In [None]:
from langchain_community.document_loaders import PyPDFLoader
from pathlib import Path
import re

path = Path("../data/PDFs")

class PDFProcessor:
    def __init__(self, pdfPath: Path, pdfName: str):
        self.pdfPath = pdfPath
        self.pdfName = pdfName

    @property
    def pdfPath(self) -> Path:
        return self.__pdfPath 
    
    @pdfPath.setter
    def pdfPath(self, pdfPath: Path):

        if not pdfPath.exists():
            pdfPath.mkdir(parents=True, exist_ok=True)

        self.__pdfPath = pdfPath

    @property
    def pdfName(self) -> str:
        return self.__pdfName
    
    @pdfName.setter
    def pdfName(self, pdfName : str):
        
        if not pdfName.endswith('.pdf'):
            raise ValueError("The file name must end with .pdf")
        
        self.__pdfName = pdfName

    @property
    def load_pdf(self) -> list:

        pdfLoader = PyPDFLoader(self.pdfPath.joinpath(self.pdfName))
        pdfDocs = pdfLoader.load()

        return pdfDocs
    
    def cleanPDFText(self, text:str) -> str:

        # text = re.sub(r'\n+', '\n', text)  # Remove multiple newlines
        text = re.sub(r'\s+', ' ', text)   # Replace multiple spaces with a single space
        # text = re.sub(r'\n',' ',text)         # Replace newlines with spaces       
        
        return text.strip() # Trim leading and trailing whitespace  

    
if __name__ == "__main__":

    try:

        directory = Path("../data/PDFs")
        filename = "PBIInterviewQuestions.pdf"
        pdfProcessor = PDFProcessor(directory, filename)

        for item in pdfProcessor.load_pdf:
            print(item.metadata)
            print(pdfProcessor.cleanPDFText(item.page_content[:200]))

    except Exception as e:
        print(f"An error occurred: {e}")
        raise

  from .autonotebook import tqdm as notebook_tqdm


{'producer': 'Skia/PDF m144 Google Docs Renderer', 'creator': 'PyPDF', 'creationdate': '', 'title': 'show me line by line not table', 'source': '..\\data\\PDFs\\PBIInterviewQuestions.pdf', 'total_pages': 4, 'page': 0, 'page_label': '1'}
Certainly! Here are the most critical Power BI interview questions and answers, presented line-by-line for easier reading, categorized by topic. 1. Power BI Fundamenta
{'producer': 'Skia/PDF m144 Google Docs Renderer', 'creator': 'PyPDF', 'creationdate': '', 'title': 'show me line by line not table', 'source': '..\\data\\PDFs\\PBIInterviewQuestions.pdf', 'total_pages': 4, 'page': 1, 'page_label': '2'}
11. Q: Difference between merging and appending queries? A: Merging (JOIN) combines two queries based on a common column, adding columns. Appending (UNION) stacks two or
{'producer': 'Skia/PDF m144 Google Docs Renderer', 'creator': 'PyPDF', 'creationdate': '', 'title': 'show me line by line not table', 'source': '..\\data\\PDFs\\PBIInterviewQuestions.pdf',