In [None]:
from langchain.document_loaders import Docx2txtLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path
import re
from datetime import datetime
from pytz import timezone

tz = timezone('America/New_York')
MAXCONCURRENCY = 4
MULTITREADING = True

class DocProcessing:

    def __init__(self, path : Path = "", fileType : str ="docx", chunkSize : str ="", chunkOverlap : str = ""):

        self.dirPath = path
        self.fileType = fileType
        self.seperators = ["\n\n", "\n", " ", ""]
        self.chunkSize  = chunkSize
        self.chunkOverlap  = chunkOverlap
        self.dirLoader = DirectoryLoader(
                                        self.dirPath,
                                        glob=f"**/*.{self.fileType}",
                                        max_concurrency= MAXCONCURRENCY,
                                        use_multithreading= MULTITREADING,
                                        loader_cls= Docx2txtLoader
                                    )
        self.textSplitter = RecursiveCharacterTextSplitter(
                separators=self.seperators,
                chunk_size = self.chunkSize,
                chunk_overlap = self.chunkOverlap,
                length_function = len
        )


    @property
    def fileType(self) -> str:
        return self.__fileType
    
    @fileType.setter
    def fileType(self, fileType):
        self.__fileType = fileType

    @property
    def dirPath(self) -> Path:
        return self.__dirPath
    
    @dirPath.setter
    def dirPath(self, path: Path):

        if not path.exists():
            path.mkdir(parents=True, exist_ok=True)

        self.__dirPath = path

    @property
    def LoadDocxFromDir(self) -> list:
        return self.dirLoader.load() 
    
    def cleanText(self, text) -> str:

        text = re.sub(r'\s+', ' ', text)
        return text.strip()
    
    @property
    def pageToChunkCreation(self) -> list:

        processedChunks = []
        pages = self.LoadDocxFromDir
        for i, page in enumerate(pages):
            print(page.metadata)
            cleanText = self.cleanText(page.page_content)
            chunks = self.textSplitter.create_documents(
                [cleanText],
                metadatas=[{
                            **page.metadata,
                            "Page Number": i+1,
                            "Total Pages": len(pages),
                            "Created Time": datetime.now(tz=tz).strftime('%Y-%m-%d %H:%M:%S')
                        }
                    ]
                )
            print(len(chunks))
            processedChunks.extend(chunks)

        return processedChunks


path = "../data/worddocx"

pdfsobj = DocProcessing(Path(path),'docx',500,80)
chunks = pdfsobj.pageToChunkCreation
print(f"Processes into {len(chunks)} Chunks")
for i,item in enumerate(chunks):
    print(item.metadata)
    print('---------------------')    


