In [2]:
from langchain.document_loaders import UnstructuredWordDocumentLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path
import re
from datetime import datetime
import pytz
import os

MULTITHREADING = True
MAXCONCURRENCY = 3
UNSTRUCTURED_MODE = "elements"
SPLITTERTHRSHD = 1000
OVERLAP = 80
tz = pytz.timezone('America/New_York')

class WordDocProcessUnstructuredLoader:

    def __init__(self, path : Path = "", fileFormat : str = "docx", chunkSize : str = SPLITTERTHRSHD, chunkOverlap : str = OVERLAP):

        self.dirPath = path
        self.fileFormat = fileFormat.lower()
        self.loaderType = None
        self.seperatorList = ["\n\n", "\n", " ", ""]
        self.chunkSize = chunkSize
        self.chunkOverlap = chunkOverlap
        self.dirLoader = DirectoryLoader(
            path=path,
            glob= f"**/*.{fileFormat.lower()}",
            loader_cls= self.loaderType,
            loader_kwargs={"mode":UNSTRUCTURED_MODE},
            show_progress=True,
            use_multithreading= MULTITHREADING,
            max_concurrency= MAXCONCURRENCY
        )
        self.textSplitter = RecursiveCharacterTextSplitter(
                separators=self.seperatorList,
                chunk_size = self.chunkSize,
                chunk_overlap = self.chunkOverlap,
                length_function = len
        )

    @property
    def dirPath(self) -> Path:

        return self.__dirPath
    
    @dirPath.setter
    def dirPath(self, path : Path):

        if not (path.exists() and str(path) != "" and str(path) != None ):
            path.mkdir(parents=True, exist_ok=True)
        elif str(path) == "" or str(path) == None:
            raise ValueError("Path is Empty")
        else:
            self.__dirPath = path


    @property
    def fileFormat(self) -> str:
        return self.__fileFormat
    
    @fileFormat.setter
    def fileFormat(self, fileformat : str):

        if fileformat.lower() == "docx":
            self.__fileFormat = fileformat
        else:
            raise ValueError("File Format is other then DOCX")
        
    @property
    def loaderType(self):
        return self.__loaderType
    
    @loaderType.setter
    def loaderType(self, val):

        if self.fileFormat.lower() == "docx":
            self.__loaderType = UnstructuredWordDocumentLoader
        else:
            raise ValueError("File Type Other then DOCX")
        
    @property
    def chunkSize(self) -> str:
        return self.__chunkSize
    
    @chunkSize.setter
    def chunkSize(self, chunkSize : str) :
        self.__chunkSize = chunkSize


    @property
    def chunkOverlap(self) -> str:
        return self.__chunkOverlap
    
    @chunkOverlap.setter
    def chunkOverlap(self, chunkOverlap : str):
        self.__chunkOverlap = chunkOverlap 

        
    @property
    def loadDOCFromDir(self) -> list:
        return self.dirLoader.load()
    
    def cleanText(self, text) -> str:

        text = re.sub(r'\s+', ' ', text)
        return text.strip()
    
    @property
    def cleanChunks(self) -> list:

        processed_chunks = []
        elements = self.loadDOCFromDir

        for page in elements:
            cleanText = self.cleanText(page.page_content)
            if len(cleanText) > SPLITTERTHRSHD:
                chunks =self.textSplitter.create_documents(
                    [cleanText],
                    metadatas=[
                        {
                            **page.metadata,
                            "createddate": datetime.now(tz=tz).strftime('%Y-%m-%d %H:%M:%S'),
                            "updateddate": datetime.now(tz=tz).strftime('%Y-%m-%d %H:%M:%S'),
                            "createdby" : "kkhanvilkar",
                            "updatedby" : "kkhanvilkar"
                        }
                    ]
                )
                processed_chunks.extend(chunks)
                # print('process')
            else:

                processed_chunks.append(page)

        return processed_chunks


path = Path("../data")

obj = WordDocProcessUnstructuredLoader(path=path, fileFormat="docx")
elements = obj.cleanChunks
for item in elements:
    print(item.page_content)
    print('---------------------------')


100%|██████████| 8/8 [00:00<00:00, 11.88it/s]

That's a frustrating situation. While East US is generally one of the regions that supports AI features, there are several common requirements and settings that can still prevent you from creating an AI Agent (also referred to as Fabric Data Agent).
---------------------------
Here are the most likely reasons and steps to check:
---------------------------
1. Check Your Capacity Tier (This is Critical)
---------------------------
The AI Agent feature requires a paid Microsoft Fabric capacity.
---------------------------
Requirement: Your workspace must be assigned to a paid Microsoft Fabric capacity (F2 or higher) or a Power BI Premium capacity (P1 or higher) with Fabric features enabled.
---------------------------
Action:
---------------------------
Go to your Workspace settings.
---------------------------
Check the assigned Capacity. A free/trial capacity usually will not work for this specific feature. You need at least an F2 SKU.
---------------------------
2. Verify Tenant Admin


