
#### Semantic Chunking using Azure AI Document Intelligence and Vectorization using AI Search

**Setup**

In [1]:
# Welcome to your new notebook
# Type here in the cell editor to add code!

%pip install python-dotenv -q
%pip install langchain -q
%pip install langchain-community -q
%pip install langchain-openai  -q
%pip install langchainhub -q
%pip install openai --upgrade -q
%pip install tiktoken -q
%pip install azure-ai-documentintelligence -q
%pip install azure-identity -q
%pip install azure-search-documents==11.6.0b3 -q


StatementMeta(, b74fc4a9-bb55-4427-ad4e-1ea1a17872aa, 17, Finished, Available, Finished)

[33mDEPRECATION: notebookutils 3.5.0-20240224.2 has a non-standard version number. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of notebookutils or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
[33mDEPRECATION: notebookutils 3.5.0-20240224.2 has a non-standard version number. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of notebookutils or contact the author to suggest that they release a version with a conforming version number

In [2]:
from langchain import hub
from langchain_openai import AzureChatOpenAI
from langchain_community.document_loaders import AzureAIDocumentIntelligenceLoader
from langchain_openai import AzureOpenAIEmbeddings
from langchain_openai  import AzureOpenAI
from langchain.schema import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough
from langchain.text_splitter import MarkdownHeaderTextSplitter
from langchain.vectorstores.azuresearch import AzureSearch
from dotenv import load_dotenv
load_dotenv()
from datetime import datetime
import os

StatementMeta(, b74fc4a9-bb55-4427-ad4e-1ea1a17872aa, 19, Finished, Available, Finished)

**Configure Notebook Parameters**

In [3]:
AZURE_OPENAI_ENDPOINT= "https://test-openai-swe-central1.openai.azure.com"
VECTOR_STORE_ADDRESS = "https://cog-search-pnh3sidy433ua.search.windows.net"
DOC_INTELLIGENCE_ENDPOINT = "https://doc-intelligence-singhealth1.cognitiveservices.azure.com/"
EMBEDDING_MODEL = "text-embedding-ada-002"
OPENAI_API_VERSION = "2023-12-01-preview"
INDEX_NAME = "fabric-chunk-index"

StatementMeta(, b74fc4a9-bb55-4427-ad4e-1ea1a17872aa, 20, Finished, Available, Finished)

**Configure AI Services Keys using Key Vault**

In [4]:
from notebookutils.mssparkutils.credentials import getSecret

KEYVAULT_ENDPOINT = "https://fabric-pipeline-vault.vault.azure.net/"

DOC_INTELLIGENCE_KEY = getSecret(KEYVAULT_ENDPOINT, "DOC-INTELLIGENCE-KEY")
AZURE_OPENAI_API_KEY= getSecret(KEYVAULT_ENDPOINT, "AZURE-OPENAI-API-KEY")
VECTOR_STORE_PASSWORD = getSecret(KEYVAULT_ENDPOINT, "VECTOR-STORE-PASSWORD")

StatementMeta(, b74fc4a9-bb55-4427-ad4e-1ea1a17872aa, 21, Finished, Available, Finished)

**Load the document using AI Document Intelligence prebuilt-layout mode and split it into semantic chunks using MarkdownHeaderTextSplitter**

**Initialize lakehouse raw and processed folder path**

In [5]:
lakehouse_path = "/lakehouse/default/Files/"
raw_folder_path = lakehouse_path + "raw/" 
processed_folder_path = lakehouse_path + "processed/" 

StatementMeta(, b74fc4a9-bb55-4427-ad4e-1ea1a17872aa, 22, Finished, Available, Finished)

**Initalize Azure OpenAI Embedding Model**

In [6]:
 # Embed the splitted documents and insert into Azure Search vector store
aoai_embeddings = AzureOpenAIEmbeddings(
azure_deployment=EMBEDDING_MODEL,
openai_api_version=OPENAI_API_VERSION,
azure_endpoint=AZURE_OPENAI_ENDPOINT,
api_key=AZURE_OPENAI_API_KEY,
)

StatementMeta(, b74fc4a9-bb55-4427-ad4e-1ea1a17872aa, 23, Finished, Available, Finished)

**Chunk documents and Output the Semantic Chunks (splits) into results folder**

In [8]:
def chunk_documents(file_name):
    # Initiate Azure AI Document Intelligence to load the document. You can either specify file_path or url_path to load the document.
    loader = AzureAIDocumentIntelligenceLoader(file_path=raw_folder_path + file_name, api_key=DOC_INTELLIGENCE_KEY, api_endpoint=DOC_INTELLIGENCE_ENDPOINT, api_model="prebuilt-layout")
    docs = loader.load()


    # Split the document into chunks base on markdown headers.
    headers_to_split_on = [
        #("#", "Header 1"),
        ("##", "Header 2"),
        # ("###", "Header 3"),
    ]
    text_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

    docs_string = docs[0].page_content
    splits = text_splitter.split_text(docs_string)
    print("Length of splits: " + str(len(splits)))

    run_prefix = datetime.now().strftime("%Y%m%d%H%M%S")   
    output_folder = processed_folder_path + f"chunks_{file_name}_{run_prefix}"  # Specify the folder path where you want to save the files

    os.makedirs(output_folder, exist_ok=True)  # Create the output folder if it doesn't exist

    for split in splits:
        # Specify the folder path where you want to save the files
        for i, split in enumerate(splits):
            file_path = os.path.join(output_folder, f"split_{i}.MD")  # Specify the file path for each split
            with open(file_path, "w") as file:
                file.write(split.page_content)
    print("Chunks generated in folder : "+ output_folder)
    return splits

StatementMeta(, b74fc4a9-bb55-4427-ad4e-1ea1a17872aa, 25, Finished, Available, Finished)

**Embed and index the chunks**

In [10]:
def embedd_index_documents(splits):
    vector_store: AzureSearch = AzureSearch(
        azure_search_endpoint=VECTOR_STORE_ADDRESS,
        azure_search_key=VECTOR_STORE_PASSWORD,
        index_name=INDEX_NAME,
        embedding_function=aoai_embeddings.embed_query,
    )

    vector_store.add_documents(documents=splits)   
    print ("Embeding of chunks and Indexing to AI Search is completed")

StatementMeta(, b74fc4a9-bb55-4427-ad4e-1ea1a17872aa, 27, Finished, Available, Finished)

**Move the raw file to processed folder**

In [11]:
def mov_raw_processed(file_name):
    os.rename(raw_folder_path + file_name, processed_folder_path + file_name)
    print("Moved file from raw folder to processed folder")

StatementMeta(, b74fc4a9-bb55-4427-ad4e-1ea1a17872aa, 28, Finished, Available, Finished)

**Main Method: Invoke File Processing in Raw Folder**

In [12]:
def process_files(file_name):
    print("Processing File Started : "+file_name)
    splits = chunk_documents(file_name)
    embedd_index_documents(splits)
    mov_raw_processed(file_name)
    print("Processing File Completed : "+file_name)

StatementMeta(, b74fc4a9-bb55-4427-ad4e-1ea1a17872aa, 29, Finished, Available, Finished)

In [14]:
files = os.listdir(raw_folder_path)

for file_name in files:
    process_files(file_name)

StatementMeta(, b74fc4a9-bb55-4427-ad4e-1ea1a17872aa, 31, Finished, Available, Finished)

Processing File Started : 2310.07488.pdf
Length of splits: 8
Chunks generated in folder : /lakehouse/default/Files/processed/chunks_2310.07488.pdf_20240730013514
Embeding of chunks and Indexing to AI Search is completed
Moved file from raw folder to processed folder
Processing File Completed : 2310.07488.pdf
