In [0]:
# import required packages
from tqdm import tqdm
from langchain_openai import AzureOpenAIEmbeddings
from langchain_openai import AzureChatOpenAI
from langchain_core.retrievers import BaseRetriever
from langchain_core.callbacks import CallbackManagerForRetrieverRun
from langchain_core.documents import Document
from langchain_core.messages import HumanMessage
from langchain_core.runnables import ConfigurableField
from langchain_core.output_parsers import StrOutputParser
from langchain_community.document_loaders import AzureAIDocumentIntelligenceLoader
from langchain.text_splitter import MarkdownHeaderTextSplitter
from azure.core.credentials import AzureKeyCredential
from azure.storage.blob import BlobServiceClient
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeResult, AnalyzeDocumentRequest, ContentFormat
import base64

In [0]:
"""
This code loads and sets the necessary variables for Azure services.
The variables are loaded from Azure Key Vault.
"""

azure_openai_endpoint=dbutils.secrets.get(scope="myscope", key="aoai-endpoint")
azure_openai_api_key=dbutils.secrets.get(scope="myscope", key="aoai-api-key")
azure_openai_api_version = "2024-02-15-preview"
azure_openai_embedding_deployment = dbutils.secrets.get(scope="myscope", key="aoai-embedding-deployment")
doc_intelligence_endpoint = dbutils.secrets.get(scope="myscope", key="docintelligence-endpoint")
doc_intelligence_key = dbutils.secrets.get(scope="myscope", key="docintelligence-key")

In [0]:
# Connect to Blob Storage
blob_connection_string = dbutils.secrets.get(scope="myscope", key="blobstore-connstr")
blob_container_name = "document-list"
blob_service_client = BlobServiceClient.from_connection_string(blob_connection_string)
container_client = blob_service_client.get_container_client(blob_container_name)
blobs = container_client.list_blobs()
container_url = container_client.url
print(container_url)

In [0]:
# def extract_pdf_document(doc_url: str):
book_chunk_map = {}

for book in container_client.list_blob_names():
    book_url = container_url + "/" + book
    print(f"{book_url}\n\n")
    print(f"---------------------------------------------")
    doc_intelligence_endpoint = dbutils.secrets.get(scope="myscope", key="docintelligence-endpoint")
    doc_intelligence_key = dbutils.secrets.get(scope="myscope", key="docintelligence-key")
    # Instantiate the Langchain Azure AI Document Intelligence loader to load the document. You can either specify file_path or url_path to load the document.
    # Ensure that the Document Intelligence managed identity is configured with SBDC RBAC on the Blob storage resource.
    loader = AzureAIDocumentIntelligenceLoader(url_path=book_url, api_key = doc_intelligence_key, api_endpoint = doc_intelligence_endpoint, api_model="prebuilt-layout")
    docs = loader.load()

    # Split the document into semantic chunks based on markdown headers, using the MarkdownHeaderTextSplitter class.
    headers_to_split_on = [
        ("#", "Header 1"),
        ("##", "Header 2"),
        ("###", "Header 3"),
    ]
    text_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

    docs_string = docs[0].page_content
    splits = text_splitter.split_text(docs_string)

    print("Length of splits: " + str(len(splits)))
    #print(f"{docs_string}\n\n")
    #print(f"{splits}\n\n")
    #print(f"------------------------------------------")
    book_name = book.split(sep=".")[0].title()

    book_chunk_map[book_name]= splits

#### Create and populate an Azure AI Search index with the chunked data from the PDF documents

In [0]:
# Create the search index fields and vector search configuration

from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import SearchField, SearchFieldDataType, VectorSearch, SimpleField, SearchableField, HnswAlgorithmConfiguration, HnswParameters, VectorSearchAlgorithmMetric, ExhaustiveKnnAlgorithmConfiguration, ExhaustiveKnnParameters, VectorSearchProfile, AzureOpenAIVectorizer, AzureOpenAIParameters, SemanticConfiguration, SemanticSearch, SemanticPrioritizedFields, SemanticField, SearchIndex

search_credential = AzureKeyCredential(dbutils.secrets.get(scope="myscope", key="aisearch-adminkey"))
search_endpoint = dbutils.secrets.get(scope="myscope", key="aisearch-endpoint")
# Create a search index client required to create the index
index_client = SearchIndexClient(endpoint=search_endpoint, credential=search_credential)

fields = [
    SimpleField(name="parent_id", key=True, type=SearchFieldDataType.String, filterable=True, sortable=True, facetable=True),
    SearchableField(name="title", type=SearchFieldDataType.String, filterable=True, searchable=True, retrievable=True),
    SearchableField(name="content", type=SearchFieldDataType.String, searchable=True, sortable=True, facetable=True, retrievable=True),
    SearchableField(name="location", type=SearchFieldDataType.String, searchable=True, filterable=True, retrievable=True),
    SearchField(name="vector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), searchable=True, retrievable=True, hidden=False, vector_search_dimensions=1536, vector_search_profile_name="myHnswProfile")
]

# Configure the vector search config
vector_search = VectorSearch(
    algorithms=[
        HnswAlgorithmConfiguration(
            name="myHnsw",
            parameters=HnswParameters(
                m=4,
                ef_construction=400,
                ef_search=500,
                metric=VectorSearchAlgorithmMetric.COSINE
            )
        )
    ],
    profiles=[  
        VectorSearchProfile(  
            name="myHnswProfile",  
            algorithm_configuration_name="myHnsw",  
            vectorizer="myOpenAI",  
        ),
    ],
    vectorizers=[  
        AzureOpenAIVectorizer(  
            name="myOpenAI",  
            kind="azureOpenAI",  
            azure_open_ai_parameters=AzureOpenAIParameters(  
                resource_uri=azure_openai_endpoint,  
                deployment_id=azure_openai_embedding_deployment,  
                api_key=azure_openai_api_key,  
            ),  
        ),  
    ]
)

# Configure semantic search on the index
semantic_config = SemanticConfiguration(
    name="my-semantic-config",
    prioritized_fields=SemanticPrioritizedFields(
        content_fields=[
            SemanticField(field_name="content")
        ]
    )
)
# Create the semantic search config
semantic_search = SemanticSearch(configurations=[semantic_config])

In [0]:
# Create the search index
index_name = "aisearch-index-mdsplitter"
index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search, semantic_search=semantic_search)
result = index_client.create_or_update_index(index=index)
print(f"{result.name} created")

In [0]:
# Create the langchain azure open ai embedding object. This will be used to embed the vector field content
# https://python.langchain.com/v0.1/docs/integrations/vectorstores/azuresearch/#create-embeddings-and-vector-store-instances

aoai_embeddings = AzureOpenAIEmbeddings(
    azure_deployment=azure_openai_embedding_deployment,
    openai_api_version=azure_openai_api_version,
    azure_endpoint=azure_openai_endpoint,
    api_key=azure_openai_api_key,
)

In [0]:
import os
import base64

def random_text_to_base64(length=16):
    # Generate a sequence of random bytes
    random_bytes = os.urandom(length)

    # Perform URL-safe Base64 encoding
    base64_encoded = base64.urlsafe_b64encode(random_bytes)

    # Convert the result back to a UTF-8 string representation
    base64_text = base64_encoded.decode('utf-8')

    return base64_text

#### Upload the semantically chunked documents and its vectors to the Azure AI Search Index

In [0]:
book_chunk_map

In [0]:
# upload data to the search index

from azure.search.documents import SearchClient

for key, value in book_chunk_map.items():
    pass
    search_client = SearchClient(search_endpoint, index_name, credential=search_credential)
    for doc in value:
        try:
            pass
            content = doc.page_content
            book_url = container_url + "/" + key + ".pdf"
            upload_payload = {
                        "parent_id": random_text_to_base64(length=16),
                        "title": key,
                        "content": content,
                        "location": book_url,
                        "vector": aoai_embeddings.embed_query(content if content!="" else "-------")
            }

            result_upload = search_client.upload_documents(documents=[upload_payload])
            print(f"Successfully uploaded chunk for :{key}")
        except Exception as e:
            print("Exception:", e)