###Key Points:
#####This implementation is part of a PoC to evaluate the azure ai document itelligence sdk for complex document extraction. This notebook tests a single page table extraction and associated text and uses the recursive character text splitter class for chunking. The output is uploaded to azure ai search and used to power a q & a knowledgebase ai chat application and enhance response accuracy and relevance.
#####Extract Table Function: The extract_tables function is in in development and will be designed to extract a multi-page table data. It identifies the column headers and row indexes, then extracts the cell values and adds them to a string.
#####Extract PDF: The extract_pdf_content function loops through PDF books in a storage account container, then extracts text and single-page table content into a markdown format by min span offset and maxt span length. The extracted content is chunked using the recursive character text splitter class via a specified number of characters or separators.
https://github.com/Azure/azure-search-vector-samples/blob/main/demo-python/code/data-chunking/langchain-data-chunking-example.ipynb
Chunking Strategies: https://learn.microsoft.com/en-us/azure/architecture/ai-ml/guide/rag/rag-chunking-phase

Example: https://learn.microsoft.com/en-us/answers/questions/1608976/using-document-intelligence-to-create-chunks-for-i

In [0]:
# import required packages
from tqdm import tqdm
from langchain_openai import AzureOpenAIEmbeddings
from langchain_openai import AzureChatOpenAI
from langchain_core.retrievers import BaseRetriever
from langchain_core.callbacks import CallbackManagerForRetrieverRun
from langchain_core.documents import Document
from langchain_core.messages import HumanMessage
from langchain_core.runnables import ConfigurableField
from langchain_core.output_parsers import StrOutputParser
from langchain_community.document_loaders import AzureAIDocumentIntelligenceLoader
from langchain.text_splitter import MarkdownHeaderTextSplitter
from azure.core.credentials import AzureKeyCredential
from azure.storage.blob import BlobServiceClient
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeResult, AnalyzeDocumentRequest, ContentFormat
import time
from azure.identity import DefaultAzureCredential
from openai import AzureOpenAI
from azure.identity import get_bearer_token_provider
import tiktoken
from langchain.text_splitter import RecursiveCharacterTextSplitter
from time import monotonic

In [0]:
"""
This code loads and sets the necessary variables for Azure services.
The variables are loaded from Azure Key Vault.
"""

azure_openai_endpoint=dbutils.secrets.get(scope="myscope", key="aoai-endpoint")
azure_openai_api_key=dbutils.secrets.get(scope="myscope", key="aoai-api-key")
azure_openai_api_version = "2024-02-15-preview"
azure_openai_embedding_deployment = dbutils.secrets.get(scope="myscope", key="aoai-embedding-deployment")
doc_intelligence_endpoint = dbutils.secrets.get(scope="myscope", key="docintelligence-endpoint")
doc_intelligence_key = dbutils.secrets.get(scope="myscope", key="docintelligence-key")

In [0]:
# Connect to Blob Storage
blob_connection_string = dbutils.secrets.get(scope="myscope", key="blobstore-connstr")
blob_container_name = "document-list"
blob_service_client = BlobServiceClient.from_connection_string(blob_connection_string)
container_client = blob_service_client.get_container_client(blob_container_name)
blobs = container_client.list_blobs()
container_url = container_client.url
#print(container_url)

In [0]:
# Function to extract tables from the page
def extract_tables(result: AnalyzeResult):
    tables = []
    for page in result.pages:
        for table in result.tables:
            if page.page_number == table.bounding_regions[0].page_number:
                table_data = []
                for cell in table.cells:
                    table_data.append({
                        "row_index": cell.row_index,
                        "column_index": cell.column_index,
                        "content": cell.content
                    })
                tables.append(table_data)
    return tables

In [0]:
import base64

# Function to convert text to unique random id for search index field
def text_to_base64(text):
    # Convert text to bytes using UTF-8 encoding
    bytes_data = text.encode('utf-8')

    # Perform Base64 encoding
    base64_encoded = base64.b64encode(bytes_data)

    # Convert the result back to a UTF-8 string representation
    base64_text = base64_encoded.decode('utf-8')

    return base64_text

In [0]:
page_documents = ""

In [0]:
type(page_documents)

In [0]:
# Function to crack and extract PDF documents using Azure AI Document Intelligence
def extract_pdf_content(book_url: str):
    page_documents = ""
    print(f"{book_url}\n\n")
    print(f"---------------------------------------------")
    
    document_intelligence_client = DocumentIntelligenceClient(endpoint=doc_intelligence_endpoint, credential=AzureKeyCredential(key=doc_intelligence_key))

    poller= document_intelligence_client.begin_analyze_document(model_id="prebuilt-layout", analyze_request=AnalyzeDocumentRequest(url_source=book_url), output_content_format="markdown")

    result: AnalyzeResult = poller.result()
    
    for page in result.pages:
        page_num = page.page_number
        # Calculate the start position as the offset of the first span
        start_pos = page.spans[0].offset

        # Calculate the end position by adding the length of the first span to its offset
        end_pos = start_pos + page.spans[0].length

        # Slice the result.content string from start_pos to end_pos to get the desired content
        page_content = result.content[start_pos:end_pos]
        #print(f"{page_content}\n\n")

        #print(f"------------------------------------------")
        page_documents+=page_content

    
    return page_documents

In [0]:
# Create the search index fields and vector search configuration

from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import SearchField, SearchFieldDataType, VectorSearch, SimpleField, SearchableField, HnswAlgorithmConfiguration, HnswParameters, VectorSearchAlgorithmMetric, ExhaustiveKnnAlgorithmConfiguration, ExhaustiveKnnParameters, VectorSearchProfile, AzureOpenAIVectorizer, AzureOpenAIParameters, SemanticConfiguration, SemanticSearch, SemanticPrioritizedFields, SemanticField, SearchIndex

search_credential = AzureKeyCredential(dbutils.secrets.get(scope="myscope", key="aisearch-adminkey"))
search_endpoint = dbutils.secrets.get(scope="myscope", key="aisearch-endpoint")
# Create a search index client required to create the index
index_client = SearchIndexClient(endpoint=search_endpoint, credential=search_credential)

fields = [
    SimpleField(name="id", key=True, type=SearchFieldDataType.String, filterable=True, sortable=True, facetable=True),
    SearchableField(name="title", type=SearchFieldDataType.String, filterable=True, searchable=True, retrievable=True),
    SearchableField(name="content", type=SearchFieldDataType.String, searchable=True, sortable=True, facetable=True, retrievable=True),
    SearchableField(name="location", type=SearchFieldDataType.String, searchable=True, filterable=True, retrievable=True),
    SearchField(name="vector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), searchable=True, retrievable=True, hidden=False, vector_search_dimensions=1536, vector_search_profile_name="myHnswProfile")
]

# Configure the vector search config
vector_search = VectorSearch(
    algorithms=[
        HnswAlgorithmConfiguration(
            name="myHnsw",
            parameters=HnswParameters(
                m=4,
                ef_construction=400,
                ef_search=500,
                metric=VectorSearchAlgorithmMetric.COSINE
            )
        )
    ],
    profiles=[  
        VectorSearchProfile(  
            name="myHnswProfile",  
            algorithm_configuration_name="myHnsw",  
            vectorizer="myOpenAI",  
        ),
    ],
    vectorizers=[  
        AzureOpenAIVectorizer(  
            name="myOpenAI",  
            kind="azureOpenAI",  
            azure_open_ai_parameters=AzureOpenAIParameters(  
                resource_uri=azure_openai_endpoint,  
                deployment_id=azure_openai_embedding_deployment,  
                api_key=azure_openai_api_key,  
            ),  
        ),  
    ]
)

# Configure semantic search on the index
semantic_config = SemanticConfiguration(
    name="my-semantic-config",
    prioritized_fields=SemanticPrioritizedFields(
        content_fields=[
            SemanticField(field_name="content")
        ]
    )
)
# Create the semantic search config
semantic_search = SemanticSearch(configurations=[semantic_config])

In [0]:
# Create the search index
index_name = "aisearch-index-recursive"
index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search, semantic_search=semantic_search)
result = index_client.create_or_update_index(index=index)
print(f"{result.name} created")

In [0]:
# Create the langchain azure open ai embedding object. This will be used to embed the vector field content
# https://python.langchain.com/v0.1/docs/integrations/vectorstores/azuresearch/#create-embeddings-and-vector-store-instances

# Create azure open ai embedding
azure_openai_client = None
if azure_openai_api_key:
    azure_openai_client = AzureOpenAI(
        api_key=azure_openai_api_key, 
        api_version=azure_openai_api_version,
        azure_deployment=azure_openai_embedding_deployment,
        azure_endpoint=azure_openai_endpoint)
else:
    azure_openai_client = AzureOpenAI(
        azure_ad_token_provider=get_bearer_token_provider(DefaultAzureCredential(), scope="https://cognitiveservices.azure.com/.default"),
        api_version=azure_openai_api_version,
        azure_deployment=azure_openai_embedding_deployment,
        azure_endpoint=azure_openai_endpoint)
    

aoai_embeddings = AzureOpenAIEmbeddings(
    azure_deployment=azure_openai_embedding_deployment,
    openai_api_version=azure_openai_api_version,
    azure_endpoint=azure_openai_endpoint,
    api_key=azure_openai_api_key,
)

In [0]:
def chunk_text(text: str):
    pass
    recursive_text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        model_name=dbutils.secrets.get(scope="myscope", key="aoai-deploymentname"),
        chunk_size=600,
        chunk_overlap=125,
        separators=["\n\n", "\n", " ", ""]
    )

    recursive_text_splitter_chunks = recursive_text_splitter.split_text(text=text)
    return recursive_text_splitter_chunks

In [0]:
# dictionary to hold and map a book to it's content and page numbers
book_pages_map = {}

for book in container_client.list_blob_names():
    print(f"Extracting content from {book}...")

    # Capture the start time
    start_time = time.time()
    book_url = container_url + "/" + book

    # Start extraction
    page_documents = extract_pdf_content(book_url=book_url)
    book_name = book.split(sep=".")[0].title()
    chunks = chunk_text(page_documents)
    #chunked_docs = [Document(page_content=chunk) for chunk in chunks]
    book_pages_map[book_name]= chunks

    # Capture the end time and Calculate the elapsed time
    end_time = time.time()
    elapsed_time = end_time - start_time

    print(f"Parsing took: {elapsed_time:.6f} seconds")
    print(f"The {book_name} book contains {len(chunks)} chunks\n")

In [0]:
chunks

In [0]:
from azure.search.documents import SearchClient

search_client = SearchClient(search_endpoint, index_name, credential=search_credential)

for bookname, chunks in book_pages_map.items():
    for chunk in chunks:
        try:
            id = bookname + chunk[1:10]
            title = f"{bookname}"
            upload_payload = {
                        "id": text_to_base64(text=id),
                        "title": title,
                        "content": chunk,
                        "location": container_url + "/" + bookname + ".pdf",
                        "vector": aoai_embeddings.embed_query(chunk if chunk!="" else "-------")
            }

            result_upload = search_client.upload_documents(documents=[upload_payload])
            #print(f"Successfully uploaded chunk for :{bookname}")
        except Exception as e:
            print("Exception:", e)

In [0]:
book_pages_map.items()

## Test Specific Search Types and Queries.

## Perform a vector similarity search

This example shows a pure vector search using the vectorizable text query, all you need to do is pass in text and your vectorizer will handle the query vectorization.

If you indexed the health plan PDF file, send queries that ask plan-related questions.

In [0]:
from azure.search.documents import SearchClient
from azure.search.documents.models import VectorizableTextQuery

# Pure Vector Search
# query = "Which is more comprehensive, Northwind Health Plus vs Northwind Standard?"
query = "What determines the venue of a legal action brought against Northwind Health?"  
  
search_client = SearchClient(search_endpoint, index_name, credential=search_credential)
vector_query = VectorizableTextQuery(text=query, k_nearest_neighbors=1, fields="vector", exhaustive=True)
# Use the below query to pass in the raw vector query instead of the query vectorization
# vector_query = RawVectorQuery(vector=generate_embeddings(query), k_nearest_neighbors=3, fields="vector")
  
results = search_client.search(  
    search_text=None,  
    vector_queries= [vector_query],
    select=["id", "title", "content"],
    top=1
)  
  
for result in results:  
    print(f"Id: {result['id']}")  
    print(f"Title: {result['title']}")  
    print(f"Score: {result['@search.score']}")  
    print(f"Content: {result['content']}")   


## Perform a hybrid search

In [0]:
from azure.search.documents.models import (
    QueryType,
    QueryCaptionType,
    QueryAnswerType
)
from azure.search.documents import SearchClient
from azure.search.documents.models import VectorizableTextQuery

# Hybrid Search query
#query = "How much is the employee's cost per pay check for the north wind standard?"  
# query = "Can you summarize the employee handbook for me?"
query = "What determines the venue of a legal action brought against Northwind Health?"
  
search_client = SearchClient(search_endpoint, index_name, credential=search_credential)
vector_query = VectorizableTextQuery(text=query, k_nearest_neighbors=1, fields="vector", exhaustive=True)
  
results = search_client.search(  
    search_text=query,  # use both the text query
    vector_queries= [vector_query], # use both the text query in the previous parameter and vector query
    select=["id", "title", "content"],
    top=1
)  
  
for result in results:  
    print(f"Id: {result['id']}")  
    print(f"Title: {result['title']}")  
    print(f"Score: {result['@search.score']}")  
    print(f"Content: {result['content']}")  


## Perform a hybrid search + semantic reranking

In [0]:
from azure.search.documents.models import (
    QueryType,
    QueryCaptionType,
    QueryAnswerType
)

from azure.search.documents import SearchClient
from azure.search.documents.models import VectorizableTextQuery

# Semantic Hybrid Search
# query = "Which is more comprehensive, Northwind Health Plus vs Northwind Standard?"
query = "What determines the venue of a legal action brought against Northwind Health?"

search_client = SearchClient(search_endpoint, index_name, search_credential)
vector_query = VectorizableTextQuery(text=query, k_nearest_neighbors=1, fields="vector", exhaustive=True)

results = search_client.search(  
    search_text=query,
    vector_queries=[vector_query],
    select=["id", "title", "content"],
    query_type=QueryType.SEMANTIC,
    semantic_configuration_name='my-semantic-config',
    query_caption=QueryCaptionType.EXTRACTIVE,
    query_answer=QueryAnswerType.EXTRACTIVE,
    top=1
)

semantic_answers = results.get_answers()
if semantic_answers:
    for answer in semantic_answers:
        if answer.highlights:
            print(f"Semantic Answer: {answer.highlights}")
        else:
            print(f"Semantic Answer: {answer.text}")
        print(f"Semantic Answer Score: {answer.score}\n")

for result in results:
    print(f"Id: {result['id']}")  
    print(f"Title: {result['title']}")  
    print(f"Reranker Score: {result['@search.reranker_score']}")
    print(f"Content: {result['content']}")  

    captions = result["@search.captions"]
    if captions:
        caption = captions[0]
        if caption.highlights:
            print(f"Caption: {caption.highlights}\n")
        else:
            print(f"Caption: {caption.text}\n")
