#### In this notebook I demonstrate the use of the Azure AI Search Integrated Vectorization feature. and the Split Skill and Azure Open AI Embedding skill to index and build an agentic RAG solution on a glossary dataset in the CSV file format.
* Key vault retrievals are implemented using the azure key vault sdk
* The data source object connection string parameter was updated to use the storage account resource id: ResourceId=/subscriptions/00000000-0000-8888-7777-555555555555/resourceGroups/rgxxx/providers/Microsoft.Storage/storageAccounts/blob00000store
* The NativeBlobSoftDeleteDeletionDetectionPolicy is not supported for parsingMode indexer config set to delimitedText. Further research required
* It's important that the requirements.txt file pinned packages in this directory are used, to avoid breaking changes in newer versions for now
* https://learn.microsoft.com/en-us/azure/search/cognitive-search-skill-document-intelligence-layout
* https://learn.microsoft.com/en-us/azure/search/search-how-to-semantic-chunking
* SplitSkill to chunk the data
* AzureOpenAIEmbedding skill to embed the dataset "Definition" field 
* Deploy the following services in the same region; Azure AI Document Intelligence, Azure AI Search, Azure Open AI, AI Foundry, Azure Blob Storage
* Enable system assigned managed identity
* Deploy text-embedding-3-small on Azure OpenAI (in Azure AI Foundry) for embeddings
* Deploy gpt-4o on Azure OpenAI for chat completion
* Configure search engine RBAC to Azure Blob Storage by adding a role for Storage Blob Data Reader, assigned to the search service system-managed identity
* Configure search engine RBAC to Azure Open AI by adding a role for Cognitive Services OpenAI User, assigned to the search service system-managed identity
* The model names and endpoint should be saved in AKV. Embedding skills and vectorizers assemble the full endpoint internally, so only the resource URI is needed. For example, given https://MY-FAKE-ACCOUNT.openai.azure.com/openai/deployments/text-embedding-3-large/embeddings?api-version=2024-06-01, the endpoint should to be provided in skill and vectorizer definitions is https://MY-FAKE-ACCOUNT.openai.azure.com.
* The Azure AI multiservice account is used for skills processing. The multiservice account key must be provided, even if RBAC is in use. The key isn't used on the connection, but it's currently used for billing purposes.

#### Import Required Packages

In [1]:
from azure.core.credentials import AzureKeyCredential
from azure.storage.blob import BlobServiceClient
import base64
from openai import AzureOpenAI
import azure.identity
from azure.identity import DefaultAzureCredential, EnvironmentCredential, ManagedIdentityCredential, SharedTokenCacheCredential
from azure.identity import ClientSecretCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeResult, AnalyzeDocumentRequest, ContentFormat
from azure.search.documents.indexes.models import (
    SearchField,
    SearchFieldDataType,
    VectorSearch,
    HnswAlgorithmConfiguration,
    VectorSearchProfile,
    AzureOpenAIVectorizer,
    AzureOpenAIVectorizerParameters,
    SearchIndex,
    BlobIndexerParsingMode,
    SemanticConfiguration, SemanticSearch, SemanticPrioritizedFields, SemanticField
)
from azure.search.documents.indexes import SearchIndexClient

import os
from azure.search.documents import SearchClient
from azure.identity import DefaultAzureCredential, AzureAuthorityHosts
from azure.keyvault.secrets import SecretClient
from azure.identity import DefaultAzureCredential
import os

#### Define Required Variables

In [3]:
try:
    keyVaultName = os.environ["KEY_VAULT_NAME"]
except KeyError:
    # Get input from user if not set
    keyVaultName = input("Please enter your Key Vault name: ")
    # Save for future cells in this session
    os.environ["KEY_VAULT_NAME"] = keyVaultName

In [4]:
keyVaultName = os.environ["KEY_VAULT_NAME"]
KVUri = f"https://{keyVaultName}.vault.azure.net"

credential = DefaultAzureCredential()
client = SecretClient(vault_url=KVUri, credential=credential)

In [5]:
"""
This code loads and sets the necessary variables for Azure services.
The variables are loaded from Azure Key Vault.
"""
azure_openai_endpoint=client.get_secret(name="aoai-endpoint").value

azure_openai_api_key=client.get_secret(name="aoai-api-key").value
azure_openai_api_version = "2024-02-15-preview"
azure_openai_embedding_deployment = client.get_secret(name="aoai-embedding-deployment").value
azure_openai_embedding_model =client.get_secret(name="aoai-embedding-model").value
azure_openai_vector_dimension = 1536
search_credential =AzureKeyCredential(client.get_secret(name="aisearch-key").value)
search_endpoint =client.get_secret(name="aisearch-endpoint").value
index_name = "csv-glossary-index"
data_source_connection_name = "csv-glossary-ds"
azure_ai_services_key =client.get_secret(name="azure-ai-services-key").value
azure_ai_services_endpoint =client.get_secret(name="azure-ai-services-endpoint").value
blob_container_name = "excel-data"
blob_storage_name =client.get_secret(name="blobstore-account-name").value

#### Create Azure AI Search Datasource Object

In [None]:
from azure.search.documents.indexes import SearchIndexerClient
from azure.search.documents.indexes.models import (
    SearchIndexerDataContainer,
    SearchIndexerDataSourceConnection,
    SearchIndexerDataIdentity,
    SearchIndexerDataUserAssignedIdentity,
    SearchIndexerDataNoneIdentity
)
from azure.search.documents.indexes.models import (
    NativeBlobSoftDeleteDeletionDetectionPolicy,
    HighWaterMarkChangeDetectionPolicy,
    DataChangeDetectionPolicy
)

indexer_client = SearchIndexerClient(
    endpoint=search_endpoint, credential=search_credential
)
indexer_container = SearchIndexerDataContainer(name=blob_container_name)
resource_id = client.get_secret(name="ds-resource-id").value
data_source_connection = SearchIndexerDataSourceConnection(name=data_source_connection_name, type="azureblob", connection_string=resource_id, container=indexer_container)

# Create the data source object
data_source = indexer_client.create_or_update_data_source_connection(data_source_connection=data_source_connection)

print(f"Data source {data_source.name} created or updated successfully.")

In [8]:
data_source.name

'csv-glossary-ds'

In [7]:
type(data_source)

azure.search.documents.indexes.models._models.SearchIndexerDataSourceConnection

#### Create Azure AI Search Index

In [6]:
fields = [
    SearchField(name="id",type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True, analyzer_name="keyword"),
    SearchField(name="parent_id",type=SearchFieldDataType.String, sortable=True, filterable=True, facetable=True),
    SearchField(name="title",type=SearchFieldDataType.String, searchable=True, sortable=True, filterable=True, facetable=True),
    SearchField(name="category",type=SearchFieldDataType.String, searchable=True, sortable=True, filterable=True, facetable=True),
    SearchField(name="notes",type=SearchFieldDataType.String, searchable=True, sortable=True, filterable=True, facetable=True),
    SearchField(name="chunk",type=SearchFieldDataType.String, searchable=True, sortable=False, filterable=False, facetable=False),
    SearchField(name="vector",type=SearchFieldDataType.Collection(SearchFieldDataType.Single), vector_search_dimensions=azure_openai_vector_dimension, vector_search_profile_name="myHnswProfile")
]


# Define the vector search configuration and parameters
vector_search = VectorSearch(
    algorithms=[
        HnswAlgorithmConfiguration(name="myHsnw")
    ],
    profiles=[
        VectorSearchProfile(
            name="myHnswProfile",
            algorithm_configuration_name="myHsnw",
            vectorizer_name="myOpenAI"
        )
    ],
    vectorizers=[
        AzureOpenAIVectorizer(
            vectorizer_name="myOpenAI",
            kind="azureOpenAI",
            parameters=AzureOpenAIVectorizerParameters(
                resource_url=azure_openai_endpoint,
                deployment_name=azure_openai_embedding_deployment,
                model_name=azure_openai_embedding_model,
            )
        )
    ]
)

# Configure semantic search on the index
semantic_config = SemanticConfiguration(
    name="my-semantic-config",
    prioritized_fields=SemanticPrioritizedFields(
        title_field=SemanticField(field_name="title"),
        content_fields=[SemanticField(field_name="chunk")]
    )
)

# Create the semantic search config
semantic_search = SemanticSearch(configurations=[semantic_config])

scoring_profiles = []

In [7]:
# Create a search index client required to create the index
index_client = SearchIndexClient(endpoint=search_endpoint, credential=search_credential)

index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search, scoring_profiles=scoring_profiles, semantic_search=semantic_search)
result = index_client.create_or_update_index(index=index)
print(f"{result.name} created")

csv-glossary-index created


> #### Create Required Skillsets for the document extraction processes and operations.
Skills drive integrated vectorization. Text Split provides data chunking. AzureOpenAIEmbedding handles calls to Azure OpenAI, using the connection information you provide in the environment variables. An indexer projection specifies secondary indexes used for chunked data

In [8]:
# Import required libraries
from azure.search.documents.indexes.models import (
    SplitSkill,
    AzureOpenAIEmbeddingSkill,
    OcrSkill,
    SearchIndexerSkillset,
    DocumentIntelligenceLayoutSkill,
    DocumentIntelligenceLayoutSkillMarkdownHeaderDepth,
    InputFieldMappingEntry,
    OutputFieldMappingEntry,
    SearchIndexerIndexProjection,
    SearchIndexerIndexProjectionSelector,
    SearchIndexerIndexProjectionsParameters,
    IndexProjectionMode,
    AIServicesAccountKey,
    AIServicesAccountIdentity
)

skillset_name = f"{index_name}-skillset"

def create_skillset():

    split_skill = SplitSkill(
        description="Split skill to chunk documents",
        text_split_mode="pages",
        default_language_code="en",
        context="/document",
        maximum_page_length=2000,
        page_overlap_length=500,
        inputs=[
            InputFieldMappingEntry(name="text", source="/document/Definition"),
        ],
        outputs=[
            OutputFieldMappingEntry(name="textItems", target_name="chunks")
        ]
    )

    embedding_skill = AzureOpenAIEmbeddingSkill(
        description="Skill to generate embeddings via Azure OpenAI",
        context="/document/chunks/*",
        resource_url=azure_openai_endpoint,
        deployment_name=azure_openai_embedding_deployment,
        model_name=azure_openai_embedding_model,
        dimensions=azure_openai_vector_dimension,
        api_key=azure_openai_api_key,
        inputs=[
            InputFieldMappingEntry(name="text", source="/document/chunks/*"),
        ],
        outputs=[
            OutputFieldMappingEntry(name="embedding", target_name="vector")
        ]
    )

    index_projections = SearchIndexerIndexProjection(
        selectors=[
            SearchIndexerIndexProjectionSelector(
                target_index_name=index_name,
                parent_key_field_name="parent_id",
                source_context="/document/chunks/*",
                mappings=[
                    InputFieldMappingEntry(name="chunk", source="/document/chunks/*"),
                    InputFieldMappingEntry(name="vector", source="/document/chunks/*/vector"),
                    InputFieldMappingEntry(name="title", source="/document/Term"),
                    InputFieldMappingEntry(name="category", source="/document/Category"),
                    InputFieldMappingEntry(name="notes", source="/document/Notes"),
                ]
            )
        ],
        parameters=SearchIndexerIndexProjectionsParameters(
            projection_mode=IndexProjectionMode.SKIP_INDEXING_PARENT_DOCUMENTS
        )
    )

    skills = [split_skill, embedding_skill]

    return SearchIndexerSkillset(
        name=skillset_name,
        description="Skillset to chunk documents and generating embeddings",
        skills=skills,
        index_projection=index_projections,
        cognitive_services_account=AIServicesAccountKey(key=azure_ai_services_key, subdomain_url=azure_ai_services_endpoint)
    )

skillset = create_skillset()


indexer_client.create_or_update_skillset(skillset)
print(f"Created skillset {skillset.name}")

Created skillset csv-glossary-index-skillset


#### Create Indexer

In [14]:
from azure.search.documents.indexes.models import (
    SearchIndexer,
    IndexingParameters,
    IndexingParametersConfiguration,
    BlobIndexerImageAction
)

# Define indexer name  
indexer_name = f"{index_name}-indexer"

index_parameters = IndexingParameters(
    configuration=IndexingParametersConfiguration(
      data_to_extract="contentAndMetadata",
      parsing_mode="delimitedText",
      first_line_contains_headers=True,
      query_timeout=None,
    )
  )

indexer = SearchIndexer(
  name=indexer_name,
  description="Indexer to orchestrate the document indexing and embedding generation",
  skillset_name=skillset_name,
  target_index_name=index_name,
  data_source_name=data_source.name,
  parameters=index_parameters
)

indexer_result = indexer_client.create_or_update_indexer(indexer)

# Run the indexer to kick off the indexing process
indexer_client.run_indexer(indexer_name)
print(f' {indexer_name} is created and running. If queries return no results, please wait a bit and try again.')

# Schedule an indexer to run every 24 hours
#https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/search/azure-search-documents/samples/sample_indexer_datasource_skillset.py

 csv-glossary-index-indexer is created and running. If queries return no results, please wait a bit and try again.


#### Perform a vector similarity search

This example shows a pure vector search using the vectorizable text query, all you need to do is pass in text and your vectorizer will handle the query vectorization.

If you indexed the health plan PDF file, send queries that ask plan-related questions.

In [15]:
from azure.search.documents import SearchClient
from azure.search.documents.models import VectorizableTextQuery

# Pure Vector Search
# query = "What can you tell me about application programming interface ?"
query = "what is an API ?"

search_client = SearchClient(endpoint=search_endpoint, index_name=index_name, credential=search_credential)
vector_query = VectorizableTextQuery(text=query, k_nearest_neighbors=1, fields="vector", exhaustive=True)
# Use the below query to pass in the raw vector query instead of the query vectorization
# vector_query = RawVectorQuery(vector=generate_embeddings(query), k_nearest_neighbors=3, fields="vector")
  
results = search_client.search(  
    search_text=None,  
    vector_queries= [vector_query],
    top=1
)  
  
for result in results:  
    print(f"id: {result['id']}")  
    print(f"notes: {result['notes']}")  
    print(f"Score: {result['@search.score']}")  
    print(f"Content: {result['chunk']}")   

id: 99d593c454b5_aHR0cHM6Ly9ibG9ic3RvcmUwNS5ibG9iLmNvcmUud2luZG93cy5uZXQvZXhjZWwtZGF0YS9nbG9zc2FyeV9kYXRhc2V0LmNzdjsx0_chunks_0
notes: Used in web development to connect services.
Score: 0.68176585
Content: Application Programming Interface: a set of rules that allows different software entities to interact.


In [16]:
from azure.search.documents import SearchClient
from azure.search.documents.models import VectorizableTextQuery

# Pure Vector Search
query = "What's the difference between a sprint and agile?"

search_client = SearchClient(endpoint=search_endpoint, index_name=index_name, credential=search_credential)
vector_query = VectorizableTextQuery(text=query, k_nearest_neighbors=1, fields="vector", exhaustive=True)
# Use the below query to pass in the raw vector query instead of the query vectorization
# vector_query = RawVectorQuery(vector=generate_embeddings(query), k_nearest_neighbors=3, fields="vector")
  
results = search_client.search(  
    search_text=None,  
    vector_queries= [vector_query],
    top=1
)  
  
for result in results:  
    print(f"id: {result['id']}")  
    print(f"notes: {result['notes']}")  
    print(f"Score: {result['@search.score']}")  
    print(f"Content: {result['chunk']}")   

id: 99d593c454b5_aHR0cHM6Ly9ibG9ic3RvcmUwNS5ibG9iLmNvcmUud2luZG93cy5uZXQvZXhjZWwtZGF0YS9nbG9zc2FyeV9kYXRhc2V0LmNzdjs50_chunks_0
notes: Common in Scrum methodology.
Score: 0.6210777
Content: A short, time-boxed period in which a scrum team works to complete a set amount of work.


#### Perform a hybrid search + semantic reranking

In [17]:
from azure.search.documents.models import (
    QueryType,
    QueryCaptionType,
    QueryAnswerType
)
# Semantic Hybrid Search
query = "What's the difference between a sprint and agile?"

search_client = SearchClient(endpoint=search_endpoint, index_name=index_name, credential=search_credential)
vector_query = VectorizableTextQuery(text=query, k_nearest_neighbors=2, fields="vector", exhaustive=True)

results = search_client.search(  
    search_text=query,
    vector_queries=[vector_query],
    select=["id", "notes", "chunk"],
    query_type=QueryType.SEMANTIC,
    semantic_configuration_name='my-semantic-config',
    query_caption=QueryCaptionType.EXTRACTIVE,
    query_answer=QueryAnswerType.EXTRACTIVE,
    top=1
)

semantic_answers = results.get_answers()
if semantic_answers:
    for answer in semantic_answers:
        if answer.highlights:
            print(f"Semantic Answer: {answer.highlights}")
        else:
            print(f"Semantic Answer: {answer.text}")
        print(f"Semantic Answer Score: {answer.score}\n")

for result in results:
    print(f"id: {result['id']}")  
    print(f"notes: {result['notes']}")  
    print(f"Reranker Score: {result['@search.reranker_score']}")
    print(f"Content: {result['chunk']}")  

    captions = result["@search.captions"]
    if captions:
        caption = captions[0]
        if caption.highlights:
            print(f"Caption: {caption.highlights}\n")
        else:
            print(f"Caption: {caption.text}\n")

Semantic Answer: <em>A short, time-boxed period in which a scrum team works to complete a set amount of work.</em>
Semantic Answer Score: 0.9850000143051147

id: 99d593c454b5_aHR0cHM6Ly9ibG9ic3RvcmUwNS5ibG9iLmNvcmUud2luZG93cy5uZXQvZXhjZWwtZGF0YS9nbG9zc2FyeV9kYXRhc2V0LmNzdjs50_chunks_0
notes: Common in Scrum methodology.
Reranker Score: 2.586292028427124
Content: A short, time-boxed period in which a scrum team works to complete a set amount of work.
Caption: <em>A short, time-boxed period in which a scrum team works to complete a set amount of work.</em>

