In [22]:
from dotenv import load_dotenv
from azure.core.credentials import AzureKeyCredential
import os

load_dotenv(override=True) # take environment variables from .env.

# Set up your Azure Search admin key and service endpoint 
aisearch_endpoint = os.environ["AZURE_SEARCH_SERVICE_ENDPOINT"]
aisearch_key = AzureKeyCredential(os.environ["AZURE_SEARCH_API_KEY"])
index_name= "indexer-demo"

storage_account_subscription_id= os.environ["AZURE_STORAGE_ACCOUNT_SUB_ID"]
stroage_account_ressource_group_name= os.environ["AZURE_STORAGE_ACCOUNT_RG_NAME"]
storage_account_name= os.environ["AZURE_STORAGE_ACCOUNT_NAME"]
aisearch_datasource_connection_string= f"ResourceId=/subscriptions/{storage_account_subscription_id}/resourceGroups/{stroage_account_ressource_group_name}/providers/Microsoft.Storage/storageAccounts/{storage_account_name}/;"

azure_openai_endpoint = os.environ["AZURE_OPENAI_ENDPOINT"]
azure_openai_key =  os.environ["AZURE_OPENAI_KEY"] 
azure_openai_embedding_deployment = os.environ["AZURE_OPENAI_EMBEDDING_DEPLOYMENT"]
azure_openai_model_name = os.environ["AZURE_OPENAI_EMBEDDING_MODEL_NAME"]
azure_openai_model_dimensions = int(os.getenv("AZURE_OPENAI_EMBEDDING_DIMENSIONS", 3072))

azure_ai_services_endpoint= os.environ["AZURE_AI_SERVICES_ENDPOINT"]
azure_ai_services_key= os.environ["AZURE_AI_SERVICES_KEY"]

Create a blob data source connector on Azure AI Search


In [None]:
from azure.search.documents.indexes import SearchIndexerClient
from azure.search.documents.indexes.models import (
    SearchIndexerDataContainer,
    SearchIndexerDataSourceConnection
)
#create the indexer client
indexer_client = SearchIndexerClient(aisearch_endpoint, aisearch_key)  

# Define the data source  
data_source = SearchIndexerDataSourceConnection(  
    name=f"{index_name}-blob",  
    type="azureblob",  
    connection_string= aisearch_datasource_connection_string ,  
    container=SearchIndexerDataContainer(name="demo-indexer-storage")  
)  
indexer_client.create_or_update_data_source_connection(data_source)  
print(f"Data source '{data_source.name}' created or updated.")
print("ASSIGN THE STORAGE BLOB DATA READER ROLE TO YOUR AI SEARCH INSTANCE OVER THE STORAGE ACCOUNT")

Create a search index


In [None]:
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    SearchField,
    SearchFieldDataType,
    VectorSearch,
    HnswAlgorithmConfiguration,
    VectorSearchProfile,
    AzureOpenAIVectorizer,
    AzureOpenAIParameters,
    AIServicesVisionVectorizer,
    AIServicesVisionParameters,
    SemanticConfiguration,
    SemanticSearch,
    SemanticPrioritizedFields,
    SemanticField,
    SearchIndex
)
# Create a client
search_client = SearchIndexClient(aisearch_endpoint, aisearch_key)

# Define the index schema
fields = [
    SearchField(name="text_parent_id", type=SearchFieldDataType.String, sortable=True, filterable=True, facetable=True),
    SearchField(name="image_parent_id", type=SearchFieldDataType.String, sortable=True, filterable=True, facetable=True),
    SearchField(name="title", type=SearchFieldDataType.String),
    SearchField(name="chunk_id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True, analyzer_name="keyword"),
    SearchField(name="chunk", type=SearchFieldDataType.String, sortable=False, filterable=False, facetable=False),
    SearchField(name="textvector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), vector_search_dimensions=azure_openai_model_dimensions, vector_search_profile_name="myHnswAOAIProfile"),
    SearchField(name="imagevector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), vector_search_dimensions=1024, vector_search_profile_name="myHnswVisionProfile"),
]

# Configure the vector search configuration
vector_search = VectorSearch(
    algorithms=[
        HnswAlgorithmConfiguration(name="myHnsw"),
    ],
    profiles=[
        VectorSearchProfile(
            name="myHnswAOAIProfile",
            algorithm_configuration_name="myHnsw",
            vectorizer="DemoAOAIVectorizer"
        ),
        VectorSearchProfile(
            name="myHnswVisionProfile",
            algorithm_configuration_name="myHnsw",
            vectorizer="DemoAIServicesVectorizer"
        )
    ],
    vectorizers=[
        AIServicesVisionVectorizer(
            name="DemoAIServicesVectorizer",
            kind="aiServicesVision",
            ai_services_vision_parameters=AIServicesVisionParameters(
                model_version="2023-04-15",
                resource_uri=azure_ai_services_endpoint,
                api_key=azure_ai_services_key,
            )
        ),

        AzureOpenAIVectorizer(
            name="DemoAOAIVectorizer",
            kind="azureOpenAI",
            azure_open_ai_parameters=AzureOpenAIParameters(
                resource_uri=azure_openai_endpoint,
                deployment_id=azure_openai_embedding_deployment,
                model_name=azure_openai_model_name,
                api_key=azure_ai_services_key,
            ),
        )
    ],
)

semantic_config = SemanticConfiguration(
    name="my-semantic-config",
    prioritized_fields=SemanticPrioritizedFields(
        title_field=SemanticField(field_name="title"),
        content_fields=[SemanticField(field_name="chunk")]
    )
)
# Create the semantic settings with the configuration
semantic_search = SemanticSearch(configurations=[semantic_config])

# Create the search index with the semantic settings
index = SearchIndex(name=index_name, fields=fields,
                    vector_search=vector_search, semantic_search=semantic_search)

search_client.create_or_update_index(index)

print(f"Index '{index.name}' created or updated")

Create a skillset

In [None]:
from azure.search.documents.indexes.models import (
    SplitSkill,
    InputFieldMappingEntry,
    OutputFieldMappingEntry,
    AzureOpenAIEmbeddingSkill,
    OcrSkill,
    MergeSkill,
    SearchIndexerIndexProjections,
    SearchIndexerIndexProjectionSelector,
    SearchIndexerIndexProjectionsParameters,
    IndexProjectionMode,
    SearchIndexerSkillset,
    CognitiveServicesAccountKey,
    VisionVectorizeSkill 
)

# Define the skillset 

split_skill = SplitSkill(  
    description="Split skill to chunk documents",  
    text_split_mode="pages",  
    context="/document",  
    maximum_page_length=2000,  
    page_overlap_length=500,  
    inputs=[  
        InputFieldMappingEntry(name="text", source="/document/content"),  
    ],  
    outputs=[  
        OutputFieldMappingEntry(name="textItems", target_name="pages")  
    ],  
)  

vision_skill = VisionVectorizeSkill(
    description="Vision skill to generate vector representation of images",
    context= "/document/normalized_images/*",
    inputs=[  
        InputFieldMappingEntry(name="image", source= "/document/normalized_images/*"),  
    ],  
    outputs=[  
        OutputFieldMappingEntry(name="vector", target_name="imagevector")  
    ],
    model_version="2023-04-15"
)
  
embedding_skill = AzureOpenAIEmbeddingSkill(  
    description="Skill to generate embeddings via Azure OpenAI",  
    context="/document/pages/*",  
    resource_uri=azure_openai_endpoint,  
    deployment_id=azure_openai_embedding_deployment,  
    model_name=azure_openai_model_name,
    dimensions=azure_openai_model_dimensions,
    api_key=azure_openai_key,  
    inputs=[  
        InputFieldMappingEntry(name="text", source="/document/pages/*"),  
    ],  
    outputs=[  
        OutputFieldMappingEntry(name="embedding", target_name="textvector")  
    ],  
)  
  
index_projections = SearchIndexerIndexProjections(  
    selectors=[  
        SearchIndexerIndexProjectionSelector(  
            target_index_name=index_name,  
            parent_key_field_name="text_parent_id",  
            source_context="/document/pages/*",  
            mappings=[  
                InputFieldMappingEntry(name="chunk", source="/document/pages/*"),  
                InputFieldMappingEntry(name="textvector", source="/document/pages/*/textvector"),
                InputFieldMappingEntry(name="title", source="/document/metadata_storage_name"),  
            ],  
        ), 
        SearchIndexerIndexProjectionSelector(  
            target_index_name=index_name,  
            parent_key_field_name="image_parent_id",  
            source_context="/document/normalized_images/*",  
            mappings=[  
               InputFieldMappingEntry(name="imagevector", source="/document/normalized_images/*/imagevector"), 
            ],  
        ), 

    ],  
    parameters=SearchIndexerIndexProjectionsParameters(  
        projection_mode=IndexProjectionMode.SKIP_INDEXING_PARENT_DOCUMENTS  
    ),  
) 


skills = [split_skill, embedding_skill,vision_skill]

skillset = SearchIndexerSkillset(  
    name=f"{index_name}-skillset",  
    description="Skillset to chunk documents and generating embeddings",  
    skills=skills,  
    index_projections=index_projections,
    cognitive_services_account=CognitiveServicesAccountKey(
        key=azure_ai_services_key,
        description="AI Vision Multi Service Account",
    ),

)
  
client = SearchIndexerClient(aisearch_endpoint, aisearch_key)  
client.create_or_update_skillset(skillset)  
print(f"{skillset.name} created")

In [None]:
from azure.search.documents.indexes.models import (
    SearchIndexer,
    FieldMapping,
    IndexingParameters,
    IndexingParametersConfiguration,
    BlobIndexerImageAction
)

# Create an indexer  
indexer_name = f"{index_name}-indexer"  

indexer_parameters = IndexingParameters (
    configuration= IndexingParametersConfiguration(
        image_action = "generateNormalizedImages",
        query_timeout=None,
        data_to_extract="contentAndMetadata"
        )
    )

indexer = SearchIndexer(  
    name=indexer_name,  
    description="Indexer to index documents and generate embeddings",  
    skillset_name=f"{index_name}-skillset",  
    target_index_name=index_name,  
    data_source_name=data_source.name,
    # Map the metadata_storage_name field to the title field in the index to display the PDF title in the search results  
    field_mappings=[FieldMapping(source_field_name="metadata_storage_name", target_field_name="title")],
    parameters=indexer_parameters
)  

indexer_result = indexer_client.create_or_update_indexer(indexer)  
  
# Run the indexer  
indexer_client.run_indexer(indexer_name)  
print(f' {indexer_name} is created and running.')  